2727#include "zend_shared_alloc.h"
2828#include "zend_observer.h"
2929
30+ #ifdef __SSE2__
31+ /* For SSE2 adler32 */
32+ #include <immintrin.h>
33+ #endif
34+
3035typedef int (* id_function_t )(void * , void * );
3136typedef void (* unique_copy_ctor_func_t )(void * pElement );
3237
@@ -451,11 +456,62 @@ zend_op_array* zend_accel_load_script(zend_persistent_script *persistent_script,
451456#define ADLER32_NMAX 5552
452457/* NMAX is the largest n such that 255n(n+1)/2 + (n+1)(BASE-1) <= 2^32-1 */
453458
454- #define ADLER32_DO1 (buf ) {s1 += *(buf); s2 += s1;}
455- #define ADLER32_DO2 (buf , i ) ADLER32_DO1(buf + i); ADLER32_DO1(buf + i + 1);
456- #define ADLER32_DO4 (buf , i ) ADLER32_DO2(buf, i); ADLER32_DO2(buf, i + 2);
457- #define ADLER32_DO8 (buf , i ) ADLER32_DO4(buf, i); ADLER32_DO4(buf, i + 4);
458- #define ADLER32_DO16 (buf ) ADLER32_DO8(buf, 0); ADLER32_DO8(buf, 8);
459+ #define ADLER32_SCALAR_DO1 (buf ) {s1 += *(buf); s2 += s1;}
460+ #define ADLER32_SCALAR_DO2 (buf , i ) ADLER32_SCALAR_DO1(buf + i); ADLER32_SCALAR_DO1(buf + i + 1);
461+ #define ADLER32_SCALAR_DO4 (buf , i ) ADLER32_SCALAR_DO2(buf, i); ADLER32_SCALAR_DO2(buf, i + 2);
462+ #define ADLER32_SCALAR_DO8 (buf , i ) ADLER32_SCALAR_DO4(buf, i); ADLER32_SCALAR_DO4(buf, i + 4);
463+ #define ADLER32_SCALAR_DO16 (buf ) ADLER32_SCALAR_DO8(buf, 0); ADLER32_SCALAR_DO8(buf, 8);
464+
465+ static zend_always_inline void adler32_do16_loop (unsigned char * buf , unsigned char * end , unsigned int * s1_out , unsigned int * s2_out )
466+ {
467+ unsigned int s1 = * s1_out ;
468+ unsigned int s2 = * s2_out ;
469+
470+ #ifdef __SSE2__
471+ const __m128i zero = _mm_setzero_si128 ();
472+
473+ __m128i accumulate_s2 = zero ;
474+ unsigned int accumulate_s1 = 0 ;
475+
476+ do {
477+ __m128i read = _mm_loadu_si128 ((__m128i * ) buf ); /* [A:P] */
478+
479+ /* Split the 8-bit-element vector into two 16-bit-element vectors where each element gets zero-extended from 8-bits to 16-bits */
480+ __m128i lower = _mm_unpacklo_epi8 (read , zero ); /* [A:H] zero-extended to 16-bits */
481+ __m128i higher = _mm_unpackhi_epi8 (read , zero ); /* [I:P] zero-extended to 16-bits */
482+ lower = _mm_madd_epi16 (lower , _mm_set_epi16 (9 , 10 , 11 , 12 , 13 , 14 , 15 , 16 )); /* [A * 16:H * 9] */
483+ higher = _mm_madd_epi16 (higher , _mm_set_epi16 (1 , 2 , 3 , 4 , 5 , 6 , 7 , 8 )); /* [I * 8:P * 1] */
484+
485+ /* We'll cheat here: it's difficult to add 16-bit elementwise, but we can do 32-bit additions.
486+ * The highest value the sum of two elements of the vectors can take is 0xff * 16 + 0xff * 8 < 0xffff.
487+ * That means there is no carry possible from 16->17 bits so the 32-bit addition is safe. */
488+ __m128i sum = _mm_add_epi32 (lower , higher ); /* [A * 16 + I * 8:H * 9 + P * 1] */
489+ accumulate_s2 = _mm_add_epi32 (accumulate_s2 , sum );
490+ accumulate_s1 += s1 ;
491+
492+ /* Computes 8-bit element-wise abs(buf - zero) and then sums the elements into two 16 bit parts */
493+ sum = _mm_sad_epu8 (read , zero );
494+ s1 += _mm_cvtsi128_si32 (sum ) + _mm_extract_epi16 (sum , 4 );
495+
496+ buf += 16 ;
497+ } while (buf != end );
498+
499+ /* For convenience, let's do a rename of variables and let accumulate_s2 = [X, Y, Z, W] */
500+ __m128i shuffled = _mm_shuffle_epi32 (accumulate_s2 , _MM_SHUFFLE (1 , 0 , 0 , 2 )); /* [Y, X, X, Z] */
501+ accumulate_s2 = _mm_add_epi32 (accumulate_s2 , shuffled ); /* [X + Y, Y + X, Z + X, W + Z] */
502+ shuffled = _mm_shuffle_epi32 (accumulate_s2 , _MM_SHUFFLE (3 , 3 , 3 , 3 )); /* [X + Y, X + Y, X + Y, X + Y] */
503+ accumulate_s2 = _mm_add_epi32 (accumulate_s2 , shuffled ); /* [/, /, /, W + Z + X + Y] */
504+ s2 += accumulate_s1 * 16 + _mm_cvtsi128_si32 (accumulate_s2 );
505+ #else
506+ do {
507+ ADLER32_SCALAR_DO16 (buf );
508+ buf += 16 ;
509+ } while (buf != end );
510+ #endif
511+
512+ * s1_out = s1 ;
513+ * s2_out = s2 ;
514+ }
459515
460516unsigned int zend_adler32 (unsigned int checksum , unsigned char * buf , uint32_t len )
461517{
@@ -466,10 +522,8 @@ unsigned int zend_adler32(unsigned int checksum, unsigned char *buf, uint32_t le
466522 while (len >= ADLER32_NMAX ) {
467523 len -= ADLER32_NMAX ;
468524 end = buf + ADLER32_NMAX ;
469- do {
470- ADLER32_DO16 (buf );
471- buf += 16 ;
472- } while (buf != end );
525+ adler32_do16_loop (buf , end , & s1 , & s2 );
526+ buf = end ;
473527 s1 %= ADLER32_BASE ;
474528 s2 %= ADLER32_BASE ;
475529 }
@@ -478,15 +532,13 @@ unsigned int zend_adler32(unsigned int checksum, unsigned char *buf, uint32_t le
478532 if (len >= 16 ) {
479533 end = buf + (len & 0xfff0 );
480534 len &= 0xf ;
481- do {
482- ADLER32_DO16 (buf );
483- buf += 16 ;
484- } while (buf != end );
535+ adler32_do16_loop (buf , end , & s1 , & s2 );
536+ buf = end ;
485537 }
486538 if (len ) {
487539 end = buf + len ;
488540 do {
489- ADLER32_DO1 (buf );
541+ ADLER32_SCALAR_DO1 (buf );
490542 buf ++ ;
491543 } while (buf != end );
492544 }
0 commit comments