Implement an SSE2 accelerated version of zend_adler32 (#10507)

ndossche · web-flow · commit 722fbd01a377 · 2023-02-05T15:58:39.000Z
When benchmarking the file cache of opcache on index.php from a dummy
WordPress install, I noticed that 36.42% of the time was spent in
zend_adler32 to verify the checksums of the files. Callgrind reported
that 332,731,216 instructions were executed during that run and average
time to execute the index file was around 91ms.

This patch implements an SSE2 accelerated version of zend_adler32, which
reduces the number of instructions executed on that bench to
248,600,983, which is a reduction of ~25%. There is also a decrease in
wallclock time measurable: around 10ms. Now only 16.05% of the time is
spent computing checksums.

The benchmark tests were performed using Callgrind, and time for the
wallclock time. These tests were executed multiple times and their
results were averaged. The WordPress install only contains two
almost-blank posts.
diff --git a/ext/opcache/zend_accelerator_util_funcs.c b/ext/opcache/zend_accelerator_util_funcs.c
@@ -27,6 +27,11 @@
 #include "zend_shared_alloc.h"
 #include "zend_observer.h"
 
+#ifdef __SSE2__
+/* For SSE2 adler32 */
+#include <immintrin.h>
+#endif
+
 typedef int (*id_function_t)(void *, void *);
 typedef void (*unique_copy_ctor_func_t)(void *pElement);
 
@@ -451,11 +456,62 @@ zend_op_array* zend_accel_load_script(zend_persistent_script *persistent_script,
 #define ADLER32_NMAX 5552
 /* NMAX is the largest n such that 255n(n+1)/2 + (n+1)(BASE-1) <= 2^32-1 */
 
-#define ADLER32_DO1(buf)        {s1 += *(buf); s2 += s1;}
-#define ADLER32_DO2(buf, i)     ADLER32_DO1(buf + i); ADLER32_DO1(buf + i + 1);
-#define ADLER32_DO4(buf, i)     ADLER32_DO2(buf, i); ADLER32_DO2(buf, i + 2);
-#define ADLER32_DO8(buf, i)     ADLER32_DO4(buf, i); ADLER32_DO4(buf, i + 4);
-#define ADLER32_DO16(buf)       ADLER32_DO8(buf, 0); ADLER32_DO8(buf, 8);
+#define ADLER32_SCALAR_DO1(buf)        {s1 += *(buf); s2 += s1;}
+#define ADLER32_SCALAR_DO2(buf, i)     ADLER32_SCALAR_DO1(buf + i); ADLER32_SCALAR_DO1(buf + i + 1);
+#define ADLER32_SCALAR_DO4(buf, i)     ADLER32_SCALAR_DO2(buf, i); ADLER32_SCALAR_DO2(buf, i + 2);
+#define ADLER32_SCALAR_DO8(buf, i)     ADLER32_SCALAR_DO4(buf, i); ADLER32_SCALAR_DO4(buf, i + 4);
+#define ADLER32_SCALAR_DO16(buf)       ADLER32_SCALAR_DO8(buf, 0); ADLER32_SCALAR_DO8(buf, 8);
+
+static zend_always_inline void adler32_do16_loop(unsigned char *buf, unsigned char *end, unsigned int *s1_out, unsigned int *s2_out)
+{
+	unsigned int s1 = *s1_out;
+	unsigned int s2 = *s2_out;
+
+#ifdef __SSE2__
+	const __m128i zero = _mm_setzero_si128();
+
+	__m128i accumulate_s2 = zero;
+	unsigned int accumulate_s1 = 0;
+
+	do {
+		__m128i read = _mm_loadu_si128((__m128i *) buf); /* [A:P] */
+
+		/* Split the 8-bit-element vector into two 16-bit-element vectors where each element gets zero-extended from 8-bits to 16-bits */
+		__m128i lower = _mm_unpacklo_epi8(read, zero);									/* [A:H] zero-extended to 16-bits */
+		__m128i higher = _mm_unpackhi_epi8(read, zero);									/* [I:P] zero-extended to 16-bits */
+		lower = _mm_madd_epi16(lower, _mm_set_epi16(9, 10, 11, 12, 13, 14, 15, 16));	/* [A * 16:H * 9] */
+		higher = _mm_madd_epi16(higher, _mm_set_epi16(1, 2, 3, 4, 5, 6, 7, 8)); 		/* [I * 8:P * 1] */
+
+		/* We'll cheat here: it's difficult to add 16-bit elementwise, but we can do 32-bit additions.
+			* The highest value the sum of two elements of the vectors can take is 0xff * 16 + 0xff * 8 < 0xffff.
+			* That means there is no carry possible from 16->17 bits so the 32-bit addition is safe. */
+		__m128i sum = _mm_add_epi32(lower, higher); /* [A * 16 + I * 8:H * 9 + P * 1] */
+		accumulate_s2 = _mm_add_epi32(accumulate_s2, sum);
+		accumulate_s1 += s1;
+
+		/* Computes 8-bit element-wise abs(buf - zero) and then sums the elements into two 16 bit parts */
+		sum = _mm_sad_epu8(read, zero);
+		s1 += _mm_cvtsi128_si32(sum) + _mm_extract_epi16(sum, 4);
+
+		buf += 16;
+	} while (buf != end);
+
+	/* For convenience, let's do a rename of variables and let accumulate_s2 = [X, Y, Z, W] */
+	__m128i shuffled = _mm_shuffle_epi32(accumulate_s2, _MM_SHUFFLE(1, 0, 0, 2));	/* [Y, X, X, Z] */
+	accumulate_s2 = _mm_add_epi32(accumulate_s2, shuffled);							/* [X + Y, Y + X, Z + X, W + Z] */
+	shuffled = _mm_shuffle_epi32(accumulate_s2, _MM_SHUFFLE(3, 3, 3, 3));			/* [X + Y, X + Y, X + Y, X + Y] */
+	accumulate_s2 = _mm_add_epi32(accumulate_s2, shuffled);							/* [/, /, /, W + Z + X + Y] */
+	s2 += accumulate_s1 * 16 + _mm_cvtsi128_si32(accumulate_s2);
+#else
+	do {
+		ADLER32_SCALAR_DO16(buf);
+		buf += 16;
+	} while (buf != end);
+#endif
+
+	*s1_out = s1;
+	*s2_out = s2;
+}
 
 unsigned int zend_adler32(unsigned int checksum, unsigned char *buf, uint32_t len)
 {
@@ -466,10 +522,8 @@ unsigned int zend_adler32(unsigned int checksum, unsigned char *buf, uint32_t le
 	while (len >= ADLER32_NMAX) {
 		len -= ADLER32_NMAX;
 		end = buf + ADLER32_NMAX;
-		do {
-			ADLER32_DO16(buf);
-			buf += 16;
-		} while (buf != end);
+		adler32_do16_loop(buf, end, &s1, &s2);
+		buf = end;
 		s1 %= ADLER32_BASE;
 		s2 %= ADLER32_BASE;
 	}
@@ -478,15 +532,13 @@ unsigned int zend_adler32(unsigned int checksum, unsigned char *buf, uint32_t le
 		if (len >= 16) {
 			end = buf + (len & 0xfff0);
 			len &= 0xf;
-			do {
-				ADLER32_DO16(buf);
-				buf += 16;
-			} while (buf != end);
+			adler32_do16_loop(buf, end, &s1, &s2);
+			buf = end;
 		}
 		if (len) {
 			end = buf + len;
 			do {
-				ADLER32_DO1(buf);
+				ADLER32_SCALAR_DO1(buf);
 				buf++;
 			} while (buf != end);
 		}