@@ -4728,8 +4728,9 @@ finish_up_remaining_bytes: ;
47284728 if (p == e ) {
47294729 uint8_t remaining_bytes = ZSTR_LEN (str ) & (sizeof (__m128i ) - 1 ); /* Not including terminating null */
47304730
4731- /* Crazy hack here... we want to use the above vectorized code to check a block of less than 16
4732- * bytes, but there is no good way to read a variable number of bytes into an XMM register
4731+ /* Crazy hack here for cases where 9 or more bytes are remaining...
4732+ * We want to use the above vectorized code to check a block of less than 16 bytes,
4733+ * but there is no good way to read a variable number of bytes into an XMM register
47334734 * However, we know that these bytes are part of a zend_string, and a zend_string has some
47344735 * 'header' fields which occupy the memory just before its content
47354736 * And, those header fields occupy more than 16 bytes...
@@ -4744,20 +4745,17 @@ finish_up_remaining_bytes: ;
47444745 * shift distance, so the compiler will choke on _mm_srli_si128(operand, shift_dist)
47454746 */
47464747 switch (remaining_bytes ) {
4747- case 0 :
4748- operand = _mm_srli_si128 (_mm_loadu_si128 ((__m128i * )(p - 15 )), 15 );
4749- goto check_operand ;
4748+ case 0 : ;
4749+ __m128i bad_mask = _mm_set_epi8 (-64 , -32 , -16 , -1 , -1 , -1 , -1 , -1 , -1 , -1 , -1 , -1 , -1 , -1 , -1 , -1 );
4750+ __m128i bad = _mm_cmpeq_epi8 (_mm_and_si128 (last_block , bad_mask ), bad_mask );
4751+ return _mm_movemask_epi8 (bad ) == 0 ;
47504752 case 1 :
4751- operand = _mm_srli_si128 (_mm_loadu_si128 ((__m128i * )(p - 14 )), 14 );
4752- goto check_operand ;
47534753 case 2 :
4754- operand = _mm_srli_si128 ( _mm_loadu_si128 (( __m128i * )( p - 13 )), 13 );
4754+ operand = _mm_set_epi16 ( 0 , 0 , 0 , 0 , 0 , 0 , 0 , * (( uint16_t * ) p ) );
47554755 goto check_operand ;
47564756 case 3 :
4757- operand = _mm_srli_si128 (_mm_loadu_si128 ((__m128i * )(p - 12 )), 12 );
4758- goto check_operand ;
47594757 case 4 :
4760- operand = _mm_srli_si128 ( _mm_loadu_si128 (( __m128i * )( p - 11 )), 11 );
4758+ operand = _mm_set_epi32 ( 0 , 0 , 0 , * (( uint32_t * ) p ) );
47614759 goto check_operand ;
47624760 case 5 :
47634761 operand = _mm_srli_si128 (_mm_loadu_si128 ((__m128i * )(p - 10 )), 10 );
@@ -4766,10 +4764,8 @@ finish_up_remaining_bytes: ;
47664764 operand = _mm_srli_si128 (_mm_loadu_si128 ((__m128i * )(p - 9 )), 9 );
47674765 goto check_operand ;
47684766 case 7 :
4769- operand = _mm_srli_si128 (_mm_loadu_si128 ((__m128i * )(p - 8 )), 8 );
4770- goto check_operand ;
47714767 case 8 :
4772- operand = _mm_srli_si128 ( _mm_loadu_si128 (( __m128i * )( p - 7 )), 7 );
4768+ operand = _mm_set_epi64x ( 0 , * (( uint64_t * ) p ) );
47734769 goto check_operand ;
47744770 case 9 :
47754771 operand = _mm_srli_si128 (_mm_loadu_si128 ((__m128i * )(p - 6 )), 6 );
0 commit comments