@@ -4587,13 +4587,215 @@ MBSTRING_API bool php_mb_check_encoding(const char *input, size_t length, const
45874587 return true;
45884588}
45894589
4590+ static bool mb_fast_check_utf8 (zend_string * str )
4591+ {
4592+ #ifdef __SSE2__
4593+ unsigned char * p = (unsigned char * )ZSTR_VAL (str );
4594+ /* `e` points 1 byte past the last full 16-byte block of string content
4595+ * Note that we include the terminating null byte which is included in each zend_string
4596+ * as part of the content to check; this ensures that multi-byte characters which are
4597+ * truncated abruptly at the end of the string will be detected as invalid */
4598+ unsigned char * e = p + ((ZSTR_LEN (str ) + 1 ) & ~(sizeof (__m128i ) - 1 ));
4599+
4600+ /* For checking for illegal bytes 0xF5-FF */
4601+ const __m128i over_f5 = _mm_set1_epi8 (-117 );
4602+ /* For checking for overlong 3-byte code units and reserved codepoints U+D800-DFFF */
4603+ const __m128i over_9f = _mm_set1_epi8 (-97 );
4604+ /* For checking for overlong 4-byte code units and invalid codepoints > U+10FFFF */
4605+ const __m128i over_8f = _mm_set1_epi8 (-113 );
4606+ /* For checking for illegal bytes 0xC0-C1 */
4607+ const __m128i find_c0 = _mm_set1_epi8 (-64 );
4608+ const __m128i c0_to_c1 = _mm_set1_epi8 (-126 );
4609+ /* For checking structure of continuation bytes */
4610+ const __m128i find_e0 = _mm_set1_epi8 (-32 );
4611+ const __m128i find_f0 = _mm_set1_epi8 (-16 );
4612+
4613+ __m128i last_block = _mm_setzero_si128 ();
4614+ __m128i operand ;
4615+
4616+ while (p < e ) {
4617+ operand = _mm_loadu_si128 ((__m128i * )p ); /* Load 16 bytes */
4618+
4619+ check_operand :
4620+ /* If all 16 bytes are single-byte characters, then a number of checks can be skipped */
4621+ if (!_mm_movemask_epi8 (_mm_cmplt_epi8 (operand , _mm_setzero_si128 ()))) {
4622+ /* Even if this block only contains single-byte characters, there may have been a
4623+ * multi-byte character at the end of the previous block, which was supposed to
4624+ * have continuation bytes in this block
4625+ * This bitmask will pick out a 2/3/4-byte character starting from the last byte of
4626+ * the previous block, a 3/4-byte starting from the 2nd last, or a 4-byte starting
4627+ * from the 3rd last */
4628+ __m128i bad_mask = _mm_set_epi8 (-64 , -32 , -16 , -1 , -1 , -1 , -1 , -1 , -1 , -1 , -1 , -1 , -1 , -1 , -1 , -1 );
4629+ __m128i bad = _mm_cmpeq_epi8 (_mm_and_si128 (last_block , bad_mask ), bad_mask );
4630+ if (_mm_movemask_epi8 (bad )) {
4631+ return false;
4632+ }
4633+
4634+ /* Consume as many full blocks of single-byte characters as we can */
4635+ while (true) {
4636+ p += sizeof (__m128i );
4637+ if (p >= e ) {
4638+ goto finish_up_remaining_bytes ;
4639+ }
4640+ operand = _mm_loadu_si128 ((__m128i * )p );
4641+ if (_mm_movemask_epi8 (_mm_cmplt_epi8 (operand , _mm_setzero_si128 ()))) {
4642+ break ;
4643+ }
4644+ }
4645+ }
4646+
4647+ /* Check for >= 0xF5, which are illegal byte values in UTF-8
4648+ * AVX512 has instructions for vectorized unsigned compare, but SSE2 only has signed compare
4649+ * So we add an offset to shift 0xF5-FF to the far low end of the signed byte range
4650+ * Then a single signed compare will pick out any bad bytes
4651+ * `bad` is a vector of 16 good/bad values, where 0x00 means good and 0xFF means bad */
4652+ __m128i bad = _mm_cmplt_epi8 (_mm_add_epi8 (operand , over_f5 ), over_f5 );
4653+
4654+ /* Check for overlong 3-byte code units AND reserved codepoints U+D800-DFFF
4655+ * 0xE0 followed by a byte < 0xA0 indicates an overlong 3-byte code unit, and
4656+ * 0xED followed by a byte >= 0xA0 indicates a reserved codepoint
4657+ * We can check for both problems at once by generating a vector where each byte < 0xA0
4658+ * is mapped to 0xE0, and each byte >= 0xA0 is mapped to 0xED
4659+ * Shift the original block right by one byte, and XOR the shifted block with the bitmask
4660+ * Any matches will give a 0x00 byte; do a compare with a zero vector to pick out the
4661+ * bad positions, and OR them into `bad` */
4662+ __m128i operand2 = _mm_or_si128 (_mm_slli_si128 (operand , 1 ), _mm_srli_si128 (last_block , 15 ));
4663+ __m128i mask1 = _mm_or_si128 (find_e0 , _mm_and_si128 (_mm_set1_epi8 (0xD ), _mm_cmpgt_epi8 (operand , over_9f )));
4664+ bad = _mm_or_si128 (bad , _mm_cmpeq_epi8 (_mm_setzero_si128 (), _mm_xor_si128 (operand2 , mask1 )));
4665+
4666+ /* Check for overlong 4-byte code units AND invalid codepoints > U+10FFFF
4667+ * Similar to the previous check; 0xF0 followed by < 0x90 indicates an overlong 4-byte
4668+ * code unit, and 0xF4 followed by >= 0x90 indicates a codepoint over U+10FFFF
4669+ * Build the bitmask, XOR it with the shifted block, check for 0x00 bytes in the result */
4670+ __m128i mask2 = _mm_or_si128 (find_f0 , _mm_and_si128 (_mm_set1_epi8 (0x4 ), _mm_cmpgt_epi8 (operand , over_8f )));
4671+ bad = _mm_or_si128 (bad , _mm_cmpeq_epi8 (_mm_setzero_si128 (), _mm_xor_si128 (operand2 , mask2 )));
4672+
4673+ /* Check for overlong 2-byte code units
4674+ * Any 0xC0 or 0xC1 byte can only be the first byte of an overlong 2-byte code unit
4675+ * Same deal as before; add an offset to shift 0xC0-C1 to the far low end of the signed
4676+ * byte range, do a signed compare to pick out any bad bytes */
4677+ bad = _mm_or_si128 (bad , _mm_cmplt_epi8 (_mm_add_epi8 (operand , find_c0 ), c0_to_c1 ));
4678+
4679+ /* Check structure of continuation bytes
4680+ * A UTF-8 byte should be a continuation byte if, and only if, it is:
4681+ * 1) 1 byte after the start of a 2-byte, 3-byte, or 4-byte character
4682+ * 2) 2 bytes after the start of a 3-byte or 4-byte character
4683+ * 3) 3 bytes after the start of a 4-byte character
4684+ * We build 3 bitmasks with 0xFF in each such position, and OR them together to
4685+ * get a single bitmask with 0xFF in each position where a continuation byte should be */
4686+ __m128i cont_mask = _mm_cmpeq_epi8 (_mm_and_si128 (operand2 , find_c0 ), find_c0 );
4687+ __m128i operand3 = _mm_or_si128 (_mm_slli_si128 (operand , 2 ), _mm_srli_si128 (last_block , 14 ));
4688+ cont_mask = _mm_or_si128 (cont_mask , _mm_cmpeq_epi8 (_mm_and_si128 (operand3 , find_e0 ), find_e0 ));
4689+ __m128i operand4 = _mm_or_si128 (_mm_slli_si128 (operand , 3 ), _mm_srli_si128 (last_block , 13 ));
4690+ cont_mask = _mm_or_si128 (cont_mask , _mm_cmpeq_epi8 (_mm_and_si128 (operand4 , find_f0 ), find_f0 ));
4691+
4692+ /* Now, use a signed comparison to get another bitmask with 0xFF in each position where
4693+ * a continuation byte actually is
4694+ * XOR those two bitmasks together; if everything is good, the result should be zero
4695+ * However, if a byte which should have been a continuation wasn't, or if a byte which
4696+ * shouldn't have been a continuation was, we will get 0xFF in that position */
4697+ __m128i continuation = _mm_cmplt_epi8 (operand , find_c0 );
4698+ bad = _mm_or_si128 (bad , _mm_xor_si128 (continuation , cont_mask ));
4699+
4700+ /* Pick out the high bit of each byte in `bad` as a 16-bit value (into a scalar register)
4701+ * If that value is non-zero, then we found a bad byte somewhere! */
4702+ if (_mm_movemask_epi8 (bad )) {
4703+ return false;
4704+ }
4705+
4706+ last_block = operand ;
4707+ p += sizeof (__m128i );
4708+ }
4709+
4710+ finish_up_remaining_bytes : ;
4711+ /* Finish up 1-15 remaining bytes */
4712+ if (p == e ) {
4713+ uint8_t remaining_bytes = ZSTR_LEN (str ) & (sizeof (__m128i ) - 1 ); /* Not including terminating null */
4714+
4715+ /* Crazy hack here... we want to use the above vectorized code to check a block of less than 16
4716+ * bytes, but there is no good way to read a variable number of bytes into an XMM register
4717+ * However, we know that these bytes are part of a zend_string, and a zend_string has some
4718+ * 'header' fields which occupy the memory just before its content
4719+ * And, those header fields occupy more than 16 bytes...
4720+ * So if we go back 16 bytes from the end of the zend_string content, and load 16 bytes from there,
4721+ * we may pick up some 'junk' bytes from the zend_string header fields, but we will get the 1-15
4722+ * bytes we wanted in the tail end of our XMM register, and this will never cause a segfault.
4723+ * Then, we do a left shift to get rid of the unwanted bytes
4724+ * Conveniently, the same left shift also zero-fills the tail end of the XMM register
4725+ *
4726+ * The following `switch` looks useless, but it's not
4727+ * The PSRLDQ instruction used for the 128-bit left shift requires an immediate (literal)
4728+ * shift distance, so the compiler will choke on _mm_srli_si128(operand, shift_dist)
4729+ */
4730+ switch (remaining_bytes ) {
4731+ case 0 :
4732+ operand = _mm_srli_si128 (_mm_loadu_si128 ((__m128i * )(p - 15 )), 15 );
4733+ goto check_operand ;
4734+ case 1 :
4735+ operand = _mm_srli_si128 (_mm_loadu_si128 ((__m128i * )(p - 14 )), 14 );
4736+ goto check_operand ;
4737+ case 2 :
4738+ operand = _mm_srli_si128 (_mm_loadu_si128 ((__m128i * )(p - 13 )), 13 );
4739+ goto check_operand ;
4740+ case 3 :
4741+ operand = _mm_srli_si128 (_mm_loadu_si128 ((__m128i * )(p - 12 )), 12 );
4742+ goto check_operand ;
4743+ case 4 :
4744+ operand = _mm_srli_si128 (_mm_loadu_si128 ((__m128i * )(p - 11 )), 11 );
4745+ goto check_operand ;
4746+ case 5 :
4747+ operand = _mm_srli_si128 (_mm_loadu_si128 ((__m128i * )(p - 10 )), 10 );
4748+ goto check_operand ;
4749+ case 6 :
4750+ operand = _mm_srli_si128 (_mm_loadu_si128 ((__m128i * )(p - 9 )), 9 );
4751+ goto check_operand ;
4752+ case 7 :
4753+ operand = _mm_srli_si128 (_mm_loadu_si128 ((__m128i * )(p - 8 )), 8 );
4754+ goto check_operand ;
4755+ case 8 :
4756+ operand = _mm_srli_si128 (_mm_loadu_si128 ((__m128i * )(p - 7 )), 7 );
4757+ goto check_operand ;
4758+ case 9 :
4759+ operand = _mm_srli_si128 (_mm_loadu_si128 ((__m128i * )(p - 6 )), 6 );
4760+ goto check_operand ;
4761+ case 10 :
4762+ operand = _mm_srli_si128 (_mm_loadu_si128 ((__m128i * )(p - 5 )), 5 );
4763+ goto check_operand ;
4764+ case 11 :
4765+ operand = _mm_srli_si128 (_mm_loadu_si128 ((__m128i * )(p - 4 )), 4 );
4766+ goto check_operand ;
4767+ case 12 :
4768+ operand = _mm_srli_si128 (_mm_loadu_si128 ((__m128i * )(p - 3 )), 3 );
4769+ goto check_operand ;
4770+ case 13 :
4771+ operand = _mm_srli_si128 (_mm_loadu_si128 ((__m128i * )(p - 2 )), 2 );
4772+ goto check_operand ;
4773+ case 14 :
4774+ operand = _mm_srli_si128 (_mm_loadu_si128 ((__m128i * )(p - 1 )), 1 );
4775+ goto check_operand ;
4776+ case 15 :
4777+ /* No trailing bytes are left which need to be checked
4778+ * We get 15 because we did not include the terminating null when
4779+ * calculating `remaining_bytes`, so the value wraps around */
4780+ return true;
4781+ }
4782+
4783+ ZEND_UNREACHABLE ();
4784+ }
4785+
4786+ return true;
4787+ #else
4788+ return php_mb_check_encoding (ZSTR_VAL (str ), ZSTR_LEN (str ), & mbfl_encoding_utf8 );
4789+ #endif
4790+ }
4791+
45904792static bool mb_check_str_encoding (zend_string * str , const mbfl_encoding * encoding )
45914793{
45924794 if (encoding == & mbfl_encoding_utf8 ) {
45934795 if (GC_FLAGS (str ) & IS_STR_VALID_UTF8 ) {
45944796 return true;
45954797 }
4596- bool result = php_mb_check_encoding ( ZSTR_VAL ( str ), ZSTR_LEN ( str ), encoding );
4798+ bool result = mb_fast_check_utf8 ( str );
45974799 if (result && !ZSTR_IS_INTERNED (str )) {
45984800 GC_ADD_FLAGS (str , IS_STR_VALID_UTF8 );
45994801 }
0 commit comments