Skip to content

Commit 31c6db5

Browse files
authored
encoding.utf8: add more tests for UTF-8 strings (#24544)
1 parent a5c8b4f commit 31c6db5

2 files changed

Lines changed: 91 additions & 0 deletions

File tree

‎vlib/encoding/utf8/validate/encoding_utf8_test.v‎

Lines changed: 78 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,85 @@ import encoding.utf8.validate
33
fn test_validate_str() {
44
assert validate.utf8_string('añçá') == true
55
assert validate.utf8_string('\x61\xC3\xB1\xC3\xA7\xC3\xA1') == true
6+
7+
assert validate.utf8_string('\x01') == true
8+
assert validate.utf8_string('\x7e') == true
9+
assert validate.utf8_string('\x7f') == true
10+
assert validate.utf8_string('\xc2\x80') == true
11+
assert validate.utf8_string('\xc2\x81') == true
12+
assert validate.utf8_string('\xc2\xbf') == true
13+
assert validate.utf8_string('\xc3\x80') == true
14+
assert validate.utf8_string('\xc3\x81') == true
15+
assert validate.utf8_string('\xc3\x88') == true
16+
assert validate.utf8_string('\xc3\x90') == true
17+
assert validate.utf8_string('\xc3\xa0') == true
18+
assert validate.utf8_string('\xc3\xb0') == true
19+
assert validate.utf8_string('\xc3\xb8') == true
20+
assert validate.utf8_string('\xc3\xbf') == true
21+
assert validate.utf8_string('\xc4\x80') == true
22+
assert validate.utf8_string('\xdf\xbf') == true
23+
assert validate.utf8_string('\xd0\x80') == true
24+
assert validate.utf8_string('\xe0\xa0\x80') == true
25+
assert validate.utf8_string('\xe0\xa0\x81') == true
26+
assert validate.utf8_string('\xe1\x80\x80') == true
27+
assert validate.utf8_string('\xed\x80\x80') == true
28+
assert validate.utf8_string('\xed\x9f\xbf') == true
29+
assert validate.utf8_string('\xee\x80\x80') == true
30+
assert validate.utf8_string('\xef\xbf\xbe') == true
31+
assert validate.utf8_string('\xef\xbf\xbf') == true
32+
assert validate.utf8_string('\xf0\x90\x80\x80') == true
33+
assert validate.utf8_string('\xf0\x90\x80\x81') == true
34+
assert validate.utf8_string('\xf1\x80\x80\x80') == true
35+
assert validate.utf8_string('\xf4\x8f\xbf\xbe') == true
36+
assert validate.utf8_string('\xf4\x8f\xbf\xbf') == true
37+
assert validate.utf8_string('\xef\xbf\xbd') == true
38+
}
39+
40+
fn test_validate_invalid_str() {
641
assert validate.utf8_string('\xC0\xC1') == false
742
assert validate.utf8_string('\xF5\xFF') == false
843
assert validate.utf8_string('\xE0\xEF') == false
44+
45+
// xx
46+
assert validate.utf8_string('\x91\x80\x80\x80') == false
47+
48+
// s1
49+
assert validate.utf8_string('\xC2\x7F\x80\x80') == false
50+
assert validate.utf8_string('\xC2\xC0\x80\x80') == false
51+
assert validate.utf8_string('\xDF\x7F\x80\x80') == false
52+
assert validate.utf8_string('\xDF\xC0\x80\x80') == false
53+
54+
// s2
55+
assert validate.utf8_string('\xE0\x9F\xBF\x80') == false
56+
assert validate.utf8_string('\xE0\xA0\x7F\x80') == false
57+
assert validate.utf8_string('\xE0\xBF\xC0\x80') == false
58+
assert validate.utf8_string('\xE0\xC0\x80\x80') == false
59+
60+
// s3
61+
assert validate.utf8_string('\xE1\x7F\xBF\x80') == false
62+
assert validate.utf8_string('\xE1\x80\x7F\x80') == false
63+
assert validate.utf8_string('\xE1\xBF\xC0\x80') == false
64+
assert validate.utf8_string('\xE1\xC0\x80\x80') == false
65+
66+
// s4
67+
assert validate.utf8_string('\xED\x7F\xBF\x80') == false
68+
assert validate.utf8_string('\xED\x80\x7F\x80') == false
69+
assert validate.utf8_string('\xED\x9F\xC0\x80') == false
70+
assert validate.utf8_string('\xED\xA0\x80\x80') == false
71+
72+
// s5
73+
assert validate.utf8_string('\xF0\x8F\xBF\xBF') == false
74+
assert validate.utf8_string('\xF0\x90\x7F\xBF') == false
75+
assert validate.utf8_string('\xF0\x90\x80\x7F') == false
76+
assert validate.utf8_string('\xF0\xBF\xBF\xC0') == false
77+
assert validate.utf8_string('\xF0\xBF\xC0\x80') == false
78+
assert validate.utf8_string('\xF0\xC0\x80\x80') == false
79+
80+
// s6
81+
assert validate.utf8_string('\xF1\x7F\xBF\xBF') == false
82+
assert validate.utf8_string('\xF1\x80\x7F\xBF') == false
83+
assert validate.utf8_string('\xF1\x80\x80\x7F') == false
84+
assert validate.utf8_string('\xF1\xBF\xBF\xC0') == false
85+
assert validate.utf8_string('\xF1\xBF\xC0\x80') == false
86+
assert validate.utf8_string('\xF1\xC0\x80\x80') == false
987
}

‎vlib/encoding/utf8/validate/validate_utf8.v‎

Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -48,6 +48,19 @@ fn (mut s Utf8State) seq(r0 bool, r1 bool, is_tail bool) bool {
4848
return false
4949
}
5050

51+
/* Check UTF-8 Byte sequences according to Unicode Standard
52+
* https://www.unicode.org/versions/Unicode16.0.0/core-spec/chapter-3/
53+
* Code Points 1st 2s 3s 4s
54+
* U+0000..U+007F 00..7F
55+
* U+0080..U+07FF C2..DF 80..BF
56+
* U+0800..U+0FFF E0 A0..BF 80..BF
57+
* U+1000..U+CFFF E1..EC 80..BF 80..BF
58+
* U+D000..U+D7FF ED 80..9F 80..BF
59+
* U+E000..U+FFFF EE..EF 80..BF 80..BF
60+
* U+10000..U+3FFFF F0 90..BF 80..BF 80..BF
61+
* U+40000..U+FFFFF F1..F3 80..BF 80..BF 80..BF
62+
* U+100000..U+10FFFF F4 80..8F 80..BF 80..BF
63+
*/
5164
fn (mut s Utf8State) next_state(c u8) {
5265
// sequence 1
5366
if s.index == 0 {

0 commit comments

Comments
 (0)