Skip to content

Commit 9dd8994

Browse files
authored
builtin: speed up, fix and test impl_utf8_to_utf32 (#26109)
1 parent 0aca807 commit 9dd8994

2 files changed

Lines changed: 85 additions & 12 deletions

File tree

‎vlib/builtin/utf8.v‎

Lines changed: 25 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -95,24 +95,37 @@ pub fn (_bytes []u8) utf8_to_utf32() !rune {
9595

9696
@[direct_array_access]
9797
fn impl_utf8_to_utf32(_bytes &u8, _bytes_len int) rune {
98-
if _bytes_len == 0 {
98+
if _bytes_len == 0 || _bytes_len > 4 {
9999
return 0
100100
}
101101
// return ASCII unchanged
102102
if _bytes_len == 1 {
103-
return unsafe { rune(_bytes[0]) }
103+
return rune(unsafe { _bytes[0] })
104104
}
105-
mut b := u8(int(unsafe { _bytes[0] }))
106-
b = b << _bytes_len
107-
mut res := rune(b)
108-
mut shift := 6 - _bytes_len
109-
for i := 1; i < _bytes_len; i++ {
110-
c := rune(unsafe { _bytes[i] })
111-
res = rune(res) << shift
112-
res |= c & 63 // 0x3f
113-
shift = 6
105+
106+
match _bytes_len {
107+
2 {
108+
b0 := rune(unsafe { _bytes[0] })
109+
b1 := rune(unsafe { _bytes[1] })
110+
return ((b0 & 0x1F) << 6) | (b1 & 0x3F)
111+
}
112+
3 {
113+
b0 := rune(unsafe { _bytes[0] })
114+
b1 := rune(unsafe { _bytes[1] })
115+
b2 := rune(unsafe { _bytes[2] })
116+
return ((b0 & 0x0F) << 12) | ((b1 & 0x3F) << 6) | (b2 & 0x3F)
117+
}
118+
4 {
119+
b0 := rune(unsafe { _bytes[0] })
120+
b1 := rune(unsafe { _bytes[1] })
121+
b2 := rune(unsafe { _bytes[2] })
122+
b3 := rune(unsafe { _bytes[3] })
123+
return ((b0 & 0x07) << 18) | ((b1 & 0x3F) << 12) | ((b2 & 0x3F) << 6) | (b3 & 0x3F)
124+
}
125+
else {
126+
return 0
127+
}
114128
}
115-
return res
116129
}
117130

118131
// Calculate string length for formatting, i.e. number of "characters"

‎vlib/builtin/utf8_test.v‎

Lines changed: 60 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -101,3 +101,63 @@ fn test_string_to_ansi_not_null_terminated() {
101101
fn test_utf8_str_visible_length() {
102102
assert utf8_str_visible_length('𝐀𝐁𝐂') == 3
103103
}
104+
105+
fn test_utf8_to_utf32_cases() {
106+
test_case1 := 'A'.bytes()
107+
assert impl_utf8_to_utf32(&u8(test_case1.data), test_case1.len) == rune(`A`)
108+
109+
test_case2 := 'é'.bytes()
110+
assert impl_utf8_to_utf32(&u8(test_case2.data), test_case2.len) == rune(`é`)
111+
112+
test_case3 := '€'.bytes()
113+
assert impl_utf8_to_utf32(&u8(test_case3.data), test_case3.len) == rune(`€`)
114+
115+
test_case4 := '𐍈'.bytes()
116+
assert impl_utf8_to_utf32(&u8(test_case4.data), test_case4.len) == rune(0x10348)
117+
assert impl_utf8_to_utf32(&u8(test_case4.data), test_case4.len) == rune(`𐍈`)
118+
119+
test_case5 := '中'.bytes()
120+
assert impl_utf8_to_utf32(&u8(test_case5.data), test_case5.len) == rune(0x4E2D)
121+
assert impl_utf8_to_utf32(&u8(test_case5.data), test_case5.len) == rune(`中`)
122+
123+
// emoji, 4-byte UTF-8
124+
test_case6 := '😀'.bytes()
125+
assert impl_utf8_to_utf32(&u8(test_case6.data), test_case6.len) == rune(0x1F600)
126+
assert impl_utf8_to_utf32(&u8(test_case6.data), test_case6.len) == `😀`
127+
128+
test_case7 := 'Ж'.bytes()
129+
assert impl_utf8_to_utf32(&u8(test_case7.data), test_case7.len) == rune(`Ж`)
130+
131+
test_case8 := 'م'.bytes()
132+
assert impl_utf8_to_utf32(&u8(test_case8.data), test_case8.len) == rune(`م`)
133+
134+
test_case9 := '߿'.bytes()
135+
assert impl_utf8_to_utf32(&u8(test_case9.data), test_case9.len) == rune(0x07FF)
136+
assert impl_utf8_to_utf32(&u8(test_case9.data), test_case9.len) == rune(`߿`)
137+
138+
test_case10 := 'ࠀ'.bytes()
139+
assert impl_utf8_to_utf32(&u8(test_case10.data), test_case10.len) == rune(0x0800)
140+
assert impl_utf8_to_utf32(&u8(test_case10.data), test_case10.len) == rune(`ࠀ`)
141+
142+
test_case11 := '￿'.bytes()
143+
assert impl_utf8_to_utf32(&u8(test_case11.data), test_case11.len) == rune(0xFFFF)
144+
assert impl_utf8_to_utf32(&u8(test_case11.data), test_case11.len) == rune(`￿`)
145+
146+
test_case12 := '𐀀'.bytes()
147+
assert impl_utf8_to_utf32(&u8(test_case12.data), test_case12.len) == rune(0x10000)
148+
assert impl_utf8_to_utf32(&u8(test_case12.data), test_case12.len) == rune(`𐀀`)
149+
150+
test_case13 := '􏿿'.bytes()
151+
assert impl_utf8_to_utf32(&u8(test_case13.data), test_case13.len) == rune(0x10FFFF)
152+
assert impl_utf8_to_utf32(&u8(test_case13.data), test_case13.len) == rune(`􏿿`)
153+
}
154+
155+
fn test_utf8_to_utf32_invalid_length() {
156+
// More than 4 bytes is invalid
157+
invalid := [u8(0xF0), 0x9F, 0x98, 0x80, 0x00]
158+
assert impl_utf8_to_utf32(&u8(invalid.data), invalid.len) == 0
159+
}
160+
161+
fn test_utf8_to_utf32_empty() {
162+
assert impl_utf8_to_utf32(&u8([]u8{}.data), 0) == 0
163+
}

0 commit comments

Comments
 (0)