More tests and micro-optimize switch to ASCII fast path

This commit is contained in:
Kovid Goyal
2024-01-11 12:13:59 +05:30
parent a63d62fb4e
commit b0dcdf74bd
2 changed files with 23 additions and 22 deletions

View File

@@ -313,10 +313,6 @@ scalar_decode_to_accept(UTF8Decoder *d, const uint8_t *src, size_t src_sz) {
}
}
static inline unsigned short
count_trailing_zeros(int32_t mask) {
return mask ? __builtin_ctz(mask) : 0;
}
#endif
static inline bool
@@ -343,19 +339,17 @@ FUNC(utf8_decode_to_esc)(UTF8Decoder *d, const uint8_t *src, size_t src_sz) {
} else d->num_consumed = src_sz;
if (src_sz < sizeof(integer_t)) vec = zero_last_n_bytes(vec, sizeof(integer_t) - src_sz);
// Classify the bytes
// Check if we have pure ASCII and use fast path
print_register_as_bytes(vec);
if (!movemask_epi8(vec)) { // no bytes with high bit (0x80) set, so just plain ASCII
FUNC(output_plain_ascii)(d, vec, src_sz);
return sentinel_found;
}
// Classify the bytes
integer_t state = set1_epi8(0x80);
const integer_t vec_signed = add_epi8(vec, state); // needed because cmplt_epi8 works only on signed chars
const integer_t bytes_indicating_start_of_two_byte_sequence = cmplt_epi8(set1_epi8(0xc0 - 1 - 0x80), vec_signed);
if (
(unsigned)count_trailing_zeros(movemask_epi8(bytes_indicating_start_of_two_byte_sequence)) >= src_sz &&
(unsigned)count_trailing_zeros(movemask_epi8(vec)) >= src_sz)
{ // no bytes with high bit (0x80) set, so just plain ASCII
FUNC(output_plain_ascii)(d, vec, src_sz);
return sentinel_found;
}
state = blendv_epi8(state, set1_epi8(0xc2), bytes_indicating_start_of_two_byte_sequence);
// state now has 0xc2 on all bytes that start a 2 or more byte sequence and 0x80 on the rest
const integer_t bytes_indicating_start_of_three_byte_sequence = cmplt_epi8(set1_epi8(0xe0 - 1 - 0x80), vec_signed);

View File

@@ -181,19 +181,26 @@ class TestParser(BaseTest):
pb(b'"\xf0\x9f\x98"', '"\ufffd"')
def test_utf8_simd_decode(self):
def t(x, which=2, reset=True):
if reset:
test_utf8_decode_to_sentinel(b'', -1)
def reset_state():
test_utf8_decode_to_sentinel(b'', -1)
def t(x, which=2):
expected = test_utf8_decode_to_sentinel(x, 1)
actual = test_utf8_decode_to_sentinel(x, which)
self.ae(expected, actual, msg=f'Failed for {x!r} with {which=}\n{expected!r} !=\n{actual!r}')
for which in (2, 3):
x = partial(t, which=which)
x('2:α3')
x('2:α3:≤4:😸|')
x('abcd1234efgh5678')
x('abc\x1bd1234efgh5678')
x('abcd1234efgh5678ijklABCDmnopEFGH')
def double_test(x):
for which in (2, 3):
t(x)
t(x*2, which=3)
reset_state()
x = double_test
x('2:α3')
x('2:α3:≤4:😸|')
x('abcd1234efgh5678')
x('abc\x1bd1234efgh5678')
x('abcd1234efgh5678ijklABCDmnopEFGH')
def test_esc_codes(self):
s = self.create_screen()