From 5d9a28f7a168a45753aaa28dde508de1222b95a3 Mon Sep 17 00:00:00 2001 From: Kovid Goyal Date: Mon, 15 Sep 2025 08:53:03 +0530 Subject: [PATCH] Move utf8_parsing tests to simd_decode More robust as the tests are run against all three SIMD modes, no SIMD, 128bit and 256 bit --- kitty_tests/parser.py | 122 ++++++++++++++++++++---------------------- 1 file changed, 58 insertions(+), 64 deletions(-) diff --git a/kitty_tests/parser.py b/kitty_tests/parser.py index 5a7669473..e398e5feb 100644 --- a/kitty_tests/parser.py +++ b/kitty_tests/parser.py @@ -185,69 +185,6 @@ class TestParser(BaseTest): pb('😀'.encode()[:-1]) pb('\x1b\x1b%a', '\ufffd', ('Unknown char after ESC: 0x1b',), ('draw', '%a')) - def test_utf8_parsing(self): - s = self.create_screen() - pb = partial(self.parse_bytes_dump, s) - - # Kitty's UTF-8 decoding uses `U+FFFD substitution of maximal subparts - # `_, - # same as in the WHATWG Encoding Standard. - # This means that ill-formed sequences may be replaced by multiple - # U+FFFD REPLACEMENT CHARACTERs. - - # Lone continuation bytes with no leading starts - pb(b'"\xbf"', '"\ufffd"') - pb(b'"\x80"', '"\ufffd"') - - # Multiple lone continuation bytes - pb(b'"\x80\xbf"', '"\ufffd\ufffd"') - pb(b'"\x80\xbf\x80"', '"\ufffd\ufffd\ufffd"') - - # Lone starter byte of 2-byte sequence - pb(b'"\xc0 "', '"\ufffd "') - - # Single never-valid bytes - pb(b'"\xfe"', '"\ufffd"') - pb(b'"\xff"', '"\ufffd"') - - # Multiple never-valid bytes - pb(b'"\xff\xfe"', '"\ufffd\ufffd"') - pb(b'"\xfe\xfe\xff\xff"', '"\ufffd\ufffd\ufffd\ufffd"') - - # Truncated 2-byte sequence (only 1 byte) - pb(b'"\xc2"', '"\ufffd"') - - # Truncated 3-byte sequences (only 2 bytes) - pb(b'"\xef\xbf"', '"\ufffd"') - pb(b'"\xe0\xa0"', '"\ufffd"') - - # Truncated 4-byte sequence (only 2 or 3 bytes) - pb(b'"\xf0\x9f"', '"\ufffd"') - pb(b'"\xf0\x9f\x98"', '"\ufffd"') - - # Bad continuation byte (restored as ASCII) - pb(b'"\xe1\x28\xa1"', '"\ufffd(\ufffd"') # ) - - # The following all fail and need to be fixed in the SIMD parser - - # Overlong 2-byte sequence for U+0000 (should be `0x00`) - # pb(b'"\xc0\x80"', '"\ufffd\ufffd"') - - # Overlong 3-byte sequence for U+0000 (violates boundary) - # pb(b'"\xe0\x80\x80"', '"\ufffd\ufffd\ufffd"') - - # Overlong 4-byte sequence for U+0000 (violates boundary) - # pb(b'"\xf0\x80\x80\x80"', '"\ufffd\ufffd\ufffd\ufffd"') - - # High surrogate code point - # pb(b'"\xed\xa0\x80"', '"\ufffd\ufffd\ufffd"') - - # Low surrogate code point - # pb(b'"\xed\xb0\x80"', '"\ufffd\ufffd\ufffd"') - - # Too large first codepoint - # pb(b'"\xff\x80\x80\x80"', '"\ufffd\ufffd\ufffd\ufffd"') - def test_utf8_simd_decode(self): def unsupported(which): return (which == 2 and not has_sse4_2) or (which == 3 and not has_avx2) @@ -331,7 +268,11 @@ class TestParser(BaseTest): pb('\uffff', '\uffff') pb('\0', '\0') pb(chr(0x10ffff), chr(0x10ffff)) - # various invalid input + # Kitty's UTF-8 decoding uses `U+FFFD substitution of maximal subparts + # `_, + # same as in the WHATWG Encoding Standard. + # This means that ill-formed sequences may be replaced by multiple + # U+FFFD REPLACEMENT CHARACTERs. pb(b'abcd\xf51234', 'abcd\ufffd1234') # bytes > 0xf4 pb(b'abcd\xff1234', 'abcd\ufffd1234') # bytes > 0xf4 pb(b'"\xbf"', '"\ufffd"') @@ -347,6 +288,59 @@ class TestParser(BaseTest): pb(b'"\xe0\xa0"', '"\ufffd"') pb(b'"\xf0\x9f\x98"', '"\ufffd"') pb(b'"\xef\x93\x94\x95"', '"\uf4d4\ufffd"') + # Lone continuation bytes with no leading starts + pb(b'"\xbf"', '"\ufffd"') + pb(b'"\x80"', '"\ufffd"') + + # Multiple lone continuation bytes + pb(b'"\x80\xbf"', '"\ufffd\ufffd"') + pb(b'"\x80\xbf\x80"', '"\ufffd\ufffd\ufffd"') + + # Lone starter byte of 2-byte sequence + pb(b'"\xc0 "', '"\ufffd "') + + # Single never-valid bytes + pb(b'"\xfe"', '"\ufffd"') + pb(b'"\xff"', '"\ufffd"') + + # Multiple never-valid bytes + pb(b'"\xff\xfe"', '"\ufffd\ufffd"') + pb(b'"\xfe\xfe\xff\xff"', '"\ufffd\ufffd\ufffd\ufffd"') + + # Truncated 2-byte sequence (only 1 byte) + pb(b'"\xc2"', '"\ufffd"') + + # Truncated 3-byte sequences (only 2 bytes) + pb(b'"\xef\xbf"', '"\ufffd"') + pb(b'"\xe0\xa0"', '"\ufffd"') + + # Truncated 4-byte sequence (only 2 or 3 bytes) + pb(b'"\xf0\x9f"', '"\ufffd"') + pb(b'"\xf0\x9f\x98"', '"\ufffd"') + + # Bad continuation byte (restored as ASCII) + pb(b'"\xe1\x28\xa1"', '"\ufffd(\ufffd"') # ) + + # The following all fail and need to be fixed in the SIMD parser + + # Overlong 2-byte sequence for U+0000 (should be `0x00`) + # pb(b'"\xc0\x80"', '"\ufffd\ufffd"') + + # Overlong 3-byte sequence for U+0000 (violates boundary) + # pb(b'"\xe0\x80\x80"', '"\ufffd\ufffd\ufffd"') + + # Overlong 4-byte sequence for U+0000 (violates boundary) + # pb(b'"\xf0\x80\x80\x80"', '"\ufffd\ufffd\ufffd\ufffd"') + + # High surrogate code point + # pb(b'"\xed\xa0\x80"', '"\ufffd\ufffd\ufffd"') + + # Low surrogate code point + # pb(b'"\xed\xb0\x80"', '"\ufffd\ufffd\ufffd"') + + # Too large first codepoint + # pb(b'"\xff\x80\x80\x80"', '"\ufffd\ufffd\ufffd\ufffd"') + def test_find_either_of_two_bytes(self): sizes = []