diff --git a/kitty/simd-string-impl.h b/kitty/simd-string-impl.h index 46bf7244d..8c3b13e38 100644 --- a/kitty/simd-string-impl.h +++ b/kitty/simd-string-impl.h @@ -541,7 +541,7 @@ FUNC(utf8_decode_to_esc)(UTF8Decoder *d, const uint8_t *src_data, size_t src_len src_data += d->num_consumed; src_len -= d->num_consumed; } const integer_t esc_vec = set1_epi8(0x1b); - const integer_t zero = create_zero_integer(), one = set1_epi8(1), two = set1_epi8(2), three = set1_epi8(3), numbered = numbered_bytes(); + const integer_t zero = create_zero_integer(), one = set1_epi8(1), two = set1_epi8(2), three = set1_epi8(3), four = set1_epi8(4), numbered = numbered_bytes(); const uint8_t *limit = src_data + src_len, *p = src_data, *start_of_current_chunk = src_data; bool sentinel_found = false; unsigned chunk_src_sz = 0; @@ -593,10 +593,14 @@ start_classification: handle_trailing_bytes(); continue; } - // Classify the bytes + // Classify the bytes by whether they may be the start of a 2-byte, 3-byte, or 4-byte sequence. + // This is only an initial, potential classification. + // 0xC0 and 0xC1 are initially classified as potential starter bytes of 2-byte sequences. + // And 0xF5..0xFF are initially classified as potential starter bytes of 4-byte sequences. + // They will be marked as actually invalid later in the chunk_is_invalid checks. integer_t state = set1_epi8(0x80); const integer_t vec_signed = add_epi8(vec, state); // needed because cmplt_epi8 works only on signed chars - + // state now has 0x80 on all bytes const integer_t bytes_indicating_start_of_two_byte_sequence = cmplt_epi8(set1_epi8(0xc0 - 1 - 0x80), vec_signed); state = blendv_epi8(state, set1_epi8(0xc2), bytes_indicating_start_of_two_byte_sequence); // state now has 0xc2 on all bytes that start a 2 or more byte sequence and 0x80 on the rest @@ -631,11 +635,61 @@ start_classification: vec = zero_last_n_bytes(vec, sizeof(integer_t) - chunk_src_sz); goto start_classification; } + + // The next section performs detailed validation of the chunk's byte sequences. + // It accumulates validation errors into a chunk_is_invalid vector. + // When chunk_is_invalid has any non-zero byte, then the chunk contains invalid UTF-8. + // chunk_is_invalid is a vector, and not a bitmask or boolean, + // because the or_si SIMD operation is empirically faster than movemask_epi8 with |= or ||=. + integer_t chunk_is_invalid; + // Only ASCII chars should have corresponding byte of counts == 0 if (ascii_mask != movemask_epi8(cmpgt_epi8(counts, zero))) { abort_with_invalid_utf8(); } + // The difference between a byte in counts and the next one should be negative, // zero, or one. Any other value means there is not enough continuation bytes. - if (!is_zero(cmpgt_epi8(subtract_epi8(shift_right_by_one_byte(counts), counts), one))) { abort_with_invalid_utf8(); } + chunk_is_invalid = cmpgt_epi8(subtract_epi8(shift_right_by_one_byte(counts), counts), one); + + // Validate 2-byte sequence starter bytes: 0xC0..0xC1 are invalid (overlong encodings for U+0000..U+007F). + // Without this, "\xc0\x80" would incorrectly be decoded as a "\x00". + chunk_is_invalid = or_si(chunk_is_invalid, and_si(bytes_indicating_start_of_two_byte_sequence, cmplt_epi8(vec, set1_epi8(0xc2)))); + + // Validate 4-byte sequence starter bytes: 0xF5..0xFF are invalid (out of Unicode codespace). + // Without this, "\xff\x80\x80\x80" would incorrectly be decoded as an ill-formed "\U003C0000". + chunk_is_invalid = or_si(chunk_is_invalid, and_si(bytes_indicating_start_of_four_byte_sequence, cmpgt_epi8(vec, set1_epi8(0xf4)))); + + // Validate second bytes of E0-starting 3-byte sequences. + // 0xE0 must be followed by 0xA0..0xBF (not 0x80..0x9F) to avoid overlong encodings. + // Without this, "\xe0\x80\x80" would incorrectly be decoded as a "\x00". + const integer_t e0_starter_bytes = cmpeq_epi8(vec, set1_epi8(0xe0)); + const integer_t e0_first_follower_bytes = shift_right_by_one_byte(e0_starter_bytes); + chunk_is_invalid = or_si(chunk_is_invalid, and_si(e0_first_follower_bytes, cmplt_epi8(and_si(e0_first_follower_bytes, vec), set1_epi8(0xa0)))); + + // Validate second bytes of ED-starting 3-byte sequences. + // 0xED must be followed by 0x80..0x9F (not 0xA0..0xBF) to avoid UTF-16 surrogates. + // Without this, "\xed\xa0\x80" would incorrectly be decoded as an isolated surrogate "\uD800". + const integer_t ed_starter_bytes = cmpeq_epi8(vec, set1_epi8(0xed)); + const integer_t ed_first_follower_bytes = shift_right_by_one_byte(ed_starter_bytes); + chunk_is_invalid = or_si(chunk_is_invalid, and_si(ed_first_follower_bytes, cmpgt_epi8(and_si(ed_first_follower_bytes, vec), set1_epi8(0x9f)))); + + // Validate second bytes of F0-starting 4-byte sequences. + // F0 must be followed by 0x90..0xBF (not 0x80..0x8F) to avoid overlong encodings. + // Without this, "\xf0\x80\x80\x80" would incorrectly be decoded as a "\x0000". + const integer_t f0_starter_bytes = cmpeq_epi8(vec, set1_epi8(0xf0)); + const integer_t f0_first_follower_bytes = shift_right_by_one_byte(f0_starter_bytes); + chunk_is_invalid = or_si(chunk_is_invalid, and_si(f0_first_follower_bytes, cmplt_epi8(and_si(f0_first_follower_bytes, vec), set1_epi8(0x90)))); + + // Validate second bytes of F4-starting 4-byte sequences. + // F4 must be followed by 0x80..0x8F (not 0x90..0xBF) to stay within the Unicode codespace. + // Without this, "\xf4\x90\x80\x80" would incorrectly be decoded as an ill-formed "\U00110000". + const integer_t f4_starter_bytes = cmpeq_epi8(vec, set1_epi8(0xf4)); + const integer_t f4_first_follower_bytes = shift_right_by_one_byte(f4_starter_bytes); + chunk_is_invalid = or_si(chunk_is_invalid, and_si(f4_first_follower_bytes, cmpgt_epi8(and_si(f4_first_follower_bytes, vec), set1_epi8(0x8f)))); + + // Check for any accumulated validation errors and, if found, + // fall back to slow scalar decoding of this chunk, + // which handles replacement of invalid sequences with U+FFFD + if (!is_zero(chunk_is_invalid)) { abort_with_invalid_utf8(); } // Process the bytes storing the three resulting bytes that make up the unicode codepoint // mask all control bits so that we have only useful bits left @@ -673,7 +727,7 @@ start_classification: // The last byte is made up of bits 5 and 6 from count == 3 and 3 bits from count == 4 integer_t output3 = and_si(three, shift_right_by_bits32(vec, 4)); // bits 5 and 6 from count == 3 - const integer_t count4_locations = cmpeq_epi8(counts, set1_epi8(4)); + const integer_t count4_locations = cmpeq_epi8(counts, four); // 3 bits from count == 4 locations, placed at count == 3 locations shifted left by 2 bits output3 = or_si(output3, and_si(set1_epi8(0xfc), diff --git a/kitty_tests/parser.py b/kitty_tests/parser.py index b65e02b80..2e3752209 100644 --- a/kitty_tests/parser.py +++ b/kitty_tests/parser.py @@ -322,10 +322,6 @@ class TestParser(BaseTest): # Bad continuation byte (restored as ASCII) pb(b'"\xe1\x28\xa1"', '"\ufffd(\ufffd"') # ) - # The following all fail when using SIMD and need to be fixed in the SIMD parser - if which != 1: - continue - # Overlong 2-byte sequence for U+0000 (should be `0x00`) pb(b'"\xc0\x80"', '"\ufffd\ufffd"')