Merge branch 'utf-8-simd-parser-fix' of https://github.com/Wukuyon/kitty

2026-07-24 17:27:39 +02:00 · 2025-10-24 14:49:58 +05:30
parent c7117c2839 9d336be0d3
commit c2be527d3d
2 changed files with 376 additions and 16 deletions
--- a/kitty/simd-string-impl.h
+++ b/kitty/simd-string-impl.h
@@ -541,7 +541,7 @@ FUNC(utf8_decode_to_esc)(UTF8Decoder *d, const uint8_t *src_data, size_t src_len
        src_data += d->num_consumed; src_len -= d->num_consumed;
    }
    const integer_t esc_vec = set1_epi8(0x1b);
-    const integer_t zero = create_zero_integer(), one = set1_epi8(1), two = set1_epi8(2), three = set1_epi8(3), numbered = numbered_bytes();
+    const integer_t zero = create_zero_integer(), one = set1_epi8(1), two = set1_epi8(2), three = set1_epi8(3), four = set1_epi8(4), numbered = numbered_bytes();
    const uint8_t *limit = src_data + src_len, *p = src_data, *start_of_current_chunk = src_data;
    bool sentinel_found = false;
    unsigned chunk_src_sz = 0;
@@ -593,10 +593,14 @@ start_classification:
            handle_trailing_bytes();
            continue;
        }
-        // Classify the bytes
+        // Classify the bytes by whether they may be the start of a 2-byte, 3-byte, or 4-byte sequence.
+        // This is only an initial, potential classification.
+        // 0xC0 and 0xC1 are initially classified as potential starter bytes of 2-byte sequences.
+        // And 0xF5..0xFF are initially classified as potential starter bytes of 4-byte sequences.
+        // They will be marked as actually invalid later in the chunk_is_invalid checks.
        integer_t state = set1_epi8(0x80);
        const integer_t vec_signed = add_epi8(vec, state); // needed because cmplt_epi8 works only on signed chars
-
+        // state now has 0x80 on all bytes
        const integer_t bytes_indicating_start_of_two_byte_sequence = cmplt_epi8(set1_epi8(0xc0 - 1 - 0x80), vec_signed);
        state = blendv_epi8(state, set1_epi8(0xc2), bytes_indicating_start_of_two_byte_sequence);
        // state now has 0xc2 on all bytes that start a 2 or more byte sequence and 0x80 on the rest
@@ -631,11 +635,74 @@ start_classification:
            vec = zero_last_n_bytes(vec, sizeof(integer_t) - chunk_src_sz);
            goto start_classification;
        }
-        // Only ASCII chars should have corresponding byte of counts == 0
-        if (ascii_mask != movemask_epi8(cmpgt_epi8(counts, zero))) { abort_with_invalid_utf8(); }
-        // The difference between a byte in counts and the next one should be negative,
-        // zero, or one. Any other value means there is not enough continuation bytes.
-        if (!is_zero(cmpgt_epi8(subtract_epi8(shift_right_by_one_byte(counts), counts), one))) { abort_with_invalid_utf8(); }
+
+        // The next section performs detailed validation of the chunk's byte sequences.
+        // It accumulates validation errors into a chunk_is_invalid vector.
+        // When chunk_is_invalid has any non-zero byte, then the chunk contains invalid UTF-8.
+        // chunk_is_invalid is a vector, and not a bitmask or boolean,
+        // because the or_si SIMD operation is empirically faster than movemask_epi8 with |= or ||=.
+        integer_t chunk_is_invalid;
+
+        // Only bytes within the ASCII range should have counts[i] == 0, and vice versa.
+        // Detect any mismatch between the two conditions for each chunk byte.
+        // If there is any mismatch, then the chunk has invalid UTF-8, so set all bytes in chunk_is_invalid to 0xFF;
+        // otherwise the chunk might be valid, so set all bytes in chunk_is_invalid to 0x00.
+        // Without this, "\x80" would incorrectly be decoded as a "\x00".
+        // This also validates that continuation bytes' positions do not have ASCII bytes (< 0x80).
+        // Without this, "\xe0\xa0\x7f\x01" would incorrectly be decoded as "\x00\x01".
+        // In that example, 0x7F has an ascii_mask bit of 0 (i.e., it is within 0x00..0x7F),
+        // but it has a counts value of 1, not 0 (i.e., it is the last remaining byte of a multi-byte sequence).
+        // Therefore there is a count mismatch, indicating that the chunk is ill-formed UTF-8.
+        // (If the following "\x01" were absent, and the "\x7f" were the last byte of the chunk,
+        // then the `check_for_trailing_bytes` validation above detects the error as a trailing incomplete sequence.)
+        const int ascii_sequence_count_mismatches = ascii_mask ^ movemask_epi8(cmpgt_epi8(counts, zero));
+        chunk_is_invalid = set1_epi8(ascii_sequence_count_mismatches ? 0xff : 0x00);
+
+        // Validate 2-byte sequence starter bytes: 0xC0..0xC1 are invalid (overlong encodings for U+0000..U+007F).
+        // Without this, "\xc0\x80" would incorrectly be decoded as a "\x00".
+        chunk_is_invalid = or_si(chunk_is_invalid, and_si(bytes_indicating_start_of_two_byte_sequence, cmplt_epi8(vec, set1_epi8(0xc2))));
+
+        // Validate 4-byte sequence starter bytes: 0xF5..0xFF are invalid (out of Unicode codespace).
+        // Without this, "\xff\x80\x80\x80" would incorrectly be decoded as an ill-formed "\U003C0000".
+        chunk_is_invalid = or_si(chunk_is_invalid, and_si(bytes_indicating_start_of_four_byte_sequence, cmpgt_epi8(vec, set1_epi8(0xf4))));
+
+        // Validate that all continuation bytes' positions do not have non-ASCII starter bytes (>=0xC0).
+        // If counts[i] > count[i], the chunk byte at i is in the middle of a previous sequence but also classified as a starter byte.
+        // Without this, "\xf0\x90\xc2\x80" would have overlapping sequences, and it would be incorrectly decoded elsewhere as an empty string.
+        chunk_is_invalid = or_si(chunk_is_invalid, andnot_si(cmplt_epi8(vec, set1_epi8(0xc0)), cmpgt_epi8(counts, count)));
+
+        // Validate second bytes of E0-starting 3-byte sequences.
+        // 0xE0 must be followed by 0xA0..0xBF (not 0x80..0x9F) to avoid overlong encodings.
+        // Without this, "\xe0\x80\x80" would incorrectly be decoded as a "\x00".
+        const integer_t e0_starter_bytes = cmpeq_epi8(vec, set1_epi8(0xe0));
+        const integer_t e0_first_follower_bytes = shift_right_by_one_byte(e0_starter_bytes);
+        chunk_is_invalid = or_si(chunk_is_invalid, and_si(e0_first_follower_bytes, cmplt_epi8(and_si(e0_first_follower_bytes, vec), set1_epi8(0xa0))));
+
+        // Validate second bytes of ED-starting 3-byte sequences.
+        // 0xED must be followed by 0x80..0x9F (not 0xA0..0xBF) to avoid UTF-16 surrogates.
+        // Without this, "\xed\xa0\x80" would incorrectly be decoded as an isolated surrogate "\uD800".
+        const integer_t ed_starter_bytes = cmpeq_epi8(vec, set1_epi8(0xed));
+        const integer_t ed_first_follower_bytes = shift_right_by_one_byte(ed_starter_bytes);
+        chunk_is_invalid = or_si(chunk_is_invalid, and_si(ed_first_follower_bytes, cmpgt_epi8(and_si(ed_first_follower_bytes, vec), set1_epi8(0x9f))));
+
+        // Validate second bytes of F0-starting 4-byte sequences.
+        // F0 must be followed by 0x90..0xBF (not 0x80..0x8F) to avoid overlong encodings.
+        // Without this, "\xf0\x80\x80\x80" would incorrectly be decoded as a "\x0000".
+        const integer_t f0_starter_bytes = cmpeq_epi8(vec, set1_epi8(0xf0));
+        const integer_t f0_first_follower_bytes = shift_right_by_one_byte(f0_starter_bytes);
+        chunk_is_invalid = or_si(chunk_is_invalid, and_si(f0_first_follower_bytes, cmplt_epi8(and_si(f0_first_follower_bytes, vec), set1_epi8(0x90))));
+
+        // Validate second bytes of F4-starting 4-byte sequences.
+        // F4 must be followed by 0x80..0x8F (not 0x90..0xBF) to stay within the Unicode codespace.
+        // Without this, "\xf4\x90\x80\x80" would incorrectly be decoded as an ill-formed "\U00110000".
+        const integer_t f4_starter_bytes = cmpeq_epi8(vec, set1_epi8(0xf4));
+        const integer_t f4_first_follower_bytes = shift_right_by_one_byte(f4_starter_bytes);
+        chunk_is_invalid = or_si(chunk_is_invalid, and_si(f4_first_follower_bytes, cmpgt_epi8(and_si(f4_first_follower_bytes, vec), set1_epi8(0x8f))));
+
+        // Check for any accumulated validation errors and, if found,
+        // fall back to slow scalar decoding of this chunk,
+        // which handles replacement of invalid sequences with U+FFFD
+        if (!is_zero(chunk_is_invalid)) { abort_with_invalid_utf8(); }

        // Process the bytes storing the three resulting bytes that make up the unicode codepoint
        // mask all control bits so that we have only useful bits left
@@ -673,7 +740,7 @@ start_classification:

        // The last byte is made up of bits 5 and 6 from count == 3 and 3 bits from count == 4
        integer_t output3 = and_si(three, shift_right_by_bits32(vec, 4));  // bits 5 and 6 from count == 3
-        const integer_t count4_locations = cmpeq_epi8(counts, set1_epi8(4));
+        const integer_t count4_locations = cmpeq_epi8(counts, four);
        // 3 bits from count == 4 locations, placed at count == 3 locations shifted left by 2 bits
        output3 = or_si(output3,
            and_si(set1_epi8(0xfc),
--- a/kitty_tests/parser.py
+++ b/kitty_tests/parser.py
@@ -214,9 +214,9 @@ class TestParser(BaseTest):
                return esc_found, ''.join(parts), total_consumed

            reset_state()
-            actual = parse_parts(1)
+            expected = parse_parts(1)
            reset_state()
-            expected = parse_parts(which)
+            actual = parse_parts(which)
            self.ae(expected, actual, msg=f'Failed for {a} with {which=}\n{expected!r} !=\n{actual!r}')
            return actual

@@ -288,6 +288,7 @@ class TestParser(BaseTest):
            pb(b'"\xe0\xa0"', '"\ufffd"')
            pb(b'"\xf0\x9f\x98"', '"\ufffd"')
            pb(b'"\xef\x93\x94\x95"', '"\uf4d4\ufffd"')
+
            # Lone continuation bytes with no leading starts
            pb(b'"\xbf"', '"\ufffd"')
            pb(b'"\x80"', '"\ufffd"')
@@ -321,10 +322,6 @@ class TestParser(BaseTest):
            # Bad continuation byte (restored as ASCII)
            pb(b'"\xe1\x28\xa1"', '"\ufffd(\ufffd"')  # )

-            # The following all fail when using SIMD and need to be fixed in the SIMD parser
-            if which != 1:
-                continue
-
            # Overlong 2-byte sequence for U+0000 (should be `0x00`)
            pb(b'"\xc0\x80"', '"\ufffd\ufffd"')

@@ -340,9 +337,305 @@ class TestParser(BaseTest):
            # Low surrogate code point
            pb(b'"\xed\xb0\x80"', '"\ufffd\ufffd\ufffd"')

-            # Too large first codepoint
+            # Too large starter byte
            pb(b'"\xff\x80\x80\x80"', '"\ufffd\ufffd\ufffd\ufffd"')

+            # The following boundary cases come from the table of well-formed UTF-8 byte sequences
+            # <https://www.unicode.org/versions/Unicode16.0.0/core-spec/chapter-3/#G27506>`_.
+            # For continuation bytes, both 0xC0 and 0xC2 are tested as values that exceed the valid maximum.
+            # This is because 0xC0 is an invalid starter byte, but 0xC2 is also a starter byte for 2-byte sequences.
+            # simd-string-impl.h prefers classifying bytes as starter bytes when possible (e.g., in "\xf0\x90\xc2\x80").
+            # The tests need to check that simd-string-impl.h correctly detects
+            # starter bytes that are actually invalid continution bytes, like 0xC2.
+
+            # Boundary cases: 2-byte sequences
+            pb(b'"\xc1\x7f"', '"\ufffd\x7f"')
+            pb(b'"\xc1\x80"', '"\ufffd\ufffd"')
+            pb(b'"\xc1\xbf"', '"\ufffd\ufffd"')
+            pb(b'"\xc1\xc0"', '"\ufffd\ufffd"')
+            pb(b'"\xc1\xc2"', '"\ufffd\ufffd"')
+            pb(b'"\xc2\x7f"', '"\ufffd\x7f"')
+            pb(b'"\xc2\x80"', '"\x80"')
+            pb(b'"\xc2\xbf"', '"\xbf"')
+            pb(b'"\xc2\xc0"', '"\ufffd\ufffd"')
+            pb(b'"\xc2\xc2"', '"\ufffd\ufffd"')
+            pb(b'"\xdf\x7f"', '"\ufffd\x7f"')
+            pb(b'"\xdf\x80"', '"\u07c0"')
+            pb(b'"\xdf\xbf"', '"\u07ff"')
+            pb(b'"\xdf\xc0"', '"\ufffd\ufffd"')
+            pb(b'"\xdf\xc2"', '"\ufffd\ufffd"')
+
+            # Boundary cases: 3-byte sequences starting with 0xE0
+            pb(b'"\xe0\x9f\x80"', '"\ufffd\ufffd\ufffd"')
+            pb(b'"\xe0\xa0\x7f"', '"\ufffd\x7f"')
+            pb(b'"\xe0\xa0\x80"', '"\u0800"')
+            pb(b'"\xe0\xa0\xbf"', '"\u083f"')
+            pb(b'"\xe0\xa0\xc0"', '"\ufffd\ufffd"')
+            pb(b'"\xe0\xa0\xc2"', '"\ufffd\ufffd"')
+            pb(b'"\xe0\xbf\x7f"', '"\ufffd\x7f"')
+            pb(b'"\xe0\xbf\x80"', '"\u0fc0"')
+            pb(b'"\xe0\xbf\xbf"', '"\u0fff"')
+            pb(b'"\xe0\xbf\xc0"', '"\ufffd\ufffd"')
+            pb(b'"\xe0\xbf\xc2"', '"\ufffd\ufffd"')
+            pb(b'"\xe0\xc0\x80"', '"\ufffd\ufffd\ufffd"')
+            pb(b'"\xe0\xc2\x80"', '"\ufffd\x80"')
+
+            # Boundary cases: 3-byte sequences starting with 0xE1..0xEC
+            pb(b'"\xe1\x7f\x80"', '"\ufffd\x7f\ufffd"')
+            pb(b'"\xe1\x80\x7f"', '"\ufffd\x7f"')
+            pb(b'"\xe1\x80\x80"', '"\u1000"')
+            pb(b'"\xe1\x80\xbf"', '"\u103f"')
+            pb(b'"\xe1\x80\xc0"', '"\ufffd\ufffd"')
+            pb(b'"\xe1\x80\xc2"', '"\ufffd\ufffd"')
+            pb(b'"\xe1\xbf\x7f"', '"\ufffd\x7f"')
+            pb(b'"\xe1\xbf\x80"', '"\u1fc0"')
+            pb(b'"\xe1\xbf\xbf"', '"\u1fff"')
+            pb(b'"\xe1\xbf\xc0"', '"\ufffd\ufffd"')
+            pb(b'"\xe1\xbf\xc2"', '"\ufffd\ufffd"')
+            pb(b'"\xe1\xc0\x80"', '"\ufffd\ufffd\ufffd"')
+            pb(b'"\xe1\xc2\x80"', '"\ufffd\x80"')
+            pb(b'"\xec\x7f\x80"', '"\ufffd\x7f\ufffd"')
+            pb(b'"\xec\x80\x7f"', '"\ufffd\x7f"')
+            pb(b'"\xec\x80\x80"', '"\uc000"')
+            pb(b'"\xec\x80\xbf"', '"\uc03f"')
+            pb(b'"\xec\x80\xc0"', '"\ufffd\ufffd"')
+            pb(b'"\xec\x80\xc2"', '"\ufffd\ufffd"')
+            pb(b'"\xec\xbf\x7f"', '"\ufffd\x7f"')
+            pb(b'"\xec\xbf\x80"', '"\ucfc0"')
+            pb(b'"\xec\xbf\xbf"', '"\ucfff"')
+            pb(b'"\xec\xbf\xc0"', '"\ufffd\ufffd"')
+            pb(b'"\xec\xbf\xc2"', '"\ufffd\ufffd"')
+            pb(b'"\xec\xc0\x80"', '"\ufffd\ufffd\ufffd"')
+            pb(b'"\xec\xc2\x80"', '"\ufffd\x80"')
+
+            # Boundary cases: 3-byte sequences starting with 0xED
+            pb(b'"\xed\x7f\x80"', '"\ufffd\x7f\ufffd"')
+            pb(b'"\xed\x80\x7f"', '"\ufffd\x7f"')
+            pb(b'"\xed\x80\x80"', '"\ud000"')
+            pb(b'"\xed\x80\xbf"', '"\ud03f"')
+            pb(b'"\xed\x80\xc0"', '"\ufffd\ufffd"')
+            pb(b'"\xed\x80\xc2"', '"\ufffd\ufffd"')
+            pb(b'"\xed\x9f\x7f"', '"\ufffd\x7f"')
+            pb(b'"\xed\x9f\x80"', '"\ud7c0"')
+            pb(b'"\xed\x9f\xbf"', '"\ud7ff"')
+            pb(b'"\xed\x9f\xc0"', '"\ufffd\ufffd"')
+            pb(b'"\xed\x9f\xc2"', '"\ufffd\ufffd"')
+
+            # Boundary cases: 3-byte sequences starting with 0xEE..0xEF
+            pb(b'"\xed\xa0\x80"', '"\ufffd\ufffd\ufffd"')
+            pb(b'"\xee\x7f\x80"', '"\ufffd\x7f\ufffd"')
+            pb(b'"\xee\x80\x7f"', '"\ufffd\x7f"')
+            pb(b'"\xee\x80\x80"', '"\ue000"')
+            pb(b'"\xee\x80\xbf"', '"\ue03f"')
+            pb(b'"\xee\x80\xc0"', '"\ufffd\ufffd"')
+            pb(b'"\xee\x80\xc2"', '"\ufffd\ufffd"')
+            pb(b'"\xee\xbf\x7f"', '"\ufffd\x7f"')
+            pb(b'"\xee\xbf\x80"', '"\uefc0"')
+            pb(b'"\xee\xbf\xbf"', '"\uefff"')
+            pb(b'"\xee\xbf\xc0"', '"\ufffd\ufffd"')
+            pb(b'"\xee\xbf\xc2"', '"\ufffd\ufffd"')
+            pb(b'"\xee\xc0\x80"', '"\ufffd\ufffd\ufffd"')
+            pb(b'"\xee\xc2\x80"', '"\ufffd\x80"')
+            pb(b'"\xef\x7f\x80"', '"\ufffd\x7f\ufffd"')
+            pb(b'"\xef\x80\x7f"', '"\ufffd\x7f"')
+            pb(b'"\xef\x80\x80"', '"\uf000"')
+            pb(b'"\xef\x80\xbf"', '"\uf03f"')
+            pb(b'"\xef\x80\xc0"', '"\ufffd\ufffd"')
+            pb(b'"\xef\x80\xc2"', '"\ufffd\ufffd"')
+            pb(b'"\xef\xbf\x7f"', '"\ufffd\x7f"')
+            pb(b'"\xef\xbf\x80"', '"\uffc0"')
+            pb(b'"\xef\xbf\xbf"', '"\uffff"')
+            pb(b'"\xef\xbf\xc0"', '"\ufffd\ufffd"')
+            pb(b'"\xef\xbf\xc2"', '"\ufffd\ufffd"')
+            pb(b'"\xef\xc0\x80"', '"\ufffd\ufffd\ufffd"')
+            pb(b'"\xef\xc2\x80"', '"\ufffd\x80"')
+
+            # Boundary cases: 4-byte sequences starting with 0xF0
+            pb(b'"\xf0\x8f\x80\x80"', '"\ufffd\ufffd\ufffd\ufffd"')
+            pb(b'"\xf0\x90\x7f\x80"', '"\ufffd\x7f\ufffd"')
+            pb(b'"\xf0\x90\x80\x7f"', '"\ufffd\x7f"')
+            pb(b'"\xf0\x90\x80\x80"', '"\U00010000"')
+            pb(b'"\xf0\x90\x80\xbf"', '"\U0001003f"')
+            pb(b'"\xf0\x90\x80\xc0"', '"\ufffd\ufffd"')
+            pb(b'"\xf0\x90\x80\xc2"', '"\ufffd\ufffd"')
+            pb(b'"\xf0\x90\xbf\x7f"', '"\ufffd\x7f"')
+            pb(b'"\xf0\x90\xbf\x80"', '"\U00010fc0"')
+            pb(b'"\xf0\x90\xbf\xbf"', '"\U00010fff"')
+            pb(b'"\xf0\x90\xbf\xc0"', '"\ufffd\ufffd"')
+            pb(b'"\xf0\x90\xbf\xc2"', '"\ufffd\ufffd"')
+            pb(b'"\xf0\x90\xc0\x7f"', '"\ufffd\ufffd\x7f"')
+            pb(b'"\xf0\x90\xc0\x80"', '"\ufffd\ufffd\ufffd"')
+            pb(b'"\xf0\x90\xc0\xbf"', '"\ufffd\ufffd\ufffd"')
+            pb(b'"\xf0\x90\xc0\xc0"', '"\ufffd\ufffd\ufffd"')
+            pb(b'"\xf0\x90\xc0\xc2"', '"\ufffd\ufffd\ufffd"')
+            pb(b'"\xf0\x90\xc2\x7f"', '"\ufffd\ufffd\x7f"')
+            pb(b'"\xf0\x90\xc2\x80"', '"\ufffd\x80"')
+            pb(b'"\xf0\x90\xc2\xbf"', '"\ufffd\xbf"')
+            pb(b'"\xf0\x90\xc2\xc0"', '"\ufffd\ufffd\ufffd"')
+            pb(b'"\xf0\x90\xc2\xc2"', '"\ufffd\ufffd\ufffd"')
+            pb(b'"\xf0\xbf\x7f\x80"', '"\ufffd\x7f\ufffd"')
+            pb(b'"\xf0\xbf\x80\x7f"', '"\ufffd\x7f"')
+            pb(b'"\xf0\xbf\x80\x80"', '"\U0003f000"')
+            pb(b'"\xf0\xbf\x80\xbf"', '"\U0003f03f"')
+            pb(b'"\xf0\xbf\x80\xc0"', '"\ufffd\ufffd"')
+            pb(b'"\xf0\xbf\x80\xc2"', '"\ufffd\ufffd"')
+            pb(b'"\xf0\xbf\xbf\x7f"', '"\ufffd\x7f"')
+            pb(b'"\xf0\xbf\xbf\x80"', '"\U0003ffc0"')
+            pb(b'"\xf0\xbf\xbf\xbf"', '"\U0003ffff"')
+            pb(b'"\xf0\xbf\xbf\xc0"', '"\ufffd\ufffd"')
+            pb(b'"\xf0\xbf\xbf\xc2"', '"\ufffd\ufffd"')
+            pb(b'"\xf0\xbf\xc0\x7f"', '"\ufffd\ufffd\x7f"')
+            pb(b'"\xf0\xbf\xc0\x80"', '"\ufffd\ufffd\ufffd"')
+            pb(b'"\xf0\xbf\xc0\xbf"', '"\ufffd\ufffd\ufffd"')
+            pb(b'"\xf0\xbf\xc0\xc0"', '"\ufffd\ufffd\ufffd"')
+            pb(b'"\xf0\xbf\xc0\xc2"', '"\ufffd\ufffd\ufffd"')
+            pb(b'"\xf0\xbf\xc2\x7f"', '"\ufffd\ufffd\x7f"')
+            pb(b'"\xf0\xbf\xc2\x80"', '"\ufffd\x80"')
+            pb(b'"\xf0\xbf\xc2\xbf"', '"\ufffd\xbf"')
+            pb(b'"\xf0\xbf\xc2\xc0"', '"\ufffd\ufffd\ufffd"')
+            pb(b'"\xf0\xbf\xc2\xc2"', '"\ufffd\ufffd\ufffd"')
+            pb(b'"\xf0\xc0\x80\x80"', '"\ufffd\ufffd\ufffd\ufffd"')
+
+            # Boundary cases: 4-byte sequences starting with 0xF1..0xF3
+            pb(b'"\xf1\x7f\x80\x80"', '"\ufffd\x7f\ufffd\ufffd"')
+            pb(b'"\xf1\x80\x7f\x80"', '"\ufffd\x7f\ufffd"')
+            pb(b'"\xf1\x80\x80\x7f"', '"\ufffd\x7f"')
+            pb(b'"\xf1\x80\x80\x80"', '"\U00040000"')
+            pb(b'"\xf1\x80\x80\xbf"', '"\U0004003f"')
+            pb(b'"\xf1\x80\x80\xc0"', '"\ufffd\ufffd"')
+            pb(b'"\xf1\x80\x80\xc2"', '"\ufffd\ufffd"')
+            pb(b'"\xf1\x80\xbf\x7f"', '"\ufffd\x7f"')
+            pb(b'"\xf1\x80\xbf\x80"', '"\U00040fc0"')
+            pb(b'"\xf1\x80\xbf\xbf"', '"\U00040fff"')
+            pb(b'"\xf1\x80\xbf\xc0"', '"\ufffd\ufffd"')
+            pb(b'"\xf1\x80\xbf\xc2"', '"\ufffd\ufffd"')
+            pb(b'"\xf1\x80\xc0\x7f"', '"\ufffd\ufffd\x7f"')
+            pb(b'"\xf1\x80\xc0\x80"', '"\ufffd\ufffd\ufffd"')
+            pb(b'"\xf1\x80\xc0\xbf"', '"\ufffd\ufffd\ufffd"')
+            pb(b'"\xf1\x80\xc0\xc0"', '"\ufffd\ufffd\ufffd"')
+            pb(b'"\xf1\x80\xc2\x7f"', '"\ufffd\ufffd\x7f"')
+            pb(b'"\xf1\x80\xc2\x80"', '"\ufffd\x80"')
+            pb(b'"\xf1\x80\xc2\xbf"', '"\ufffd\xbf"')
+            pb(b'"\xf1\x80\xc2\xc0"', '"\ufffd\ufffd\ufffd"')
+            pb(b'"\xf1\xbf\x7f\x80"', '"\ufffd\x7f\ufffd"')
+            pb(b'"\xf1\xbf\x80\x7f"', '"\ufffd\x7f"')
+            pb(b'"\xf1\xbf\x80\x80"', '"\U0007f000"')
+            pb(b'"\xf1\xbf\x80\xbf"', '"\U0007f03f"')
+            pb(b'"\xf1\xbf\x80\xc0"', '"\ufffd\ufffd"')
+            pb(b'"\xf1\xbf\x80\xc2"', '"\ufffd\ufffd"')
+            pb(b'"\xf1\xbf\xbf\x7f"', '"\ufffd\x7f"')
+            pb(b'"\xf1\xbf\xbf\x80"', '"\U0007ffc0"')
+            pb(b'"\xf1\xbf\xbf\xbf"', '"\U0007ffff"')
+            pb(b'"\xf1\xbf\xbf\xc0"', '"\ufffd\ufffd"')
+            pb(b'"\xf1\xbf\xbf\xc2"', '"\ufffd\ufffd"')
+            pb(b'"\xf1\xbf\xc0\x7f"', '"\ufffd\ufffd\x7f"')
+            pb(b'"\xf1\xbf\xc0\x80"', '"\ufffd\ufffd\ufffd"')
+            pb(b'"\xf1\xbf\xc0\xbf"', '"\ufffd\ufffd\ufffd"')
+            pb(b'"\xf1\xbf\xc0\xc0"', '"\ufffd\ufffd\ufffd"')
+            pb(b'"\xf1\xbf\xc0\xc2"', '"\ufffd\ufffd\ufffd"')
+            pb(b'"\xf1\xbf\xc2\x7f"', '"\ufffd\ufffd\x7f"')
+            pb(b'"\xf1\xbf\xc2\x80"', '"\ufffd\x80"')
+            pb(b'"\xf1\xbf\xc2\xbf"', '"\ufffd\xbf"')
+            pb(b'"\xf1\xbf\xc2\xc0"', '"\ufffd\ufffd\ufffd"')
+            pb(b'"\xf1\xbf\xc2\xc2"', '"\ufffd\ufffd\ufffd"')
+            pb(b'"\xf1\xc0\x80\x80"', '"\ufffd\ufffd\ufffd\ufffd"')
+            pb(b'"\xf1\xc2\x80\x80"', '"\ufffd\x80\ufffd"')
+            pb(b'"\xf3\x7f\x80\x80"', '"\ufffd\x7f\ufffd\ufffd"')
+            pb(b'"\xf3\x80\x7f\x80"', '"\ufffd\x7f\ufffd"')
+            pb(b'"\xf3\x80\x80\x7f"', '"\ufffd\x7f"')
+            pb(b'"\xf3\x80\x80\x80"', '"\U000c0000"')
+            pb(b'"\xf3\x80\x80\xbf"', '"\U000c003f"')
+            pb(b'"\xf3\x80\x80\xc0"', '"\ufffd\ufffd"')
+            pb(b'"\xf3\x80\x80\xc2"', '"\ufffd\ufffd"')
+            pb(b'"\xf3\x80\xbf\x7f"', '"\ufffd\x7f"')
+            pb(b'"\xf3\x80\xbf\x80"', '"\U000c0fc0"')
+            pb(b'"\xf3\x80\xbf\xbf"', '"\U000c0fff"')
+            pb(b'"\xf3\x80\xbf\xc0"', '"\ufffd\ufffd"')
+            pb(b'"\xf3\x80\xbf\xc2"', '"\ufffd\ufffd"')
+            pb(b'"\xf3\x80\xc0\x7f"', '"\ufffd\ufffd\x7f"')
+            pb(b'"\xf3\x80\xc0\x80"', '"\ufffd\ufffd\ufffd"')
+            pb(b'"\xf3\x80\xc0\xbf"', '"\ufffd\ufffd\ufffd"')
+            pb(b'"\xf3\x80\xc0\xc0"', '"\ufffd\ufffd\ufffd"')
+            pb(b'"\xf3\x80\xc0\xc2"', '"\ufffd\ufffd\ufffd"')
+            pb(b'"\xf3\x80\xc2\x7f"', '"\ufffd\ufffd\x7f"')
+            pb(b'"\xf3\x80\xc2\x80"', '"\ufffd\x80"')
+            pb(b'"\xf3\x80\xc2\xbf"', '"\ufffd\xbf"')
+            pb(b'"\xf3\x80\xc2\xc0"', '"\ufffd\ufffd\ufffd"')
+            pb(b'"\xf3\x80\xc2\xc2"', '"\ufffd\ufffd\ufffd"')
+            pb(b'"\xf3\xbf\x7f\x80"', '"\ufffd\x7f\ufffd"')
+            pb(b'"\xf3\xbf\x80\x7f"', '"\ufffd\x7f"')
+            pb(b'"\xf3\xbf\x80\x80"', '"\U000ff000"')
+            pb(b'"\xf3\xbf\x80\xbf"', '"\U000ff03f"')
+            pb(b'"\xf3\xbf\x80\xc0"', '"\ufffd\ufffd"')
+            pb(b'"\xf3\xbf\x80\xc2"', '"\ufffd\ufffd"')
+            pb(b'"\xf3\xbf\xbf\x7f"', '"\ufffd\x7f"')
+            pb(b'"\xf3\xbf\xbf\x80"', '"\U000fffc0"')
+            pb(b'"\xf3\xbf\xbf\xbf"', '"\U000fffff"')
+            pb(b'"\xf3\xbf\xbf\xc0"', '"\ufffd\ufffd"')
+            pb(b'"\xf3\xbf\xbf\xc2"', '"\ufffd\ufffd"')
+            pb(b'"\xf3\xbf\xc0\x7f"', '"\ufffd\ufffd\x7f"')
+            pb(b'"\xf3\xbf\xc0\x80"', '"\ufffd\ufffd\ufffd"')
+            pb(b'"\xf3\xbf\xc0\xbf"', '"\ufffd\ufffd\ufffd"')
+            pb(b'"\xf3\xbf\xc0\xc0"', '"\ufffd\ufffd\ufffd"')
+            pb(b'"\xf3\xbf\xc0\xc2"', '"\ufffd\ufffd\ufffd"')
+            pb(b'"\xf3\xbf\xc2\x7f"', '"\ufffd\ufffd\x7f"')
+            pb(b'"\xf3\xbf\xc2\x80"', '"\ufffd\x80"')
+            pb(b'"\xf3\xbf\xc2\xbf"', '"\ufffd\xbf"')
+            pb(b'"\xf3\xbf\xc2\xc0"', '"\ufffd\ufffd\ufffd"')
+            pb(b'"\xf3\xbf\xc2\xc2"', '"\ufffd\ufffd\ufffd"')
+            pb(b'"\xf3\xc0\x80\x80"', '"\ufffd\ufffd\ufffd\ufffd"')
+            pb(b'"\xf3\xc2\x80\x80"', '"\ufffd\x80\ufffd"')
+
+            # Boundary cases: 4-byte sequences starting with 0xF4
+            pb(b'"\xf4\x7f\x80\x80"', '"\ufffd\x7f\ufffd\ufffd"')
+            pb(b'"\xf4\x80\x7f\x80"', '"\ufffd\x7f\ufffd"')
+            pb(b'"\xf4\x80\x80\x7f"', '"\ufffd\x7f"')
+            pb(b'"\xf4\x80\x80\x80"', '"\U00100000"')
+            pb(b'"\xf4\x80\x80\xbf"', '"\U0010003f"')
+            pb(b'"\xf4\x80\x80\xc0"', '"\ufffd\ufffd"')
+            pb(b'"\xf4\x80\x80\xc2"', '"\ufffd\ufffd"')
+            pb(b'"\xf4\x80\xbf\x7f"', '"\ufffd\x7f"')
+            pb(b'"\xf4\x80\xbf\x80"', '"\U00100fc0"')
+            pb(b'"\xf4\x80\xbf\xbf"', '"\U00100fff"')
+            pb(b'"\xf4\x80\xbf\xc0"', '"\ufffd\ufffd"')
+            pb(b'"\xf4\x80\xbf\xc2"', '"\ufffd\ufffd"')
+            pb(b'"\xf4\x80\xc0\x7f"', '"\ufffd\ufffd\x7f"')
+            pb(b'"\xf4\x80\xc0\x80"', '"\ufffd\ufffd\ufffd"')
+            pb(b'"\xf4\x80\xc0\xbf"', '"\ufffd\ufffd\ufffd"')
+            pb(b'"\xf4\x80\xc0\xc0"', '"\ufffd\ufffd\ufffd"')
+            pb(b'"\xf4\x80\xc0\xc2"', '"\ufffd\ufffd\ufffd"')
+            pb(b'"\xf4\x80\xc2\x7f"', '"\ufffd\ufffd\x7f"')
+            pb(b'"\xf4\x80\xc2\x80"', '"\ufffd\x80"')
+            pb(b'"\xf4\x80\xc2\xbf"', '"\ufffd\xbf"')
+            pb(b'"\xf4\x80\xc2\xc0"', '"\ufffd\ufffd\ufffd"')
+            pb(b'"\xf4\x80\xc2\xc2"', '"\ufffd\ufffd\ufffd"')
+            pb(b'"\xf4\x8f\x7f\x80"', '"\ufffd\x7f\ufffd"')
+            pb(b'"\xf4\x8f\x80\x7f"', '"\ufffd\x7f"')
+            pb(b'"\xf4\x8f\x80\x80"', '"\U0010f000"')
+            pb(b'"\xf4\x8f\x80\xbf"', '"\U0010f03f"')
+            pb(b'"\xf4\x8f\x80\xc0"', '"\ufffd\ufffd"')
+            pb(b'"\xf4\x8f\x80\xc2"', '"\ufffd\ufffd"')
+            pb(b'"\xf4\x8f\xbf\x7f"', '"\ufffd\x7f"')
+            pb(b'"\xf4\x8f\xbf\x80"', '"\U0010ffc0"')
+            pb(b'"\xf4\x8f\xbf\xbf"', '"\U0010ffff"')
+            pb(b'"\xf4\x8f\xbf\xc0"', '"\ufffd\ufffd"')
+            pb(b'"\xf4\x8f\xbf\xc2"', '"\ufffd\ufffd"')
+            pb(b'"\xf4\x8f\xc0\x7f"', '"\ufffd\ufffd\x7f"')
+            pb(b'"\xf4\x8f\xc0\x80"', '"\ufffd\ufffd\ufffd"')
+            pb(b'"\xf4\x8f\xc0\xbf"', '"\ufffd\ufffd\ufffd"')
+            pb(b'"\xf4\x8f\xc0\xc0"', '"\ufffd\ufffd\ufffd"')
+            pb(b'"\xf4\x8f\xc0\xc2"', '"\ufffd\ufffd\ufffd"')
+            pb(b'"\xf4\x8f\xc2\x7f"', '"\ufffd\ufffd\x7f"')
+            pb(b'"\xf4\x8f\xc2\x80"', '"\ufffd\x80"')
+            pb(b'"\xf4\x8f\xc2\xbf"', '"\ufffd\xbf"')
+            pb(b'"\xf4\x8f\xc2\xc0"', '"\ufffd\ufffd\ufffd"')
+            pb(b'"\xf4\x8f\xc2\xc2"', '"\ufffd\ufffd\ufffd"')
+            pb(b'"\xf4\x90\x80\x80"', '"\ufffd\ufffd\ufffd\ufffd"')
+            pb(b'"\xf5\x80\x80\x80"', '"\ufffd\ufffd\ufffd\ufffd"')
+
+            # Boundary case: too large codepoint (> U+10FFFF)
+            pb(b'"\xf5\x80\x80\x80"', '"\ufffd\ufffd\ufffd\ufffd"')
+

    def test_find_either_of_two_bytes(self):
        sizes = []