diff --git a/kitty/simd-string-impl.h b/kitty/simd-string-impl.h index 46bf7244d..89d462f66 100644 --- a/kitty/simd-string-impl.h +++ b/kitty/simd-string-impl.h @@ -541,7 +541,7 @@ FUNC(utf8_decode_to_esc)(UTF8Decoder *d, const uint8_t *src_data, size_t src_len src_data += d->num_consumed; src_len -= d->num_consumed; } const integer_t esc_vec = set1_epi8(0x1b); - const integer_t zero = create_zero_integer(), one = set1_epi8(1), two = set1_epi8(2), three = set1_epi8(3), numbered = numbered_bytes(); + const integer_t zero = create_zero_integer(), one = set1_epi8(1), two = set1_epi8(2), three = set1_epi8(3), four = set1_epi8(4), numbered = numbered_bytes(); const uint8_t *limit = src_data + src_len, *p = src_data, *start_of_current_chunk = src_data; bool sentinel_found = false; unsigned chunk_src_sz = 0; @@ -593,10 +593,14 @@ start_classification: handle_trailing_bytes(); continue; } - // Classify the bytes + // Classify the bytes by whether they may be the start of a 2-byte, 3-byte, or 4-byte sequence. + // This is only an initial, potential classification. + // 0xC0 and 0xC1 are initially classified as potential starter bytes of 2-byte sequences. + // And 0xF5..0xFF are initially classified as potential starter bytes of 4-byte sequences. + // They will be marked as actually invalid later in the chunk_is_invalid checks. integer_t state = set1_epi8(0x80); const integer_t vec_signed = add_epi8(vec, state); // needed because cmplt_epi8 works only on signed chars - + // state now has 0x80 on all bytes const integer_t bytes_indicating_start_of_two_byte_sequence = cmplt_epi8(set1_epi8(0xc0 - 1 - 0x80), vec_signed); state = blendv_epi8(state, set1_epi8(0xc2), bytes_indicating_start_of_two_byte_sequence); // state now has 0xc2 on all bytes that start a 2 or more byte sequence and 0x80 on the rest @@ -631,11 +635,74 @@ start_classification: vec = zero_last_n_bytes(vec, sizeof(integer_t) - chunk_src_sz); goto start_classification; } - // Only ASCII chars should have corresponding byte of counts == 0 - if (ascii_mask != movemask_epi8(cmpgt_epi8(counts, zero))) { abort_with_invalid_utf8(); } - // The difference between a byte in counts and the next one should be negative, - // zero, or one. Any other value means there is not enough continuation bytes. - if (!is_zero(cmpgt_epi8(subtract_epi8(shift_right_by_one_byte(counts), counts), one))) { abort_with_invalid_utf8(); } + + // The next section performs detailed validation of the chunk's byte sequences. + // It accumulates validation errors into a chunk_is_invalid vector. + // When chunk_is_invalid has any non-zero byte, then the chunk contains invalid UTF-8. + // chunk_is_invalid is a vector, and not a bitmask or boolean, + // because the or_si SIMD operation is empirically faster than movemask_epi8 with |= or ||=. + integer_t chunk_is_invalid; + + // Only bytes within the ASCII range should have counts[i] == 0, and vice versa. + // Detect any mismatch between the two conditions for each chunk byte. + // If there is any mismatch, then the chunk has invalid UTF-8, so set all bytes in chunk_is_invalid to 0xFF; + // otherwise the chunk might be valid, so set all bytes in chunk_is_invalid to 0x00. + // Without this, "\x80" would incorrectly be decoded as a "\x00". + // This also validates that continuation bytes' positions do not have ASCII bytes (< 0x80). + // Without this, "\xe0\xa0\x7f\x01" would incorrectly be decoded as "\x00\x01". + // In that example, 0x7F has an ascii_mask bit of 0 (i.e., it is within 0x00..0x7F), + // but it has a counts value of 1, not 0 (i.e., it is the last remaining byte of a multi-byte sequence). + // Therefore there is a count mismatch, indicating that the chunk is ill-formed UTF-8. + // (If the following "\x01" were absent, and the "\x7f" were the last byte of the chunk, + // then the `check_for_trailing_bytes` validation above detects the error as a trailing incomplete sequence.) + const int ascii_sequence_count_mismatches = ascii_mask ^ movemask_epi8(cmpgt_epi8(counts, zero)); + chunk_is_invalid = set1_epi8(ascii_sequence_count_mismatches ? 0xff : 0x00); + + // Validate 2-byte sequence starter bytes: 0xC0..0xC1 are invalid (overlong encodings for U+0000..U+007F). + // Without this, "\xc0\x80" would incorrectly be decoded as a "\x00". + chunk_is_invalid = or_si(chunk_is_invalid, and_si(bytes_indicating_start_of_two_byte_sequence, cmplt_epi8(vec, set1_epi8(0xc2)))); + + // Validate 4-byte sequence starter bytes: 0xF5..0xFF are invalid (out of Unicode codespace). + // Without this, "\xff\x80\x80\x80" would incorrectly be decoded as an ill-formed "\U003C0000". + chunk_is_invalid = or_si(chunk_is_invalid, and_si(bytes_indicating_start_of_four_byte_sequence, cmpgt_epi8(vec, set1_epi8(0xf4)))); + + // Validate that all continuation bytes' positions do not have non-ASCII starter bytes (>=0xC0). + // If counts[i] > count[i], the chunk byte at i is in the middle of a previous sequence but also classified as a starter byte. + // Without this, "\xf0\x90\xc2\x80" would have overlapping sequences, and it would be incorrectly decoded elsewhere as an empty string. + chunk_is_invalid = or_si(chunk_is_invalid, andnot_si(cmplt_epi8(vec, set1_epi8(0xc0)), cmpgt_epi8(counts, count))); + + // Validate second bytes of E0-starting 3-byte sequences. + // 0xE0 must be followed by 0xA0..0xBF (not 0x80..0x9F) to avoid overlong encodings. + // Without this, "\xe0\x80\x80" would incorrectly be decoded as a "\x00". + const integer_t e0_starter_bytes = cmpeq_epi8(vec, set1_epi8(0xe0)); + const integer_t e0_first_follower_bytes = shift_right_by_one_byte(e0_starter_bytes); + chunk_is_invalid = or_si(chunk_is_invalid, and_si(e0_first_follower_bytes, cmplt_epi8(and_si(e0_first_follower_bytes, vec), set1_epi8(0xa0)))); + + // Validate second bytes of ED-starting 3-byte sequences. + // 0xED must be followed by 0x80..0x9F (not 0xA0..0xBF) to avoid UTF-16 surrogates. + // Without this, "\xed\xa0\x80" would incorrectly be decoded as an isolated surrogate "\uD800". + const integer_t ed_starter_bytes = cmpeq_epi8(vec, set1_epi8(0xed)); + const integer_t ed_first_follower_bytes = shift_right_by_one_byte(ed_starter_bytes); + chunk_is_invalid = or_si(chunk_is_invalid, and_si(ed_first_follower_bytes, cmpgt_epi8(and_si(ed_first_follower_bytes, vec), set1_epi8(0x9f)))); + + // Validate second bytes of F0-starting 4-byte sequences. + // F0 must be followed by 0x90..0xBF (not 0x80..0x8F) to avoid overlong encodings. + // Without this, "\xf0\x80\x80\x80" would incorrectly be decoded as a "\x0000". + const integer_t f0_starter_bytes = cmpeq_epi8(vec, set1_epi8(0xf0)); + const integer_t f0_first_follower_bytes = shift_right_by_one_byte(f0_starter_bytes); + chunk_is_invalid = or_si(chunk_is_invalid, and_si(f0_first_follower_bytes, cmplt_epi8(and_si(f0_first_follower_bytes, vec), set1_epi8(0x90)))); + + // Validate second bytes of F4-starting 4-byte sequences. + // F4 must be followed by 0x80..0x8F (not 0x90..0xBF) to stay within the Unicode codespace. + // Without this, "\xf4\x90\x80\x80" would incorrectly be decoded as an ill-formed "\U00110000". + const integer_t f4_starter_bytes = cmpeq_epi8(vec, set1_epi8(0xf4)); + const integer_t f4_first_follower_bytes = shift_right_by_one_byte(f4_starter_bytes); + chunk_is_invalid = or_si(chunk_is_invalid, and_si(f4_first_follower_bytes, cmpgt_epi8(and_si(f4_first_follower_bytes, vec), set1_epi8(0x8f)))); + + // Check for any accumulated validation errors and, if found, + // fall back to slow scalar decoding of this chunk, + // which handles replacement of invalid sequences with U+FFFD + if (!is_zero(chunk_is_invalid)) { abort_with_invalid_utf8(); } // Process the bytes storing the three resulting bytes that make up the unicode codepoint // mask all control bits so that we have only useful bits left @@ -673,7 +740,7 @@ start_classification: // The last byte is made up of bits 5 and 6 from count == 3 and 3 bits from count == 4 integer_t output3 = and_si(three, shift_right_by_bits32(vec, 4)); // bits 5 and 6 from count == 3 - const integer_t count4_locations = cmpeq_epi8(counts, set1_epi8(4)); + const integer_t count4_locations = cmpeq_epi8(counts, four); // 3 bits from count == 4 locations, placed at count == 3 locations shifted left by 2 bits output3 = or_si(output3, and_si(set1_epi8(0xfc), diff --git a/kitty_tests/parser.py b/kitty_tests/parser.py index aa8352135..2e3752209 100644 --- a/kitty_tests/parser.py +++ b/kitty_tests/parser.py @@ -214,9 +214,9 @@ class TestParser(BaseTest): return esc_found, ''.join(parts), total_consumed reset_state() - actual = parse_parts(1) + expected = parse_parts(1) reset_state() - expected = parse_parts(which) + actual = parse_parts(which) self.ae(expected, actual, msg=f'Failed for {a} with {which=}\n{expected!r} !=\n{actual!r}') return actual @@ -288,6 +288,7 @@ class TestParser(BaseTest): pb(b'"\xe0\xa0"', '"\ufffd"') pb(b'"\xf0\x9f\x98"', '"\ufffd"') pb(b'"\xef\x93\x94\x95"', '"\uf4d4\ufffd"') + # Lone continuation bytes with no leading starts pb(b'"\xbf"', '"\ufffd"') pb(b'"\x80"', '"\ufffd"') @@ -321,10 +322,6 @@ class TestParser(BaseTest): # Bad continuation byte (restored as ASCII) pb(b'"\xe1\x28\xa1"', '"\ufffd(\ufffd"') # ) - # The following all fail when using SIMD and need to be fixed in the SIMD parser - if which != 1: - continue - # Overlong 2-byte sequence for U+0000 (should be `0x00`) pb(b'"\xc0\x80"', '"\ufffd\ufffd"') @@ -340,9 +337,305 @@ class TestParser(BaseTest): # Low surrogate code point pb(b'"\xed\xb0\x80"', '"\ufffd\ufffd\ufffd"') - # Too large first codepoint + # Too large starter byte pb(b'"\xff\x80\x80\x80"', '"\ufffd\ufffd\ufffd\ufffd"') + # The following boundary cases come from the table of well-formed UTF-8 byte sequences + # `_. + # For continuation bytes, both 0xC0 and 0xC2 are tested as values that exceed the valid maximum. + # This is because 0xC0 is an invalid starter byte, but 0xC2 is also a starter byte for 2-byte sequences. + # simd-string-impl.h prefers classifying bytes as starter bytes when possible (e.g., in "\xf0\x90\xc2\x80"). + # The tests need to check that simd-string-impl.h correctly detects + # starter bytes that are actually invalid continution bytes, like 0xC2. + + # Boundary cases: 2-byte sequences + pb(b'"\xc1\x7f"', '"\ufffd\x7f"') + pb(b'"\xc1\x80"', '"\ufffd\ufffd"') + pb(b'"\xc1\xbf"', '"\ufffd\ufffd"') + pb(b'"\xc1\xc0"', '"\ufffd\ufffd"') + pb(b'"\xc1\xc2"', '"\ufffd\ufffd"') + pb(b'"\xc2\x7f"', '"\ufffd\x7f"') + pb(b'"\xc2\x80"', '"\x80"') + pb(b'"\xc2\xbf"', '"\xbf"') + pb(b'"\xc2\xc0"', '"\ufffd\ufffd"') + pb(b'"\xc2\xc2"', '"\ufffd\ufffd"') + pb(b'"\xdf\x7f"', '"\ufffd\x7f"') + pb(b'"\xdf\x80"', '"\u07c0"') + pb(b'"\xdf\xbf"', '"\u07ff"') + pb(b'"\xdf\xc0"', '"\ufffd\ufffd"') + pb(b'"\xdf\xc2"', '"\ufffd\ufffd"') + + # Boundary cases: 3-byte sequences starting with 0xE0 + pb(b'"\xe0\x9f\x80"', '"\ufffd\ufffd\ufffd"') + pb(b'"\xe0\xa0\x7f"', '"\ufffd\x7f"') + pb(b'"\xe0\xa0\x80"', '"\u0800"') + pb(b'"\xe0\xa0\xbf"', '"\u083f"') + pb(b'"\xe0\xa0\xc0"', '"\ufffd\ufffd"') + pb(b'"\xe0\xa0\xc2"', '"\ufffd\ufffd"') + pb(b'"\xe0\xbf\x7f"', '"\ufffd\x7f"') + pb(b'"\xe0\xbf\x80"', '"\u0fc0"') + pb(b'"\xe0\xbf\xbf"', '"\u0fff"') + pb(b'"\xe0\xbf\xc0"', '"\ufffd\ufffd"') + pb(b'"\xe0\xbf\xc2"', '"\ufffd\ufffd"') + pb(b'"\xe0\xc0\x80"', '"\ufffd\ufffd\ufffd"') + pb(b'"\xe0\xc2\x80"', '"\ufffd\x80"') + + # Boundary cases: 3-byte sequences starting with 0xE1..0xEC + pb(b'"\xe1\x7f\x80"', '"\ufffd\x7f\ufffd"') + pb(b'"\xe1\x80\x7f"', '"\ufffd\x7f"') + pb(b'"\xe1\x80\x80"', '"\u1000"') + pb(b'"\xe1\x80\xbf"', '"\u103f"') + pb(b'"\xe1\x80\xc0"', '"\ufffd\ufffd"') + pb(b'"\xe1\x80\xc2"', '"\ufffd\ufffd"') + pb(b'"\xe1\xbf\x7f"', '"\ufffd\x7f"') + pb(b'"\xe1\xbf\x80"', '"\u1fc0"') + pb(b'"\xe1\xbf\xbf"', '"\u1fff"') + pb(b'"\xe1\xbf\xc0"', '"\ufffd\ufffd"') + pb(b'"\xe1\xbf\xc2"', '"\ufffd\ufffd"') + pb(b'"\xe1\xc0\x80"', '"\ufffd\ufffd\ufffd"') + pb(b'"\xe1\xc2\x80"', '"\ufffd\x80"') + pb(b'"\xec\x7f\x80"', '"\ufffd\x7f\ufffd"') + pb(b'"\xec\x80\x7f"', '"\ufffd\x7f"') + pb(b'"\xec\x80\x80"', '"\uc000"') + pb(b'"\xec\x80\xbf"', '"\uc03f"') + pb(b'"\xec\x80\xc0"', '"\ufffd\ufffd"') + pb(b'"\xec\x80\xc2"', '"\ufffd\ufffd"') + pb(b'"\xec\xbf\x7f"', '"\ufffd\x7f"') + pb(b'"\xec\xbf\x80"', '"\ucfc0"') + pb(b'"\xec\xbf\xbf"', '"\ucfff"') + pb(b'"\xec\xbf\xc0"', '"\ufffd\ufffd"') + pb(b'"\xec\xbf\xc2"', '"\ufffd\ufffd"') + pb(b'"\xec\xc0\x80"', '"\ufffd\ufffd\ufffd"') + pb(b'"\xec\xc2\x80"', '"\ufffd\x80"') + + # Boundary cases: 3-byte sequences starting with 0xED + pb(b'"\xed\x7f\x80"', '"\ufffd\x7f\ufffd"') + pb(b'"\xed\x80\x7f"', '"\ufffd\x7f"') + pb(b'"\xed\x80\x80"', '"\ud000"') + pb(b'"\xed\x80\xbf"', '"\ud03f"') + pb(b'"\xed\x80\xc0"', '"\ufffd\ufffd"') + pb(b'"\xed\x80\xc2"', '"\ufffd\ufffd"') + pb(b'"\xed\x9f\x7f"', '"\ufffd\x7f"') + pb(b'"\xed\x9f\x80"', '"\ud7c0"') + pb(b'"\xed\x9f\xbf"', '"\ud7ff"') + pb(b'"\xed\x9f\xc0"', '"\ufffd\ufffd"') + pb(b'"\xed\x9f\xc2"', '"\ufffd\ufffd"') + + # Boundary cases: 3-byte sequences starting with 0xEE..0xEF + pb(b'"\xed\xa0\x80"', '"\ufffd\ufffd\ufffd"') + pb(b'"\xee\x7f\x80"', '"\ufffd\x7f\ufffd"') + pb(b'"\xee\x80\x7f"', '"\ufffd\x7f"') + pb(b'"\xee\x80\x80"', '"\ue000"') + pb(b'"\xee\x80\xbf"', '"\ue03f"') + pb(b'"\xee\x80\xc0"', '"\ufffd\ufffd"') + pb(b'"\xee\x80\xc2"', '"\ufffd\ufffd"') + pb(b'"\xee\xbf\x7f"', '"\ufffd\x7f"') + pb(b'"\xee\xbf\x80"', '"\uefc0"') + pb(b'"\xee\xbf\xbf"', '"\uefff"') + pb(b'"\xee\xbf\xc0"', '"\ufffd\ufffd"') + pb(b'"\xee\xbf\xc2"', '"\ufffd\ufffd"') + pb(b'"\xee\xc0\x80"', '"\ufffd\ufffd\ufffd"') + pb(b'"\xee\xc2\x80"', '"\ufffd\x80"') + pb(b'"\xef\x7f\x80"', '"\ufffd\x7f\ufffd"') + pb(b'"\xef\x80\x7f"', '"\ufffd\x7f"') + pb(b'"\xef\x80\x80"', '"\uf000"') + pb(b'"\xef\x80\xbf"', '"\uf03f"') + pb(b'"\xef\x80\xc0"', '"\ufffd\ufffd"') + pb(b'"\xef\x80\xc2"', '"\ufffd\ufffd"') + pb(b'"\xef\xbf\x7f"', '"\ufffd\x7f"') + pb(b'"\xef\xbf\x80"', '"\uffc0"') + pb(b'"\xef\xbf\xbf"', '"\uffff"') + pb(b'"\xef\xbf\xc0"', '"\ufffd\ufffd"') + pb(b'"\xef\xbf\xc2"', '"\ufffd\ufffd"') + pb(b'"\xef\xc0\x80"', '"\ufffd\ufffd\ufffd"') + pb(b'"\xef\xc2\x80"', '"\ufffd\x80"') + + # Boundary cases: 4-byte sequences starting with 0xF0 + pb(b'"\xf0\x8f\x80\x80"', '"\ufffd\ufffd\ufffd\ufffd"') + pb(b'"\xf0\x90\x7f\x80"', '"\ufffd\x7f\ufffd"') + pb(b'"\xf0\x90\x80\x7f"', '"\ufffd\x7f"') + pb(b'"\xf0\x90\x80\x80"', '"\U00010000"') + pb(b'"\xf0\x90\x80\xbf"', '"\U0001003f"') + pb(b'"\xf0\x90\x80\xc0"', '"\ufffd\ufffd"') + pb(b'"\xf0\x90\x80\xc2"', '"\ufffd\ufffd"') + pb(b'"\xf0\x90\xbf\x7f"', '"\ufffd\x7f"') + pb(b'"\xf0\x90\xbf\x80"', '"\U00010fc0"') + pb(b'"\xf0\x90\xbf\xbf"', '"\U00010fff"') + pb(b'"\xf0\x90\xbf\xc0"', '"\ufffd\ufffd"') + pb(b'"\xf0\x90\xbf\xc2"', '"\ufffd\ufffd"') + pb(b'"\xf0\x90\xc0\x7f"', '"\ufffd\ufffd\x7f"') + pb(b'"\xf0\x90\xc0\x80"', '"\ufffd\ufffd\ufffd"') + pb(b'"\xf0\x90\xc0\xbf"', '"\ufffd\ufffd\ufffd"') + pb(b'"\xf0\x90\xc0\xc0"', '"\ufffd\ufffd\ufffd"') + pb(b'"\xf0\x90\xc0\xc2"', '"\ufffd\ufffd\ufffd"') + pb(b'"\xf0\x90\xc2\x7f"', '"\ufffd\ufffd\x7f"') + pb(b'"\xf0\x90\xc2\x80"', '"\ufffd\x80"') + pb(b'"\xf0\x90\xc2\xbf"', '"\ufffd\xbf"') + pb(b'"\xf0\x90\xc2\xc0"', '"\ufffd\ufffd\ufffd"') + pb(b'"\xf0\x90\xc2\xc2"', '"\ufffd\ufffd\ufffd"') + pb(b'"\xf0\xbf\x7f\x80"', '"\ufffd\x7f\ufffd"') + pb(b'"\xf0\xbf\x80\x7f"', '"\ufffd\x7f"') + pb(b'"\xf0\xbf\x80\x80"', '"\U0003f000"') + pb(b'"\xf0\xbf\x80\xbf"', '"\U0003f03f"') + pb(b'"\xf0\xbf\x80\xc0"', '"\ufffd\ufffd"') + pb(b'"\xf0\xbf\x80\xc2"', '"\ufffd\ufffd"') + pb(b'"\xf0\xbf\xbf\x7f"', '"\ufffd\x7f"') + pb(b'"\xf0\xbf\xbf\x80"', '"\U0003ffc0"') + pb(b'"\xf0\xbf\xbf\xbf"', '"\U0003ffff"') + pb(b'"\xf0\xbf\xbf\xc0"', '"\ufffd\ufffd"') + pb(b'"\xf0\xbf\xbf\xc2"', '"\ufffd\ufffd"') + pb(b'"\xf0\xbf\xc0\x7f"', '"\ufffd\ufffd\x7f"') + pb(b'"\xf0\xbf\xc0\x80"', '"\ufffd\ufffd\ufffd"') + pb(b'"\xf0\xbf\xc0\xbf"', '"\ufffd\ufffd\ufffd"') + pb(b'"\xf0\xbf\xc0\xc0"', '"\ufffd\ufffd\ufffd"') + pb(b'"\xf0\xbf\xc0\xc2"', '"\ufffd\ufffd\ufffd"') + pb(b'"\xf0\xbf\xc2\x7f"', '"\ufffd\ufffd\x7f"') + pb(b'"\xf0\xbf\xc2\x80"', '"\ufffd\x80"') + pb(b'"\xf0\xbf\xc2\xbf"', '"\ufffd\xbf"') + pb(b'"\xf0\xbf\xc2\xc0"', '"\ufffd\ufffd\ufffd"') + pb(b'"\xf0\xbf\xc2\xc2"', '"\ufffd\ufffd\ufffd"') + pb(b'"\xf0\xc0\x80\x80"', '"\ufffd\ufffd\ufffd\ufffd"') + + # Boundary cases: 4-byte sequences starting with 0xF1..0xF3 + pb(b'"\xf1\x7f\x80\x80"', '"\ufffd\x7f\ufffd\ufffd"') + pb(b'"\xf1\x80\x7f\x80"', '"\ufffd\x7f\ufffd"') + pb(b'"\xf1\x80\x80\x7f"', '"\ufffd\x7f"') + pb(b'"\xf1\x80\x80\x80"', '"\U00040000"') + pb(b'"\xf1\x80\x80\xbf"', '"\U0004003f"') + pb(b'"\xf1\x80\x80\xc0"', '"\ufffd\ufffd"') + pb(b'"\xf1\x80\x80\xc2"', '"\ufffd\ufffd"') + pb(b'"\xf1\x80\xbf\x7f"', '"\ufffd\x7f"') + pb(b'"\xf1\x80\xbf\x80"', '"\U00040fc0"') + pb(b'"\xf1\x80\xbf\xbf"', '"\U00040fff"') + pb(b'"\xf1\x80\xbf\xc0"', '"\ufffd\ufffd"') + pb(b'"\xf1\x80\xbf\xc2"', '"\ufffd\ufffd"') + pb(b'"\xf1\x80\xc0\x7f"', '"\ufffd\ufffd\x7f"') + pb(b'"\xf1\x80\xc0\x80"', '"\ufffd\ufffd\ufffd"') + pb(b'"\xf1\x80\xc0\xbf"', '"\ufffd\ufffd\ufffd"') + pb(b'"\xf1\x80\xc0\xc0"', '"\ufffd\ufffd\ufffd"') + pb(b'"\xf1\x80\xc2\x7f"', '"\ufffd\ufffd\x7f"') + pb(b'"\xf1\x80\xc2\x80"', '"\ufffd\x80"') + pb(b'"\xf1\x80\xc2\xbf"', '"\ufffd\xbf"') + pb(b'"\xf1\x80\xc2\xc0"', '"\ufffd\ufffd\ufffd"') + pb(b'"\xf1\xbf\x7f\x80"', '"\ufffd\x7f\ufffd"') + pb(b'"\xf1\xbf\x80\x7f"', '"\ufffd\x7f"') + pb(b'"\xf1\xbf\x80\x80"', '"\U0007f000"') + pb(b'"\xf1\xbf\x80\xbf"', '"\U0007f03f"') + pb(b'"\xf1\xbf\x80\xc0"', '"\ufffd\ufffd"') + pb(b'"\xf1\xbf\x80\xc2"', '"\ufffd\ufffd"') + pb(b'"\xf1\xbf\xbf\x7f"', '"\ufffd\x7f"') + pb(b'"\xf1\xbf\xbf\x80"', '"\U0007ffc0"') + pb(b'"\xf1\xbf\xbf\xbf"', '"\U0007ffff"') + pb(b'"\xf1\xbf\xbf\xc0"', '"\ufffd\ufffd"') + pb(b'"\xf1\xbf\xbf\xc2"', '"\ufffd\ufffd"') + pb(b'"\xf1\xbf\xc0\x7f"', '"\ufffd\ufffd\x7f"') + pb(b'"\xf1\xbf\xc0\x80"', '"\ufffd\ufffd\ufffd"') + pb(b'"\xf1\xbf\xc0\xbf"', '"\ufffd\ufffd\ufffd"') + pb(b'"\xf1\xbf\xc0\xc0"', '"\ufffd\ufffd\ufffd"') + pb(b'"\xf1\xbf\xc0\xc2"', '"\ufffd\ufffd\ufffd"') + pb(b'"\xf1\xbf\xc2\x7f"', '"\ufffd\ufffd\x7f"') + pb(b'"\xf1\xbf\xc2\x80"', '"\ufffd\x80"') + pb(b'"\xf1\xbf\xc2\xbf"', '"\ufffd\xbf"') + pb(b'"\xf1\xbf\xc2\xc0"', '"\ufffd\ufffd\ufffd"') + pb(b'"\xf1\xbf\xc2\xc2"', '"\ufffd\ufffd\ufffd"') + pb(b'"\xf1\xc0\x80\x80"', '"\ufffd\ufffd\ufffd\ufffd"') + pb(b'"\xf1\xc2\x80\x80"', '"\ufffd\x80\ufffd"') + pb(b'"\xf3\x7f\x80\x80"', '"\ufffd\x7f\ufffd\ufffd"') + pb(b'"\xf3\x80\x7f\x80"', '"\ufffd\x7f\ufffd"') + pb(b'"\xf3\x80\x80\x7f"', '"\ufffd\x7f"') + pb(b'"\xf3\x80\x80\x80"', '"\U000c0000"') + pb(b'"\xf3\x80\x80\xbf"', '"\U000c003f"') + pb(b'"\xf3\x80\x80\xc0"', '"\ufffd\ufffd"') + pb(b'"\xf3\x80\x80\xc2"', '"\ufffd\ufffd"') + pb(b'"\xf3\x80\xbf\x7f"', '"\ufffd\x7f"') + pb(b'"\xf3\x80\xbf\x80"', '"\U000c0fc0"') + pb(b'"\xf3\x80\xbf\xbf"', '"\U000c0fff"') + pb(b'"\xf3\x80\xbf\xc0"', '"\ufffd\ufffd"') + pb(b'"\xf3\x80\xbf\xc2"', '"\ufffd\ufffd"') + pb(b'"\xf3\x80\xc0\x7f"', '"\ufffd\ufffd\x7f"') + pb(b'"\xf3\x80\xc0\x80"', '"\ufffd\ufffd\ufffd"') + pb(b'"\xf3\x80\xc0\xbf"', '"\ufffd\ufffd\ufffd"') + pb(b'"\xf3\x80\xc0\xc0"', '"\ufffd\ufffd\ufffd"') + pb(b'"\xf3\x80\xc0\xc2"', '"\ufffd\ufffd\ufffd"') + pb(b'"\xf3\x80\xc2\x7f"', '"\ufffd\ufffd\x7f"') + pb(b'"\xf3\x80\xc2\x80"', '"\ufffd\x80"') + pb(b'"\xf3\x80\xc2\xbf"', '"\ufffd\xbf"') + pb(b'"\xf3\x80\xc2\xc0"', '"\ufffd\ufffd\ufffd"') + pb(b'"\xf3\x80\xc2\xc2"', '"\ufffd\ufffd\ufffd"') + pb(b'"\xf3\xbf\x7f\x80"', '"\ufffd\x7f\ufffd"') + pb(b'"\xf3\xbf\x80\x7f"', '"\ufffd\x7f"') + pb(b'"\xf3\xbf\x80\x80"', '"\U000ff000"') + pb(b'"\xf3\xbf\x80\xbf"', '"\U000ff03f"') + pb(b'"\xf3\xbf\x80\xc0"', '"\ufffd\ufffd"') + pb(b'"\xf3\xbf\x80\xc2"', '"\ufffd\ufffd"') + pb(b'"\xf3\xbf\xbf\x7f"', '"\ufffd\x7f"') + pb(b'"\xf3\xbf\xbf\x80"', '"\U000fffc0"') + pb(b'"\xf3\xbf\xbf\xbf"', '"\U000fffff"') + pb(b'"\xf3\xbf\xbf\xc0"', '"\ufffd\ufffd"') + pb(b'"\xf3\xbf\xbf\xc2"', '"\ufffd\ufffd"') + pb(b'"\xf3\xbf\xc0\x7f"', '"\ufffd\ufffd\x7f"') + pb(b'"\xf3\xbf\xc0\x80"', '"\ufffd\ufffd\ufffd"') + pb(b'"\xf3\xbf\xc0\xbf"', '"\ufffd\ufffd\ufffd"') + pb(b'"\xf3\xbf\xc0\xc0"', '"\ufffd\ufffd\ufffd"') + pb(b'"\xf3\xbf\xc0\xc2"', '"\ufffd\ufffd\ufffd"') + pb(b'"\xf3\xbf\xc2\x7f"', '"\ufffd\ufffd\x7f"') + pb(b'"\xf3\xbf\xc2\x80"', '"\ufffd\x80"') + pb(b'"\xf3\xbf\xc2\xbf"', '"\ufffd\xbf"') + pb(b'"\xf3\xbf\xc2\xc0"', '"\ufffd\ufffd\ufffd"') + pb(b'"\xf3\xbf\xc2\xc2"', '"\ufffd\ufffd\ufffd"') + pb(b'"\xf3\xc0\x80\x80"', '"\ufffd\ufffd\ufffd\ufffd"') + pb(b'"\xf3\xc2\x80\x80"', '"\ufffd\x80\ufffd"') + + # Boundary cases: 4-byte sequences starting with 0xF4 + pb(b'"\xf4\x7f\x80\x80"', '"\ufffd\x7f\ufffd\ufffd"') + pb(b'"\xf4\x80\x7f\x80"', '"\ufffd\x7f\ufffd"') + pb(b'"\xf4\x80\x80\x7f"', '"\ufffd\x7f"') + pb(b'"\xf4\x80\x80\x80"', '"\U00100000"') + pb(b'"\xf4\x80\x80\xbf"', '"\U0010003f"') + pb(b'"\xf4\x80\x80\xc0"', '"\ufffd\ufffd"') + pb(b'"\xf4\x80\x80\xc2"', '"\ufffd\ufffd"') + pb(b'"\xf4\x80\xbf\x7f"', '"\ufffd\x7f"') + pb(b'"\xf4\x80\xbf\x80"', '"\U00100fc0"') + pb(b'"\xf4\x80\xbf\xbf"', '"\U00100fff"') + pb(b'"\xf4\x80\xbf\xc0"', '"\ufffd\ufffd"') + pb(b'"\xf4\x80\xbf\xc2"', '"\ufffd\ufffd"') + pb(b'"\xf4\x80\xc0\x7f"', '"\ufffd\ufffd\x7f"') + pb(b'"\xf4\x80\xc0\x80"', '"\ufffd\ufffd\ufffd"') + pb(b'"\xf4\x80\xc0\xbf"', '"\ufffd\ufffd\ufffd"') + pb(b'"\xf4\x80\xc0\xc0"', '"\ufffd\ufffd\ufffd"') + pb(b'"\xf4\x80\xc0\xc2"', '"\ufffd\ufffd\ufffd"') + pb(b'"\xf4\x80\xc2\x7f"', '"\ufffd\ufffd\x7f"') + pb(b'"\xf4\x80\xc2\x80"', '"\ufffd\x80"') + pb(b'"\xf4\x80\xc2\xbf"', '"\ufffd\xbf"') + pb(b'"\xf4\x80\xc2\xc0"', '"\ufffd\ufffd\ufffd"') + pb(b'"\xf4\x80\xc2\xc2"', '"\ufffd\ufffd\ufffd"') + pb(b'"\xf4\x8f\x7f\x80"', '"\ufffd\x7f\ufffd"') + pb(b'"\xf4\x8f\x80\x7f"', '"\ufffd\x7f"') + pb(b'"\xf4\x8f\x80\x80"', '"\U0010f000"') + pb(b'"\xf4\x8f\x80\xbf"', '"\U0010f03f"') + pb(b'"\xf4\x8f\x80\xc0"', '"\ufffd\ufffd"') + pb(b'"\xf4\x8f\x80\xc2"', '"\ufffd\ufffd"') + pb(b'"\xf4\x8f\xbf\x7f"', '"\ufffd\x7f"') + pb(b'"\xf4\x8f\xbf\x80"', '"\U0010ffc0"') + pb(b'"\xf4\x8f\xbf\xbf"', '"\U0010ffff"') + pb(b'"\xf4\x8f\xbf\xc0"', '"\ufffd\ufffd"') + pb(b'"\xf4\x8f\xbf\xc2"', '"\ufffd\ufffd"') + pb(b'"\xf4\x8f\xc0\x7f"', '"\ufffd\ufffd\x7f"') + pb(b'"\xf4\x8f\xc0\x80"', '"\ufffd\ufffd\ufffd"') + pb(b'"\xf4\x8f\xc0\xbf"', '"\ufffd\ufffd\ufffd"') + pb(b'"\xf4\x8f\xc0\xc0"', '"\ufffd\ufffd\ufffd"') + pb(b'"\xf4\x8f\xc0\xc2"', '"\ufffd\ufffd\ufffd"') + pb(b'"\xf4\x8f\xc2\x7f"', '"\ufffd\ufffd\x7f"') + pb(b'"\xf4\x8f\xc2\x80"', '"\ufffd\x80"') + pb(b'"\xf4\x8f\xc2\xbf"', '"\ufffd\xbf"') + pb(b'"\xf4\x8f\xc2\xc0"', '"\ufffd\ufffd\ufffd"') + pb(b'"\xf4\x8f\xc2\xc2"', '"\ufffd\ufffd\ufffd"') + pb(b'"\xf4\x90\x80\x80"', '"\ufffd\ufffd\ufffd\ufffd"') + pb(b'"\xf5\x80\x80\x80"', '"\ufffd\ufffd\ufffd\ufffd"') + + # Boundary case: too large codepoint (> U+10FFFF) + pb(b'"\xf5\x80\x80\x80"', '"\ufffd\ufffd\ufffd\ufffd"') + def test_find_either_of_two_bytes(self): sizes = []