mirror of
https://github.com/kovidgoyal/kitty
synced 2026-06-08 22:28:24 +02:00
Merge branch 'utf-8-simd-parser-fix' of https://github.com/Wukuyon/kitty
This commit is contained in:
@@ -541,7 +541,7 @@ FUNC(utf8_decode_to_esc)(UTF8Decoder *d, const uint8_t *src_data, size_t src_len
|
||||
src_data += d->num_consumed; src_len -= d->num_consumed;
|
||||
}
|
||||
const integer_t esc_vec = set1_epi8(0x1b);
|
||||
const integer_t zero = create_zero_integer(), one = set1_epi8(1), two = set1_epi8(2), three = set1_epi8(3), numbered = numbered_bytes();
|
||||
const integer_t zero = create_zero_integer(), one = set1_epi8(1), two = set1_epi8(2), three = set1_epi8(3), four = set1_epi8(4), numbered = numbered_bytes();
|
||||
const uint8_t *limit = src_data + src_len, *p = src_data, *start_of_current_chunk = src_data;
|
||||
bool sentinel_found = false;
|
||||
unsigned chunk_src_sz = 0;
|
||||
@@ -593,10 +593,14 @@ start_classification:
|
||||
handle_trailing_bytes();
|
||||
continue;
|
||||
}
|
||||
// Classify the bytes
|
||||
// Classify the bytes by whether they may be the start of a 2-byte, 3-byte, or 4-byte sequence.
|
||||
// This is only an initial, potential classification.
|
||||
// 0xC0 and 0xC1 are initially classified as potential starter bytes of 2-byte sequences.
|
||||
// And 0xF5..0xFF are initially classified as potential starter bytes of 4-byte sequences.
|
||||
// They will be marked as actually invalid later in the chunk_is_invalid checks.
|
||||
integer_t state = set1_epi8(0x80);
|
||||
const integer_t vec_signed = add_epi8(vec, state); // needed because cmplt_epi8 works only on signed chars
|
||||
|
||||
// state now has 0x80 on all bytes
|
||||
const integer_t bytes_indicating_start_of_two_byte_sequence = cmplt_epi8(set1_epi8(0xc0 - 1 - 0x80), vec_signed);
|
||||
state = blendv_epi8(state, set1_epi8(0xc2), bytes_indicating_start_of_two_byte_sequence);
|
||||
// state now has 0xc2 on all bytes that start a 2 or more byte sequence and 0x80 on the rest
|
||||
@@ -631,11 +635,74 @@ start_classification:
|
||||
vec = zero_last_n_bytes(vec, sizeof(integer_t) - chunk_src_sz);
|
||||
goto start_classification;
|
||||
}
|
||||
// Only ASCII chars should have corresponding byte of counts == 0
|
||||
if (ascii_mask != movemask_epi8(cmpgt_epi8(counts, zero))) { abort_with_invalid_utf8(); }
|
||||
// The difference between a byte in counts and the next one should be negative,
|
||||
// zero, or one. Any other value means there is not enough continuation bytes.
|
||||
if (!is_zero(cmpgt_epi8(subtract_epi8(shift_right_by_one_byte(counts), counts), one))) { abort_with_invalid_utf8(); }
|
||||
|
||||
// The next section performs detailed validation of the chunk's byte sequences.
|
||||
// It accumulates validation errors into a chunk_is_invalid vector.
|
||||
// When chunk_is_invalid has any non-zero byte, then the chunk contains invalid UTF-8.
|
||||
// chunk_is_invalid is a vector, and not a bitmask or boolean,
|
||||
// because the or_si SIMD operation is empirically faster than movemask_epi8 with |= or ||=.
|
||||
integer_t chunk_is_invalid;
|
||||
|
||||
// Only bytes within the ASCII range should have counts[i] == 0, and vice versa.
|
||||
// Detect any mismatch between the two conditions for each chunk byte.
|
||||
// If there is any mismatch, then the chunk has invalid UTF-8, so set all bytes in chunk_is_invalid to 0xFF;
|
||||
// otherwise the chunk might be valid, so set all bytes in chunk_is_invalid to 0x00.
|
||||
// Without this, "\x80" would incorrectly be decoded as a "\x00".
|
||||
// This also validates that continuation bytes' positions do not have ASCII bytes (< 0x80).
|
||||
// Without this, "\xe0\xa0\x7f\x01" would incorrectly be decoded as "\x00\x01".
|
||||
// In that example, 0x7F has an ascii_mask bit of 0 (i.e., it is within 0x00..0x7F),
|
||||
// but it has a counts value of 1, not 0 (i.e., it is the last remaining byte of a multi-byte sequence).
|
||||
// Therefore there is a count mismatch, indicating that the chunk is ill-formed UTF-8.
|
||||
// (If the following "\x01" were absent, and the "\x7f" were the last byte of the chunk,
|
||||
// then the `check_for_trailing_bytes` validation above detects the error as a trailing incomplete sequence.)
|
||||
const int ascii_sequence_count_mismatches = ascii_mask ^ movemask_epi8(cmpgt_epi8(counts, zero));
|
||||
chunk_is_invalid = set1_epi8(ascii_sequence_count_mismatches ? 0xff : 0x00);
|
||||
|
||||
// Validate 2-byte sequence starter bytes: 0xC0..0xC1 are invalid (overlong encodings for U+0000..U+007F).
|
||||
// Without this, "\xc0\x80" would incorrectly be decoded as a "\x00".
|
||||
chunk_is_invalid = or_si(chunk_is_invalid, and_si(bytes_indicating_start_of_two_byte_sequence, cmplt_epi8(vec, set1_epi8(0xc2))));
|
||||
|
||||
// Validate 4-byte sequence starter bytes: 0xF5..0xFF are invalid (out of Unicode codespace).
|
||||
// Without this, "\xff\x80\x80\x80" would incorrectly be decoded as an ill-formed "\U003C0000".
|
||||
chunk_is_invalid = or_si(chunk_is_invalid, and_si(bytes_indicating_start_of_four_byte_sequence, cmpgt_epi8(vec, set1_epi8(0xf4))));
|
||||
|
||||
// Validate that all continuation bytes' positions do not have non-ASCII starter bytes (>=0xC0).
|
||||
// If counts[i] > count[i], the chunk byte at i is in the middle of a previous sequence but also classified as a starter byte.
|
||||
// Without this, "\xf0\x90\xc2\x80" would have overlapping sequences, and it would be incorrectly decoded elsewhere as an empty string.
|
||||
chunk_is_invalid = or_si(chunk_is_invalid, andnot_si(cmplt_epi8(vec, set1_epi8(0xc0)), cmpgt_epi8(counts, count)));
|
||||
|
||||
// Validate second bytes of E0-starting 3-byte sequences.
|
||||
// 0xE0 must be followed by 0xA0..0xBF (not 0x80..0x9F) to avoid overlong encodings.
|
||||
// Without this, "\xe0\x80\x80" would incorrectly be decoded as a "\x00".
|
||||
const integer_t e0_starter_bytes = cmpeq_epi8(vec, set1_epi8(0xe0));
|
||||
const integer_t e0_first_follower_bytes = shift_right_by_one_byte(e0_starter_bytes);
|
||||
chunk_is_invalid = or_si(chunk_is_invalid, and_si(e0_first_follower_bytes, cmplt_epi8(and_si(e0_first_follower_bytes, vec), set1_epi8(0xa0))));
|
||||
|
||||
// Validate second bytes of ED-starting 3-byte sequences.
|
||||
// 0xED must be followed by 0x80..0x9F (not 0xA0..0xBF) to avoid UTF-16 surrogates.
|
||||
// Without this, "\xed\xa0\x80" would incorrectly be decoded as an isolated surrogate "\uD800".
|
||||
const integer_t ed_starter_bytes = cmpeq_epi8(vec, set1_epi8(0xed));
|
||||
const integer_t ed_first_follower_bytes = shift_right_by_one_byte(ed_starter_bytes);
|
||||
chunk_is_invalid = or_si(chunk_is_invalid, and_si(ed_first_follower_bytes, cmpgt_epi8(and_si(ed_first_follower_bytes, vec), set1_epi8(0x9f))));
|
||||
|
||||
// Validate second bytes of F0-starting 4-byte sequences.
|
||||
// F0 must be followed by 0x90..0xBF (not 0x80..0x8F) to avoid overlong encodings.
|
||||
// Without this, "\xf0\x80\x80\x80" would incorrectly be decoded as a "\x0000".
|
||||
const integer_t f0_starter_bytes = cmpeq_epi8(vec, set1_epi8(0xf0));
|
||||
const integer_t f0_first_follower_bytes = shift_right_by_one_byte(f0_starter_bytes);
|
||||
chunk_is_invalid = or_si(chunk_is_invalid, and_si(f0_first_follower_bytes, cmplt_epi8(and_si(f0_first_follower_bytes, vec), set1_epi8(0x90))));
|
||||
|
||||
// Validate second bytes of F4-starting 4-byte sequences.
|
||||
// F4 must be followed by 0x80..0x8F (not 0x90..0xBF) to stay within the Unicode codespace.
|
||||
// Without this, "\xf4\x90\x80\x80" would incorrectly be decoded as an ill-formed "\U00110000".
|
||||
const integer_t f4_starter_bytes = cmpeq_epi8(vec, set1_epi8(0xf4));
|
||||
const integer_t f4_first_follower_bytes = shift_right_by_one_byte(f4_starter_bytes);
|
||||
chunk_is_invalid = or_si(chunk_is_invalid, and_si(f4_first_follower_bytes, cmpgt_epi8(and_si(f4_first_follower_bytes, vec), set1_epi8(0x8f))));
|
||||
|
||||
// Check for any accumulated validation errors and, if found,
|
||||
// fall back to slow scalar decoding of this chunk,
|
||||
// which handles replacement of invalid sequences with U+FFFD
|
||||
if (!is_zero(chunk_is_invalid)) { abort_with_invalid_utf8(); }
|
||||
|
||||
// Process the bytes storing the three resulting bytes that make up the unicode codepoint
|
||||
// mask all control bits so that we have only useful bits left
|
||||
@@ -673,7 +740,7 @@ start_classification:
|
||||
|
||||
// The last byte is made up of bits 5 and 6 from count == 3 and 3 bits from count == 4
|
||||
integer_t output3 = and_si(three, shift_right_by_bits32(vec, 4)); // bits 5 and 6 from count == 3
|
||||
const integer_t count4_locations = cmpeq_epi8(counts, set1_epi8(4));
|
||||
const integer_t count4_locations = cmpeq_epi8(counts, four);
|
||||
// 3 bits from count == 4 locations, placed at count == 3 locations shifted left by 2 bits
|
||||
output3 = or_si(output3,
|
||||
and_si(set1_epi8(0xfc),
|
||||
|
||||
@@ -214,9 +214,9 @@ class TestParser(BaseTest):
|
||||
return esc_found, ''.join(parts), total_consumed
|
||||
|
||||
reset_state()
|
||||
actual = parse_parts(1)
|
||||
expected = parse_parts(1)
|
||||
reset_state()
|
||||
expected = parse_parts(which)
|
||||
actual = parse_parts(which)
|
||||
self.ae(expected, actual, msg=f'Failed for {a} with {which=}\n{expected!r} !=\n{actual!r}')
|
||||
return actual
|
||||
|
||||
@@ -288,6 +288,7 @@ class TestParser(BaseTest):
|
||||
pb(b'"\xe0\xa0"', '"\ufffd"')
|
||||
pb(b'"\xf0\x9f\x98"', '"\ufffd"')
|
||||
pb(b'"\xef\x93\x94\x95"', '"\uf4d4\ufffd"')
|
||||
|
||||
# Lone continuation bytes with no leading starts
|
||||
pb(b'"\xbf"', '"\ufffd"')
|
||||
pb(b'"\x80"', '"\ufffd"')
|
||||
@@ -321,10 +322,6 @@ class TestParser(BaseTest):
|
||||
# Bad continuation byte (restored as ASCII)
|
||||
pb(b'"\xe1\x28\xa1"', '"\ufffd(\ufffd"') # )
|
||||
|
||||
# The following all fail when using SIMD and need to be fixed in the SIMD parser
|
||||
if which != 1:
|
||||
continue
|
||||
|
||||
# Overlong 2-byte sequence for U+0000 (should be `0x00`)
|
||||
pb(b'"\xc0\x80"', '"\ufffd\ufffd"')
|
||||
|
||||
@@ -340,9 +337,305 @@ class TestParser(BaseTest):
|
||||
# Low surrogate code point
|
||||
pb(b'"\xed\xb0\x80"', '"\ufffd\ufffd\ufffd"')
|
||||
|
||||
# Too large first codepoint
|
||||
# Too large starter byte
|
||||
pb(b'"\xff\x80\x80\x80"', '"\ufffd\ufffd\ufffd\ufffd"')
|
||||
|
||||
# The following boundary cases come from the table of well-formed UTF-8 byte sequences
|
||||
# <https://www.unicode.org/versions/Unicode16.0.0/core-spec/chapter-3/#G27506>`_.
|
||||
# For continuation bytes, both 0xC0 and 0xC2 are tested as values that exceed the valid maximum.
|
||||
# This is because 0xC0 is an invalid starter byte, but 0xC2 is also a starter byte for 2-byte sequences.
|
||||
# simd-string-impl.h prefers classifying bytes as starter bytes when possible (e.g., in "\xf0\x90\xc2\x80").
|
||||
# The tests need to check that simd-string-impl.h correctly detects
|
||||
# starter bytes that are actually invalid continution bytes, like 0xC2.
|
||||
|
||||
# Boundary cases: 2-byte sequences
|
||||
pb(b'"\xc1\x7f"', '"\ufffd\x7f"')
|
||||
pb(b'"\xc1\x80"', '"\ufffd\ufffd"')
|
||||
pb(b'"\xc1\xbf"', '"\ufffd\ufffd"')
|
||||
pb(b'"\xc1\xc0"', '"\ufffd\ufffd"')
|
||||
pb(b'"\xc1\xc2"', '"\ufffd\ufffd"')
|
||||
pb(b'"\xc2\x7f"', '"\ufffd\x7f"')
|
||||
pb(b'"\xc2\x80"', '"\x80"')
|
||||
pb(b'"\xc2\xbf"', '"\xbf"')
|
||||
pb(b'"\xc2\xc0"', '"\ufffd\ufffd"')
|
||||
pb(b'"\xc2\xc2"', '"\ufffd\ufffd"')
|
||||
pb(b'"\xdf\x7f"', '"\ufffd\x7f"')
|
||||
pb(b'"\xdf\x80"', '"\u07c0"')
|
||||
pb(b'"\xdf\xbf"', '"\u07ff"')
|
||||
pb(b'"\xdf\xc0"', '"\ufffd\ufffd"')
|
||||
pb(b'"\xdf\xc2"', '"\ufffd\ufffd"')
|
||||
|
||||
# Boundary cases: 3-byte sequences starting with 0xE0
|
||||
pb(b'"\xe0\x9f\x80"', '"\ufffd\ufffd\ufffd"')
|
||||
pb(b'"\xe0\xa0\x7f"', '"\ufffd\x7f"')
|
||||
pb(b'"\xe0\xa0\x80"', '"\u0800"')
|
||||
pb(b'"\xe0\xa0\xbf"', '"\u083f"')
|
||||
pb(b'"\xe0\xa0\xc0"', '"\ufffd\ufffd"')
|
||||
pb(b'"\xe0\xa0\xc2"', '"\ufffd\ufffd"')
|
||||
pb(b'"\xe0\xbf\x7f"', '"\ufffd\x7f"')
|
||||
pb(b'"\xe0\xbf\x80"', '"\u0fc0"')
|
||||
pb(b'"\xe0\xbf\xbf"', '"\u0fff"')
|
||||
pb(b'"\xe0\xbf\xc0"', '"\ufffd\ufffd"')
|
||||
pb(b'"\xe0\xbf\xc2"', '"\ufffd\ufffd"')
|
||||
pb(b'"\xe0\xc0\x80"', '"\ufffd\ufffd\ufffd"')
|
||||
pb(b'"\xe0\xc2\x80"', '"\ufffd\x80"')
|
||||
|
||||
# Boundary cases: 3-byte sequences starting with 0xE1..0xEC
|
||||
pb(b'"\xe1\x7f\x80"', '"\ufffd\x7f\ufffd"')
|
||||
pb(b'"\xe1\x80\x7f"', '"\ufffd\x7f"')
|
||||
pb(b'"\xe1\x80\x80"', '"\u1000"')
|
||||
pb(b'"\xe1\x80\xbf"', '"\u103f"')
|
||||
pb(b'"\xe1\x80\xc0"', '"\ufffd\ufffd"')
|
||||
pb(b'"\xe1\x80\xc2"', '"\ufffd\ufffd"')
|
||||
pb(b'"\xe1\xbf\x7f"', '"\ufffd\x7f"')
|
||||
pb(b'"\xe1\xbf\x80"', '"\u1fc0"')
|
||||
pb(b'"\xe1\xbf\xbf"', '"\u1fff"')
|
||||
pb(b'"\xe1\xbf\xc0"', '"\ufffd\ufffd"')
|
||||
pb(b'"\xe1\xbf\xc2"', '"\ufffd\ufffd"')
|
||||
pb(b'"\xe1\xc0\x80"', '"\ufffd\ufffd\ufffd"')
|
||||
pb(b'"\xe1\xc2\x80"', '"\ufffd\x80"')
|
||||
pb(b'"\xec\x7f\x80"', '"\ufffd\x7f\ufffd"')
|
||||
pb(b'"\xec\x80\x7f"', '"\ufffd\x7f"')
|
||||
pb(b'"\xec\x80\x80"', '"\uc000"')
|
||||
pb(b'"\xec\x80\xbf"', '"\uc03f"')
|
||||
pb(b'"\xec\x80\xc0"', '"\ufffd\ufffd"')
|
||||
pb(b'"\xec\x80\xc2"', '"\ufffd\ufffd"')
|
||||
pb(b'"\xec\xbf\x7f"', '"\ufffd\x7f"')
|
||||
pb(b'"\xec\xbf\x80"', '"\ucfc0"')
|
||||
pb(b'"\xec\xbf\xbf"', '"\ucfff"')
|
||||
pb(b'"\xec\xbf\xc0"', '"\ufffd\ufffd"')
|
||||
pb(b'"\xec\xbf\xc2"', '"\ufffd\ufffd"')
|
||||
pb(b'"\xec\xc0\x80"', '"\ufffd\ufffd\ufffd"')
|
||||
pb(b'"\xec\xc2\x80"', '"\ufffd\x80"')
|
||||
|
||||
# Boundary cases: 3-byte sequences starting with 0xED
|
||||
pb(b'"\xed\x7f\x80"', '"\ufffd\x7f\ufffd"')
|
||||
pb(b'"\xed\x80\x7f"', '"\ufffd\x7f"')
|
||||
pb(b'"\xed\x80\x80"', '"\ud000"')
|
||||
pb(b'"\xed\x80\xbf"', '"\ud03f"')
|
||||
pb(b'"\xed\x80\xc0"', '"\ufffd\ufffd"')
|
||||
pb(b'"\xed\x80\xc2"', '"\ufffd\ufffd"')
|
||||
pb(b'"\xed\x9f\x7f"', '"\ufffd\x7f"')
|
||||
pb(b'"\xed\x9f\x80"', '"\ud7c0"')
|
||||
pb(b'"\xed\x9f\xbf"', '"\ud7ff"')
|
||||
pb(b'"\xed\x9f\xc0"', '"\ufffd\ufffd"')
|
||||
pb(b'"\xed\x9f\xc2"', '"\ufffd\ufffd"')
|
||||
|
||||
# Boundary cases: 3-byte sequences starting with 0xEE..0xEF
|
||||
pb(b'"\xed\xa0\x80"', '"\ufffd\ufffd\ufffd"')
|
||||
pb(b'"\xee\x7f\x80"', '"\ufffd\x7f\ufffd"')
|
||||
pb(b'"\xee\x80\x7f"', '"\ufffd\x7f"')
|
||||
pb(b'"\xee\x80\x80"', '"\ue000"')
|
||||
pb(b'"\xee\x80\xbf"', '"\ue03f"')
|
||||
pb(b'"\xee\x80\xc0"', '"\ufffd\ufffd"')
|
||||
pb(b'"\xee\x80\xc2"', '"\ufffd\ufffd"')
|
||||
pb(b'"\xee\xbf\x7f"', '"\ufffd\x7f"')
|
||||
pb(b'"\xee\xbf\x80"', '"\uefc0"')
|
||||
pb(b'"\xee\xbf\xbf"', '"\uefff"')
|
||||
pb(b'"\xee\xbf\xc0"', '"\ufffd\ufffd"')
|
||||
pb(b'"\xee\xbf\xc2"', '"\ufffd\ufffd"')
|
||||
pb(b'"\xee\xc0\x80"', '"\ufffd\ufffd\ufffd"')
|
||||
pb(b'"\xee\xc2\x80"', '"\ufffd\x80"')
|
||||
pb(b'"\xef\x7f\x80"', '"\ufffd\x7f\ufffd"')
|
||||
pb(b'"\xef\x80\x7f"', '"\ufffd\x7f"')
|
||||
pb(b'"\xef\x80\x80"', '"\uf000"')
|
||||
pb(b'"\xef\x80\xbf"', '"\uf03f"')
|
||||
pb(b'"\xef\x80\xc0"', '"\ufffd\ufffd"')
|
||||
pb(b'"\xef\x80\xc2"', '"\ufffd\ufffd"')
|
||||
pb(b'"\xef\xbf\x7f"', '"\ufffd\x7f"')
|
||||
pb(b'"\xef\xbf\x80"', '"\uffc0"')
|
||||
pb(b'"\xef\xbf\xbf"', '"\uffff"')
|
||||
pb(b'"\xef\xbf\xc0"', '"\ufffd\ufffd"')
|
||||
pb(b'"\xef\xbf\xc2"', '"\ufffd\ufffd"')
|
||||
pb(b'"\xef\xc0\x80"', '"\ufffd\ufffd\ufffd"')
|
||||
pb(b'"\xef\xc2\x80"', '"\ufffd\x80"')
|
||||
|
||||
# Boundary cases: 4-byte sequences starting with 0xF0
|
||||
pb(b'"\xf0\x8f\x80\x80"', '"\ufffd\ufffd\ufffd\ufffd"')
|
||||
pb(b'"\xf0\x90\x7f\x80"', '"\ufffd\x7f\ufffd"')
|
||||
pb(b'"\xf0\x90\x80\x7f"', '"\ufffd\x7f"')
|
||||
pb(b'"\xf0\x90\x80\x80"', '"\U00010000"')
|
||||
pb(b'"\xf0\x90\x80\xbf"', '"\U0001003f"')
|
||||
pb(b'"\xf0\x90\x80\xc0"', '"\ufffd\ufffd"')
|
||||
pb(b'"\xf0\x90\x80\xc2"', '"\ufffd\ufffd"')
|
||||
pb(b'"\xf0\x90\xbf\x7f"', '"\ufffd\x7f"')
|
||||
pb(b'"\xf0\x90\xbf\x80"', '"\U00010fc0"')
|
||||
pb(b'"\xf0\x90\xbf\xbf"', '"\U00010fff"')
|
||||
pb(b'"\xf0\x90\xbf\xc0"', '"\ufffd\ufffd"')
|
||||
pb(b'"\xf0\x90\xbf\xc2"', '"\ufffd\ufffd"')
|
||||
pb(b'"\xf0\x90\xc0\x7f"', '"\ufffd\ufffd\x7f"')
|
||||
pb(b'"\xf0\x90\xc0\x80"', '"\ufffd\ufffd\ufffd"')
|
||||
pb(b'"\xf0\x90\xc0\xbf"', '"\ufffd\ufffd\ufffd"')
|
||||
pb(b'"\xf0\x90\xc0\xc0"', '"\ufffd\ufffd\ufffd"')
|
||||
pb(b'"\xf0\x90\xc0\xc2"', '"\ufffd\ufffd\ufffd"')
|
||||
pb(b'"\xf0\x90\xc2\x7f"', '"\ufffd\ufffd\x7f"')
|
||||
pb(b'"\xf0\x90\xc2\x80"', '"\ufffd\x80"')
|
||||
pb(b'"\xf0\x90\xc2\xbf"', '"\ufffd\xbf"')
|
||||
pb(b'"\xf0\x90\xc2\xc0"', '"\ufffd\ufffd\ufffd"')
|
||||
pb(b'"\xf0\x90\xc2\xc2"', '"\ufffd\ufffd\ufffd"')
|
||||
pb(b'"\xf0\xbf\x7f\x80"', '"\ufffd\x7f\ufffd"')
|
||||
pb(b'"\xf0\xbf\x80\x7f"', '"\ufffd\x7f"')
|
||||
pb(b'"\xf0\xbf\x80\x80"', '"\U0003f000"')
|
||||
pb(b'"\xf0\xbf\x80\xbf"', '"\U0003f03f"')
|
||||
pb(b'"\xf0\xbf\x80\xc0"', '"\ufffd\ufffd"')
|
||||
pb(b'"\xf0\xbf\x80\xc2"', '"\ufffd\ufffd"')
|
||||
pb(b'"\xf0\xbf\xbf\x7f"', '"\ufffd\x7f"')
|
||||
pb(b'"\xf0\xbf\xbf\x80"', '"\U0003ffc0"')
|
||||
pb(b'"\xf0\xbf\xbf\xbf"', '"\U0003ffff"')
|
||||
pb(b'"\xf0\xbf\xbf\xc0"', '"\ufffd\ufffd"')
|
||||
pb(b'"\xf0\xbf\xbf\xc2"', '"\ufffd\ufffd"')
|
||||
pb(b'"\xf0\xbf\xc0\x7f"', '"\ufffd\ufffd\x7f"')
|
||||
pb(b'"\xf0\xbf\xc0\x80"', '"\ufffd\ufffd\ufffd"')
|
||||
pb(b'"\xf0\xbf\xc0\xbf"', '"\ufffd\ufffd\ufffd"')
|
||||
pb(b'"\xf0\xbf\xc0\xc0"', '"\ufffd\ufffd\ufffd"')
|
||||
pb(b'"\xf0\xbf\xc0\xc2"', '"\ufffd\ufffd\ufffd"')
|
||||
pb(b'"\xf0\xbf\xc2\x7f"', '"\ufffd\ufffd\x7f"')
|
||||
pb(b'"\xf0\xbf\xc2\x80"', '"\ufffd\x80"')
|
||||
pb(b'"\xf0\xbf\xc2\xbf"', '"\ufffd\xbf"')
|
||||
pb(b'"\xf0\xbf\xc2\xc0"', '"\ufffd\ufffd\ufffd"')
|
||||
pb(b'"\xf0\xbf\xc2\xc2"', '"\ufffd\ufffd\ufffd"')
|
||||
pb(b'"\xf0\xc0\x80\x80"', '"\ufffd\ufffd\ufffd\ufffd"')
|
||||
|
||||
# Boundary cases: 4-byte sequences starting with 0xF1..0xF3
|
||||
pb(b'"\xf1\x7f\x80\x80"', '"\ufffd\x7f\ufffd\ufffd"')
|
||||
pb(b'"\xf1\x80\x7f\x80"', '"\ufffd\x7f\ufffd"')
|
||||
pb(b'"\xf1\x80\x80\x7f"', '"\ufffd\x7f"')
|
||||
pb(b'"\xf1\x80\x80\x80"', '"\U00040000"')
|
||||
pb(b'"\xf1\x80\x80\xbf"', '"\U0004003f"')
|
||||
pb(b'"\xf1\x80\x80\xc0"', '"\ufffd\ufffd"')
|
||||
pb(b'"\xf1\x80\x80\xc2"', '"\ufffd\ufffd"')
|
||||
pb(b'"\xf1\x80\xbf\x7f"', '"\ufffd\x7f"')
|
||||
pb(b'"\xf1\x80\xbf\x80"', '"\U00040fc0"')
|
||||
pb(b'"\xf1\x80\xbf\xbf"', '"\U00040fff"')
|
||||
pb(b'"\xf1\x80\xbf\xc0"', '"\ufffd\ufffd"')
|
||||
pb(b'"\xf1\x80\xbf\xc2"', '"\ufffd\ufffd"')
|
||||
pb(b'"\xf1\x80\xc0\x7f"', '"\ufffd\ufffd\x7f"')
|
||||
pb(b'"\xf1\x80\xc0\x80"', '"\ufffd\ufffd\ufffd"')
|
||||
pb(b'"\xf1\x80\xc0\xbf"', '"\ufffd\ufffd\ufffd"')
|
||||
pb(b'"\xf1\x80\xc0\xc0"', '"\ufffd\ufffd\ufffd"')
|
||||
pb(b'"\xf1\x80\xc2\x7f"', '"\ufffd\ufffd\x7f"')
|
||||
pb(b'"\xf1\x80\xc2\x80"', '"\ufffd\x80"')
|
||||
pb(b'"\xf1\x80\xc2\xbf"', '"\ufffd\xbf"')
|
||||
pb(b'"\xf1\x80\xc2\xc0"', '"\ufffd\ufffd\ufffd"')
|
||||
pb(b'"\xf1\xbf\x7f\x80"', '"\ufffd\x7f\ufffd"')
|
||||
pb(b'"\xf1\xbf\x80\x7f"', '"\ufffd\x7f"')
|
||||
pb(b'"\xf1\xbf\x80\x80"', '"\U0007f000"')
|
||||
pb(b'"\xf1\xbf\x80\xbf"', '"\U0007f03f"')
|
||||
pb(b'"\xf1\xbf\x80\xc0"', '"\ufffd\ufffd"')
|
||||
pb(b'"\xf1\xbf\x80\xc2"', '"\ufffd\ufffd"')
|
||||
pb(b'"\xf1\xbf\xbf\x7f"', '"\ufffd\x7f"')
|
||||
pb(b'"\xf1\xbf\xbf\x80"', '"\U0007ffc0"')
|
||||
pb(b'"\xf1\xbf\xbf\xbf"', '"\U0007ffff"')
|
||||
pb(b'"\xf1\xbf\xbf\xc0"', '"\ufffd\ufffd"')
|
||||
pb(b'"\xf1\xbf\xbf\xc2"', '"\ufffd\ufffd"')
|
||||
pb(b'"\xf1\xbf\xc0\x7f"', '"\ufffd\ufffd\x7f"')
|
||||
pb(b'"\xf1\xbf\xc0\x80"', '"\ufffd\ufffd\ufffd"')
|
||||
pb(b'"\xf1\xbf\xc0\xbf"', '"\ufffd\ufffd\ufffd"')
|
||||
pb(b'"\xf1\xbf\xc0\xc0"', '"\ufffd\ufffd\ufffd"')
|
||||
pb(b'"\xf1\xbf\xc0\xc2"', '"\ufffd\ufffd\ufffd"')
|
||||
pb(b'"\xf1\xbf\xc2\x7f"', '"\ufffd\ufffd\x7f"')
|
||||
pb(b'"\xf1\xbf\xc2\x80"', '"\ufffd\x80"')
|
||||
pb(b'"\xf1\xbf\xc2\xbf"', '"\ufffd\xbf"')
|
||||
pb(b'"\xf1\xbf\xc2\xc0"', '"\ufffd\ufffd\ufffd"')
|
||||
pb(b'"\xf1\xbf\xc2\xc2"', '"\ufffd\ufffd\ufffd"')
|
||||
pb(b'"\xf1\xc0\x80\x80"', '"\ufffd\ufffd\ufffd\ufffd"')
|
||||
pb(b'"\xf1\xc2\x80\x80"', '"\ufffd\x80\ufffd"')
|
||||
pb(b'"\xf3\x7f\x80\x80"', '"\ufffd\x7f\ufffd\ufffd"')
|
||||
pb(b'"\xf3\x80\x7f\x80"', '"\ufffd\x7f\ufffd"')
|
||||
pb(b'"\xf3\x80\x80\x7f"', '"\ufffd\x7f"')
|
||||
pb(b'"\xf3\x80\x80\x80"', '"\U000c0000"')
|
||||
pb(b'"\xf3\x80\x80\xbf"', '"\U000c003f"')
|
||||
pb(b'"\xf3\x80\x80\xc0"', '"\ufffd\ufffd"')
|
||||
pb(b'"\xf3\x80\x80\xc2"', '"\ufffd\ufffd"')
|
||||
pb(b'"\xf3\x80\xbf\x7f"', '"\ufffd\x7f"')
|
||||
pb(b'"\xf3\x80\xbf\x80"', '"\U000c0fc0"')
|
||||
pb(b'"\xf3\x80\xbf\xbf"', '"\U000c0fff"')
|
||||
pb(b'"\xf3\x80\xbf\xc0"', '"\ufffd\ufffd"')
|
||||
pb(b'"\xf3\x80\xbf\xc2"', '"\ufffd\ufffd"')
|
||||
pb(b'"\xf3\x80\xc0\x7f"', '"\ufffd\ufffd\x7f"')
|
||||
pb(b'"\xf3\x80\xc0\x80"', '"\ufffd\ufffd\ufffd"')
|
||||
pb(b'"\xf3\x80\xc0\xbf"', '"\ufffd\ufffd\ufffd"')
|
||||
pb(b'"\xf3\x80\xc0\xc0"', '"\ufffd\ufffd\ufffd"')
|
||||
pb(b'"\xf3\x80\xc0\xc2"', '"\ufffd\ufffd\ufffd"')
|
||||
pb(b'"\xf3\x80\xc2\x7f"', '"\ufffd\ufffd\x7f"')
|
||||
pb(b'"\xf3\x80\xc2\x80"', '"\ufffd\x80"')
|
||||
pb(b'"\xf3\x80\xc2\xbf"', '"\ufffd\xbf"')
|
||||
pb(b'"\xf3\x80\xc2\xc0"', '"\ufffd\ufffd\ufffd"')
|
||||
pb(b'"\xf3\x80\xc2\xc2"', '"\ufffd\ufffd\ufffd"')
|
||||
pb(b'"\xf3\xbf\x7f\x80"', '"\ufffd\x7f\ufffd"')
|
||||
pb(b'"\xf3\xbf\x80\x7f"', '"\ufffd\x7f"')
|
||||
pb(b'"\xf3\xbf\x80\x80"', '"\U000ff000"')
|
||||
pb(b'"\xf3\xbf\x80\xbf"', '"\U000ff03f"')
|
||||
pb(b'"\xf3\xbf\x80\xc0"', '"\ufffd\ufffd"')
|
||||
pb(b'"\xf3\xbf\x80\xc2"', '"\ufffd\ufffd"')
|
||||
pb(b'"\xf3\xbf\xbf\x7f"', '"\ufffd\x7f"')
|
||||
pb(b'"\xf3\xbf\xbf\x80"', '"\U000fffc0"')
|
||||
pb(b'"\xf3\xbf\xbf\xbf"', '"\U000fffff"')
|
||||
pb(b'"\xf3\xbf\xbf\xc0"', '"\ufffd\ufffd"')
|
||||
pb(b'"\xf3\xbf\xbf\xc2"', '"\ufffd\ufffd"')
|
||||
pb(b'"\xf3\xbf\xc0\x7f"', '"\ufffd\ufffd\x7f"')
|
||||
pb(b'"\xf3\xbf\xc0\x80"', '"\ufffd\ufffd\ufffd"')
|
||||
pb(b'"\xf3\xbf\xc0\xbf"', '"\ufffd\ufffd\ufffd"')
|
||||
pb(b'"\xf3\xbf\xc0\xc0"', '"\ufffd\ufffd\ufffd"')
|
||||
pb(b'"\xf3\xbf\xc0\xc2"', '"\ufffd\ufffd\ufffd"')
|
||||
pb(b'"\xf3\xbf\xc2\x7f"', '"\ufffd\ufffd\x7f"')
|
||||
pb(b'"\xf3\xbf\xc2\x80"', '"\ufffd\x80"')
|
||||
pb(b'"\xf3\xbf\xc2\xbf"', '"\ufffd\xbf"')
|
||||
pb(b'"\xf3\xbf\xc2\xc0"', '"\ufffd\ufffd\ufffd"')
|
||||
pb(b'"\xf3\xbf\xc2\xc2"', '"\ufffd\ufffd\ufffd"')
|
||||
pb(b'"\xf3\xc0\x80\x80"', '"\ufffd\ufffd\ufffd\ufffd"')
|
||||
pb(b'"\xf3\xc2\x80\x80"', '"\ufffd\x80\ufffd"')
|
||||
|
||||
# Boundary cases: 4-byte sequences starting with 0xF4
|
||||
pb(b'"\xf4\x7f\x80\x80"', '"\ufffd\x7f\ufffd\ufffd"')
|
||||
pb(b'"\xf4\x80\x7f\x80"', '"\ufffd\x7f\ufffd"')
|
||||
pb(b'"\xf4\x80\x80\x7f"', '"\ufffd\x7f"')
|
||||
pb(b'"\xf4\x80\x80\x80"', '"\U00100000"')
|
||||
pb(b'"\xf4\x80\x80\xbf"', '"\U0010003f"')
|
||||
pb(b'"\xf4\x80\x80\xc0"', '"\ufffd\ufffd"')
|
||||
pb(b'"\xf4\x80\x80\xc2"', '"\ufffd\ufffd"')
|
||||
pb(b'"\xf4\x80\xbf\x7f"', '"\ufffd\x7f"')
|
||||
pb(b'"\xf4\x80\xbf\x80"', '"\U00100fc0"')
|
||||
pb(b'"\xf4\x80\xbf\xbf"', '"\U00100fff"')
|
||||
pb(b'"\xf4\x80\xbf\xc0"', '"\ufffd\ufffd"')
|
||||
pb(b'"\xf4\x80\xbf\xc2"', '"\ufffd\ufffd"')
|
||||
pb(b'"\xf4\x80\xc0\x7f"', '"\ufffd\ufffd\x7f"')
|
||||
pb(b'"\xf4\x80\xc0\x80"', '"\ufffd\ufffd\ufffd"')
|
||||
pb(b'"\xf4\x80\xc0\xbf"', '"\ufffd\ufffd\ufffd"')
|
||||
pb(b'"\xf4\x80\xc0\xc0"', '"\ufffd\ufffd\ufffd"')
|
||||
pb(b'"\xf4\x80\xc0\xc2"', '"\ufffd\ufffd\ufffd"')
|
||||
pb(b'"\xf4\x80\xc2\x7f"', '"\ufffd\ufffd\x7f"')
|
||||
pb(b'"\xf4\x80\xc2\x80"', '"\ufffd\x80"')
|
||||
pb(b'"\xf4\x80\xc2\xbf"', '"\ufffd\xbf"')
|
||||
pb(b'"\xf4\x80\xc2\xc0"', '"\ufffd\ufffd\ufffd"')
|
||||
pb(b'"\xf4\x80\xc2\xc2"', '"\ufffd\ufffd\ufffd"')
|
||||
pb(b'"\xf4\x8f\x7f\x80"', '"\ufffd\x7f\ufffd"')
|
||||
pb(b'"\xf4\x8f\x80\x7f"', '"\ufffd\x7f"')
|
||||
pb(b'"\xf4\x8f\x80\x80"', '"\U0010f000"')
|
||||
pb(b'"\xf4\x8f\x80\xbf"', '"\U0010f03f"')
|
||||
pb(b'"\xf4\x8f\x80\xc0"', '"\ufffd\ufffd"')
|
||||
pb(b'"\xf4\x8f\x80\xc2"', '"\ufffd\ufffd"')
|
||||
pb(b'"\xf4\x8f\xbf\x7f"', '"\ufffd\x7f"')
|
||||
pb(b'"\xf4\x8f\xbf\x80"', '"\U0010ffc0"')
|
||||
pb(b'"\xf4\x8f\xbf\xbf"', '"\U0010ffff"')
|
||||
pb(b'"\xf4\x8f\xbf\xc0"', '"\ufffd\ufffd"')
|
||||
pb(b'"\xf4\x8f\xbf\xc2"', '"\ufffd\ufffd"')
|
||||
pb(b'"\xf4\x8f\xc0\x7f"', '"\ufffd\ufffd\x7f"')
|
||||
pb(b'"\xf4\x8f\xc0\x80"', '"\ufffd\ufffd\ufffd"')
|
||||
pb(b'"\xf4\x8f\xc0\xbf"', '"\ufffd\ufffd\ufffd"')
|
||||
pb(b'"\xf4\x8f\xc0\xc0"', '"\ufffd\ufffd\ufffd"')
|
||||
pb(b'"\xf4\x8f\xc0\xc2"', '"\ufffd\ufffd\ufffd"')
|
||||
pb(b'"\xf4\x8f\xc2\x7f"', '"\ufffd\ufffd\x7f"')
|
||||
pb(b'"\xf4\x8f\xc2\x80"', '"\ufffd\x80"')
|
||||
pb(b'"\xf4\x8f\xc2\xbf"', '"\ufffd\xbf"')
|
||||
pb(b'"\xf4\x8f\xc2\xc0"', '"\ufffd\ufffd\ufffd"')
|
||||
pb(b'"\xf4\x8f\xc2\xc2"', '"\ufffd\ufffd\ufffd"')
|
||||
pb(b'"\xf4\x90\x80\x80"', '"\ufffd\ufffd\ufffd\ufffd"')
|
||||
pb(b'"\xf5\x80\x80\x80"', '"\ufffd\ufffd\ufffd\ufffd"')
|
||||
|
||||
# Boundary case: too large codepoint (> U+10FFFF)
|
||||
pb(b'"\xf5\x80\x80\x80"', '"\ufffd\ufffd\ufffd\ufffd"')
|
||||
|
||||
|
||||
def test_find_either_of_two_bytes(self):
|
||||
sizes = []
|
||||
|
||||
Reference in New Issue
Block a user