Fix UTF-8 overlong and special range checks in simd-string-impl.h

Modified `start_classification` in `utf8_decode_to_esc` in `simd-string-impl.h`, which now:

Rejects `0xC0`, `0xC1` and `0xF5..0xFF` lead bytes in UTF-8 subsequences.

Enforces special ranges for the second subsequence bytes after `0xE0`, `0xED`, `0xF0` and `0xF4` bytes to prevent overlong sequences, surrogates, and code points above U+10FFFF.

Accumulates UTF-8 validation errors in a single vector to avoid many conditional branches.

Worsens unicode benchmark performance by about 4%.
This commit is contained in:
Wukuyon
2025-10-18 17:11:58 -06:00
parent 295951348c
commit 65890de60d
2 changed files with 59 additions and 9 deletions

View File

@@ -541,7 +541,7 @@ FUNC(utf8_decode_to_esc)(UTF8Decoder *d, const uint8_t *src_data, size_t src_len
src_data += d->num_consumed; src_len -= d->num_consumed;
}
const integer_t esc_vec = set1_epi8(0x1b);
const integer_t zero = create_zero_integer(), one = set1_epi8(1), two = set1_epi8(2), three = set1_epi8(3), numbered = numbered_bytes();
const integer_t zero = create_zero_integer(), one = set1_epi8(1), two = set1_epi8(2), three = set1_epi8(3), four = set1_epi8(4), numbered = numbered_bytes();
const uint8_t *limit = src_data + src_len, *p = src_data, *start_of_current_chunk = src_data;
bool sentinel_found = false;
unsigned chunk_src_sz = 0;
@@ -593,10 +593,14 @@ start_classification:
handle_trailing_bytes();
continue;
}
// Classify the bytes
// Classify the bytes by whether they may be the start of a 2-byte, 3-byte, or 4-byte sequence.
// This is only an initial, potential classification.
// 0xC0 and 0xC1 are initially classified as potential starter bytes of 2-byte sequences.
// And 0xF5..0xFF are initially classified as potential starter bytes of 4-byte sequences.
// They will be marked as actually invalid later in the chunk_is_invalid checks.
integer_t state = set1_epi8(0x80);
const integer_t vec_signed = add_epi8(vec, state); // needed because cmplt_epi8 works only on signed chars
// state now has 0x80 on all bytes
const integer_t bytes_indicating_start_of_two_byte_sequence = cmplt_epi8(set1_epi8(0xc0 - 1 - 0x80), vec_signed);
state = blendv_epi8(state, set1_epi8(0xc2), bytes_indicating_start_of_two_byte_sequence);
// state now has 0xc2 on all bytes that start a 2 or more byte sequence and 0x80 on the rest
@@ -631,11 +635,61 @@ start_classification:
vec = zero_last_n_bytes(vec, sizeof(integer_t) - chunk_src_sz);
goto start_classification;
}
// The next section performs detailed validation of the chunk's byte sequences.
// It accumulates validation errors into a chunk_is_invalid vector.
// When chunk_is_invalid has any non-zero byte, then the chunk contains invalid UTF-8.
// chunk_is_invalid is a vector, and not a bitmask or boolean,
// because the or_si SIMD operation is empirically faster than movemask_epi8 with |= or ||=.
integer_t chunk_is_invalid;
// Only ASCII chars should have corresponding byte of counts == 0
if (ascii_mask != movemask_epi8(cmpgt_epi8(counts, zero))) { abort_with_invalid_utf8(); }
// The difference between a byte in counts and the next one should be negative,
// zero, or one. Any other value means there is not enough continuation bytes.
if (!is_zero(cmpgt_epi8(subtract_epi8(shift_right_by_one_byte(counts), counts), one))) { abort_with_invalid_utf8(); }
chunk_is_invalid = cmpgt_epi8(subtract_epi8(shift_right_by_one_byte(counts), counts), one);
// Validate 2-byte sequence starter bytes: 0xC0..0xC1 are invalid (overlong encodings for U+0000..U+007F).
// Without this, "\xc0\x80" would incorrectly be decoded as a "\x00".
chunk_is_invalid = or_si(chunk_is_invalid, and_si(bytes_indicating_start_of_two_byte_sequence, cmplt_epi8(vec, set1_epi8(0xc2))));
// Validate 4-byte sequence starter bytes: 0xF5..0xFF are invalid (out of Unicode codespace).
// Without this, "\xff\x80\x80\x80" would incorrectly be decoded as an ill-formed "\U003C0000".
chunk_is_invalid = or_si(chunk_is_invalid, and_si(bytes_indicating_start_of_four_byte_sequence, cmpgt_epi8(vec, set1_epi8(0xf4))));
// Validate second bytes of E0-starting 3-byte sequences.
// 0xE0 must be followed by 0xA0..0xBF (not 0x80..0x9F) to avoid overlong encodings.
// Without this, "\xe0\x80\x80" would incorrectly be decoded as a "\x00".
const integer_t e0_starter_bytes = cmpeq_epi8(vec, set1_epi8(0xe0));
const integer_t e0_first_follower_bytes = shift_right_by_one_byte(e0_starter_bytes);
chunk_is_invalid = or_si(chunk_is_invalid, and_si(e0_first_follower_bytes, cmplt_epi8(and_si(e0_first_follower_bytes, vec), set1_epi8(0xa0))));
// Validate second bytes of ED-starting 3-byte sequences.
// 0xED must be followed by 0x80..0x9F (not 0xA0..0xBF) to avoid UTF-16 surrogates.
// Without this, "\xed\xa0\x80" would incorrectly be decoded as an isolated surrogate "\uD800".
const integer_t ed_starter_bytes = cmpeq_epi8(vec, set1_epi8(0xed));
const integer_t ed_first_follower_bytes = shift_right_by_one_byte(ed_starter_bytes);
chunk_is_invalid = or_si(chunk_is_invalid, and_si(ed_first_follower_bytes, cmpgt_epi8(and_si(ed_first_follower_bytes, vec), set1_epi8(0x9f))));
// Validate second bytes of F0-starting 4-byte sequences.
// F0 must be followed by 0x90..0xBF (not 0x80..0x8F) to avoid overlong encodings.
// Without this, "\xf0\x80\x80\x80" would incorrectly be decoded as a "\x0000".
const integer_t f0_starter_bytes = cmpeq_epi8(vec, set1_epi8(0xf0));
const integer_t f0_first_follower_bytes = shift_right_by_one_byte(f0_starter_bytes);
chunk_is_invalid = or_si(chunk_is_invalid, and_si(f0_first_follower_bytes, cmplt_epi8(and_si(f0_first_follower_bytes, vec), set1_epi8(0x90))));
// Validate second bytes of F4-starting 4-byte sequences.
// F4 must be followed by 0x80..0x8F (not 0x90..0xBF) to stay within the Unicode codespace.
// Without this, "\xf4\x90\x80\x80" would incorrectly be decoded as an ill-formed "\U00110000".
const integer_t f4_starter_bytes = cmpeq_epi8(vec, set1_epi8(0xf4));
const integer_t f4_first_follower_bytes = shift_right_by_one_byte(f4_starter_bytes);
chunk_is_invalid = or_si(chunk_is_invalid, and_si(f4_first_follower_bytes, cmpgt_epi8(and_si(f4_first_follower_bytes, vec), set1_epi8(0x8f))));
// Check for any accumulated validation errors and, if found,
// fall back to slow scalar decoding of this chunk,
// which handles replacement of invalid sequences with U+FFFD
if (!is_zero(chunk_is_invalid)) { abort_with_invalid_utf8(); }
// Process the bytes storing the three resulting bytes that make up the unicode codepoint
// mask all control bits so that we have only useful bits left
@@ -673,7 +727,7 @@ start_classification:
// The last byte is made up of bits 5 and 6 from count == 3 and 3 bits from count == 4
integer_t output3 = and_si(three, shift_right_by_bits32(vec, 4)); // bits 5 and 6 from count == 3
const integer_t count4_locations = cmpeq_epi8(counts, set1_epi8(4));
const integer_t count4_locations = cmpeq_epi8(counts, four);
// 3 bits from count == 4 locations, placed at count == 3 locations shifted left by 2 bits
output3 = or_si(output3,
and_si(set1_epi8(0xfc),

View File

@@ -322,10 +322,6 @@ class TestParser(BaseTest):
# Bad continuation byte (restored as ASCII)
pb(b'"\xe1\x28\xa1"', '"\ufffd(\ufffd"') # )
# The following all fail when using SIMD and need to be fixed in the SIMD parser
if which != 1:
continue
# Overlong 2-byte sequence for U+0000 (should be `0x00`)
pb(b'"\xc0\x80"', '"\ufffd\ufffd"')