diff --git a/kitty/simd-string-impl.h b/kitty/simd-string-impl.h index 64ceb53fb..633e68f06 100644 --- a/kitty/simd-string-impl.h +++ b/kitty/simd-string-impl.h @@ -413,187 +413,210 @@ scalar_decode_all(UTF8Decoder *d, const uint8_t *src, size_t src_sz) { } return pos; } + #undef do_one_byte bool -FUNC(utf8_decode_to_esc)(UTF8Decoder *d, const uint8_t *src, size_t src_sz) { +FUNC(utf8_decode_to_esc)(UTF8Decoder *d, const uint8_t *src_data, size_t src_len) { // Based on the algorithm described in: https://woboq.com/blog/utf-8-processing-using-simd.html d->output.pos = 0; d->num_consumed = 0; - if (d->state.cur != UTF8_ACCEPT) { // Finish the trailing sequence only - d->num_consumed += scalar_decode_to_accept(d, src, src_sz); - src += d->num_consumed; src_sz -= d->num_consumed; + d->num_consumed = scalar_decode_to_accept(d, src_data, src_len); + src_data += d->num_consumed; src_len -= d->num_consumed; } - src_sz = MIN(src_sz, sizeof(integer_t)); - integer_t vec = load_unaligned((integer_t*)src); - const integer_t esc_vec = set1_epi8(0x1b); - const integer_t esc_cmp = cmpeq_epi8(vec, esc_vec); + const integer_t zero = create_zero_integer(), one = set1_epi8(1), two = set1_epi8(2), three = set1_epi8(3), four = set1_epi8(4); + const integer_t vec_c2 = set1_epi8(0xc2), vec_e3 = set1_epi8(0xe3), vec_f4 = set1_epi8(0xf4); + const uint8_t *limit = src_data + src_len, *p = src_data, *start_of_current_chunk = src_data; bool sentinel_found = false; - int num_of_bytes_to_first_esc = bytes_to_first_match(esc_cmp); - if (num_of_bytes_to_first_esc > -1 && (unsigned)num_of_bytes_to_first_esc < src_sz) { - sentinel_found = true; - src_sz = num_of_bytes_to_first_esc; - d->num_consumed += src_sz + 1; // esc is also consumed - } else d->num_consumed += src_sz; - - // use scalar decode for short input - if (src_sz < 4) { - scalar_decode_all(d, src, src_sz); return sentinel_found; - } - if (src_sz < sizeof(integer_t)) vec = zero_last_n_bytes(vec, sizeof(integer_t) - src_sz); - + unsigned chunk_src_sz = 0; unsigned num_of_trailing_bytes = 0; - bool check_for_trailing_bytes = true; - // Check if we have pure ASCII and use fast path - debug_register(vec); - int32_t ascii_mask; + while (p < limit && !sentinel_found) { + chunk_src_sz = MIN((size_t)(limit - p), sizeof(integer_t)); + integer_t vec = load_unaligned((integer_t*)p); + start_of_current_chunk = p; + p += chunk_src_sz; + + const integer_t esc_cmp = cmpeq_epi8(vec, esc_vec); + int num_of_bytes_to_first_esc = bytes_to_first_match(esc_cmp); + if (num_of_bytes_to_first_esc > -1 && (unsigned)num_of_bytes_to_first_esc < chunk_src_sz) { + sentinel_found = true; + chunk_src_sz = num_of_bytes_to_first_esc; + d->num_consumed += chunk_src_sz + 1; // esc is also consumed + if (!chunk_src_sz) continue; + } else d->num_consumed += chunk_src_sz; + + if (chunk_src_sz < sizeof(integer_t)) vec = zero_last_n_bytes(vec, sizeof(integer_t) - chunk_src_sz); + + num_of_trailing_bytes = 0; + bool check_for_trailing_bytes = !sentinel_found; + + debug_register(vec); + int32_t ascii_mask; + +#define abort_with_invalid_utf8() { \ + scalar_decode_all(d, start_of_current_chunk, chunk_src_sz + num_of_trailing_bytes); \ + d->num_consumed += num_of_trailing_bytes; \ + break; \ +} + +#define handle_trailing_bytes() if (num_of_trailing_bytes) { \ + if (p >= limit) { \ + scalar_decode_all(d, p - num_of_trailing_bytes, num_of_trailing_bytes); \ + d->num_consumed += num_of_trailing_bytes; \ + break; \ + } \ + p -= num_of_trailing_bytes; \ +} + start_classification: - ascii_mask = movemask_epi8(vec); - if (!ascii_mask) { // no bytes with high bit (0x80) set, so just plain ASCII - FUNC(output_plain_ascii)(d, vec, src_sz); - if (num_of_trailing_bytes) scalar_decode_all(d, src + src_sz, num_of_trailing_bytes); - return sentinel_found; - } - // Classify the bytes - integer_t state = set1_epi8(0x80); - const integer_t vec_signed = add_epi8(vec, state); // needed because cmplt_epi8 works only on signed chars + // Check if we have pure ASCII and use fast path + ascii_mask = movemask_epi8(vec); + if (!ascii_mask) { // no bytes with high bit (0x80) set, so just plain ASCII + FUNC(output_plain_ascii)(d, vec, chunk_src_sz); + handle_trailing_bytes(); + continue; + } + // Classify the bytes + integer_t state = set1_epi8(0x80); + const integer_t vec_signed = add_epi8(vec, state); // needed because cmplt_epi8 works only on signed chars - const integer_t bytes_indicating_start_of_two_byte_sequence = cmplt_epi8(set1_epi8(0xc0 - 1 - 0x80), vec_signed); - state = blendv_epi8(state, set1_epi8(0xc2), bytes_indicating_start_of_two_byte_sequence); - // state now has 0xc2 on all bytes that start a 2 or more byte sequence and 0x80 on the rest - const integer_t bytes_indicating_start_of_three_byte_sequence = cmplt_epi8(set1_epi8(0xe0 - 1 - 0x80), vec_signed); - state = blendv_epi8(state, set1_epi8(0xe3), bytes_indicating_start_of_three_byte_sequence); - const integer_t bytes_indicating_start_of_four_byte_sequence = cmplt_epi8(set1_epi8(0xf0 - 1 - 0x80), vec_signed); - state = blendv_epi8(state, set1_epi8(0xf4), bytes_indicating_start_of_four_byte_sequence); - // state now has 0xc2 on all bytes that start a 2 byte sequence, 0xe3 on start of 3-byte sequence, 0xf4 on 4-byte start and 0x80 on rest - debug_register(state); - integer_t mask = and_si(state, set1_epi8(0xf8)); // keep upper 5 bits of state - debug_register(mask); - integer_t count = and_si(state, set1_epi8(0x7)); // keep lower 3 bits of state - debug_register(count); - const integer_t zero = create_zero_integer(), one = set1_epi8(1), two = set1_epi8(2), three = set1_epi8(3); - // count contains the number of bytes in the sequence for the start byte of every sequence and zero elsewhere - // shift 02 bytes by 1 and subtract 1 - integer_t count_subs1 = subtract_saturate_epu8(count, one); - integer_t counts = add_epi8(count, shift_right_by_one_byte(count_subs1)); - // shift 03 and 04 bytes by 2 and subtract 2 - counts = add_epi8(counts, shift_right_by_two_bytes(subtract_saturate_epu8(counts, two))); - // counts now contains the number of bytes remaining in each utf-8 sequence of 2 or more bytes - debug_register(counts); - // check for an incomplete trailing utf8 sequence - if (check_for_trailing_bytes && !is_zero(cmplt_epi8(one, and_si(counts, cmpeq_epi8(numbered_bytes(), set1_epi8(src_sz - 1)))))) { - // The value of counts at the last byte is > 1 indicating we have a trailing incomplete sequence - check_for_trailing_bytes = false; - if (src[src_sz-1] >= 0xc0) num_of_trailing_bytes = 1; // 2-, 3- and 4-byte characters with only 1 byte left - else if (src_sz > 1 && src[src_sz-2] >= 0xe0) num_of_trailing_bytes = 2; // 3- and 4-byte characters with only 1 byte left - else if (src_sz > 2 && src[src_sz-3] >= 0xf0) num_of_trailing_bytes = 3; // 4-byte characters with only 3 bytes left - src_sz -= num_of_trailing_bytes; - vec = zero_last_n_bytes(vec, sizeof(integer_t) - src_sz); - goto start_classification; - } - // Only ASCII chars should have corresponding byte of counts == 0 - if (ascii_mask != movemask_epi8(cmpgt_epi8(counts, zero))) goto invalid_utf8; - // The difference between a byte in counts and the next one should be negative, - // zero, or one. Any other value means there is not enough continuation bytes. - if (!is_zero(cmpgt_epi8(subtract_epi8(shift_right_by_one_byte(counts), counts), one))) goto invalid_utf8; + const integer_t bytes_indicating_start_of_two_byte_sequence = cmplt_epi8(set1_epi8(0xc0 - 1 - 0x80), vec_signed); + state = blendv_epi8(state, vec_c2, bytes_indicating_start_of_two_byte_sequence); + // state now has 0xc2 on all bytes that start a 2 or more byte sequence and 0x80 on the rest + const integer_t bytes_indicating_start_of_three_byte_sequence = cmplt_epi8(set1_epi8(0xe0 - 1 - 0x80), vec_signed); + state = blendv_epi8(state, vec_e3, bytes_indicating_start_of_three_byte_sequence); + const integer_t bytes_indicating_start_of_four_byte_sequence = cmplt_epi8(set1_epi8(0xf0 - 1 - 0x80), vec_signed); + state = blendv_epi8(state, vec_f4, bytes_indicating_start_of_four_byte_sequence); + // state now has 0xc2 on all bytes that start a 2 byte sequence, 0xe3 on start of 3-byte, 0xf4 on 4-byte start and 0x80 on rest + debug_register(state); + integer_t mask = and_si(state, set1_epi8(0xf8)); // keep upper 5 bits of state + debug_register(mask); + integer_t count = and_si(state, set1_epi8(0x7)); // keep lower 3 bits of state + debug_register(count); + // count contains the number of bytes in the sequence for the start byte of every sequence and zero elsewhere + // shift 02 bytes by 1 and subtract 1 + integer_t count_subs1 = subtract_saturate_epu8(count, one); + integer_t counts = add_epi8(count, shift_right_by_one_byte(count_subs1)); + // shift 03 and 04 bytes by 2 and subtract 2 + counts = add_epi8(counts, shift_right_by_two_bytes(subtract_saturate_epu8(counts, two))); + // counts now contains the number of bytes remaining in each utf-8 sequence of 2 or more bytes + debug_register(counts); + // check for an incomplete trailing utf8 sequence + if (check_for_trailing_bytes && !is_zero(cmplt_epi8(one, and_si(counts, cmpeq_epi8(numbered_bytes(), set1_epi8(chunk_src_sz - 1)))))) { + // The value of counts at the last byte is > 1 indicating we have a trailing incomplete sequence + check_for_trailing_bytes = false; + if (start_of_current_chunk[chunk_src_sz-1] >= 0xc0) num_of_trailing_bytes = 1; // 2-, 3- and 4-byte characters with only 1 byte left + else if (chunk_src_sz > 1 && start_of_current_chunk[chunk_src_sz-2] >= 0xe0) num_of_trailing_bytes = 2; // 3- and 4-byte characters with only 1 byte left + else if (chunk_src_sz > 2 && start_of_current_chunk[chunk_src_sz-3] >= 0xf0) num_of_trailing_bytes = 3; // 4-byte characters with only 3 bytes left + chunk_src_sz -= num_of_trailing_bytes; + d->num_consumed -= num_of_trailing_bytes; + if (!chunk_src_sz) { abort_with_invalid_utf8(); } + vec = zero_last_n_bytes(vec, sizeof(integer_t) - chunk_src_sz); + goto start_classification; + } + // Only ASCII chars should have corresponding byte of counts == 0 + if (ascii_mask != movemask_epi8(cmpgt_epi8(counts, zero))) { abort_with_invalid_utf8(); } + // The difference between a byte in counts and the next one should be negative, + // zero, or one. Any other value means there is not enough continuation bytes. + if (!is_zero(cmpgt_epi8(subtract_epi8(shift_right_by_one_byte(counts), counts), one))) { abort_with_invalid_utf8(); } - // Process the bytes storing the three resulting bytes that make up the unicode codepoint - // mask all control bits so that we have only useful bits left - vec = andnot_si(mask, vec); - debug_register(vec); + // Process the bytes storing the three resulting bytes that make up the unicode codepoint + // mask all control bits so that we have only useful bits left + vec = andnot_si(mask, vec); + debug_register(vec); - // Now calculate the three output vectors + // Now calculate the three output vectors - // The lowest byte is made up of 6 bits from locations with counts == 1 and the lowest two bits from locations with count == 2 - // In addition, the ASCII bytes are copied unchanged from vec - integer_t vec_non_ascii = andnot_si(cmpeq_epi8(counts, zero), vec); - debug_register(vec_non_ascii); - integer_t output1 = blendv_epi8(vec, - or_si( - // there are no count == 1 locations without a count == 2 location to its left so we dont need to AND with count2_locations - vec, and_si(shift_left_by_bits16(shift_right_by_one_byte(vec_non_ascii), 6), set1_epi8(0xc0)) - ), - cmpeq_epi8(counts, one) - ); - debug_register(output1); + // The lowest byte is made up of 6 bits from locations with counts == 1 and the lowest two bits from locations with count == 2 + // In addition, the ASCII bytes are copied unchanged from vec + integer_t vec_non_ascii = andnot_si(cmpeq_epi8(counts, zero), vec); + debug_register(vec_non_ascii); + integer_t output1 = blendv_epi8(vec, + or_si( + // there are no count == 1 locations without a count == 2 location to its left so we dont need to AND with count2_locations + vec, and_si(shift_left_by_bits16(shift_right_by_one_byte(vec_non_ascii), 6), set1_epi8(0xc0)) + ), + cmpeq_epi8(counts, one) + ); + debug_register(output1); - // The next byte is made up of 4 bits (5, 4, 3, 2) from locations with count == 2 and the first 4 bits from locations with count == 3 - integer_t count2_locations = cmpeq_epi8(counts, two), count3_locations = cmpeq_epi8(counts, three); - integer_t output2 = and_si(vec, count2_locations); - output2 = shift_right_by_bits32(output2, 2); // selects the bits 5, 4, 3, 2 - // select the first 4 bits from locs with count == 3 by shifting count 3 locations right by one byte and left by 4 bits - output2 = or_si(output2, - and_si(set1_epi8(0xf0), - shift_left_by_bits16(shift_right_by_one_byte(and_si(count3_locations, vec_non_ascii)), 4) - ) - ); - output2 = and_si(output2, count2_locations); // keep only the count2 bytes - output2 = shift_right_by_one_byte(output2); - debug_register(output2); + // The next byte is made up of 4 bits (5, 4, 3, 2) from locations with count == 2 and the first 4 bits from locations with count == 3 + integer_t count2_locations = cmpeq_epi8(counts, two), count3_locations = cmpeq_epi8(counts, three); + integer_t output2 = and_si(vec, count2_locations); + output2 = shift_right_by_bits32(output2, 2); // selects the bits 5, 4, 3, 2 + // select the first 4 bits from locs with count == 3 by shifting count 3 locations right by one byte and left by 4 bits + output2 = or_si(output2, + and_si(set1_epi8(0xf0), + shift_left_by_bits16(shift_right_by_one_byte(and_si(count3_locations, vec_non_ascii)), 4) + ) + ); + output2 = and_si(output2, count2_locations); // keep only the count2 bytes + output2 = shift_right_by_one_byte(output2); + debug_register(output2); - // The last byte is made up of bits 5 and 6 from count == 3 and 3 bits from count == 4 - integer_t output3 = and_si(three, shift_right_by_bits32(vec, 4)); // bits 5 and 6 from count == 3 - integer_t count4_locations = cmpeq_epi8(counts, set1_epi8(4)); - // 3 bits from count == 4 locations, placed at count == 3 locations shifted left by 2 bits - output3 = or_si(output3, - and_si(set1_epi8(0xfc), - shift_left_by_bits16(shift_right_by_one_byte(and_si(count4_locations, vec_non_ascii)), 2) - ) - ); - output3 = and_si(output3, count3_locations); // keep only count3 bytes - output3 = shift_right_by_two_bytes(output3); - debug_register(output3); + // The last byte is made up of bits 5 and 6 from count == 3 and 3 bits from count == 4 + integer_t output3 = and_si(three, shift_right_by_bits32(vec, 4)); // bits 5 and 6 from count == 3 + integer_t count4_locations = cmpeq_epi8(counts, four); + // 3 bits from count == 4 locations, placed at count == 3 locations shifted left by 2 bits + output3 = or_si(output3, + and_si(set1_epi8(0xfc), + shift_left_by_bits16(shift_right_by_one_byte(and_si(count4_locations, vec_non_ascii)), 2) + ) + ); + output3 = and_si(output3, count3_locations); // keep only count3 bytes + output3 = shift_right_by_two_bytes(output3); + debug_register(output3); - // Shuffle bytes to remove continuation bytes - integer_t shifts = count_subs1; // number of bytes we need to skip for each UTF-8 sequence - // propagate the shifts to all subsequent bytes by shift and add - shifts = add_epi8(shifts, shift_right_by_one_byte(shifts)); - shifts = add_epi8(shifts, shift_right_by_two_bytes(shifts)); - shifts = add_epi8(shifts, shift_right_by_four_bytes(shifts)); - shifts = add_epi8(shifts, shift_right_by_eight_bytes(shifts)); + // Shuffle bytes to remove continuation bytes + integer_t shifts = count_subs1; // number of bytes we need to skip for each UTF-8 sequence + // propagate the shifts to all subsequent bytes by shift and add + shifts = add_epi8(shifts, shift_right_by_one_byte(shifts)); + shifts = add_epi8(shifts, shift_right_by_two_bytes(shifts)); + shifts = add_epi8(shifts, shift_right_by_four_bytes(shifts)); + shifts = add_epi8(shifts, shift_right_by_eight_bytes(shifts)); #if KITTY_SIMD_LEVEL == 256 - shifts = add_epi8(shifts, shift_right_by_sixteen_bytes(shifts)); + shifts = add_epi8(shifts, shift_right_by_sixteen_bytes(shifts)); #endif - // zero the shifts for discarded continuation bytes - shifts = and_si(shifts, cmplt_epi8(counts, two)); - // now we need to convert shifts into a mask for the shuffle. The mask has each byte of the - // form 0000xxxx the lower four bits indicating the destination location for the byte. For 256 bit shuffle we use lower 5 bits. - // First we move the numbers in shifts to discard the unwanted UTF-8 sequence bytes. We note that the numbers - // are bounded by sizeof(integer_t) and so we need at most 4 (for 128 bit) or 5 (for 256 bit) moves. The numbers are - // monotonic from left to right and change value only at the end of a UTF-8 sequence. We move them leftwards, accumulating the - // moves bit-by-bit. + // zero the shifts for discarded continuation bytes + shifts = and_si(shifts, cmplt_epi8(counts, two)); + // now we need to convert shifts into a mask for the shuffle. The mask has each byte of the + // form 0000xxxx the lower four bits indicating the destination location for the byte. For 256 bit shuffle we use lower 5 bits. + // First we move the numbers in shifts to discard the unwanted UTF-8 sequence bytes. We note that the numbers + // are bounded by sizeof(integer_t) and so we need at most 4 (for 128 bit) or 5 (for 256 bit) moves. The numbers are + // monotonic from left to right and change value only at the end of a UTF-8 sequence. We move them leftwards, accumulating the + // moves bit-by-bit. #define move(shifts, amt, which_bit) blendv_epi8(shifts, shift_left_by_##amt(shifts), shift_left_by_##amt(shift_left_by_bits16(shifts, 8 - which_bit))) - shifts = move(shifts, one_byte, 1); - shifts = move(shifts, two_bytes, 2); - shifts = move(shifts, four_bytes, 3); - shifts = move(shifts, eight_bytes, 4); + shifts = move(shifts, one_byte, 1); + shifts = move(shifts, two_bytes, 2); + shifts = move(shifts, four_bytes, 3); + shifts = move(shifts, eight_bytes, 4); #if KITTY_SIMD_LEVEL == 256 - shifts = move(shifts, sixteen_bytes, 5); + shifts = move(shifts, sixteen_bytes, 5); #endif #undef move - // convert the shifts into a suitable mask for shuffle by adding the byte number to each byte - shifts = add_epi8(shifts, numbered_bytes()); - debug_register(shifts); + // convert the shifts into a suitable mask for shuffle by adding the byte number to each byte + shifts = add_epi8(shifts, numbered_bytes()); + debug_register(shifts); - output1 = shuffle_epi8(output1, shifts); - output2 = shuffle_epi8(output2, shifts); - output3 = shuffle_epi8(output3, shifts); - debug_register(output1); - debug_register(output2); - debug_register(output3); + output1 = shuffle_epi8(output1, shifts); + output2 = shuffle_epi8(output2, shifts); + output3 = shuffle_epi8(output3, shifts); + debug_register(output1); + debug_register(output2); + debug_register(output3); - const unsigned num_of_discarded_bytes = sum_bytes(count_subs1); - const unsigned num_codepoints = src_sz - num_of_discarded_bytes; - debug("num_of_discarded_bytes: %u num_codepoints: %u\n", num_of_discarded_bytes, num_codepoints); - FUNC(output_unicode)(d, output1, output2, output3, num_codepoints); - if (num_of_trailing_bytes) scalar_decode_all(d, src + src_sz, num_of_trailing_bytes); - return sentinel_found; -invalid_utf8: - scalar_decode_all(d, src, src_sz + num_of_trailing_bytes); + const unsigned num_of_discarded_bytes = sum_bytes(count_subs1); + const unsigned num_codepoints = chunk_src_sz - num_of_discarded_bytes; + debug("num_of_discarded_bytes: %u num_codepoints: %u\n", num_of_discarded_bytes, num_codepoints); + FUNC(output_unicode)(d, output1, output2, output3, num_codepoints); + handle_trailing_bytes(); + } return sentinel_found; +#undef abort_with_invalid_utf8 +#undef handle_trailing_bytes } diff --git a/kitty/simd-string.c b/kitty/simd-string.c index 5bdc6630c..9c8d779c2 100644 --- a/kitty/simd-string.c +++ b/kitty/simd-string.c @@ -101,7 +101,7 @@ test_utf8_decode_to_sentinel(PyObject *self UNUSED, PyObject *args) { } } utf8_decoder_free(&d); - return Py_BuildValue("OO", found_sentinel ? Py_True : Py_False, ans); + return Py_BuildValue("OOi", found_sentinel ? Py_True : Py_False, ans, p); } // }}} diff --git a/kitty/simd-string.h b/kitty/simd-string.h index 4a9140d33..789e68b06 100644 --- a/kitty/simd-string.h +++ b/kitty/simd-string.h @@ -18,15 +18,20 @@ typedef struct UTF8Decoder { struct { uint32_t cur, prev, codep; } state; unsigned num_consumed; } UTF8Decoder; + static inline void utf8_decoder_reset(UTF8Decoder *self) { zero_at_ptr(&self->state); } + bool utf8_decode_to_esc(UTF8Decoder *d, const uint8_t *src, size_t src_sz); + static inline void utf8_decoder_ensure_capacity(UTF8Decoder *d, unsigned sz) { - if (d->output.capacity + d->output.pos < sz) { + if (d->output.pos + sz > d->output.capacity) { d->output.capacity = d->output.pos + sz + 4096; - d->output.storage = realloc(d->output.storage, d->output.capacity * sizeof(d->output.storage[0]) + 64); // allow for overwrite of upto 64 bytes - if (!d->output.storage) fatal("Output of memory for UTF8Decoder output buffer at capacity: %u", d->output.capacity); + // allow for overwrite of upto 64 bytes + d->output.storage = realloc(d->output.storage, d->output.capacity * sizeof(d->output.storage[0]) + 64); + if (!d->output.storage) fatal("Out of memory for UTF8Decoder output buffer at capacity: %u", d->output.capacity); } } + static inline void utf8_decoder_free(UTF8Decoder *d) { free(d->output.storage); zero_at_ptr(&(d->output)); diff --git a/kitty_tests/parser.py b/kitty_tests/parser.py index 4b34cddb1..e55503fd6 100644 --- a/kitty_tests/parser.py +++ b/kitty_tests/parser.py @@ -197,20 +197,23 @@ class TestParser(BaseTest): return def parse_parts(which): + total_consumed = 0 esc_found = False parts = [] for x in a: - found_sentinel, x = test_utf8_decode_to_sentinel(asbytes(x), which) + found_sentinel, x, num_consumed = test_utf8_decode_to_sentinel(asbytes(x), which) + total_consumed += num_consumed if found_sentinel: esc_found = found_sentinel parts.append(x) - return esc_found, ''.join(parts) + return esc_found, ''.join(parts), total_consumed reset_state() actual = parse_parts(1) reset_state() expected = parse_parts(which) self.ae(expected, actual, msg=f'Failed for {a} with {which=}\n{expected!r} !=\n{actual!r}') + return actual def double_test(x): for which in (2, 3): @@ -218,6 +221,9 @@ class TestParser(BaseTest): t(x*2, which=3) reset_state() + # incomplete trailer at end of vector + t("a"*10 + "😸😸" + "b"*15) + x = double_test x('2:α3') x('2:α\x1b3') @@ -231,18 +237,20 @@ class TestParser(BaseTest): x('abcdef', 'ghijk') x('2:α3', ':≤4:😸|') # trailing incomplete sequence - x(b'abcd\xf0\x9f', b'\x98\xb81234') - x(b'abcd\xf0\x9f\x9b', b'\xb81234') - x(b'abcd\xf0', b'\x9f\x98\xb81234') - x(b'abcd\xc3', b'\xa41234') - x(b'abcd\xe2', b'\x89\xa41234') - x(b'abcd\xe2\x89', b'\xa41234') + for prefix in (b'abcd', '😸'.encode()): + for suffix in (b'1234', '😸'.encode()): + x(prefix + b'\xf0\x9f', b'\x98\xb8' + suffix) + x(prefix + b'\xf0\x9f\x9b', b'\xb8' + suffix) + x(prefix + b'\xf0', b'\x9f\x98\xb8' + suffix) + x(prefix + b'\xc3', b'\xa4' + suffix) + x(prefix + b'\xe2', b'\x89\xa4' + suffix) + x(prefix + b'\xe2\x89', b'\xa4' + suffix) def test_expected(src, expected, which=2): if unsupported(which): return reset_state() - _, actual = test_utf8_decode_to_sentinel(b'filler' + asbytes(src), which) + _, actual, _ = t(b'filler' + asbytes(src), which=which) expected = 'filler' + expected self.ae(expected, actual, f'Failed for: {src!r} with {which=}')