diff --git a/kitty/simd-string.c b/kitty/simd-string.c index 8e4c3c0d5..f4a536c6c 100644 --- a/kitty/simd-string.c +++ b/kitty/simd-string.c @@ -61,8 +61,8 @@ byte_loader_skip(ByteLoader *self) { #define prepare_for_hasvalue(n) (~0ULL/255 * (n)) #define hasvalue(x,n) (haszero((x) ^ (n))) -static const char* -find_either_of_two_bytes_simple(const char *haystack, const size_t sz, const uint8_t x, const uint8_t y) { +static const uint8_t* +find_either_of_two_bytes_simple(const uint8_t *haystack, const size_t sz, const uint8_t x, const uint8_t y) { ByteLoader it; byte_loader_init(&it, (uint8_t*)haystack, sz); // first align by testing the first few bytes one at a time @@ -74,7 +74,7 @@ find_either_of_two_bytes_simple(const char *haystack, const size_t sz, const uin const BYTE_LOADER_T a = prepare_for_hasvalue(x), b = prepare_for_hasvalue(y); while (it.num_left) { if (hasvalue(it.m, a) || hasvalue(it.m, b)) { - const char *ans = haystack + sz - it.num_left, q = hasvalue(it.m, a) ? x : y; + const uint8_t *ans = haystack + sz - it.num_left, q = hasvalue(it.m, a) ? x : y; while (it.num_left) { if (byte_loader_next(&it) == q) return ans; ans++; @@ -87,11 +87,11 @@ find_either_of_two_bytes_simple(const char *haystack, const size_t sz, const uin } #undef SHIFT_OP -static const char* -find_either_of_two_bytes_simd_impl(const char *haystack, const char* needle_, size_t sz) { +static const uint8_t* +find_either_of_two_bytes_simd_impl(const uint8_t *haystack, const uint8_t* needle_, size_t sz) { size_t extra = (uintptr_t)haystack % sizeof(__m128i); if (extra) { // need aligned loads for performance so search first few bytes by hand - const char *ans = find_either_of_two_bytes_simple(haystack, MIN(sz, extra), needle_[0], needle_[1]); + const uint8_t *ans = find_either_of_two_bytes_simple(haystack, MIN(sz, extra), needle_[0], needle_[1]); if (ans) return ans; extra = MIN(extra, sz); sz -= extra; @@ -99,7 +99,7 @@ find_either_of_two_bytes_simd_impl(const char *haystack, const char* needle_, si if (!sz) return NULL; } const __m128i needle = _mm_load_si128((const __m128i *)needle_); - for (const char* limit = haystack + sz; haystack < limit; haystack += 16) { + for (const uint8_t* limit = haystack + sz; haystack < limit; haystack += 16) { const __m128i h = _mm_load_si128((const __m128i *)haystack); int c = _mm_cmpistri(needle, h, _SIDD_CMP_EQUAL_ANY); if (c != 16) { @@ -109,32 +109,67 @@ find_either_of_two_bytes_simd_impl(const char *haystack, const char* needle_, si return NULL; } -static uint8_t* +static const uint8_t* find_either_of_two_bytes_simd(uint8_t *haystack, const size_t sz, const uint8_t x, const uint8_t y) { uint8_t before = haystack[sz]; haystack[sz] = 0; - char needle[16] = {x, y, 0,0,0,0,0,0,0,0,0,0,0,0,0,0}; - uint8_t *ans = (uint8_t*)find_either_of_two_bytes_simd_impl((char*)haystack, needle, sz); + uint8_t needle[16] = {x, y, 0,0,0,0,0,0,0,0,0,0,0,0,0,0}; + const uint8_t *ans = find_either_of_two_bytes_simd_impl(haystack, needle, sz); haystack[sz] = before; return ans; } uint8_t* find_either_of_two_bytes(uint8_t *haystack, const size_t sz, const uint8_t a, const uint8_t b) { - return find_either_of_two_bytes_simd(haystack, sz, a, b); + return (uint8_t*)find_either_of_two_bytes_simd(haystack, sz, a, b); } static const uint8_t* -find_start_of_two_ranges_simple(const uint8_t *haystack, const size_t sz, const uint8_t a1, const uint8_t a2, const uint8_t a3, const uint8_t a4) { +find_byte_not_in_range_simple(const uint8_t *haystack, const size_t sz, const uint8_t a, const uint8_t b) { ByteLoader it; byte_loader_init(&it, haystack, sz); while (it.num_left) { const uint8_t ch = byte_loader_next(&it); - if ((a1 <= ch && ch <= a2) || (a3 <= ch && ch <= a4)) return haystack + sz - it.num_left - 1; + if (ch < a || ch > b) return haystack + sz - it.num_left - 1; } return NULL; } -uint8_t* -find_start_of_two_ranges(uint8_t *haystack, const size_t sz, const uint8_t a1, const uint8_t a2, const uint8_t a3, const uint8_t a4) { - return (uint8_t*)find_start_of_two_ranges_simple(haystack, sz, a1, a2, a3, a4); +static const uint8_t* +find_byte_not_in_range_simd_impl(const uint8_t *haystack, const uint8_t* needle_, size_t sz) { + size_t extra = (uintptr_t)haystack % sizeof(__m128i); + if (extra) { // need aligned loads for performance so search first few bytes by hand + const uint8_t *ans = find_byte_not_in_range_simple((const uint8_t*)haystack, MIN(sz, extra), needle_[0], needle_[1]); + if (ans) return (const uint8_t*)ans; + extra = MIN(extra, sz); + sz -= extra; + haystack += extra; + if (!sz) return NULL; + } + const __m128i needle = _mm_load_si128((const __m128i *)needle_); + for (const uint8_t* limit = haystack + sz; haystack < limit; haystack += 16) { + const __m128i h = _mm_load_si128((const __m128i *)haystack); + int c = _mm_cmpistri(needle, h, _SIDD_CMP_RANGES | _SIDD_NEGATIVE_POLARITY); + if (c != 16) { + return haystack + c; + } + } + return NULL; +} + + +static uint8_t* +find_byte_not_in_range_sse4_2(uint8_t *haystack, const size_t sz, const uint8_t a, const uint8_t b) { + uint8_t before = haystack[sz]; + haystack[sz] = 0; + uint8_t needle[16] = {a, b, 0, 0, 0,0,0,0,0,0,0,0,0,0,0,0}; + uint8_t *ans = (uint8_t*)find_byte_not_in_range_simd_impl((uint8_t*)haystack, needle, sz); + haystack[sz] = before; + return ans; + +} + + +uint8_t* +find_byte_not_in_range(uint8_t *haystack, const size_t sz, const uint8_t a, const uint8_t b) { + return (uint8_t*)find_byte_not_in_range_sse4_2(haystack, sz, a, b); } diff --git a/kitty/simd-string.h b/kitty/simd-string.h index 9da140137..580239874 100644 --- a/kitty/simd-string.h +++ b/kitty/simd-string.h @@ -28,6 +28,6 @@ uint8_t* find_either_of_two_bytes(uint8_t *haystack, const size_t sz, const uint // Requires haystack[sz] to be writable and 7 bytes to the left of haystack to // be readable. Returns pointer to first position in haystack that contains -// a char that is in [a1, a2] or [a2, a3] +// a char that is not in [a, b]. a must be <= b uint8_t* -find_start_of_two_ranges(uint8_t *haystack, const size_t sz, const uint8_t a1, const uint8_t a2, const uint8_t a3, const uint8_t a4); +find_byte_not_in_range(uint8_t *haystack, const size_t sz, const uint8_t a1, const uint8_t b); diff --git a/kitty/vt-parser.c b/kitty/vt-parser.c index af1c5abcf..4e15b3339 100644 --- a/kitty/vt-parser.c +++ b/kitty/vt-parser.c @@ -284,7 +284,7 @@ consume_normal(PS *self) { do { if (self->utf8.state == UTF8_ACCEPT) { size_t sz = self->read.sz - self->read.pos; - uint8_t *p = find_start_of_two_ranges(self->buf + self->read.pos, sz, 0, 31, 0x7f, 0xff); + uint8_t *p = find_byte_not_in_range(self->buf + self->read.pos, sz, 32, 126); if (p != NULL) sz = p - (self->buf + self->read.pos); if (sz) dispatch_printable_ascii(self, sz); else dispatch_normal_mode_byte(self, self->buf[self->read.pos++]);