From c01a941fe7defe70392a353ce1d447aa8882d14d Mon Sep 17 00:00:00 2001 From: Kovid Goyal Date: Thu, 10 Apr 2025 08:33:18 +0530 Subject: [PATCH] Get the unicode grapheme seg tests passing on screen, with minimal modification We ignore tests including ACII control codes and we modify the results when there are grapheme breaks before zero width characters. --- kitty/screen.c | 19 ++++++------------- kitty_tests/datatypes.py | 36 +++++++++++++++++++++++++++++++++++- 2 files changed, 41 insertions(+), 14 deletions(-) diff --git a/kitty/screen.c b/kitty/screen.c index 55f3d3046..309a0b0cd 100644 --- a/kitty/screen.c +++ b/kitty/screen.c @@ -1101,11 +1101,6 @@ draw_control_char(Screen *self, text_loop_state *s, uint32_t ch) { } } -static bool -is_roundtripped_zero_width_char(char_type ch) { - return ch == 0xad || ch == 0x200b || ch == 0x2060; -} - static void draw_text_loop(Screen *self, const uint32_t *chars, size_t num_chars, text_loop_state *s) { init_text_loop_line(self, s); @@ -1124,13 +1119,11 @@ draw_text_loop(Screen *self, const uint32_t *chars, size_t num_chars, text_loop_ int char_width = wcwidth_std(cp); if (UNLIKELY(char_width < 1)) { if (char_width == 0) { - // check for some zero width chars that we want to preserve for - // round tripping that are not added to prev cell by grapheme - // segmentation. - if (s->prev.cc && is_roundtripped_zero_width_char(ch)) { // soft hyphen, zero width space, word joiner - draw_combining_char(self, s, ch); - } - continue; // we cannot represent zero width chars except as combining chars + // Preserve zero width chars as combining chars even though + // they were not added to the prev cell by grapheme segmentation. + // Zero width chars can only be represented as combining chars. + if (s->prev.cc) draw_combining_char(self, s, ch); + continue; } char_width = 1; } @@ -1317,7 +1310,7 @@ screen_handle_multicell_command(Screen *self, const MultiCellCommand *cmd, const char_type ch = self->lc->chars[i]; CharProps cp = char_props_for(ch); if (cp.is_invalid) continue; - if ((s = grapheme_segmentation_step(s, cp)).add_to_current_cell || (wcwidth_std(cp) == 0 && is_roundtripped_zero_width_char(ch) && lc.count)) lc.chars[lc.count++] = ch; + if ((s = grapheme_segmentation_step(s, cp)).add_to_current_cell || (wcwidth_std(cp) == 0 && lc.count)) lc.chars[lc.count++] = ch; else { if (lc.count) handle_variable_width_multicell_command(self, mcd, &lc); if (wcwidth_std(cp) < 1) lc.count = 0; diff --git a/kitty_tests/datatypes.py b/kitty_tests/datatypes.py index 7b420f6ce..812135168 100644 --- a/kitty_tests/datatypes.py +++ b/kitty_tests/datatypes.py @@ -640,7 +640,41 @@ class TestDataTypes(BaseTest): def test_split_into_graphemes(self): self.assertEqual(char_props_for('\ue000')['category'], 'Co') self.ae(split_into_graphemes('ab'), ['a', 'b']) + s = self.create_screen(cols=12) + excluded_chars = set(range(32)) + + def is_excluded(text): + return bool(set(map(ord, text)) & excluded_chars) + + def adapt_cell_text(cells): + for cell in cells: + gp = split_into_graphemes(cell) + if len(gp) == 1: + yield cell + else: + for i, g in enumerate(gp[:-1]): + if wcswidth(gp[i+1][0]) != 0: + raise AssertionError( + f'cell {cell!r} contains grapheme break point at non zero width character for Test #{i}: {test["comment"]}') + yield from gp + for i, test in enumerate(json.loads(read_kitty_resource('GraphemeBreakTest.json', __name__.rpartition('.')[0]))): expected = test['data'] - actual = split_into_graphemes(''.join(expected)) + text = ''.join(expected) + actual = split_into_graphemes(text) self.ae(expected, actual, f'Test #{i} failed: {test["comment"]}') + if is_excluded(text): + continue + s.carriage_return(), s.erase_in_line() + s.draw(' ' + text) + actual = [] + for x in range(s.cursor.x): + cell = s.cpu_cells(0, x) + if cell['x'] > 0: + continue + ct = cell['text'] + if x == 0: + ct = ct[1:] + if ct: + actual.append(ct) + self.ae(expected, list(adapt_cell_text(actual)), f'Test #{i} failed: {test["comment"]}')