Get the unicode grapheme seg tests passing on screen, with minimal modification

We ignore tests including ACII control codes and we modify the results
when there are grapheme breaks before zero width characters.
This commit is contained in:
Kovid Goyal
2025-04-10 08:33:18 +05:30
parent e976cf67fd
commit c01a941fe7
2 changed files with 41 additions and 14 deletions

View File

@@ -1101,11 +1101,6 @@ draw_control_char(Screen *self, text_loop_state *s, uint32_t ch) {
}
}
static bool
is_roundtripped_zero_width_char(char_type ch) {
return ch == 0xad || ch == 0x200b || ch == 0x2060;
}
static void
draw_text_loop(Screen *self, const uint32_t *chars, size_t num_chars, text_loop_state *s) {
init_text_loop_line(self, s);
@@ -1124,13 +1119,11 @@ draw_text_loop(Screen *self, const uint32_t *chars, size_t num_chars, text_loop_
int char_width = wcwidth_std(cp);
if (UNLIKELY(char_width < 1)) {
if (char_width == 0) {
// check for some zero width chars that we want to preserve for
// round tripping that are not added to prev cell by grapheme
// segmentation.
if (s->prev.cc && is_roundtripped_zero_width_char(ch)) { // soft hyphen, zero width space, word joiner
draw_combining_char(self, s, ch);
}
continue; // we cannot represent zero width chars except as combining chars
// Preserve zero width chars as combining chars even though
// they were not added to the prev cell by grapheme segmentation.
// Zero width chars can only be represented as combining chars.
if (s->prev.cc) draw_combining_char(self, s, ch);
continue;
}
char_width = 1;
}
@@ -1317,7 +1310,7 @@ screen_handle_multicell_command(Screen *self, const MultiCellCommand *cmd, const
char_type ch = self->lc->chars[i];
CharProps cp = char_props_for(ch);
if (cp.is_invalid) continue;
if ((s = grapheme_segmentation_step(s, cp)).add_to_current_cell || (wcwidth_std(cp) == 0 && is_roundtripped_zero_width_char(ch) && lc.count)) lc.chars[lc.count++] = ch;
if ((s = grapheme_segmentation_step(s, cp)).add_to_current_cell || (wcwidth_std(cp) == 0 && lc.count)) lc.chars[lc.count++] = ch;
else {
if (lc.count) handle_variable_width_multicell_command(self, mcd, &lc);
if (wcwidth_std(cp) < 1) lc.count = 0;

View File

@@ -640,7 +640,41 @@ class TestDataTypes(BaseTest):
def test_split_into_graphemes(self):
self.assertEqual(char_props_for('\ue000')['category'], 'Co')
self.ae(split_into_graphemes('ab'), ['a', 'b'])
s = self.create_screen(cols=12)
excluded_chars = set(range(32))
def is_excluded(text):
return bool(set(map(ord, text)) & excluded_chars)
def adapt_cell_text(cells):
for cell in cells:
gp = split_into_graphemes(cell)
if len(gp) == 1:
yield cell
else:
for i, g in enumerate(gp[:-1]):
if wcswidth(gp[i+1][0]) != 0:
raise AssertionError(
f'cell {cell!r} contains grapheme break point at non zero width character for Test #{i}: {test["comment"]}')
yield from gp
for i, test in enumerate(json.loads(read_kitty_resource('GraphemeBreakTest.json', __name__.rpartition('.')[0]))):
expected = test['data']
actual = split_into_graphemes(''.join(expected))
text = ''.join(expected)
actual = split_into_graphemes(text)
self.ae(expected, actual, f'Test #{i} failed: {test["comment"]}')
if is_excluded(text):
continue
s.carriage_return(), s.erase_in_line()
s.draw(' ' + text)
actual = []
for x in range(s.cursor.x):
cell = s.cpu_cells(0, x)
if cell['x'] > 0:
continue
ct = cell['text']
if x == 0:
ct = ct[1:]
if ct:
actual.append(ct)
self.ae(expected, list(adapt_cell_text(actual)), f'Test #{i} failed: {test["comment"]}')