Add tests for grapheme segmentation

Test data provided by Unicode organisation
This commit is contained in:
Kovid Goyal
2025-03-13 13:48:35 +05:30
parent 9c1c141775
commit 0d866b1f13
7 changed files with 7488 additions and 26 deletions

1
.gitattributes vendored
View File

@@ -1,5 +1,6 @@
kitty/wcwidth-std.h linguist-generated=true
kitty/grapheme-segmentation-data.h linguist-generated=true
kitty_tests/GraphemeBreakTest.json linguist-generated=true
kitty/emoji.h linguist-generated=true
kitty/charsets.c linguist-generated=true
kitty/key_encoding.py linguist-generated=true

View File

@@ -1,6 +1,7 @@
#!/usr/bin/env python
# License: GPL v3 Copyright: 2017, Kovid Goyal <kovid at kovidgoyal.net>
import json
import os
import re
import subprocess
@@ -655,6 +656,30 @@ def gen_rowcolumn_diacritics() -> None:
gofmt(go_file)
def gen_test_data() -> None:
tests = []
for line in get_data('ucd/auxiliary/GraphemeBreakTest.txt'):
t, comment = line.split('#')
t = t.lstrip('÷').strip().rstrip('÷').strip()
chars: list[list[str]] = [[]]
for x in re.split(r'([÷×])', t):
x = x.strip()
match x:
case '÷':
chars.append([])
case '×':
pass
case '':
pass
case _:
ch = chr(int(x, 16))
chars[-1].append(ch)
c = [''.join(c) for c in chars]
tests.append({'data': c, 'comment': comment.strip()})
with open('kitty_tests/GraphemeBreakTest.json', 'wb') as f:
f.write(json.dumps(tests, indent=2, ensure_ascii=False).encode())
def main(args: list[str]=sys.argv) -> None:
parse_ucd()
parse_prop_list()
@@ -667,6 +692,7 @@ def main(args: list[str]=sys.argv) -> None:
gen_names()
gen_rowcolumn_diacritics()
gen_grapheme_segmentation()
gen_test_data()
if __name__ == '__main__':

View File

@@ -12,7 +12,7 @@
#undef _DARWIN_C_SOURCE
#endif
#include "data-types.h"
#include "grapheme-segmentation.h"
#include "line.h"
#include "charsets.h"
#include "base64.h"
@@ -134,6 +134,28 @@ base64_decode_into(PyObject UNUSED *self, PyObject *args) {
return PyLong_FromSize_t(sz);
}
static PyObject*
split_into_graphemes(PyObject UNUSED *self, PyObject *src) {
if (!PyUnicode_Check(src)) { PyErr_SetString(PyExc_TypeError, "must provide a unicode string"); return NULL; }
int kind = PyUnicode_KIND(src); char *data = PyUnicode_DATA(src);
RAII_PyObject(ans, PyList_New(0));
if (!ans) return NULL;
GraphemeSegmentationState s; grapheme_segmentation_reset(&s);
Py_ssize_t pos = 0;
for (Py_ssize_t i = 0; i < PyUnicode_GET_LENGTH(src); i++) {
char_type ch = PyUnicode_READ(kind, data, i);
if (!grapheme_segmentation_step(&s, ch)) {
RAII_PyObject(u, PyUnicode_FromKindAndData(kind, data + kind * pos, i - pos));
if (!u || PyList_Append(ans, u) != 0) return NULL;
pos = i;
}
}
if (pos < PyUnicode_GET_LENGTH(src)) {
RAII_PyObject(u, PyUnicode_FromKindAndData(kind, data + kind * pos, PyUnicode_GET_LENGTH(src) - pos));
if (!u || PyList_Append(ans, u) != 0) return NULL;
}
return Py_NewRef(ans);
}
typedef struct StreamingBase64Decoder {
PyObject_HEAD
@@ -627,6 +649,7 @@ static PyMethodDef module_methods[] = {
{"base64_encode_into", (PyCFunction)base64_encode_into, METH_VARARGS, ""},
{"base64_decode", (PyCFunction)(void (*) (void))(pybase64_decode), METH_O, ""},
{"base64_decode_into", (PyCFunction)base64_decode_into, METH_VARARGS, ""},
{"split_into_graphemes", (PyCFunction)split_into_graphemes, METH_O, ""},
{"thread_write", (PyCFunction)cm_thread_write, METH_VARARGS, ""},
{"redirect_std_streams", (PyCFunction)redirect_std_streams, METH_VARARGS, ""},
{"locale_is_valid", (PyCFunction)locale_is_valid, METH_VARARGS, ""},

View File

@@ -8,32 +8,11 @@
#include "text-cache.h"
#include "grapheme-segmentation-data.h"
typedef struct GraphemeSegmentationState {
GraphemeBreakProperty last_char_prop;
/* True if the last character ends a sequence of Indic_Conjunct_Break
values: consonant {extend|linker}* */
bool incb_consonant_extended;
/* True if the last character ends a sequence of Indic_Conjunct_Break
values: consonant {extend|linker}* linker */
bool incb_consonant_extended_linker;
/* True if the last character ends a sequence of Indic_Conjunct_Break
values: consonant {extend|linker}* linker {extend|linker}* */
bool incb_consonant_extended_linker_extended;
/* True if the last character ends an emoji modifier sequence
\p{Extended_Pictographic} Extend*. */
bool emoji_modifier_sequence;
/* True if the last character was immediately preceded by an
emoji modifier sequence \p{Extended_Pictographic} Extend*. */
bool emoji_modifier_sequence_before_last_char;
/* Number of consecutive regional indicator (RI) characters seen
immediately before the current point. */
size_t ri_count;
} GraphemeSegmentationState;
#define is_linker_or_extend(incb) ((incb) == ICB_Linker || (incb) == ICB_Extend)
#define GSS_IMPLEMENTATION
#include "grapheme-segmentation.h"
void
grapheme_segmentation_reset(GraphemeSegmentationState *s) {
*s = (GraphemeSegmentationState){0};
@@ -42,6 +21,8 @@ grapheme_segmentation_reset(GraphemeSegmentationState *s) {
bool
grapheme_segmentation_step(GraphemeSegmentationState *s, char_type ch) {
// Grapheme segmentation as per UAX29-C1-1 as defined in https://www.unicode.org/reports/tr29/
// Returns true iff ch should be added to the current cell based on s which
// must reflect the state of the current cell.
GraphemeBreakProperty prop = grapheme_break_property(ch);
IndicConjunctBreak incb = indic_conjunct_break(ch);
bool add_to_cell = false;
@@ -66,7 +47,10 @@ grapheme_segmentation_step(GraphemeSegmentationState *s, char_type ch) {
else if (s->incb_consonant_extended_linker_extended && incb == ICB_Consonant) add_to_cell = true;
/* No break within emoji modifier sequences or emoji zwj sequences (GB11). */
else if (s->last_char_prop == GBP_ZWJ && s->emoji_modifier_sequence_before_last_char && is_extended_pictographic(ch)) add_to_cell = true;
else {} // break everywhere else
/* No break between RI if there is an odd number of RI characters before (GB12, GB13). */
else if (prop == GBP_Regional_Indicator && (s->ri_count % 2) != 0) add_to_cell = true;
/* Break everywhere else */
else {}
}
s->incb_consonant_extended_linker = s->incb_consonant_extended && incb == ICB_Linker;

View File

@@ -0,0 +1,38 @@
/*
* grapheme-segmentation.h
* Copyright (C) 2025 Kovid Goyal <kovid at kovidgoyal.net>
*
* Distributed under terms of the GPL3 license.
*/
#pragma once
#include "data-types.h"
typedef struct GraphemeSegmentationState {
int last_char_prop;
/* True if the last character ends a sequence of Indic_Conjunct_Break
values: consonant {extend|linker}* */
bool incb_consonant_extended;
/* True if the last character ends a sequence of Indic_Conjunct_Break
values: consonant {extend|linker}* linker */
bool incb_consonant_extended_linker;
/* True if the last character ends a sequence of Indic_Conjunct_Break
values: consonant {extend|linker}* linker {extend|linker}* */
bool incb_consonant_extended_linker_extended;
/* True if the last character ends an emoji modifier sequence
\p{Extended_Pictographic} Extend*. */
bool emoji_modifier_sequence;
/* True if the last character was immediately preceded by an
emoji modifier sequence \p{Extended_Pictographic} Extend*. */
bool emoji_modifier_sequence_before_last_char;
/* Number of consecutive regional indicator (RI) characters seen
immediately before the current point. */
size_t ri_count;
} GraphemeSegmentationState;
void grapheme_segmentation_reset(GraphemeSegmentationState *s);
bool grapheme_segmentation_step(GraphemeSegmentationState *s, char_type ch);

7381
kitty_tests/GraphemeBreakTest.json generated Normal file

File diff suppressed because it is too large Load Diff

View File

@@ -1,10 +1,12 @@
#!/usr/bin/env python
# License: GPL v3 Copyright: 2016, Kovid Goyal <kovid at kovidgoyal.net>
import json
import os
import sys
import tempfile
from kitty.constants import read_kitty_resource
from kitty.fast_data_types import (
Color,
HistoryBuf,
@@ -12,6 +14,7 @@ from kitty.fast_data_types import (
expand_ansi_c_escapes,
parse_input_from_terminal,
replace_c0_codes_except_nl_space_tab,
split_into_graphemes,
strip_csi,
truncate_point_for_length,
wcswidth,
@@ -631,3 +634,9 @@ class TestDataTypes(BaseTest):
}.items():
actual = tuple(shlex_split_with_positions(q, True))
self.ae(expected, actual, f'Failed for text: {q!r}')
def test_split_into_graphemes(self):
for i, test in enumerate(json.loads(read_kitty_resource('GraphemeBreakTest.json', __name__.rpartition('.')[0]))):
expected = test['data']
actual = split_into_graphemes(''.join(expected))
self.ae(expected, actual, f'Test #{i} failed: {test["comment"]}')