Add tests for grapheme segmentation

Test data provided by Unicode organisation
2026-06-08 14:18:26 +02:00 · 2025-03-13 13:48:35 +05:30
parent 9c1c141775
commit 0d866b1f13
7 changed files with 7488 additions and 26 deletions
--- a/.gitattributes
+++ b/.gitattributes
@@ -1,5 +1,6 @@
 kitty/wcwidth-std.h linguist-generated=true
 kitty/grapheme-segmentation-data.h linguist-generated=true
+kitty_tests/GraphemeBreakTest.json linguist-generated=true
 kitty/emoji.h linguist-generated=true
 kitty/charsets.c linguist-generated=true
 kitty/key_encoding.py linguist-generated=true
--- a/gen/wcwidth.py
+++ b/gen/wcwidth.py
@@ -1,6 +1,7 @@
 #!/usr/bin/env python
 # License: GPL v3 Copyright: 2017, Kovid Goyal <kovid at kovidgoyal.net>

+import json
 import os
 import re
 import subprocess
@@ -655,6 +656,30 @@ def gen_rowcolumn_diacritics() -> None:
    gofmt(go_file)


+def gen_test_data() -> None:
+    tests = []
+    for line in get_data('ucd/auxiliary/GraphemeBreakTest.txt'):
+        t, comment = line.split('#')
+        t = t.lstrip('÷').strip().rstrip('÷').strip()
+        chars: list[list[str]] = [[]]
+        for x in re.split(r'([÷×])', t):
+            x = x.strip()
+            match x:
+                case '÷':
+                    chars.append([])
+                case '×':
+                    pass
+                case '':
+                    pass
+                case _:
+                    ch = chr(int(x, 16))
+                    chars[-1].append(ch)
+        c = [''.join(c) for c in chars]
+        tests.append({'data': c, 'comment': comment.strip()})
+    with open('kitty_tests/GraphemeBreakTest.json', 'wb') as f:
+        f.write(json.dumps(tests, indent=2, ensure_ascii=False).encode())
+
+
 def main(args: list[str]=sys.argv) -> None:
    parse_ucd()
    parse_prop_list()
@@ -667,6 +692,7 @@ def main(args: list[str]=sys.argv) -> None:
    gen_names()
    gen_rowcolumn_diacritics()
    gen_grapheme_segmentation()
+    gen_test_data()


 if __name__ == '__main__':
--- a/kitty/data-types.c
+++ b/kitty/data-types.c
@@ -12,7 +12,7 @@
 #undef _DARWIN_C_SOURCE
 #endif

-#include "data-types.h"
+#include "grapheme-segmentation.h"
 #include "line.h"
 #include "charsets.h"
 #include "base64.h"
@@ -134,6 +134,28 @@ base64_decode_into(PyObject UNUSED *self, PyObject *args) {
    return PyLong_FromSize_t(sz);
 }

+static PyObject*
+split_into_graphemes(PyObject UNUSED *self, PyObject *src) {
+    if (!PyUnicode_Check(src)) { PyErr_SetString(PyExc_TypeError, "must provide a unicode string"); return NULL; }
+    int kind = PyUnicode_KIND(src); char *data = PyUnicode_DATA(src);
+    RAII_PyObject(ans, PyList_New(0));
+    if (!ans) return NULL;
+    GraphemeSegmentationState s; grapheme_segmentation_reset(&s);
+    Py_ssize_t pos = 0;
+    for (Py_ssize_t i = 0; i < PyUnicode_GET_LENGTH(src); i++) {
+        char_type ch = PyUnicode_READ(kind, data, i);
+        if (!grapheme_segmentation_step(&s, ch)) {
+            RAII_PyObject(u, PyUnicode_FromKindAndData(kind, data + kind * pos, i - pos));
+            if (!u || PyList_Append(ans, u) != 0) return NULL;
+            pos = i;
+        }
+    }
+    if (pos < PyUnicode_GET_LENGTH(src)) {
+        RAII_PyObject(u, PyUnicode_FromKindAndData(kind, data + kind * pos, PyUnicode_GET_LENGTH(src) - pos));
+        if (!u || PyList_Append(ans, u) != 0) return NULL;
+    }
+    return Py_NewRef(ans);
+}

 typedef struct StreamingBase64Decoder {
    PyObject_HEAD
@@ -627,6 +649,7 @@ static PyMethodDef module_methods[] = {
    {"base64_encode_into", (PyCFunction)base64_encode_into, METH_VARARGS, ""},
    {"base64_decode", (PyCFunction)(void (*) (void))(pybase64_decode), METH_O, ""},
    {"base64_decode_into", (PyCFunction)base64_decode_into, METH_VARARGS, ""},
+    {"split_into_graphemes", (PyCFunction)split_into_graphemes, METH_O, ""},
    {"thread_write", (PyCFunction)cm_thread_write, METH_VARARGS, ""},
    {"redirect_std_streams", (PyCFunction)redirect_std_streams, METH_VARARGS, ""},
    {"locale_is_valid", (PyCFunction)locale_is_valid, METH_VARARGS, ""},
--- a/kitty/grapheme-segmentation.c
+++ b/kitty/grapheme-segmentation.c
@@ -8,32 +8,11 @@
 #include "text-cache.h"
 #include "grapheme-segmentation-data.h"

-typedef struct GraphemeSegmentationState {
-    GraphemeBreakProperty last_char_prop;
-
-    /* True if the last character ends a sequence of Indic_Conjunct_Break
-values:  consonant {extend|linker}*  */
-    bool incb_consonant_extended;
-    /* True if the last character ends a sequence of Indic_Conjunct_Break
-values:  consonant {extend|linker}* linker  */
-    bool incb_consonant_extended_linker;
-    /* True if the last character ends a sequence of Indic_Conjunct_Break
-values:  consonant {extend|linker}* linker {extend|linker}*  */
-    bool incb_consonant_extended_linker_extended;
-
-    /* True if the last character ends an emoji modifier sequence
-       \p{Extended_Pictographic} Extend*.  */
-    bool emoji_modifier_sequence;
-    /* True if the last character was immediately preceded by an
-       emoji modifier sequence   \p{Extended_Pictographic} Extend*.  */
-    bool emoji_modifier_sequence_before_last_char;
-
-    /* Number of consecutive regional indicator (RI) characters seen
-       immediately before the current point.  */
-    size_t ri_count;
-} GraphemeSegmentationState;
 #define is_linker_or_extend(incb) ((incb) == ICB_Linker || (incb) == ICB_Extend)

+#define GSS_IMPLEMENTATION
+#include "grapheme-segmentation.h"
+
 void
 grapheme_segmentation_reset(GraphemeSegmentationState *s) {
    *s = (GraphemeSegmentationState){0};
@@ -42,6 +21,8 @@ grapheme_segmentation_reset(GraphemeSegmentationState *s) {
 bool
 grapheme_segmentation_step(GraphemeSegmentationState *s, char_type ch) {
    // Grapheme segmentation as per UAX29-C1-1 as defined in https://www.unicode.org/reports/tr29/
+    // Returns true iff ch should be added to the current cell based on s which
+    // must reflect the state of the current cell.
    GraphemeBreakProperty prop = grapheme_break_property(ch);
    IndicConjunctBreak incb = indic_conjunct_break(ch);
    bool add_to_cell = false;
@@ -66,7 +47,10 @@ grapheme_segmentation_step(GraphemeSegmentationState *s, char_type ch) {
        else if (s->incb_consonant_extended_linker_extended && incb == ICB_Consonant) add_to_cell = true;
        /* No break within emoji modifier sequences or emoji zwj sequences (GB11).  */
        else if (s->last_char_prop == GBP_ZWJ && s->emoji_modifier_sequence_before_last_char && is_extended_pictographic(ch)) add_to_cell = true;
-        else {} // break everywhere else
+        /* No break between RI if there is an odd number of RI characters before (GB12, GB13).  */
+        else if (prop == GBP_Regional_Indicator && (s->ri_count % 2) != 0) add_to_cell = true;
+        /* Break everywhere else */
+        else {}
    }

    s->incb_consonant_extended_linker = s->incb_consonant_extended && incb == ICB_Linker;
--- a/kitty/grapheme-segmentation.h
+++ b/kitty/grapheme-segmentation.h
@@ -0,0 +1,38 @@
+/*
+ * grapheme-segmentation.h
+ * Copyright (C) 2025 Kovid Goyal <kovid at kovidgoyal.net>
+ *
+ * Distributed under terms of the GPL3 license.
+ */
+
+#pragma once
+
+#include "data-types.h"
+
+typedef struct GraphemeSegmentationState {
+    int last_char_prop;
+
+    /* True if the last character ends a sequence of Indic_Conjunct_Break
+values:  consonant {extend|linker}*  */
+    bool incb_consonant_extended;
+    /* True if the last character ends a sequence of Indic_Conjunct_Break
+values:  consonant {extend|linker}* linker  */
+    bool incb_consonant_extended_linker;
+    /* True if the last character ends a sequence of Indic_Conjunct_Break
+values:  consonant {extend|linker}* linker {extend|linker}*  */
+    bool incb_consonant_extended_linker_extended;
+
+    /* True if the last character ends an emoji modifier sequence
+       \p{Extended_Pictographic} Extend*.  */
+    bool emoji_modifier_sequence;
+    /* True if the last character was immediately preceded by an
+       emoji modifier sequence   \p{Extended_Pictographic} Extend*.  */
+    bool emoji_modifier_sequence_before_last_char;
+
+    /* Number of consecutive regional indicator (RI) characters seen
+       immediately before the current point.  */
+    size_t ri_count;
+} GraphemeSegmentationState;
+
+void grapheme_segmentation_reset(GraphemeSegmentationState *s);
+bool grapheme_segmentation_step(GraphemeSegmentationState *s, char_type ch);
--- a/kitty_tests/GraphemeBreakTest.json
+++ b/kitty_tests/GraphemeBreakTest.json
--- a/kitty_tests/datatypes.py
+++ b/kitty_tests/datatypes.py
@@ -1,10 +1,12 @@
 #!/usr/bin/env python
 # License: GPL v3 Copyright: 2016, Kovid Goyal <kovid at kovidgoyal.net>

+import json
 import os
 import sys
 import tempfile

+from kitty.constants import read_kitty_resource
 from kitty.fast_data_types import (
    Color,
    HistoryBuf,
@@ -12,6 +14,7 @@ from kitty.fast_data_types import (
    expand_ansi_c_escapes,
    parse_input_from_terminal,
    replace_c0_codes_except_nl_space_tab,
+    split_into_graphemes,
    strip_csi,
    truncate_point_for_length,
    wcswidth,
@@ -631,3 +634,9 @@ class TestDataTypes(BaseTest):
        }.items():
            actual = tuple(shlex_split_with_positions(q, True))
            self.ae(expected, actual, f'Failed for text: {q!r}')
+
+    def test_split_into_graphemes(self):
+        for i, test in enumerate(json.loads(read_kitty_resource('GraphemeBreakTest.json', __name__.rpartition('.')[0]))):
+            expected = test['data']
+            actual = split_into_graphemes(''.join(expected))
+            self.ae(expected, actual, f'Test #{i} failed: {test["comment"]}')