mirror of
https://github.com/kovidgoyal/kitty
synced 2026-06-08 14:18:26 +02:00
Add tests for grapheme segmentation
Test data provided by Unicode organisation
This commit is contained in:
1
.gitattributes
vendored
1
.gitattributes
vendored
@@ -1,5 +1,6 @@
|
||||
kitty/wcwidth-std.h linguist-generated=true
|
||||
kitty/grapheme-segmentation-data.h linguist-generated=true
|
||||
kitty_tests/GraphemeBreakTest.json linguist-generated=true
|
||||
kitty/emoji.h linguist-generated=true
|
||||
kitty/charsets.c linguist-generated=true
|
||||
kitty/key_encoding.py linguist-generated=true
|
||||
|
||||
@@ -1,6 +1,7 @@
|
||||
#!/usr/bin/env python
|
||||
# License: GPL v3 Copyright: 2017, Kovid Goyal <kovid at kovidgoyal.net>
|
||||
|
||||
import json
|
||||
import os
|
||||
import re
|
||||
import subprocess
|
||||
@@ -655,6 +656,30 @@ def gen_rowcolumn_diacritics() -> None:
|
||||
gofmt(go_file)
|
||||
|
||||
|
||||
def gen_test_data() -> None:
|
||||
tests = []
|
||||
for line in get_data('ucd/auxiliary/GraphemeBreakTest.txt'):
|
||||
t, comment = line.split('#')
|
||||
t = t.lstrip('÷').strip().rstrip('÷').strip()
|
||||
chars: list[list[str]] = [[]]
|
||||
for x in re.split(r'([÷×])', t):
|
||||
x = x.strip()
|
||||
match x:
|
||||
case '÷':
|
||||
chars.append([])
|
||||
case '×':
|
||||
pass
|
||||
case '':
|
||||
pass
|
||||
case _:
|
||||
ch = chr(int(x, 16))
|
||||
chars[-1].append(ch)
|
||||
c = [''.join(c) for c in chars]
|
||||
tests.append({'data': c, 'comment': comment.strip()})
|
||||
with open('kitty_tests/GraphemeBreakTest.json', 'wb') as f:
|
||||
f.write(json.dumps(tests, indent=2, ensure_ascii=False).encode())
|
||||
|
||||
|
||||
def main(args: list[str]=sys.argv) -> None:
|
||||
parse_ucd()
|
||||
parse_prop_list()
|
||||
@@ -667,6 +692,7 @@ def main(args: list[str]=sys.argv) -> None:
|
||||
gen_names()
|
||||
gen_rowcolumn_diacritics()
|
||||
gen_grapheme_segmentation()
|
||||
gen_test_data()
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
|
||||
@@ -12,7 +12,7 @@
|
||||
#undef _DARWIN_C_SOURCE
|
||||
#endif
|
||||
|
||||
#include "data-types.h"
|
||||
#include "grapheme-segmentation.h"
|
||||
#include "line.h"
|
||||
#include "charsets.h"
|
||||
#include "base64.h"
|
||||
@@ -134,6 +134,28 @@ base64_decode_into(PyObject UNUSED *self, PyObject *args) {
|
||||
return PyLong_FromSize_t(sz);
|
||||
}
|
||||
|
||||
static PyObject*
|
||||
split_into_graphemes(PyObject UNUSED *self, PyObject *src) {
|
||||
if (!PyUnicode_Check(src)) { PyErr_SetString(PyExc_TypeError, "must provide a unicode string"); return NULL; }
|
||||
int kind = PyUnicode_KIND(src); char *data = PyUnicode_DATA(src);
|
||||
RAII_PyObject(ans, PyList_New(0));
|
||||
if (!ans) return NULL;
|
||||
GraphemeSegmentationState s; grapheme_segmentation_reset(&s);
|
||||
Py_ssize_t pos = 0;
|
||||
for (Py_ssize_t i = 0; i < PyUnicode_GET_LENGTH(src); i++) {
|
||||
char_type ch = PyUnicode_READ(kind, data, i);
|
||||
if (!grapheme_segmentation_step(&s, ch)) {
|
||||
RAII_PyObject(u, PyUnicode_FromKindAndData(kind, data + kind * pos, i - pos));
|
||||
if (!u || PyList_Append(ans, u) != 0) return NULL;
|
||||
pos = i;
|
||||
}
|
||||
}
|
||||
if (pos < PyUnicode_GET_LENGTH(src)) {
|
||||
RAII_PyObject(u, PyUnicode_FromKindAndData(kind, data + kind * pos, PyUnicode_GET_LENGTH(src) - pos));
|
||||
if (!u || PyList_Append(ans, u) != 0) return NULL;
|
||||
}
|
||||
return Py_NewRef(ans);
|
||||
}
|
||||
|
||||
typedef struct StreamingBase64Decoder {
|
||||
PyObject_HEAD
|
||||
@@ -627,6 +649,7 @@ static PyMethodDef module_methods[] = {
|
||||
{"base64_encode_into", (PyCFunction)base64_encode_into, METH_VARARGS, ""},
|
||||
{"base64_decode", (PyCFunction)(void (*) (void))(pybase64_decode), METH_O, ""},
|
||||
{"base64_decode_into", (PyCFunction)base64_decode_into, METH_VARARGS, ""},
|
||||
{"split_into_graphemes", (PyCFunction)split_into_graphemes, METH_O, ""},
|
||||
{"thread_write", (PyCFunction)cm_thread_write, METH_VARARGS, ""},
|
||||
{"redirect_std_streams", (PyCFunction)redirect_std_streams, METH_VARARGS, ""},
|
||||
{"locale_is_valid", (PyCFunction)locale_is_valid, METH_VARARGS, ""},
|
||||
|
||||
@@ -8,32 +8,11 @@
|
||||
#include "text-cache.h"
|
||||
#include "grapheme-segmentation-data.h"
|
||||
|
||||
typedef struct GraphemeSegmentationState {
|
||||
GraphemeBreakProperty last_char_prop;
|
||||
|
||||
/* True if the last character ends a sequence of Indic_Conjunct_Break
|
||||
values: consonant {extend|linker}* */
|
||||
bool incb_consonant_extended;
|
||||
/* True if the last character ends a sequence of Indic_Conjunct_Break
|
||||
values: consonant {extend|linker}* linker */
|
||||
bool incb_consonant_extended_linker;
|
||||
/* True if the last character ends a sequence of Indic_Conjunct_Break
|
||||
values: consonant {extend|linker}* linker {extend|linker}* */
|
||||
bool incb_consonant_extended_linker_extended;
|
||||
|
||||
/* True if the last character ends an emoji modifier sequence
|
||||
\p{Extended_Pictographic} Extend*. */
|
||||
bool emoji_modifier_sequence;
|
||||
/* True if the last character was immediately preceded by an
|
||||
emoji modifier sequence \p{Extended_Pictographic} Extend*. */
|
||||
bool emoji_modifier_sequence_before_last_char;
|
||||
|
||||
/* Number of consecutive regional indicator (RI) characters seen
|
||||
immediately before the current point. */
|
||||
size_t ri_count;
|
||||
} GraphemeSegmentationState;
|
||||
#define is_linker_or_extend(incb) ((incb) == ICB_Linker || (incb) == ICB_Extend)
|
||||
|
||||
#define GSS_IMPLEMENTATION
|
||||
#include "grapheme-segmentation.h"
|
||||
|
||||
void
|
||||
grapheme_segmentation_reset(GraphemeSegmentationState *s) {
|
||||
*s = (GraphemeSegmentationState){0};
|
||||
@@ -42,6 +21,8 @@ grapheme_segmentation_reset(GraphemeSegmentationState *s) {
|
||||
bool
|
||||
grapheme_segmentation_step(GraphemeSegmentationState *s, char_type ch) {
|
||||
// Grapheme segmentation as per UAX29-C1-1 as defined in https://www.unicode.org/reports/tr29/
|
||||
// Returns true iff ch should be added to the current cell based on s which
|
||||
// must reflect the state of the current cell.
|
||||
GraphemeBreakProperty prop = grapheme_break_property(ch);
|
||||
IndicConjunctBreak incb = indic_conjunct_break(ch);
|
||||
bool add_to_cell = false;
|
||||
@@ -66,7 +47,10 @@ grapheme_segmentation_step(GraphemeSegmentationState *s, char_type ch) {
|
||||
else if (s->incb_consonant_extended_linker_extended && incb == ICB_Consonant) add_to_cell = true;
|
||||
/* No break within emoji modifier sequences or emoji zwj sequences (GB11). */
|
||||
else if (s->last_char_prop == GBP_ZWJ && s->emoji_modifier_sequence_before_last_char && is_extended_pictographic(ch)) add_to_cell = true;
|
||||
else {} // break everywhere else
|
||||
/* No break between RI if there is an odd number of RI characters before (GB12, GB13). */
|
||||
else if (prop == GBP_Regional_Indicator && (s->ri_count % 2) != 0) add_to_cell = true;
|
||||
/* Break everywhere else */
|
||||
else {}
|
||||
}
|
||||
|
||||
s->incb_consonant_extended_linker = s->incb_consonant_extended && incb == ICB_Linker;
|
||||
|
||||
38
kitty/grapheme-segmentation.h
Normal file
38
kitty/grapheme-segmentation.h
Normal file
@@ -0,0 +1,38 @@
|
||||
/*
|
||||
* grapheme-segmentation.h
|
||||
* Copyright (C) 2025 Kovid Goyal <kovid at kovidgoyal.net>
|
||||
*
|
||||
* Distributed under terms of the GPL3 license.
|
||||
*/
|
||||
|
||||
#pragma once
|
||||
|
||||
#include "data-types.h"
|
||||
|
||||
typedef struct GraphemeSegmentationState {
|
||||
int last_char_prop;
|
||||
|
||||
/* True if the last character ends a sequence of Indic_Conjunct_Break
|
||||
values: consonant {extend|linker}* */
|
||||
bool incb_consonant_extended;
|
||||
/* True if the last character ends a sequence of Indic_Conjunct_Break
|
||||
values: consonant {extend|linker}* linker */
|
||||
bool incb_consonant_extended_linker;
|
||||
/* True if the last character ends a sequence of Indic_Conjunct_Break
|
||||
values: consonant {extend|linker}* linker {extend|linker}* */
|
||||
bool incb_consonant_extended_linker_extended;
|
||||
|
||||
/* True if the last character ends an emoji modifier sequence
|
||||
\p{Extended_Pictographic} Extend*. */
|
||||
bool emoji_modifier_sequence;
|
||||
/* True if the last character was immediately preceded by an
|
||||
emoji modifier sequence \p{Extended_Pictographic} Extend*. */
|
||||
bool emoji_modifier_sequence_before_last_char;
|
||||
|
||||
/* Number of consecutive regional indicator (RI) characters seen
|
||||
immediately before the current point. */
|
||||
size_t ri_count;
|
||||
} GraphemeSegmentationState;
|
||||
|
||||
void grapheme_segmentation_reset(GraphemeSegmentationState *s);
|
||||
bool grapheme_segmentation_step(GraphemeSegmentationState *s, char_type ch);
|
||||
7381
kitty_tests/GraphemeBreakTest.json
generated
Normal file
7381
kitty_tests/GraphemeBreakTest.json
generated
Normal file
File diff suppressed because it is too large
Load Diff
@@ -1,10 +1,12 @@
|
||||
#!/usr/bin/env python
|
||||
# License: GPL v3 Copyright: 2016, Kovid Goyal <kovid at kovidgoyal.net>
|
||||
|
||||
import json
|
||||
import os
|
||||
import sys
|
||||
import tempfile
|
||||
|
||||
from kitty.constants import read_kitty_resource
|
||||
from kitty.fast_data_types import (
|
||||
Color,
|
||||
HistoryBuf,
|
||||
@@ -12,6 +14,7 @@ from kitty.fast_data_types import (
|
||||
expand_ansi_c_escapes,
|
||||
parse_input_from_terminal,
|
||||
replace_c0_codes_except_nl_space_tab,
|
||||
split_into_graphemes,
|
||||
strip_csi,
|
||||
truncate_point_for_length,
|
||||
wcswidth,
|
||||
@@ -631,3 +634,9 @@ class TestDataTypes(BaseTest):
|
||||
}.items():
|
||||
actual = tuple(shlex_split_with_positions(q, True))
|
||||
self.ae(expected, actual, f'Failed for text: {q!r}')
|
||||
|
||||
def test_split_into_graphemes(self):
|
||||
for i, test in enumerate(json.loads(read_kitty_resource('GraphemeBreakTest.json', __name__.rpartition('.')[0]))):
|
||||
expected = test['data']
|
||||
actual = split_into_graphemes(''.join(expected))
|
||||
self.ae(expected, actual, f'Test #{i} failed: {test["comment"]}')
|
||||
|
||||
Reference in New Issue
Block a user