Add a couple more gseg tests

This commit is contained in:
Kovid Goyal
2025-04-11 13:34:16 +05:30
parent 357481900d
commit 82e2fe82d6
2 changed files with 22 additions and 6 deletions

View File

@@ -334,6 +334,14 @@ def parse_test_data() -> None:
chars[-1].append(ch)
c = tuple(''.join(c) for c in chars)
grapheme_segmentation_tests.append({'data': c, 'comment': comment.strip()})
grapheme_segmentation_tests.append({
'data': (' ', '\xad', ' '),
'comment': '÷ [0.2] SPACE (Other) ÷ [0.4] SOFT HYPHEN ÷ [999.0] SPACE (Other) ÷ [0.3]'
})
grapheme_segmentation_tests.append({
'data': ('\U0001f468\u200d\U0001f469\u200d\U0001f467\u200d\U0001f466',),
'comment': '÷ [0.2] MAN × [9.0] ZERO WIDTH JOINER × [11.0] WOMAN × [9.0] ZERO WIDTH JOINER × [11.0] GIRL × [9.0] ZERO WIDTH JOINER × [11.0] BOY ÷ [0.3]'
})
# }}}
@@ -1164,12 +1172,6 @@ def gen_char_props() -> None:
is_extended_pictographic=x.is_extended_pictographic) for x in prop_array)
test_grapheme_segmentation(partial(split_into_graphemes, gsprops))
gseg_results = tuple(GraphemeSegmentationKey.from_int(i).result() for i in range(1 << 16))
s = GraphemeSegmentationResult.make()
for ch in range(32, 127):
k = int(GraphemeSegmentationKey(s.new_state, gsprops[ch]))
s = gseg_results[k]
print(111111, chr(ch), s)
test_grapheme_segmentation(partial(split_into_graphemes_with_table, gsprops, gseg_results))
t1, t2, t3, t_shift = splitbins(prop_array, CharProps.bitsize() // 8)

View File

@@ -7377,5 +7377,19 @@
"क््त"
],
"comment": "÷ [0.2] DEVANAGARI LETTER KA (ConjunctLinkingScripts_LinkingConsonant) × [9.0] DEVANAGARI SIGN VIRAMA (Extend_ConjunctLinkingScripts_ConjunctLinker_ExtCccZwj) × [9.0] DEVANAGARI SIGN VIRAMA (Extend_ConjunctLinkingScripts_ConjunctLinker_ExtCccZwj) × [9.3] DEVANAGARI LETTER TA (ConjunctLinkingScripts_LinkingConsonant) ÷ [0.3]"
},
{
"data": [
" ",
"­",
" "
],
"comment": "÷ [0.2] SPACE (Other) ÷ [0.4] SOFT HYPHEN ÷ [999.0] SPACE (Other) ÷ [0.3]"
},
{
"data": [
"👨‍👩‍👧‍👦"
],
"comment": "÷ [0.2] MAN × [9.0] ZERO WIDTH JOINER × [11.0] WOMAN × [9.0] ZERO WIDTH JOINER × [11.0] GIRL × [9.0] ZERO WIDTH JOINER × [11.0] BOY ÷ [0.3]"
}
]