From 6ecd78d9db77cd13089c593925022824e87407d1 Mon Sep 17 00:00:00 2001 From: Kovid Goyal Date: Tue, 1 Apr 2025 10:41:17 +0530 Subject: [PATCH] Remove bounds checking for unicode table access in Go --- gen/wcwidth.py | 30 +++++++++++++++++++++++++----- kitty/char-props.c | 1 + kitty/char-props.h | 1 + tools/wcswidth/char-props-data.go | 20 ++++++++++++++++++++ tools/wcswidth/char-props.go | 10 +--------- 5 files changed, 48 insertions(+), 14 deletions(-) diff --git a/gen/wcwidth.py b/gen/wcwidth.py index 874cd6912..945099feb 100755 --- a/gen/wcwidth.py +++ b/gen/wcwidth.py @@ -506,6 +506,10 @@ class Property(Protocol): def as_go(self) -> str: return '' + @classmethod + def bitsize(cls) -> int: + return 0 + def get_types(sz: int) -> tuple[str, str]: sz *= 8 @@ -514,11 +518,15 @@ def get_types(sz: int) -> tuple[str, str]: def gen_multistage_table( c: Callable[..., None], g: Callable[..., None], t1: Sequence[int], t2: Sequence[int], t3: Sequence[Property], shift: int, + for_go_type: str, maxval: int = 0 ) -> None: - ctype_t1, gotype_t1 = get_types(getsize(t1)) + t1_type_sz = getsize(t1) + ctype_t1, gotype_t1 = get_types(t1_type_sz) mask = mask_for(shift) name = t3[0].__class__.__name__ - ctype_t2, gotype_t2 = get_types(getsize(tuple(range(len(t3))))) + t2_type_sz = getsize(tuple(range(len(t3)))) + ctype_t2, gotype_t2 = get_types(t2_type_sz) + t3_type_sz = t3[0].bitsize() // 8 c(f'static const char_type {name}_mask = {mask}u;') c(f'static const char_type {name}_shift = {shift}u;') c(f'static const {ctype_t1} {name}_t1[{len(t1)}] = ''{') @@ -546,6 +554,17 @@ def gen_multistage_table( g(f'\t{items}') g('}') + check = f'x = max(0, min(x, {maxval}))' if maxval else '' + g(f''' +// Array accessor function that avoids bounds checking +func {name}For(x {for_go_type}) {name} {{ + {check} + t1 := uintptr(*(*{gotype_t1})(unsafe.Pointer(uintptr(unsafe.Pointer(&{lname}_t1[0])) + uintptr(x>>{lname}_shift)*{t1_type_sz}))) + t1_shifted := (t1 << {lname}_shift) + (uintptr(x) & {lname}_mask) + t2 := uintptr(*(*{gotype_t2})(unsafe.Pointer(uintptr(unsafe.Pointer(&{lname}_t2[0])) + t1_shifted*{t2_type_sz}))) + return *(*{name})(unsafe.Pointer(uintptr(unsafe.Pointer(&{lname}_t3[0])) + t2*{t3_type_sz})) +}} +''') width_shift = 4 @@ -919,7 +938,6 @@ func (r GraphemeSegmentationResult) State() (ans {base_type}) {{ return bitfield_declaration_as_c('GraphemeSegmentationResult', fields, {'state': bits}) - class CharProps(NamedTuple): width: int = 3 @@ -1127,17 +1145,19 @@ def gen_char_props() -> None: with create_header('kitty/char-props-data.h', include_data_types=False) as c, open('tools/wcswidth/char-props-data.go', 'w') as gof: gp = partial(print, file=gof) gp('package wcswidth') + gp('import "unsafe"') generate_enum(c, gp, 'GraphemeBreakProperty', *grapheme_segmentation_maps, prefix='GBP_') generate_enum(c, gp, 'IndicConjunctBreak', *incb_map, prefix='ICB_') cen('// UCBDeclaration {{''{') + cen(f'#define MAX_UNICODE ({sys.maxunicode}u)') generate_enum(cen, gp, 'UnicodeCategory', 'Cn', *class_maps, prefix='UC_') cen('// EndUCBDeclaration }}''}') gp(make_bitfield('tools/wcswidth', 'CharProps', *CharProps.go_fields(), add_package=False)[1]) gp(make_bitfield('tools/wcswidth', 'GraphemeSegmentationResult', *GraphemeSegmentationResult.go_fields(), add_package=False)[1]) gp(CharProps.go_extra()) gp(GraphemeSegmentationResult.go_extra()) - gen_multistage_table(c, gp, t1, t2, t3, t_shift) - gen_multistage_table(c, gp, g1, g2, g3, g_shift) + gen_multistage_table(c, gp, t1, t2, t3, t_shift, 'rune', sys.maxunicode) + gen_multistage_table(c, gp, g1, g2, g3, g_shift, 'uint16') c(GraphemeSegmentationKey.code_to_convert_to_int()) c(GraphemeSegmentationState.c_declaration()) gp(GraphemeSegmentationKey.code_to_convert_to_int(for_go=True)) diff --git a/kitty/char-props.c b/kitty/char-props.c index 65fb1981e..dcf31a92e 100644 --- a/kitty/char-props.c +++ b/kitty/char-props.c @@ -11,6 +11,7 @@ CharProps char_props_for(char_type ch) { + if (ch > MAX_UNICODE) ch = 0; return CharProps_t3[CharProps_t2[(CharProps_t1[ch >> CharProps_shift] << CharProps_shift) + (ch & CharProps_mask)]]; } diff --git a/kitty/char-props.h b/kitty/char-props.h index 562c79a0f..62a0e0acd 100644 --- a/kitty/char-props.h +++ b/kitty/char-props.h @@ -104,6 +104,7 @@ static_assert(sizeof(GraphemeSegmentationResult) == sizeof(uint16_t), "Fix the o // EndGraphemeSegmentationResultDeclaration }}} // UCBDeclaration {{{ +#define MAX_UNICODE (1114111u) typedef enum UnicodeCategory { UC_Cn, UC_Cc, diff --git a/tools/wcswidth/char-props-data.go b/tools/wcswidth/char-props-data.go index 135ef426a..310261dcc 100644 --- a/tools/wcswidth/char-props-data.go +++ b/tools/wcswidth/char-props-data.go @@ -1,5 +1,7 @@ package wcswidth +import "unsafe" + type GraphemeBreakProperty uint8 const ( @@ -384,6 +386,15 @@ var charprops_t3 = [109]CharProps{ ((0 & 0b1) << 0) | ((CharProps(ICB_Extend) & 0b11) << 1) | ((CharProps(GBP_Extend) & 0b1111) << 3) | ((0 & 0b1) << 7) | ((0 & 0b1) << 8) | ((1 & 0b1) << 9) | ((0 & 0b1) << 10) | ((1 & 0b1) << 11) | ((0 & 0b1) << 12) | ((0 & 0b1) << 13) | ((CharProps(UC_Cf) & 0b11111) << 14) | ((0 & 0b1) << 19) | ((4 & 0b111) << 20), // 108 } +// Array accessor function that avoids bounds checking +func CharPropsFor(x rune) CharProps { + x = max(0, min(x, 1114111)) + t1 := uintptr(*(*uint8)(unsafe.Pointer(uintptr(unsafe.Pointer(&charprops_t1[0])) + uintptr(x>>charprops_shift)*1))) + t1_shifted := (t1 << charprops_shift) + (uintptr(x) & charprops_mask) + t2 := uintptr(*(*uint8)(unsafe.Pointer(uintptr(unsafe.Pointer(&charprops_t2[0])) + t1_shifted*1))) + return *(*CharProps)(unsafe.Pointer(uintptr(unsafe.Pointer(&charprops_t3[0])) + t2*4)) +} + const graphemesegmentationresult_mask = 15 const graphemesegmentationresult_shift = 4 @@ -1026,6 +1037,15 @@ var graphemesegmentationresult_t3 = [630]GraphemeSegmentationResult{ ((GraphemeSegmentationResult(GBP_ZWJ) & 0b1111) << 0) | ((1 & 0b1) << 4) | ((0 & 0b1) << 5) | ((1 & 0b1) << 6) | ((1 & 0b1) << 7) | ((1 & 0b1) << 8) | ((0 & 0b1) << 9), // 629 } +// Array accessor function that avoids bounds checking +func GraphemeSegmentationResultFor(x uint16) GraphemeSegmentationResult { + + t1 := uintptr(*(*uint8)(unsafe.Pointer(uintptr(unsafe.Pointer(&graphemesegmentationresult_t1[0])) + uintptr(x>>graphemesegmentationresult_shift)*1))) + t1_shifted := (t1 << graphemesegmentationresult_shift) + (uintptr(x) & graphemesegmentationresult_mask) + t2 := uintptr(*(*uint16)(unsafe.Pointer(uintptr(unsafe.Pointer(&graphemesegmentationresult_t2[0])) + t1_shifted*2))) + return *(*GraphemeSegmentationResult)(unsafe.Pointer(uintptr(unsafe.Pointer(&graphemesegmentationresult_t3[0])) + t2*2)) +} + func grapheme_segmentation_key(r GraphemeSegmentationResult, ch CharProps) uint16 { return (r.State() << 7) | ch.GraphemeSegmentationProperty() } diff --git a/tools/wcswidth/char-props.go b/tools/wcswidth/char-props.go index e6cea4297..cf664f96b 100644 --- a/tools/wcswidth/char-props.go +++ b/tools/wcswidth/char-props.go @@ -7,10 +7,6 @@ import ( var _ = fmt.Print -func CharPropsFor(ch rune) CharProps { - return charprops_t3[charprops_t2[(rune(charprops_t1[ch>>charprops_shift])<>graphemesegmentationresult_shift]) << graphemesegmentationresult_shift - t2 := graphemesegmentationresult_t2[t1+key&graphemesegmentationresult_mask] - ans := graphemesegmentationresult_t3[t2] - // fmt.Printf("state: %d gsp: %d -> key: %d t1: %d -> add_to_cell: %d\n", s.State(), ch.GraphemeSegmentationProperty(), key, t1, ans.Add_to_current_cell()) - return ans + return GraphemeSegmentationResultFor(key) } func Runewidth(code rune) int {