Remove bounds checking for unicode table access in Go

This commit is contained in:
Kovid Goyal
2025-04-01 10:41:17 +05:30
parent de1adeee5e
commit 6ecd78d9db
5 changed files with 48 additions and 14 deletions

View File

@@ -506,6 +506,10 @@ class Property(Protocol):
def as_go(self) -> str:
return ''
@classmethod
def bitsize(cls) -> int:
return 0
def get_types(sz: int) -> tuple[str, str]:
sz *= 8
@@ -514,11 +518,15 @@ def get_types(sz: int) -> tuple[str, str]:
def gen_multistage_table(
c: Callable[..., None], g: Callable[..., None], t1: Sequence[int], t2: Sequence[int], t3: Sequence[Property], shift: int,
for_go_type: str, maxval: int = 0
) -> None:
ctype_t1, gotype_t1 = get_types(getsize(t1))
t1_type_sz = getsize(t1)
ctype_t1, gotype_t1 = get_types(t1_type_sz)
mask = mask_for(shift)
name = t3[0].__class__.__name__
ctype_t2, gotype_t2 = get_types(getsize(tuple(range(len(t3)))))
t2_type_sz = getsize(tuple(range(len(t3))))
ctype_t2, gotype_t2 = get_types(t2_type_sz)
t3_type_sz = t3[0].bitsize() // 8
c(f'static const char_type {name}_mask = {mask}u;')
c(f'static const char_type {name}_shift = {shift}u;')
c(f'static const {ctype_t1} {name}_t1[{len(t1)}] = ''{')
@@ -546,6 +554,17 @@ def gen_multistage_table(
g(f'\t{items}')
g('}')
check = f'x = max(0, min(x, {maxval}))' if maxval else ''
g(f'''
// Array accessor function that avoids bounds checking
func {name}For(x {for_go_type}) {name} {{
{check}
t1 := uintptr(*(*{gotype_t1})(unsafe.Pointer(uintptr(unsafe.Pointer(&{lname}_t1[0])) + uintptr(x>>{lname}_shift)*{t1_type_sz})))
t1_shifted := (t1 << {lname}_shift) + (uintptr(x) & {lname}_mask)
t2 := uintptr(*(*{gotype_t2})(unsafe.Pointer(uintptr(unsafe.Pointer(&{lname}_t2[0])) + t1_shifted*{t2_type_sz})))
return *(*{name})(unsafe.Pointer(uintptr(unsafe.Pointer(&{lname}_t3[0])) + t2*{t3_type_sz}))
}}
''')
width_shift = 4
@@ -919,7 +938,6 @@ func (r GraphemeSegmentationResult) State() (ans {base_type}) {{
return bitfield_declaration_as_c('GraphemeSegmentationResult', fields, {'state': bits})
class CharProps(NamedTuple):
width: int = 3
@@ -1127,17 +1145,19 @@ def gen_char_props() -> None:
with create_header('kitty/char-props-data.h', include_data_types=False) as c, open('tools/wcswidth/char-props-data.go', 'w') as gof:
gp = partial(print, file=gof)
gp('package wcswidth')
gp('import "unsafe"')
generate_enum(c, gp, 'GraphemeBreakProperty', *grapheme_segmentation_maps, prefix='GBP_')
generate_enum(c, gp, 'IndicConjunctBreak', *incb_map, prefix='ICB_')
cen('// UCBDeclaration {{''{')
cen(f'#define MAX_UNICODE ({sys.maxunicode}u)')
generate_enum(cen, gp, 'UnicodeCategory', 'Cn', *class_maps, prefix='UC_')
cen('// EndUCBDeclaration }}''}')
gp(make_bitfield('tools/wcswidth', 'CharProps', *CharProps.go_fields(), add_package=False)[1])
gp(make_bitfield('tools/wcswidth', 'GraphemeSegmentationResult', *GraphemeSegmentationResult.go_fields(), add_package=False)[1])
gp(CharProps.go_extra())
gp(GraphemeSegmentationResult.go_extra())
gen_multistage_table(c, gp, t1, t2, t3, t_shift)
gen_multistage_table(c, gp, g1, g2, g3, g_shift)
gen_multistage_table(c, gp, t1, t2, t3, t_shift, 'rune', sys.maxunicode)
gen_multistage_table(c, gp, g1, g2, g3, g_shift, 'uint16')
c(GraphemeSegmentationKey.code_to_convert_to_int())
c(GraphemeSegmentationState.c_declaration())
gp(GraphemeSegmentationKey.code_to_convert_to_int(for_go=True))

View File

@@ -11,6 +11,7 @@
CharProps
char_props_for(char_type ch) {
if (ch > MAX_UNICODE) ch = 0;
return CharProps_t3[CharProps_t2[(CharProps_t1[ch >> CharProps_shift] << CharProps_shift) + (ch & CharProps_mask)]];
}

View File

@@ -104,6 +104,7 @@ static_assert(sizeof(GraphemeSegmentationResult) == sizeof(uint16_t), "Fix the o
// EndGraphemeSegmentationResultDeclaration }}}
// UCBDeclaration {{{
#define MAX_UNICODE (1114111u)
typedef enum UnicodeCategory {
UC_Cn,
UC_Cc,

View File

@@ -1,5 +1,7 @@
package wcswidth
import "unsafe"
type GraphemeBreakProperty uint8
const (
@@ -384,6 +386,15 @@ var charprops_t3 = [109]CharProps{
((0 & 0b1) << 0) | ((CharProps(ICB_Extend) & 0b11) << 1) | ((CharProps(GBP_Extend) & 0b1111) << 3) | ((0 & 0b1) << 7) | ((0 & 0b1) << 8) | ((1 & 0b1) << 9) | ((0 & 0b1) << 10) | ((1 & 0b1) << 11) | ((0 & 0b1) << 12) | ((0 & 0b1) << 13) | ((CharProps(UC_Cf) & 0b11111) << 14) | ((0 & 0b1) << 19) | ((4 & 0b111) << 20), // 108
}
// Array accessor function that avoids bounds checking
func CharPropsFor(x rune) CharProps {
x = max(0, min(x, 1114111))
t1 := uintptr(*(*uint8)(unsafe.Pointer(uintptr(unsafe.Pointer(&charprops_t1[0])) + uintptr(x>>charprops_shift)*1)))
t1_shifted := (t1 << charprops_shift) + (uintptr(x) & charprops_mask)
t2 := uintptr(*(*uint8)(unsafe.Pointer(uintptr(unsafe.Pointer(&charprops_t2[0])) + t1_shifted*1)))
return *(*CharProps)(unsafe.Pointer(uintptr(unsafe.Pointer(&charprops_t3[0])) + t2*4))
}
const graphemesegmentationresult_mask = 15
const graphemesegmentationresult_shift = 4
@@ -1026,6 +1037,15 @@ var graphemesegmentationresult_t3 = [630]GraphemeSegmentationResult{
((GraphemeSegmentationResult(GBP_ZWJ) & 0b1111) << 0) | ((1 & 0b1) << 4) | ((0 & 0b1) << 5) | ((1 & 0b1) << 6) | ((1 & 0b1) << 7) | ((1 & 0b1) << 8) | ((0 & 0b1) << 9), // 629
}
// Array accessor function that avoids bounds checking
func GraphemeSegmentationResultFor(x uint16) GraphemeSegmentationResult {
t1 := uintptr(*(*uint8)(unsafe.Pointer(uintptr(unsafe.Pointer(&graphemesegmentationresult_t1[0])) + uintptr(x>>graphemesegmentationresult_shift)*1)))
t1_shifted := (t1 << graphemesegmentationresult_shift) + (uintptr(x) & graphemesegmentationresult_mask)
t2 := uintptr(*(*uint16)(unsafe.Pointer(uintptr(unsafe.Pointer(&graphemesegmentationresult_t2[0])) + t1_shifted*2)))
return *(*GraphemeSegmentationResult)(unsafe.Pointer(uintptr(unsafe.Pointer(&graphemesegmentationresult_t3[0])) + t2*2))
}
func grapheme_segmentation_key(r GraphemeSegmentationResult, ch CharProps) uint16 {
return (r.State() << 7) | ch.GraphemeSegmentationProperty()
}

View File

@@ -7,10 +7,6 @@ import (
var _ = fmt.Print
func CharPropsFor(ch rune) CharProps {
return charprops_t3[charprops_t2[(rune(charprops_t1[ch>>charprops_shift])<<charprops_shift)+(ch&charprops_mask)]]
}
func IteratorOverGraphemes(text string) iter.Seq[string] {
var s GraphemeSegmentationResult
start_pos := 0
@@ -43,11 +39,7 @@ func (s *GraphemeSegmentationResult) Reset() {
func (s GraphemeSegmentationResult) Step(ch CharProps) GraphemeSegmentationResult {
key := grapheme_segmentation_key(s, ch)
t1 := uint16(graphemesegmentationresult_t1[key>>graphemesegmentationresult_shift]) << graphemesegmentationresult_shift
t2 := graphemesegmentationresult_t2[t1+key&graphemesegmentationresult_mask]
ans := graphemesegmentationresult_t3[t2]
// fmt.Printf("state: %d gsp: %d -> key: %d t1: %d -> add_to_cell: %d\n", s.State(), ch.GraphemeSegmentationProperty(), key, t1, ans.Add_to_current_cell())
return ans
return GraphemeSegmentationResultFor(key)
}
func Runewidth(code rune) int {