From 203e9f6c5831c7c80d7da616597b919ba87cdf37 Mon Sep 17 00:00:00 2001 From: Kovid Goyal Date: Fri, 4 Apr 2025 11:09:35 +0530 Subject: [PATCH] Port wcswidth to use grapheme segmentation --- kitty/wcswidth.c | 86 ++++++++++++++------------------------ kitty/wcswidth.h | 6 ++- kitty_tests/datatypes.py | 1 + tools/wcswidth/wcswidth.go | 80 ++++++++++++++++------------------- 4 files changed, 73 insertions(+), 100 deletions(-) diff --git a/kitty/wcswidth.c b/kitty/wcswidth.c index abbd68eb4..04952304d 100644 --- a/kitty/wcswidth.c +++ b/kitty/wcswidth.c @@ -7,76 +7,56 @@ #include "char-props.h" #include "wcswidth.h" -#include "unicode-data.h" void initialize_wcs_state(WCSState *state) { zero_at_ptr(state); } -static inline bool -is_flag_pair(char_type a, char_type b) { - return is_flag_codepoint(a) && is_flag_codepoint(b); -} - -static inline bool -is_emoji_presentation_base(char_type ch) { - return char_props_for(ch).is_emoji_presentation_base == 1; -} - int wcswidth_step(WCSState *state, const char_type ch) { int ans = 0; switch (state->parser_state) { case IN_CSI: { state->prev_width = 0; - if (0x40 <= ch && ch <= 0x7e) state->parser_state = NORMAL; + if (0x40 <= ch && ch <= 0x7e) { state->parser_state = NORMAL; state->can_combine = false; } } break; case IN_ST_TERMINATED: { state->prev_width = 0; - if (ch == 0x9c || (ch == '\\' && state->prev_ch == 0x1b)) state->parser_state = NORMAL; + if (ch == '\a' || (ch == '\\' && state->prev_ch == 0x1b)) { state->parser_state = NORMAL; state->can_combine = false; } } break; - case FLAG_PAIR_STARTED: { - state->parser_state = NORMAL; - if (is_flag_pair(state->prev_ch, ch)) break; - } /* fallthrough */ - case NORMAL: { - switch(ch) { - case 0x1b: { + CharProps cp = char_props_for(ch); + state->seg = grapheme_segmentation_step(state->seg, cp); + if (state->seg.add_to_current_cell && state->can_combine) { + switch(ch) { + case 0xfe0f: + if (char_props_for(state->prev_ch).is_emoji_presentation_base && state->prev_width == 1) { + ans = 1; state->prev_width = 2; + } else state->prev_width = 0; + break; + case 0xfe0e: + if (char_props_for(state->prev_ch).is_emoji_presentation_base && state->prev_width == 2) { + ans = -1; state->prev_width = 1; + } else state->prev_width = 0; + break; + } + break; + } + int width = wcwidth_std(cp); + switch (width) { + case -1: case 0: state->prev_width = 0; - state->parser_state = IN_ESC; - } break; - case 0xfe0f: { - if (is_emoji_presentation_base(state->prev_ch) && state->prev_width == 1) { - ans += 1; - state->prev_width = 2; - } else state->prev_width = 0; - } break; - - case 0xfe0e: { - if (is_emoji_presentation_base(state->prev_ch) && state->prev_width == 2) { - ans -= 1; - state->prev_width = 1; - } else state->prev_width = 0; - } break; - - default: { - if (is_flag_codepoint(ch)) state->parser_state = FLAG_PAIR_STARTED; - int w = wcwidth_std(char_props_for(ch)); - switch(w) { - case -1: - case 0: - state->prev_width = 0; break; - case 2: - state->prev_width = 2; break; - default: - state->prev_width = 1; break; - } - ans += state->prev_width; - } break; - } break; // switch(ch) + if (ch == 0x1b) state->parser_state = IN_ESC; + break; + case 2: + state->prev_width = 2; break; + default: + state->prev_width = 1; break; + } + ans = state->prev_width; + state->can_combine = true; } break; // case NORMAL case IN_ESC: @@ -113,9 +93,7 @@ wcswidth_step(WCSState *state, const char_type ch) { case '~': break; default: - state->prev_ch = 0x1b; - state->prev_width = 0; - state->parser_state = NORMAL; + zero_at_ptr(state); return wcswidth_step(state, ch); } break; } diff --git a/kitty/wcswidth.h b/kitty/wcswidth.h index 016574daa..231e9ffb3 100644 --- a/kitty/wcswidth.h +++ b/kitty/wcswidth.h @@ -6,14 +6,16 @@ #pragma once -#include "data-types.h" +#include "char-props.h" -typedef enum {NORMAL, IN_ESC, IN_CSI, FLAG_PAIR_STARTED, IN_ST_TERMINATED} WCSParserState; +typedef enum {NORMAL, IN_ESC, IN_CSI, IN_ST_TERMINATED} WCSParserState; typedef struct { char_type prev_ch; int prev_width; WCSParserState parser_state; + bool can_combine; + GraphemeSegmentationResult seg; } WCSState; diff --git a/kitty_tests/datatypes.py b/kitty_tests/datatypes.py index 62d233eef..7b420f6ce 100644 --- a/kitty_tests/datatypes.py +++ b/kitty_tests/datatypes.py @@ -379,6 +379,7 @@ class TestDataTypes(BaseTest): def test_utils(self): def w(x): return wcwidth(ord(x)) + self.ae(wcswidth('\x9c'), 0) self.ae(wcswidth('a\033[2mb'), 2) self.ae(wcswidth('\033a\033[2mb'), 2) self.ae(wcswidth('a\033]8;id=moo;https://foo\033\\a'), 2) diff --git a/tools/wcswidth/wcswidth.go b/tools/wcswidth/wcswidth.go index d152d9c85..8b052da34 100644 --- a/tools/wcswidth/wcswidth.go +++ b/tools/wcswidth/wcswidth.go @@ -12,21 +12,12 @@ import ( var _ = fmt.Print -func IsFlagCodepoint(ch rune) bool { - return 0x1F1E6 <= ch && ch <= 0x1F1FF -} - -func IsFlagPair(a rune, b rune) bool { - return IsFlagCodepoint(a) && IsFlagCodepoint(b) -} - -type ecparser_state uint8 - type WCWidthIterator struct { prev_ch rune prev_width, current_width int + seg GraphemeSegmentationResult + can_combine bool parser EscapeCodeParser - state ecparser_state rune_count uint } @@ -34,6 +25,12 @@ func CreateWCWidthIterator() *WCWidthIterator { var ans WCWidthIterator ans.parser.HandleRune = ans.handle_rune ans.parser.HandleCSI = ans.handle_csi + ans.parser.HandleOSC = ans.handle_st_terminated + ans.parser.HandleDCS = ans.handle_st_terminated + ans.parser.HandlePM = ans.handle_st_terminated + ans.parser.HandleSOS = ans.handle_st_terminated + ans.parser.HandleAPC = ans.handle_st_terminated + return &ans } @@ -42,6 +39,8 @@ func (self *WCWidthIterator) Reset() { self.prev_width = 0 self.current_width = 0 self.rune_count = 0 + self.can_combine = false + self.seg = 0 self.parser.Reset() } @@ -58,54 +57,47 @@ func (self *WCWidthIterator) handle_csi(csi []byte) error { } } } + self.can_combine = false + self.seg = 0 + return nil +} + +func (self *WCWidthIterator) handle_st_terminated(data []byte) error { + self.can_combine = false + self.seg = 0 return nil } func (self *WCWidthIterator) handle_rune(ch rune) error { self.rune_count += 1 - const ( - normal ecparser_state = 0 - flag_pair_started ecparser_state = 3 - ) - switch self.state { - case flag_pair_started: - self.state = normal - if IsFlagPair(self.prev_ch, ch) { - break - } - fallthrough - case normal: + cp := CharPropsFor(ch) + self.seg = self.seg.Step(cp) + if self.can_combine && self.seg.Add_to_current_cell() == 1 { switch ch { case 0xfe0f: - if IsEmojiPresentationBase(self.prev_ch) && self.prev_width == 1 { + if CharPropsFor(self.prev_ch).Is_emoji_presentation_base() == 1 && self.prev_width == 1 { self.current_width += 1 self.prev_width = 2 - } else { - self.prev_width = 0 } case 0xfe0e: - if IsEmojiPresentationBase(self.prev_ch) && self.prev_width == 2 { + if CharPropsFor(self.prev_ch).Is_emoji_presentation_base() == 1 && self.prev_width == 2 { self.current_width -= 1 self.prev_width = 1 - } else { - self.prev_width = 0 } - default: - if IsFlagCodepoint(ch) { - self.state = flag_pair_started - } - w := Runewidth(ch) - switch w { - case -1: - case 0: - self.prev_width = 0 - case 2: - self.prev_width = 2 - default: - self.prev_width = 1 - } - self.current_width += self.prev_width } + } else { + width := cp.Width() + switch width { + case -1: + case 0: + self.prev_width = 0 + case 2: + self.prev_width = 2 + default: + self.prev_width = 1 + } + self.current_width += self.prev_width + self.can_combine = true } self.prev_ch = ch return nil