Port wcswidth to use grapheme segmentation

This commit is contained in:
Kovid Goyal
2025-04-04 11:09:35 +05:30
parent 2cad589f1c
commit 203e9f6c58
4 changed files with 73 additions and 100 deletions

View File

@@ -7,76 +7,56 @@
#include "char-props.h" #include "char-props.h"
#include "wcswidth.h" #include "wcswidth.h"
#include "unicode-data.h"
void void
initialize_wcs_state(WCSState *state) { initialize_wcs_state(WCSState *state) {
zero_at_ptr(state); zero_at_ptr(state);
} }
static inline bool
is_flag_pair(char_type a, char_type b) {
return is_flag_codepoint(a) && is_flag_codepoint(b);
}
static inline bool
is_emoji_presentation_base(char_type ch) {
return char_props_for(ch).is_emoji_presentation_base == 1;
}
int int
wcswidth_step(WCSState *state, const char_type ch) { wcswidth_step(WCSState *state, const char_type ch) {
int ans = 0; int ans = 0;
switch (state->parser_state) { switch (state->parser_state) {
case IN_CSI: { case IN_CSI: {
state->prev_width = 0; state->prev_width = 0;
if (0x40 <= ch && ch <= 0x7e) state->parser_state = NORMAL; if (0x40 <= ch && ch <= 0x7e) { state->parser_state = NORMAL; state->can_combine = false; }
} break; } break;
case IN_ST_TERMINATED: { case IN_ST_TERMINATED: {
state->prev_width = 0; state->prev_width = 0;
if (ch == 0x9c || (ch == '\\' && state->prev_ch == 0x1b)) state->parser_state = NORMAL; if (ch == '\a' || (ch == '\\' && state->prev_ch == 0x1b)) { state->parser_state = NORMAL; state->can_combine = false; }
} break; } break;
case FLAG_PAIR_STARTED: {
state->parser_state = NORMAL;
if (is_flag_pair(state->prev_ch, ch)) break;
} /* fallthrough */
case NORMAL: { case NORMAL: {
switch(ch) { CharProps cp = char_props_for(ch);
case 0x1b: { state->seg = grapheme_segmentation_step(state->seg, cp);
if (state->seg.add_to_current_cell && state->can_combine) {
switch(ch) {
case 0xfe0f:
if (char_props_for(state->prev_ch).is_emoji_presentation_base && state->prev_width == 1) {
ans = 1; state->prev_width = 2;
} else state->prev_width = 0;
break;
case 0xfe0e:
if (char_props_for(state->prev_ch).is_emoji_presentation_base && state->prev_width == 2) {
ans = -1; state->prev_width = 1;
} else state->prev_width = 0;
break;
}
break;
}
int width = wcwidth_std(cp);
switch (width) {
case -1: case 0:
state->prev_width = 0; state->prev_width = 0;
state->parser_state = IN_ESC; if (ch == 0x1b) state->parser_state = IN_ESC;
} break; break;
case 0xfe0f: { case 2:
if (is_emoji_presentation_base(state->prev_ch) && state->prev_width == 1) { state->prev_width = 2; break;
ans += 1; default:
state->prev_width = 2; state->prev_width = 1; break;
} else state->prev_width = 0; }
} break; ans = state->prev_width;
state->can_combine = true;
case 0xfe0e: {
if (is_emoji_presentation_base(state->prev_ch) && state->prev_width == 2) {
ans -= 1;
state->prev_width = 1;
} else state->prev_width = 0;
} break;
default: {
if (is_flag_codepoint(ch)) state->parser_state = FLAG_PAIR_STARTED;
int w = wcwidth_std(char_props_for(ch));
switch(w) {
case -1:
case 0:
state->prev_width = 0; break;
case 2:
state->prev_width = 2; break;
default:
state->prev_width = 1; break;
}
ans += state->prev_width;
} break;
} break; // switch(ch)
} break; // case NORMAL } break; // case NORMAL
case IN_ESC: case IN_ESC:
@@ -113,9 +93,7 @@ wcswidth_step(WCSState *state, const char_type ch) {
case '~': case '~':
break; break;
default: default:
state->prev_ch = 0x1b; zero_at_ptr(state);
state->prev_width = 0;
state->parser_state = NORMAL;
return wcswidth_step(state, ch); return wcswidth_step(state, ch);
} break; } break;
} }

View File

@@ -6,14 +6,16 @@
#pragma once #pragma once
#include "data-types.h" #include "char-props.h"
typedef enum {NORMAL, IN_ESC, IN_CSI, FLAG_PAIR_STARTED, IN_ST_TERMINATED} WCSParserState; typedef enum {NORMAL, IN_ESC, IN_CSI, IN_ST_TERMINATED} WCSParserState;
typedef struct { typedef struct {
char_type prev_ch; char_type prev_ch;
int prev_width; int prev_width;
WCSParserState parser_state; WCSParserState parser_state;
bool can_combine;
GraphemeSegmentationResult seg;
} WCSState; } WCSState;

View File

@@ -379,6 +379,7 @@ class TestDataTypes(BaseTest):
def test_utils(self): def test_utils(self):
def w(x): def w(x):
return wcwidth(ord(x)) return wcwidth(ord(x))
self.ae(wcswidth('\x9c'), 0)
self.ae(wcswidth('a\033[2mb'), 2) self.ae(wcswidth('a\033[2mb'), 2)
self.ae(wcswidth('\033a\033[2mb'), 2) self.ae(wcswidth('\033a\033[2mb'), 2)
self.ae(wcswidth('a\033]8;id=moo;https://foo\033\\a'), 2) self.ae(wcswidth('a\033]8;id=moo;https://foo\033\\a'), 2)

View File

@@ -12,21 +12,12 @@ import (
var _ = fmt.Print var _ = fmt.Print
func IsFlagCodepoint(ch rune) bool {
return 0x1F1E6 <= ch && ch <= 0x1F1FF
}
func IsFlagPair(a rune, b rune) bool {
return IsFlagCodepoint(a) && IsFlagCodepoint(b)
}
type ecparser_state uint8
type WCWidthIterator struct { type WCWidthIterator struct {
prev_ch rune prev_ch rune
prev_width, current_width int prev_width, current_width int
seg GraphemeSegmentationResult
can_combine bool
parser EscapeCodeParser parser EscapeCodeParser
state ecparser_state
rune_count uint rune_count uint
} }
@@ -34,6 +25,12 @@ func CreateWCWidthIterator() *WCWidthIterator {
var ans WCWidthIterator var ans WCWidthIterator
ans.parser.HandleRune = ans.handle_rune ans.parser.HandleRune = ans.handle_rune
ans.parser.HandleCSI = ans.handle_csi ans.parser.HandleCSI = ans.handle_csi
ans.parser.HandleOSC = ans.handle_st_terminated
ans.parser.HandleDCS = ans.handle_st_terminated
ans.parser.HandlePM = ans.handle_st_terminated
ans.parser.HandleSOS = ans.handle_st_terminated
ans.parser.HandleAPC = ans.handle_st_terminated
return &ans return &ans
} }
@@ -42,6 +39,8 @@ func (self *WCWidthIterator) Reset() {
self.prev_width = 0 self.prev_width = 0
self.current_width = 0 self.current_width = 0
self.rune_count = 0 self.rune_count = 0
self.can_combine = false
self.seg = 0
self.parser.Reset() self.parser.Reset()
} }
@@ -58,54 +57,47 @@ func (self *WCWidthIterator) handle_csi(csi []byte) error {
} }
} }
} }
self.can_combine = false
self.seg = 0
return nil
}
func (self *WCWidthIterator) handle_st_terminated(data []byte) error {
self.can_combine = false
self.seg = 0
return nil return nil
} }
func (self *WCWidthIterator) handle_rune(ch rune) error { func (self *WCWidthIterator) handle_rune(ch rune) error {
self.rune_count += 1 self.rune_count += 1
const ( cp := CharPropsFor(ch)
normal ecparser_state = 0 self.seg = self.seg.Step(cp)
flag_pair_started ecparser_state = 3 if self.can_combine && self.seg.Add_to_current_cell() == 1 {
)
switch self.state {
case flag_pair_started:
self.state = normal
if IsFlagPair(self.prev_ch, ch) {
break
}
fallthrough
case normal:
switch ch { switch ch {
case 0xfe0f: case 0xfe0f:
if IsEmojiPresentationBase(self.prev_ch) && self.prev_width == 1 { if CharPropsFor(self.prev_ch).Is_emoji_presentation_base() == 1 && self.prev_width == 1 {
self.current_width += 1 self.current_width += 1
self.prev_width = 2 self.prev_width = 2
} else {
self.prev_width = 0
} }
case 0xfe0e: case 0xfe0e:
if IsEmojiPresentationBase(self.prev_ch) && self.prev_width == 2 { if CharPropsFor(self.prev_ch).Is_emoji_presentation_base() == 1 && self.prev_width == 2 {
self.current_width -= 1 self.current_width -= 1
self.prev_width = 1 self.prev_width = 1
} else {
self.prev_width = 0
} }
default:
if IsFlagCodepoint(ch) {
self.state = flag_pair_started
}
w := Runewidth(ch)
switch w {
case -1:
case 0:
self.prev_width = 0
case 2:
self.prev_width = 2
default:
self.prev_width = 1
}
self.current_width += self.prev_width
} }
} else {
width := cp.Width()
switch width {
case -1:
case 0:
self.prev_width = 0
case 2:
self.prev_width = 2
default:
self.prev_width = 1
}
self.current_width += self.prev_width
self.can_combine = true
} }
self.prev_ch = ch self.prev_ch = ch
return nil return nil