Implement UTF-8 decoding for screen_draw()

This commit is contained in:
Kovid Goyal
2023-10-29 13:36:44 +05:30
parent 5f809bf249
commit e4bb00d942
2 changed files with 47 additions and 27 deletions

View File

@@ -5,7 +5,6 @@
* Distributed under terms of the GPL3 license. * Distributed under terms of the GPL3 license.
*/ */
// TODO: Implement utf-8 parsing for screen_draw with reset
// TODO: Fix dump_commands for OSC and DCS commands that used to take strings but now take memoryview // TODO: Fix dump_commands for OSC and DCS commands that used to take strings but now take memoryview
// TODO: Test clipboard kitten with 52 and 5522 // TODO: Test clipboard kitten with 52 and 5522
// TODO: Test shell integration with secondary prompts // TODO: Test shell integration with secondary prompts
@@ -13,6 +12,7 @@
// TODO: Test that C1 characters are ignored by screen_draw() // TODO: Test that C1 characters are ignored by screen_draw()
#include "vt-parser.h" #include "vt-parser.h"
#include "charsets.h"
#include "screen.h" #include "screen.h"
#include "base64.h" #include "base64.h"
#include "control-codes.h" #include "control-codes.h"
@@ -26,7 +26,7 @@
#define RESTORE_INPUT_DATA self->input_data = orig_input_data; self->input_sz = orig_input_sz; self->input_pos = orig_input_pos #define RESTORE_INPUT_DATA self->input_data = orig_input_data; self->input_sz = orig_input_sz; self->input_pos = orig_input_pos
#define SET_STATE(state) self->vte_state = state; self->parser_buf_pos = 0; #define SET_STATE(state) self->vte_state = state; self->parser_buf_pos = 0; self->utf8_state = UTF8_ACCEPT;
#define IS_DIGIT \ #define IS_DIGIT \
case '0': \ case '0': \
@@ -149,9 +149,14 @@ typedef enum VTEState {
typedef struct PS { typedef struct PS {
id_type window_id; id_type window_id;
unsigned parser_buf_pos; unsigned parser_buf_pos;
bool extended_osc_code; UTF8State utf8_state;
VTEState vte_state; VTEState vte_state;
// this is used only during dispatch of a single byte, its present here just to avoid adding an extra parameter to accumulate_osc()
bool extended_osc_code;
struct { struct {
monotonic_t activated_at, wait_time; monotonic_t activated_at, wait_time;
unsigned stop_escape_code_type; unsigned stop_escape_code_type;
@@ -170,6 +175,20 @@ typedef struct PS {
// Normal mode {{{ // Normal mode {{{
static void
draw_byte(PS *self, uint8_t b) {
uint32_t ch;
switch (decode_utf8(&self->utf8_state, &ch, b)) {
case UTF8_ACCEPT:
REPORT_DRAW(ch);
screen_draw(self->screen, ch, true);
break;
case UTF8_REJECT:
self->utf8_state = UTF8_ACCEPT;
break;
}
}
static void static void
dispatch_normal_mode_byte(PS *self) { dispatch_normal_mode_byte(PS *self) {
#define CALL_SCREEN_HANDLER(name) REPORT_COMMAND(name); name(self->screen); break; #define CALL_SCREEN_HANDLER(name) REPORT_COMMAND(name); name(self->screen); break;
@@ -197,8 +216,7 @@ dispatch_normal_mode_byte(PS *self) {
case DEL: case DEL:
break; // no-op break; // no-op
default: default:
REPORT_DRAW(ch); draw_byte(self, ch);
screen_draw(self->screen, ch, true);
break; break;
} }
#undef CALL_SCREEN_HANDLER #undef CALL_SCREEN_HANDLER
@@ -224,9 +242,9 @@ screen_nel(Screen *screen) { screen_carriage_return(screen); screen_linefeed(scr
static void static void
dispatch_esc_mode_byte(PS *self) { dispatch_esc_mode_byte(PS *self) {
#define CALL_ED(name) REPORT_COMMAND(name); name(self->screen); SET_STATE(0); #define CALL_ED(name) REPORT_COMMAND(name); name(self->screen); SET_STATE(VTE_NORMAL);
#define CALL_ED1(name, ch) REPORT_COMMAND(name, ch); name(self->screen, ch); SET_STATE(0); #define CALL_ED1(name, ch) REPORT_COMMAND(name, ch); name(self->screen, ch); SET_STATE(VTE_NORMAL);
#define CALL_ED2(name, a, b) REPORT_COMMAND(name, a, b); name(self->screen, a, b); SET_STATE(0); #define CALL_ED2(name, a, b) REPORT_COMMAND(name, a, b); name(self->screen, a, b); SET_STATE(VTE_NORMAL);
uint8_t ch = self->input_data[self->input_pos++]; uint8_t ch = self->input_data[self->input_pos++];
switch(self->parser_buf_pos) { switch(self->parser_buf_pos) {
case 0: case 0:
@@ -264,7 +282,7 @@ dispatch_esc_mode_byte(PS *self) {
break; break;
default: default:
REPORT_ERROR("%s0x%x", "Unknown char after ESC: ", ch); REPORT_ERROR("%s0x%x", "Unknown char after ESC: ", ch);
SET_STATE(0); break; SET_STATE(VTE_NORMAL); break;
} }
break; break;
default: default:
@@ -311,7 +329,7 @@ dispatch_esc_mode_byte(PS *self) {
default: default:
REPORT_ERROR("Unhandled charset related escape code: 0x%x 0x%x", self->parser_buf[0], ch); break; REPORT_ERROR("Unhandled charset related escape code: 0x%x 0x%x", self->parser_buf[0], ch); break;
} }
SET_STATE(0); SET_STATE(VTE_NORMAL);
break; break;
} }
#undef CALL_ED #undef CALL_ED
@@ -530,7 +548,7 @@ END_ALLOW_CASE_RANGE
if (self->parser_buf_pos > 0 && self->parser_buf[self->parser_buf_pos-1] == ESC) { if (self->parser_buf_pos > 0 && self->parser_buf[self->parser_buf_pos-1] == ESC) {
if (ch == '\\') { self->parser_buf_pos--; return true; } if (ch == '\\') { self->parser_buf_pos--; return true; }
REPORT_ERROR("DCS sequence contained ESC without trailing \\ at pos: %u ignoring the sequence", self->parser_buf_pos); REPORT_ERROR("DCS sequence contained ESC without trailing \\ at pos: %u ignoring the sequence", self->parser_buf_pos);
SET_STATE(ESC); return false; SET_STATE(VTE_ESC); return false;
} }
if (self->parser_buf_pos >= PARSER_BUF_SZ - 1) { if (self->parser_buf_pos >= PARSER_BUF_SZ - 1) {
REPORT_ERROR("DCS sequence too long, truncating."); REPORT_ERROR("DCS sequence too long, truncating.");
@@ -660,7 +678,7 @@ accumulate_csi(PS *self) {
#define ENSURE_SPACE \ #define ENSURE_SPACE \
if (self->parser_buf_pos > PARSER_BUF_SZ - 1) { \ if (self->parser_buf_pos > PARSER_BUF_SZ - 1) { \
REPORT_ERROR("CSI sequence too long, ignoring"); \ REPORT_ERROR("CSI sequence too long, ignoring"); \
SET_STATE(0); \ SET_STATE(VTE_NORMAL); \
return false; \ return false; \
} }
@@ -679,7 +697,7 @@ accumulate_csi(PS *self) {
case '=': case '=':
if (self->parser_buf_pos != 0) { if (self->parser_buf_pos != 0) {
REPORT_ERROR("Invalid character in CSI: 0x%x, ignoring the sequence", ch); REPORT_ERROR("Invalid character in CSI: 0x%x, ignoring the sequence", ch);
SET_STATE(0); SET_STATE(VTE_NORMAL);
return false; return false;
} }
ENSURE_SPACE; ENSURE_SPACE;
@@ -711,11 +729,11 @@ END_ALLOW_CASE_RANGE
break; break;
case NUL: case NUL:
case DEL: case DEL:
SET_STATE(0); SET_STATE(VTE_NORMAL);
break; // no-op break; // no-op
default: default:
REPORT_ERROR("Invalid character in CSI: 0x%x, ignoring the sequence", ch); REPORT_ERROR("Invalid character in CSI: 0x%x, ignoring the sequence", ch);
SET_STATE(0); SET_STATE(VTE_NORMAL);
return false; return false;
} }
@@ -1319,26 +1337,26 @@ accumulate_oth(PS *self) {
dispatch##_esc_mode_byte(self); \ dispatch##_esc_mode_byte(self); \
break; \ break; \
case VTE_CSI: \ case VTE_CSI: \
if (accumulate_csi(self)) { dispatch##_csi(self); SET_STATE(0); watch_for_pending; } \ if (accumulate_csi(self)) { dispatch##_csi(self); SET_STATE(VTE_NORMAL); watch_for_pending; } \
break; \ break; \
case VTE_OSC: \ case VTE_OSC: \
{ \ { \
if (accumulate_osc(self)) { \ if (accumulate_osc(self)) { \
dispatch##_osc(self); \ dispatch##_osc(self); \
if (self->extended_osc_code) { \ if (self->extended_osc_code) { \
if (accumulate_osc(self)) { dispatch##_osc(self); SET_STATE(0); } \ if (accumulate_osc(self)) { dispatch##_osc(self); SET_STATE(VTE_NORMAL); } \
} else { SET_STATE(0); } \ } else { SET_STATE(VTE_NORMAL); } \
} \ } \
} \ } \
break; \ break; \
case VTE_APC: \ case VTE_APC: \
if (accumulate_oth(self)) { dispatch##_apc(self); SET_STATE(0); } \ if (accumulate_oth(self)) { dispatch##_apc(self); SET_STATE(VTE_NORMAL); } \
break; \ break; \
case VTE_PM: \ case VTE_PM: \
if (accumulate_oth(self)) { dispatch##_pm(self); SET_STATE(0); } \ if (accumulate_oth(self)) { dispatch##_pm(self); SET_STATE(VTE_NORMAL); } \
break; \ break; \
case VTE_DCS: \ case VTE_DCS: \
if (accumulate_dcs(self)) { dispatch##_dcs(self); SET_STATE(0); watch_for_pending; } \ if (accumulate_dcs(self)) { dispatch##_dcs(self); SET_STATE(VTE_NORMAL); watch_for_pending; } \
if (self->vte_state == ESC) { self->input_pos--; dispatch##_esc_mode_byte(self); } \ if (self->vte_state == ESC) { self->input_pos--; dispatch##_esc_mode_byte(self); } \
break; \ break; \
case VTE_NORMAL: \ case VTE_NORMAL: \
@@ -1606,10 +1624,12 @@ free_vt_parser(Parser* self) {
static void static void
reset(PS *self) { reset(PS *self) {
self->parser_buf_pos = 0;
self->extended_osc_code = false;
self->vte_state = VTE_NORMAL; self->vte_state = VTE_NORMAL;
self->utf8_state = UTF8_ACCEPT;
self->parser_buf_pos = 0;
self->pending_mode.activated_at = 0; self->pending_mode.activated_at = 0;
self->pending_mode.stop_escape_code_type = 0;
} }
void void

View File

@@ -6,7 +6,7 @@ from base64 import standard_b64encode
from binascii import hexlify from binascii import hexlify
from functools import partial from functools import partial
from kitty.fast_data_types import CURSOR_BLOCK, base64_decode, base64_encode, parse_bytes, parse_bytes_dump from kitty.fast_data_types import CURSOR_BLOCK, base64_decode, base64_encode
from kitty.notify import NotificationCommand, handle_notification_cmd, notification_activated, reset_registry from kitty.notify import NotificationCommand, handle_notification_cmd, notification_activated, reset_registry
from . import BaseTest from . import BaseTest
@@ -25,7 +25,7 @@ class TestParser(BaseTest):
if isinstance(x, str): if isinstance(x, str):
x = x.encode('utf-8') x = x.encode('utf-8')
cmds = tuple(('draw', x) if isinstance(x, str) else x for x in cmds) cmds = tuple(('draw', x) if isinstance(x, str) else x for x in cmds)
parse_bytes_dump(cd, s, x) s.vt_parser.parse_bytes(s, x, cd)
current = '' current = ''
q = [] q = []
for args in cd: for args in cd:
@@ -65,7 +65,7 @@ class TestParser(BaseTest):
self.ae(str(s.line(1)), '6') self.ae(str(s.line(1)), '6')
self.ae(str(s.line(2)), ' 123') self.ae(str(s.line(2)), ' 123')
self.ae(str(s.line(3)), '45') self.ae(str(s.line(3)), '45')
parse_bytes(s, b'\rabcde') s.vt_parser.parse_bytes(s, b'\rabcde')
self.ae(str(s.line(3)), 'abcde') self.ae(str(s.line(3)), 'abcde')
pb('\rßxyz1', ('screen_carriage_return',), 'ßxyz1') pb('\rßxyz1', ('screen_carriage_return',), 'ßxyz1')
self.ae(str(s.line(3)), 'ßxyz1') self.ae(str(s.line(3)), 'ßxyz1')
@@ -331,7 +331,7 @@ class TestParser(BaseTest):
for sgr in '0;34;102;1;2;3;4 0;38:5:200;58:2:10:11:12'.split(): for sgr in '0;34;102;1;2;3;4 0;38:5:200;58:2:10:11:12'.split():
expected = set(sgr.split(';')) - {'0'} expected = set(sgr.split(';')) - {'0'}
c.clear() c.clear()
parse_bytes(s, f'\033[{sgr}m\033P$qm\033\\'.encode('ascii')) s.vte_parser.parse_bytes(s, f'\033[{sgr}m\033P$qm\033\\'.encode('ascii'))
r = c.wtcbuf.decode('ascii').partition('r')[2].partition('m')[0] r = c.wtcbuf.decode('ascii').partition('r')[2].partition('m')[0]
self.ae(expected, set(r.split(';'))) self.ae(expected, set(r.split(';')))
c.clear() c.clear()