Implement UTF-8 decoding for screen_draw()

This commit is contained in:
Kovid Goyal
2023-10-29 13:36:44 +05:30
parent 5f809bf249
commit e4bb00d942
2 changed files with 47 additions and 27 deletions

View File

@@ -5,7 +5,6 @@
* Distributed under terms of the GPL3 license.
*/
// TODO: Implement utf-8 parsing for screen_draw with reset
// TODO: Fix dump_commands for OSC and DCS commands that used to take strings but now take memoryview
// TODO: Test clipboard kitten with 52 and 5522
// TODO: Test shell integration with secondary prompts
@@ -13,6 +12,7 @@
// TODO: Test that C1 characters are ignored by screen_draw()
#include "vt-parser.h"
#include "charsets.h"
#include "screen.h"
#include "base64.h"
#include "control-codes.h"
@@ -26,7 +26,7 @@
#define RESTORE_INPUT_DATA self->input_data = orig_input_data; self->input_sz = orig_input_sz; self->input_pos = orig_input_pos
#define SET_STATE(state) self->vte_state = state; self->parser_buf_pos = 0;
#define SET_STATE(state) self->vte_state = state; self->parser_buf_pos = 0; self->utf8_state = UTF8_ACCEPT;
#define IS_DIGIT \
case '0': \
@@ -149,9 +149,14 @@ typedef enum VTEState {
typedef struct PS {
id_type window_id;
unsigned parser_buf_pos;
bool extended_osc_code;
UTF8State utf8_state;
VTEState vte_state;
// this is used only during dispatch of a single byte, its present here just to avoid adding an extra parameter to accumulate_osc()
bool extended_osc_code;
struct {
monotonic_t activated_at, wait_time;
unsigned stop_escape_code_type;
@@ -170,6 +175,20 @@ typedef struct PS {
// Normal mode {{{
static void
draw_byte(PS *self, uint8_t b) {
uint32_t ch;
switch (decode_utf8(&self->utf8_state, &ch, b)) {
case UTF8_ACCEPT:
REPORT_DRAW(ch);
screen_draw(self->screen, ch, true);
break;
case UTF8_REJECT:
self->utf8_state = UTF8_ACCEPT;
break;
}
}
static void
dispatch_normal_mode_byte(PS *self) {
#define CALL_SCREEN_HANDLER(name) REPORT_COMMAND(name); name(self->screen); break;
@@ -197,8 +216,7 @@ dispatch_normal_mode_byte(PS *self) {
case DEL:
break; // no-op
default:
REPORT_DRAW(ch);
screen_draw(self->screen, ch, true);
draw_byte(self, ch);
break;
}
#undef CALL_SCREEN_HANDLER
@@ -224,9 +242,9 @@ screen_nel(Screen *screen) { screen_carriage_return(screen); screen_linefeed(scr
static void
dispatch_esc_mode_byte(PS *self) {
#define CALL_ED(name) REPORT_COMMAND(name); name(self->screen); SET_STATE(0);
#define CALL_ED1(name, ch) REPORT_COMMAND(name, ch); name(self->screen, ch); SET_STATE(0);
#define CALL_ED2(name, a, b) REPORT_COMMAND(name, a, b); name(self->screen, a, b); SET_STATE(0);
#define CALL_ED(name) REPORT_COMMAND(name); name(self->screen); SET_STATE(VTE_NORMAL);
#define CALL_ED1(name, ch) REPORT_COMMAND(name, ch); name(self->screen, ch); SET_STATE(VTE_NORMAL);
#define CALL_ED2(name, a, b) REPORT_COMMAND(name, a, b); name(self->screen, a, b); SET_STATE(VTE_NORMAL);
uint8_t ch = self->input_data[self->input_pos++];
switch(self->parser_buf_pos) {
case 0:
@@ -264,7 +282,7 @@ dispatch_esc_mode_byte(PS *self) {
break;
default:
REPORT_ERROR("%s0x%x", "Unknown char after ESC: ", ch);
SET_STATE(0); break;
SET_STATE(VTE_NORMAL); break;
}
break;
default:
@@ -311,7 +329,7 @@ dispatch_esc_mode_byte(PS *self) {
default:
REPORT_ERROR("Unhandled charset related escape code: 0x%x 0x%x", self->parser_buf[0], ch); break;
}
SET_STATE(0);
SET_STATE(VTE_NORMAL);
break;
}
#undef CALL_ED
@@ -530,7 +548,7 @@ END_ALLOW_CASE_RANGE
if (self->parser_buf_pos > 0 && self->parser_buf[self->parser_buf_pos-1] == ESC) {
if (ch == '\\') { self->parser_buf_pos--; return true; }
REPORT_ERROR("DCS sequence contained ESC without trailing \\ at pos: %u ignoring the sequence", self->parser_buf_pos);
SET_STATE(ESC); return false;
SET_STATE(VTE_ESC); return false;
}
if (self->parser_buf_pos >= PARSER_BUF_SZ - 1) {
REPORT_ERROR("DCS sequence too long, truncating.");
@@ -660,7 +678,7 @@ accumulate_csi(PS *self) {
#define ENSURE_SPACE \
if (self->parser_buf_pos > PARSER_BUF_SZ - 1) { \
REPORT_ERROR("CSI sequence too long, ignoring"); \
SET_STATE(0); \
SET_STATE(VTE_NORMAL); \
return false; \
}
@@ -679,7 +697,7 @@ accumulate_csi(PS *self) {
case '=':
if (self->parser_buf_pos != 0) {
REPORT_ERROR("Invalid character in CSI: 0x%x, ignoring the sequence", ch);
SET_STATE(0);
SET_STATE(VTE_NORMAL);
return false;
}
ENSURE_SPACE;
@@ -711,11 +729,11 @@ END_ALLOW_CASE_RANGE
break;
case NUL:
case DEL:
SET_STATE(0);
SET_STATE(VTE_NORMAL);
break; // no-op
default:
REPORT_ERROR("Invalid character in CSI: 0x%x, ignoring the sequence", ch);
SET_STATE(0);
SET_STATE(VTE_NORMAL);
return false;
}
@@ -1319,26 +1337,26 @@ accumulate_oth(PS *self) {
dispatch##_esc_mode_byte(self); \
break; \
case VTE_CSI: \
if (accumulate_csi(self)) { dispatch##_csi(self); SET_STATE(0); watch_for_pending; } \
if (accumulate_csi(self)) { dispatch##_csi(self); SET_STATE(VTE_NORMAL); watch_for_pending; } \
break; \
case VTE_OSC: \
{ \
if (accumulate_osc(self)) { \
dispatch##_osc(self); \
if (self->extended_osc_code) { \
if (accumulate_osc(self)) { dispatch##_osc(self); SET_STATE(0); } \
} else { SET_STATE(0); } \
if (accumulate_osc(self)) { dispatch##_osc(self); SET_STATE(VTE_NORMAL); } \
} else { SET_STATE(VTE_NORMAL); } \
} \
} \
break; \
case VTE_APC: \
if (accumulate_oth(self)) { dispatch##_apc(self); SET_STATE(0); } \
if (accumulate_oth(self)) { dispatch##_apc(self); SET_STATE(VTE_NORMAL); } \
break; \
case VTE_PM: \
if (accumulate_oth(self)) { dispatch##_pm(self); SET_STATE(0); } \
if (accumulate_oth(self)) { dispatch##_pm(self); SET_STATE(VTE_NORMAL); } \
break; \
case VTE_DCS: \
if (accumulate_dcs(self)) { dispatch##_dcs(self); SET_STATE(0); watch_for_pending; } \
if (accumulate_dcs(self)) { dispatch##_dcs(self); SET_STATE(VTE_NORMAL); watch_for_pending; } \
if (self->vte_state == ESC) { self->input_pos--; dispatch##_esc_mode_byte(self); } \
break; \
case VTE_NORMAL: \
@@ -1606,10 +1624,12 @@ free_vt_parser(Parser* self) {
static void
reset(PS *self) {
self->parser_buf_pos = 0;
self->extended_osc_code = false;
self->vte_state = VTE_NORMAL;
self->utf8_state = UTF8_ACCEPT;
self->parser_buf_pos = 0;
self->pending_mode.activated_at = 0;
self->pending_mode.stop_escape_code_type = 0;
}
void

View File

@@ -6,7 +6,7 @@ from base64 import standard_b64encode
from binascii import hexlify
from functools import partial
from kitty.fast_data_types import CURSOR_BLOCK, base64_decode, base64_encode, parse_bytes, parse_bytes_dump
from kitty.fast_data_types import CURSOR_BLOCK, base64_decode, base64_encode
from kitty.notify import NotificationCommand, handle_notification_cmd, notification_activated, reset_registry
from . import BaseTest
@@ -25,7 +25,7 @@ class TestParser(BaseTest):
if isinstance(x, str):
x = x.encode('utf-8')
cmds = tuple(('draw', x) if isinstance(x, str) else x for x in cmds)
parse_bytes_dump(cd, s, x)
s.vt_parser.parse_bytes(s, x, cd)
current = ''
q = []
for args in cd:
@@ -65,7 +65,7 @@ class TestParser(BaseTest):
self.ae(str(s.line(1)), '6')
self.ae(str(s.line(2)), ' 123')
self.ae(str(s.line(3)), '45')
parse_bytes(s, b'\rabcde')
s.vt_parser.parse_bytes(s, b'\rabcde')
self.ae(str(s.line(3)), 'abcde')
pb('\rßxyz1', ('screen_carriage_return',), 'ßxyz1')
self.ae(str(s.line(3)), 'ßxyz1')
@@ -331,7 +331,7 @@ class TestParser(BaseTest):
for sgr in '0;34;102;1;2;3;4 0;38:5:200;58:2:10:11:12'.split():
expected = set(sgr.split(';')) - {'0'}
c.clear()
parse_bytes(s, f'\033[{sgr}m\033P$qm\033\\'.encode('ascii'))
s.vte_parser.parse_bytes(s, f'\033[{sgr}m\033P$qm\033\\'.encode('ascii'))
r = c.wtcbuf.decode('ascii').partition('r')[2].partition('m')[0]
self.ae(expected, set(r.split(';')))
c.clear()