Add tests for the subseq matcher

This commit is contained in:
Kovid Goyal
2019-01-24 20:52:18 +05:30
parent edb25314c5
commit 60b64dadfe
6 changed files with 155 additions and 27 deletions

View File

@@ -34,7 +34,6 @@ typedef uint8_t len_t;
typedef uint32_t text_t;
#define LEN_MAX UINT8_MAX
#define UNUSED(x) (void)(x)
#define UTF8_ACCEPT 0
#define UTF8_REJECT 1
#define IS_LOWERCASE(x) (x) >= 'a' && (x) <= 'z'
@@ -66,8 +65,8 @@ typedef struct {
bool output_positions;
size_t limit;
int num_threads;
text_t mark_before[128], mark_after[128];
size_t mark_before_sz, mark_after_sz;
text_t mark_before[128], mark_after[128], delimiter[128];
size_t mark_before_sz, mark_after_sz, delimiter_sz;
} Options;
VECTOR_OF(len_t, Positions)
@@ -75,7 +74,7 @@ VECTOR_OF(text_t, Chars)
VECTOR_OF(Candidate, Candidates)
void output_results(GlobalData *, Candidate *haystack, size_t count, Options *opts, len_t needle_len, text_t delim);
void output_results(GlobalData *, Candidate *haystack, size_t count, Options *opts, len_t needle_len);
void* alloc_workspace(len_t max_haystack_len, GlobalData*);
void* free_workspace(void *v);
double score_item(void *v, text_t *haystack, len_t haystack_len, len_t *match_positions);

View File

@@ -5,6 +5,7 @@
* Distributed under terms of the GPL3 license.
*/
#include "data-types.h"
#include "choose-data-types.h"
#include "charsets.h"
@@ -160,7 +161,7 @@ run_search(Options *opts, GlobalData *global, const char * const *lines, const s
global->haystack = haystack;
global->haystack_count = SIZE(candidates);
ret = run_threaded(opts->num_threads, global);
if (ret == 0) output_results(global, haystack, SIZE(candidates), opts, global->needle_len, '\n');
if (ret == 0) output_results(global, haystack, SIZE(candidates), opts, global->needle_len);
else { REPORT_OOM; }
} else { ret = 1; REPORT_OOM; }
@@ -185,13 +186,13 @@ match(PyObject *self, PyObject *args) {
(void)(self);
int output_positions;
unsigned long limit;
PyObject *lines, *levels, *needle, *mark_before, *mark_after;
PyObject *lines, *levels, *needle, *mark_before, *mark_after, *delimiter;
Options opts = {0};
GlobalData global = {0};
if (!PyArg_ParseTuple(args, "O!O!O!pkiO!O!",
&lines, &PyList_Type, &levels, &PyTuple_Type, &needle, &PyUnicode_Type,
if (!PyArg_ParseTuple(args, "O!O!UpkiUUU",
&PyList_Type, &lines, &PyTuple_Type, &levels, &needle,
&output_positions, &limit, &opts.num_threads,
&mark_before, &PyUnicode_Type, &mark_after, &PyUnicode_Type
&mark_before, &mark_after, &delimiter
)) return NULL;
opts.output_positions = output_positions ? true : false;
opts.limit = limit;
@@ -201,13 +202,14 @@ match(PyObject *self, PyObject *args) {
global.needle_len = copy_unicode_object(needle, global.needle, arraysz(global.needle));
opts.mark_before_sz = copy_unicode_object(mark_before, opts.mark_before, arraysz(opts.mark_before));
opts.mark_after_sz = copy_unicode_object(mark_after, opts.mark_after, arraysz(opts.mark_after));
opts.delimiter_sz = copy_unicode_object(delimiter, opts.delimiter, arraysz(opts.delimiter));
size_t num_lines = PyList_GET_SIZE(lines);
char **clines = malloc(sizeof(char*) * num_lines);
size_t *sizes = malloc(sizeof(size_t) * num_lines);
if (!lines || !sizes) { PyErr_NoMemory(); return NULL; }
if (!lines || !sizes) { return PyErr_NoMemory(); }
for (size_t i = 0; i < num_lines; i++) {
clines[i] = PyBytes_AS_STRING(PyTuple_GET_ITEM(lines, i));
sizes[i] = PyBytes_GET_SIZE(PyTuple_GET_ITEM(lines, i));
clines[i] = PyBytes_AS_STRING(PyList_GET_ITEM(lines, i));
sizes[i] = PyBytes_GET_SIZE(PyList_GET_ITEM(lines, i));
}
Py_BEGIN_ALLOW_THREADS;
run_search(&opts, &global, (const char* const *)clines, sizes, num_lines);
@@ -235,11 +237,7 @@ static struct PyModuleDef module = {
.m_methods = module_methods
};
PyMODINIT_FUNC
EXPORTED PyMODINIT_FUNC
PyInit_subseq_matcher(void) {
PyObject *m;
m = PyModule_Create(&module);
if (m == NULL) return NULL;
return m;
return PyModule_Create(&module);
}

View File

@@ -8,6 +8,40 @@ import sys
from ..tui.handler import Handler
from ..tui.loop import Loop
from . import subseq_matcher
def match(
input_data,
query,
threads=0,
positions=False,
level1='/',
level2='-_0123456789',
level3='.',
limit=0,
mark_before='',
mark_after='',
delimiter='\n'
):
if isinstance(input_data, str):
input_data = input_data.encode('utf-8')
if isinstance(input_data, bytes):
input_data = input_data.split(delimiter.encode('utf-8'))
else:
input_data = [x.encode('utf-8') if isinstance(x, str) else x for x in input_data]
query = query.lower()
level1 = level1.lower()
level2 = level2.lower()
level3 = level3.lower()
data = subseq_matcher.match(
input_data, (level1, level2, level3), query,
positions, limit, threads,
mark_before, mark_after, delimiter)
if data is None:
return []
return list(filter(None, data.split(delimiter or '\n')))
class ChooseHandler(Handler):

View File

@@ -75,31 +75,31 @@ output_positions(GlobalData *global, len_t *positions, len_t num) {
int num = swprintf(buf, sizeof(buf)/sizeof(buf[0]), L"%u", positions[i]);
if (num > 0 && ensure_space(global, num + 1)) {
for (int i = 0; i < num; i++) global->output[global->output_pos++] = buf[i];
global->output[global->output_pos++] = (i == num - 1) ? ':' : ',';
global->output[global->output_pos++] = (i == num - 1) ? ',' : ':';
}
}
}
static void
output_result(GlobalData *global, Candidate *c, Options *opts, len_t needle_len, text_t delim) {
output_result(GlobalData *global, Candidate *c, Options *opts, len_t needle_len) {
if (opts->output_positions) output_positions(global, c->positions, needle_len);
if (opts->mark_before_sz > 0 || opts->mark_after_sz > 0) {
output_with_marks(global, opts, c->src, c->src_sz, c->positions, needle_len);
} else {
output_text(global, c->src, c->src_sz);
}
output_text(global, &delim, 1);
output_text(global, opts->delimiter, opts->delimiter_sz);
}
void
output_results(GlobalData *global, Candidate *haystack, size_t count, Options *opts, len_t needle_len, text_t delim) {
output_results(GlobalData *global, Candidate *haystack, size_t count, Options *opts, len_t needle_len) {
Candidate *c;
qsort(haystack, count, sizeof(*haystack), cmpscore);
size_t left = opts->limit > 0 ? opts->limit : count;
for (size_t i = 0; i < left; i++) {
c = haystack + i;
if (c->score > 0) output_result(global, c, opts, needle_len, delim);
if (c->score > 0) output_result(global, c, opts, needle_len);
}
}

View File

@@ -6,10 +6,7 @@
#pragma once
#include <Python.h>
#define MAX(x, y) ((x) > (y) ? (x) : (y))
#define MIN(x, y) ((x) < (y) ? (x) : (y))
#include "data-types.h"
#define REPORT_OOM global->oom = 1;