mirror of
https://github.com/kovidgoyal/kitty
synced 2026-06-08 14:18:26 +02:00
374 lines
9.7 KiB
Go
374 lines
9.7 KiB
Go
package fzf
|
|
|
|
import (
|
|
"bytes"
|
|
"fmt"
|
|
"slices"
|
|
"strings"
|
|
"unicode"
|
|
"unicode/utf8"
|
|
|
|
"github.com/kovidgoyal/go-parallel"
|
|
"golang.org/x/text/unicode/norm"
|
|
)
|
|
|
|
var _ = fmt.Print
|
|
|
|
/*
|
|
|
|
Algorithm
|
|
---------
|
|
|
|
Based on code from fzf (MIT licensed):
|
|
https://github.com/junegunn/fzf
|
|
|
|
FuzzyMatch implements a modified version of Smith-Waterman algorithm to find
|
|
the optimal solution (highest score) according to the scoring criteria. Unlike
|
|
the original algorithm, omission or mismatch of a character in the pattern is
|
|
not allowed.
|
|
|
|
Scoring criteria
|
|
----------------
|
|
|
|
- We prefer matches at special positions, such as the start of a word, or
|
|
uppercase character in camelCase words.
|
|
|
|
- That is, we prefer an occurrence of the pattern with more characters
|
|
matching at special positions, even if the total match length is longer.
|
|
e.g. "fuzzyfinder" vs. "fuzzy-finder" on "ff"
|
|
````````````
|
|
- Also, if the first character in the pattern appears at one of the special
|
|
positions, the bonus point for the position is multiplied by a constant
|
|
as it is extremely likely that the first character in the typed pattern
|
|
has more significance than the rest.
|
|
e.g. "fo-bar" vs. "foob-r" on "br"
|
|
``````
|
|
- But since fzf is still a fuzzy finder, not an acronym finder, we should also
|
|
consider the total length of the matched substring. This is why we have the
|
|
gap penalty. The gap penalty increases as the length of the gap (distance
|
|
between the matching characters) increases, so the effect of the bonus is
|
|
eventually cancelled at some point.
|
|
e.g. "fuzzyfinder" vs. "fuzzy-blurry-finder" on "ff"
|
|
```````````
|
|
- Consequently, it is crucial to find the right balance between the bonus
|
|
and the gap penalty. The parameters were chosen that the bonus is cancelled
|
|
when the gap size increases beyond 8 characters.
|
|
|
|
- The bonus mechanism can have the undesirable side effect where consecutive
|
|
matches are ranked lower than the ones with gaps.
|
|
e.g. "foobar" vs. "foo-bar" on "foob"
|
|
```````
|
|
- To correct this anomaly, we also give extra bonus point to each character
|
|
in a consecutive matching chunk.
|
|
e.g. "foobar" vs. "foo-bar" on "foob"
|
|
``````
|
|
- The amount of consecutive bonus is primarily determined by the bonus of the
|
|
first character in the chunk.
|
|
e.g. "foobar" vs. "out-of-bound" on "oob"
|
|
````````````
|
|
*/
|
|
|
|
func try_skip(input *Chars, case_sensitive bool, b byte, from int) int {
|
|
byteArray := input.Bytes()[from:]
|
|
idx := bytes.IndexByte(byteArray, b)
|
|
if idx == 0 {
|
|
// Can't skip any further
|
|
return from
|
|
}
|
|
// We may need to search for the uppercase letter again. We don't have to
|
|
// consider normalization as we can be sure that this is an ASCII string.
|
|
if !case_sensitive && b >= 'a' && b <= 'z' {
|
|
if idx > 0 {
|
|
byteArray = byteArray[:idx]
|
|
}
|
|
uidx := bytes.IndexByte(byteArray, b-32)
|
|
if uidx >= 0 {
|
|
idx = uidx
|
|
}
|
|
}
|
|
if idx < 0 {
|
|
return -1
|
|
}
|
|
return from + idx
|
|
}
|
|
|
|
func ascii_fuzzy_index(input *Chars, pattern []rune, pattern_is_ascii bool, case_sensitive bool) (int, int) {
|
|
// Can't determine
|
|
if !input.Is_ASCII() {
|
|
return 0, input.Length()
|
|
}
|
|
// Can't match
|
|
if !pattern_is_ascii {
|
|
return -1, -1
|
|
}
|
|
|
|
firstIdx, idx, lastIdx := 0, 0, 0
|
|
var b byte
|
|
for pidx := range len(pattern) {
|
|
b = byte(pattern[pidx])
|
|
idx = try_skip(input, case_sensitive, b, idx)
|
|
if idx < 0 {
|
|
return -1, -1
|
|
}
|
|
if pidx == 0 && idx > 0 {
|
|
// Step back to find the right bonus point
|
|
firstIdx = idx - 1
|
|
}
|
|
lastIdx = idx
|
|
idx++
|
|
}
|
|
|
|
// Find the last appearance of the last character of the pattern to limit the search scope
|
|
bu := b
|
|
if !case_sensitive && b >= 'a' && b <= 'z' {
|
|
bu = b - 32
|
|
}
|
|
scope := input.Bytes()[lastIdx:]
|
|
for offset := len(scope) - 1; offset > 0; offset-- {
|
|
if scope[offset] == b || scope[offset] == bu {
|
|
return firstIdx, lastIdx + offset + 1
|
|
}
|
|
}
|
|
return firstIdx, lastIdx + 1
|
|
}
|
|
|
|
func (m *FuzzyMatcher) charClassOfNonAscii(char rune) charClass {
|
|
if unicode.IsLower(char) {
|
|
return charLower
|
|
} else if unicode.IsUpper(char) {
|
|
return charUpper
|
|
} else if unicode.IsNumber(char) {
|
|
return charNumber
|
|
} else if unicode.IsLetter(char) {
|
|
return charLetter
|
|
} else if unicode.IsSpace(char) {
|
|
return charWhite
|
|
} else if strings.ContainsRune(m.delimiterChars, char) {
|
|
return charDelimiter
|
|
}
|
|
return charNonWord
|
|
}
|
|
|
|
// Score the input against pattern. If !m.Case_sensitive pattern must be
|
|
// lowercased already. pattern must be non-empty. When m.Ignore_accents
|
|
// accents must already be removed from both pattern and input.
|
|
func (m *FuzzyMatcher) score_one(input *Chars, pattern []rune, pattern_is_ascii bool, slab *slab) (ans Result) {
|
|
M := len(pattern)
|
|
N := input.Length()
|
|
if M > N {
|
|
return
|
|
}
|
|
|
|
// Phase 1. Optimized search for ASCII string
|
|
minIdx, maxIdx := ascii_fuzzy_index(input, pattern, pattern_is_ascii, m.Case_sensitive)
|
|
if minIdx < 0 {
|
|
return
|
|
}
|
|
// fmt.Println(N, maxIdx, idx, maxIdx-idx, input.ToString())
|
|
N = maxIdx - minIdx
|
|
|
|
slab.reset()
|
|
|
|
H0 := slab.alloc16(N)
|
|
C0 := slab.alloc16(N)
|
|
// Bonus point for each position
|
|
B := slab.alloc16(N)
|
|
// The first occurrence of each character in the pattern
|
|
F := slab.alloc32(M)
|
|
// Rune array
|
|
T := slab.alloc32(N)
|
|
input.CopyRunes(T, minIdx)
|
|
|
|
// Phase 2. Calculate bonus for each point
|
|
maxScore, maxScorePos := int16(0), 0
|
|
pidx, lastIdx := 0, 0
|
|
pchar0, pchar, prevH0, prevClass, inGap := pattern[0], pattern[0], int16(0), m.initialCharClass, false
|
|
for off, char := range T {
|
|
var class charClass
|
|
if char <= unicode.MaxASCII {
|
|
class = m.asciiCharClasses[char]
|
|
if !m.Case_sensitive && class == charUpper {
|
|
char += 32
|
|
T[off] = char
|
|
}
|
|
} else {
|
|
class = m.charClassOfNonAscii(char)
|
|
if !m.Case_sensitive && class == charUpper {
|
|
char = unicode.To(unicode.LowerCase, char)
|
|
}
|
|
T[off] = char
|
|
}
|
|
|
|
bonus := m.bonusMatrix[prevClass][class]
|
|
B[off] = bonus
|
|
prevClass = class
|
|
|
|
if char == pchar {
|
|
if pidx < M {
|
|
F[pidx] = int32(off)
|
|
pidx++
|
|
pchar = pattern[min(pidx, M-1)]
|
|
}
|
|
lastIdx = off
|
|
}
|
|
|
|
if char == pchar0 {
|
|
score := scoreMatch + bonus*bonusFirstCharMultiplier
|
|
H0[off] = score
|
|
C0[off] = 1
|
|
if M == 1 && (!m.Backwards && score > maxScore || m.Backwards && score >= maxScore) {
|
|
maxScore, maxScorePos = score, off
|
|
if !m.Backwards && bonus >= bonusBoundary {
|
|
break
|
|
}
|
|
}
|
|
inGap = false
|
|
} else {
|
|
if inGap {
|
|
H0[off] = max(prevH0+scoreGapExtension, 0)
|
|
} else {
|
|
H0[off] = max(prevH0+scoreGapStart, 0)
|
|
}
|
|
C0[off] = 0
|
|
inGap = true
|
|
}
|
|
prevH0 = H0[off]
|
|
}
|
|
if pidx != M {
|
|
return
|
|
}
|
|
if M == 1 {
|
|
if m.Without_positions {
|
|
return Result{Score: uint(maxScore)}
|
|
}
|
|
return Result{Score: uint(maxScore), Positions: []int{minIdx + maxScorePos}}
|
|
}
|
|
|
|
// Phase 3. Fill in score matrix (H)
|
|
// Unlike the original algorithm, we do not allow omission.
|
|
f0 := int(F[0])
|
|
width := lastIdx - f0 + 1
|
|
H := slab.alloc16(width * M)
|
|
copy(H, H0[f0:lastIdx+1])
|
|
|
|
// Possible length of consecutive chunk at each position.
|
|
C := slab.alloc16(width * M)
|
|
copy(C, C0[f0:lastIdx+1])
|
|
|
|
Fsub := F[1:]
|
|
Psub := pattern[1:][:len(Fsub)]
|
|
for off, f := range Fsub {
|
|
f := int(f)
|
|
pchar := Psub[off]
|
|
pidx := off + 1
|
|
row := pidx * width
|
|
inGap := false
|
|
Tsub := T[f : lastIdx+1]
|
|
Bsub := B[f:][:len(Tsub)]
|
|
Csub := C[row+f-f0:][:len(Tsub)]
|
|
Cdiag := C[row+f-f0-1-width:][:len(Tsub)]
|
|
Hsub := H[row+f-f0:][:len(Tsub)]
|
|
Hdiag := H[row+f-f0-1-width:][:len(Tsub)]
|
|
Hleft := H[row+f-f0-1:][:len(Tsub)]
|
|
Hleft[0] = 0
|
|
for off, char := range Tsub {
|
|
col := off + f
|
|
var s1, s2, consecutive int16
|
|
|
|
if inGap {
|
|
s2 = Hleft[off] + scoreGapExtension
|
|
} else {
|
|
s2 = Hleft[off] + scoreGapStart
|
|
}
|
|
|
|
if pchar == char {
|
|
s1 = Hdiag[off] + scoreMatch
|
|
b := Bsub[off]
|
|
consecutive = Cdiag[off] + 1
|
|
if consecutive > 1 {
|
|
fb := B[col-int(consecutive)+1]
|
|
// Break consecutive chunk
|
|
if b >= bonusBoundary && b > fb {
|
|
consecutive = 1
|
|
} else {
|
|
b = max(b, max(bonusConsecutive, fb))
|
|
}
|
|
}
|
|
if s1+b < s2 {
|
|
s1 += Bsub[off]
|
|
consecutive = 0
|
|
} else {
|
|
s1 += b
|
|
}
|
|
}
|
|
Csub[off] = consecutive
|
|
|
|
inGap = s1 < s2
|
|
score := max(max(s1, s2), 0)
|
|
if pidx == M-1 && (!m.Backwards && score > maxScore || m.Backwards && score >= maxScore) {
|
|
maxScore, maxScorePos = score, col
|
|
}
|
|
Hsub[off] = score
|
|
}
|
|
}
|
|
// Phase 4. (Optional) Backtrace to find character positions
|
|
var pos []int
|
|
j := f0
|
|
if !m.Without_positions {
|
|
pos = make([]int, 0, M)
|
|
i := M - 1
|
|
j = maxScorePos
|
|
preferMatch := true
|
|
for {
|
|
I := i * width
|
|
j0 := j - f0
|
|
s := H[I+j0]
|
|
|
|
var s1, s2 int16
|
|
if i > 0 && j >= int(F[i]) {
|
|
s1 = H[I-width+j0-1]
|
|
}
|
|
if j > int(F[i]) {
|
|
s2 = H[I+j0-1]
|
|
}
|
|
|
|
if s > s1 && (s > s2 || s == s2 && preferMatch) {
|
|
pos = append(pos, j+minIdx)
|
|
if i == 0 {
|
|
break
|
|
}
|
|
i--
|
|
}
|
|
preferMatch = C[I+j0] > 1 || I+width+j0+1 < len(C) && C[I+width+j0+1] > 0
|
|
j--
|
|
}
|
|
}
|
|
return Result{Score: uint(maxScore), Positions: pos}
|
|
}
|
|
|
|
func (m *FuzzyMatcher) score(items []string, pattern string, scoring_func func(string, []rune, bool, *slab, func(string) Chars) Result) (ans []Result, err error) {
|
|
if pattern == "" || len(items) < 1 {
|
|
return make([]Result, len(items)), nil
|
|
}
|
|
as_chars := CharsFromString
|
|
if m.Ignore_accents {
|
|
pattern = string(CharsFromStringWithoutAccents(pattern).runes)
|
|
as_chars = CharsFromStringWithoutAccents
|
|
}
|
|
pattern = norm.NFC.String(pattern)
|
|
if !m.Case_sensitive {
|
|
pattern = strings.ToLower(pattern)
|
|
}
|
|
pat := []rune(pattern)
|
|
pattern_is_ascii := !slices.ContainsFunc(pat, func(r rune) bool { return r >= utf8.RuneSelf })
|
|
ans = make([]Result, len(items))
|
|
err = parallel.Run_in_parallel_over_range(0, func(start, end int) {
|
|
s := slab{}
|
|
for i := start; i < end; i++ {
|
|
ans[i] = scoring_func(items[i], pat, pattern_is_ascii, &s, as_chars)
|
|
}
|
|
}, 0, len(items))
|
|
return
|
|
|
|
}
|