Files
kitty/tools/fzf/algo.go
2025-10-12 13:51:16 +05:30

374 lines
9.7 KiB
Go

package fzf
import (
"bytes"
"fmt"
"slices"
"strings"
"unicode"
"unicode/utf8"
"github.com/kovidgoyal/go-parallel"
"golang.org/x/text/unicode/norm"
)
var _ = fmt.Print
/*
Algorithm
---------
Based on code from fzf (MIT licensed):
https://github.com/junegunn/fzf
FuzzyMatch implements a modified version of Smith-Waterman algorithm to find
the optimal solution (highest score) according to the scoring criteria. Unlike
the original algorithm, omission or mismatch of a character in the pattern is
not allowed.
Scoring criteria
----------------
- We prefer matches at special positions, such as the start of a word, or
uppercase character in camelCase words.
- That is, we prefer an occurrence of the pattern with more characters
matching at special positions, even if the total match length is longer.
e.g. "fuzzyfinder" vs. "fuzzy-finder" on "ff"
````````````
- Also, if the first character in the pattern appears at one of the special
positions, the bonus point for the position is multiplied by a constant
as it is extremely likely that the first character in the typed pattern
has more significance than the rest.
e.g. "fo-bar" vs. "foob-r" on "br"
``````
- But since fzf is still a fuzzy finder, not an acronym finder, we should also
consider the total length of the matched substring. This is why we have the
gap penalty. The gap penalty increases as the length of the gap (distance
between the matching characters) increases, so the effect of the bonus is
eventually cancelled at some point.
e.g. "fuzzyfinder" vs. "fuzzy-blurry-finder" on "ff"
```````````
- Consequently, it is crucial to find the right balance between the bonus
and the gap penalty. The parameters were chosen that the bonus is cancelled
when the gap size increases beyond 8 characters.
- The bonus mechanism can have the undesirable side effect where consecutive
matches are ranked lower than the ones with gaps.
e.g. "foobar" vs. "foo-bar" on "foob"
```````
- To correct this anomaly, we also give extra bonus point to each character
in a consecutive matching chunk.
e.g. "foobar" vs. "foo-bar" on "foob"
``````
- The amount of consecutive bonus is primarily determined by the bonus of the
first character in the chunk.
e.g. "foobar" vs. "out-of-bound" on "oob"
````````````
*/
func try_skip(input *Chars, case_sensitive bool, b byte, from int) int {
byteArray := input.Bytes()[from:]
idx := bytes.IndexByte(byteArray, b)
if idx == 0 {
// Can't skip any further
return from
}
// We may need to search for the uppercase letter again. We don't have to
// consider normalization as we can be sure that this is an ASCII string.
if !case_sensitive && b >= 'a' && b <= 'z' {
if idx > 0 {
byteArray = byteArray[:idx]
}
uidx := bytes.IndexByte(byteArray, b-32)
if uidx >= 0 {
idx = uidx
}
}
if idx < 0 {
return -1
}
return from + idx
}
func ascii_fuzzy_index(input *Chars, pattern []rune, pattern_is_ascii bool, case_sensitive bool) (int, int) {
// Can't determine
if !input.Is_ASCII() {
return 0, input.Length()
}
// Can't match
if !pattern_is_ascii {
return -1, -1
}
firstIdx, idx, lastIdx := 0, 0, 0
var b byte
for pidx := range len(pattern) {
b = byte(pattern[pidx])
idx = try_skip(input, case_sensitive, b, idx)
if idx < 0 {
return -1, -1
}
if pidx == 0 && idx > 0 {
// Step back to find the right bonus point
firstIdx = idx - 1
}
lastIdx = idx
idx++
}
// Find the last appearance of the last character of the pattern to limit the search scope
bu := b
if !case_sensitive && b >= 'a' && b <= 'z' {
bu = b - 32
}
scope := input.Bytes()[lastIdx:]
for offset := len(scope) - 1; offset > 0; offset-- {
if scope[offset] == b || scope[offset] == bu {
return firstIdx, lastIdx + offset + 1
}
}
return firstIdx, lastIdx + 1
}
func (m *FuzzyMatcher) charClassOfNonAscii(char rune) charClass {
if unicode.IsLower(char) {
return charLower
} else if unicode.IsUpper(char) {
return charUpper
} else if unicode.IsNumber(char) {
return charNumber
} else if unicode.IsLetter(char) {
return charLetter
} else if unicode.IsSpace(char) {
return charWhite
} else if strings.ContainsRune(m.delimiterChars, char) {
return charDelimiter
}
return charNonWord
}
// Score the input against pattern. If !m.Case_sensitive pattern must be
// lowercased already. pattern must be non-empty. When m.Ignore_accents
// accents must already be removed from both pattern and input.
func (m *FuzzyMatcher) score_one(input *Chars, pattern []rune, pattern_is_ascii bool, slab *slab) (ans Result) {
M := len(pattern)
N := input.Length()
if M > N {
return
}
// Phase 1. Optimized search for ASCII string
minIdx, maxIdx := ascii_fuzzy_index(input, pattern, pattern_is_ascii, m.Case_sensitive)
if minIdx < 0 {
return
}
// fmt.Println(N, maxIdx, idx, maxIdx-idx, input.ToString())
N = maxIdx - minIdx
slab.reset()
H0 := slab.alloc16(N)
C0 := slab.alloc16(N)
// Bonus point for each position
B := slab.alloc16(N)
// The first occurrence of each character in the pattern
F := slab.alloc32(M)
// Rune array
T := slab.alloc32(N)
input.CopyRunes(T, minIdx)
// Phase 2. Calculate bonus for each point
maxScore, maxScorePos := int16(0), 0
pidx, lastIdx := 0, 0
pchar0, pchar, prevH0, prevClass, inGap := pattern[0], pattern[0], int16(0), m.initialCharClass, false
for off, char := range T {
var class charClass
if char <= unicode.MaxASCII {
class = m.asciiCharClasses[char]
if !m.Case_sensitive && class == charUpper {
char += 32
T[off] = char
}
} else {
class = m.charClassOfNonAscii(char)
if !m.Case_sensitive && class == charUpper {
char = unicode.To(unicode.LowerCase, char)
}
T[off] = char
}
bonus := m.bonusMatrix[prevClass][class]
B[off] = bonus
prevClass = class
if char == pchar {
if pidx < M {
F[pidx] = int32(off)
pidx++
pchar = pattern[min(pidx, M-1)]
}
lastIdx = off
}
if char == pchar0 {
score := scoreMatch + bonus*bonusFirstCharMultiplier
H0[off] = score
C0[off] = 1
if M == 1 && (!m.Backwards && score > maxScore || m.Backwards && score >= maxScore) {
maxScore, maxScorePos = score, off
if !m.Backwards && bonus >= bonusBoundary {
break
}
}
inGap = false
} else {
if inGap {
H0[off] = max(prevH0+scoreGapExtension, 0)
} else {
H0[off] = max(prevH0+scoreGapStart, 0)
}
C0[off] = 0
inGap = true
}
prevH0 = H0[off]
}
if pidx != M {
return
}
if M == 1 {
if m.Without_positions {
return Result{Score: uint(maxScore)}
}
return Result{Score: uint(maxScore), Positions: []int{minIdx + maxScorePos}}
}
// Phase 3. Fill in score matrix (H)
// Unlike the original algorithm, we do not allow omission.
f0 := int(F[0])
width := lastIdx - f0 + 1
H := slab.alloc16(width * M)
copy(H, H0[f0:lastIdx+1])
// Possible length of consecutive chunk at each position.
C := slab.alloc16(width * M)
copy(C, C0[f0:lastIdx+1])
Fsub := F[1:]
Psub := pattern[1:][:len(Fsub)]
for off, f := range Fsub {
f := int(f)
pchar := Psub[off]
pidx := off + 1
row := pidx * width
inGap := false
Tsub := T[f : lastIdx+1]
Bsub := B[f:][:len(Tsub)]
Csub := C[row+f-f0:][:len(Tsub)]
Cdiag := C[row+f-f0-1-width:][:len(Tsub)]
Hsub := H[row+f-f0:][:len(Tsub)]
Hdiag := H[row+f-f0-1-width:][:len(Tsub)]
Hleft := H[row+f-f0-1:][:len(Tsub)]
Hleft[0] = 0
for off, char := range Tsub {
col := off + f
var s1, s2, consecutive int16
if inGap {
s2 = Hleft[off] + scoreGapExtension
} else {
s2 = Hleft[off] + scoreGapStart
}
if pchar == char {
s1 = Hdiag[off] + scoreMatch
b := Bsub[off]
consecutive = Cdiag[off] + 1
if consecutive > 1 {
fb := B[col-int(consecutive)+1]
// Break consecutive chunk
if b >= bonusBoundary && b > fb {
consecutive = 1
} else {
b = max(b, max(bonusConsecutive, fb))
}
}
if s1+b < s2 {
s1 += Bsub[off]
consecutive = 0
} else {
s1 += b
}
}
Csub[off] = consecutive
inGap = s1 < s2
score := max(max(s1, s2), 0)
if pidx == M-1 && (!m.Backwards && score > maxScore || m.Backwards && score >= maxScore) {
maxScore, maxScorePos = score, col
}
Hsub[off] = score
}
}
// Phase 4. (Optional) Backtrace to find character positions
var pos []int
j := f0
if !m.Without_positions {
pos = make([]int, 0, M)
i := M - 1
j = maxScorePos
preferMatch := true
for {
I := i * width
j0 := j - f0
s := H[I+j0]
var s1, s2 int16
if i > 0 && j >= int(F[i]) {
s1 = H[I-width+j0-1]
}
if j > int(F[i]) {
s2 = H[I+j0-1]
}
if s > s1 && (s > s2 || s == s2 && preferMatch) {
pos = append(pos, j+minIdx)
if i == 0 {
break
}
i--
}
preferMatch = C[I+j0] > 1 || I+width+j0+1 < len(C) && C[I+width+j0+1] > 0
j--
}
}
return Result{Score: uint(maxScore), Positions: pos}
}
func (m *FuzzyMatcher) score(items []string, pattern string, scoring_func func(string, []rune, bool, *slab, func(string) Chars) Result) (ans []Result, err error) {
if pattern == "" || len(items) < 1 {
return make([]Result, len(items)), nil
}
as_chars := CharsFromString
if m.Ignore_accents {
pattern = string(CharsFromStringWithoutAccents(pattern).runes)
as_chars = CharsFromStringWithoutAccents
}
pattern = norm.NFC.String(pattern)
if !m.Case_sensitive {
pattern = strings.ToLower(pattern)
}
pat := []rune(pattern)
pattern_is_ascii := !slices.ContainsFunc(pat, func(r rune) bool { return r >= utf8.RuneSelf })
ans = make([]Result, len(items))
err = parallel.Run_in_parallel_over_range(0, func(start, end int) {
s := slab{}
for i := start; i < end; i++ {
ans[i] = scoring_func(items[i], pat, pattern_is_ascii, &s, as_chars)
}
}, 0, len(items))
return
}