From f2879677a00653fe659ddcbadb48765877b3ef58 Mon Sep 17 00:00:00 2001 From: Likith B Date: Wed, 31 Jul 2024 11:55:50 +0530 Subject: [PATCH 1/4] MB-61640: Fuzzy Dynamic Scoring - Allow edit distances for terms to be read --- automaton.go | 5 +++++ fst_iterator.go | 14 ++++++++++++++ levenshtein/dfa.go | 14 +++++++++----- 3 files changed, 28 insertions(+), 5 deletions(-) diff --git a/automaton.go b/automaton.go index 70398f2..4014797 100644 --- a/automaton.go +++ b/automaton.go @@ -83,3 +83,8 @@ func (m *AlwaysMatch) Accept(int, byte) int { // creating an alwaysMatchAutomaton to avoid unnecessary repeated allocations. var alwaysMatchAutomaton = &AlwaysMatch{} + +type FuzzyAutomaton interface { + Automaton + EditDistance(int) uint8 +} diff --git a/fst_iterator.go b/fst_iterator.go index 2c6b0d6..948d1e6 100644 --- a/fst_iterator.go +++ b/fst_iterator.go @@ -44,6 +44,11 @@ type Iterator interface { Close() error } +type FuzzyIterator interface { + Iterator + Distance() uint8 +} + // FSTIterator is a structure for iterating key/value pairs in this FST in // lexicographic order. Iterators should be constructed with the FSTIterator // method on the parent FST structure. @@ -61,6 +66,8 @@ type FSTIterator struct { autStatesStack []int nextStart []byte + + keysDistance uint8 } func newIterator(f *FST, startKeyInclusive, endKeyExclusive []byte, @@ -74,6 +81,10 @@ func newIterator(f *FST, startKeyInclusive, endKeyExclusive []byte, return rv, nil } +func (i *FSTIterator) Distance() uint8 { + return i.keysDistance +} + // Reset resets the Iterator' internal state to allow for iterator // reuse (e.g. pooling). func (i *FSTIterator) Reset(f *FST, @@ -206,6 +217,9 @@ OUTER: cmp := bytes.Compare(i.keysStack, i.nextStart) if cmp > 0 { + if fa, ok := i.aut.(FuzzyAutomaton); ok { + i.keysDistance = fa.EditDistance(autCurr) + } // in final state greater than start key return nil } diff --git a/levenshtein/dfa.go b/levenshtein/dfa.go index d0e43ca..3fc26ed 100644 --- a/levenshtein/dfa.go +++ b/levenshtein/dfa.go @@ -28,23 +28,27 @@ type DFA struct { ed uint8 } -/// Returns the initial state +// Returns the initial state func (d *DFA) initialState() int { return d.initState } -/// Returns the Levenshtein distance associated to the -/// current state. +// Returns the Levenshtein distance associated to the +// current state. func (d *DFA) distance(stateId int) Distance { return d.distances[stateId] } -/// Returns the number of states in the `DFA`. +func (d *DFA) EditDistance(stateId int) uint8 { + return d.distances[stateId].distance() +} + +// Returns the number of states in the `DFA`. func (d *DFA) numStates() int { return len(d.transitions) } -/// Returns the destination state reached after consuming a given byte. +// Returns the destination state reached after consuming a given byte. func (d *DFA) transition(fromState int, b uint8) int { return int(d.transitions[fromState][b]) } From 1b5a4566c06055be693014f487eb261804885d29 Mon Sep 17 00:00:00 2001 From: Likith B Date: Tue, 19 Nov 2024 13:33:11 +0530 Subject: [PATCH 2/4] Minor Name Changes --- fst_iterator.go | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/fst_iterator.go b/fst_iterator.go index 948d1e6..3511063 100644 --- a/fst_iterator.go +++ b/fst_iterator.go @@ -46,7 +46,7 @@ type Iterator interface { type FuzzyIterator interface { Iterator - Distance() uint8 + EditDistance() uint8 } // FSTIterator is a structure for iterating key/value pairs in this FST in @@ -81,7 +81,7 @@ func newIterator(f *FST, startKeyInclusive, endKeyExclusive []byte, return rv, nil } -func (i *FSTIterator) Distance() uint8 { +func (i *FSTIterator) EditDistance() uint8 { return i.keysDistance } From 6d8c9a48cd406cee0da593f9ceb4db5b568d200a Mon Sep 17 00:00:00 2001 From: Abhinav Dangeti Date: Wed, 20 Nov 2024 09:59:35 -0700 Subject: [PATCH 3/4] Refactor keysDistance -> editDistance --- fst_iterator.go | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/fst_iterator.go b/fst_iterator.go index 3511063..f5c374e 100644 --- a/fst_iterator.go +++ b/fst_iterator.go @@ -67,7 +67,7 @@ type FSTIterator struct { nextStart []byte - keysDistance uint8 + editDistance uint8 } func newIterator(f *FST, startKeyInclusive, endKeyExclusive []byte, @@ -82,7 +82,7 @@ func newIterator(f *FST, startKeyInclusive, endKeyExclusive []byte, } func (i *FSTIterator) EditDistance() uint8 { - return i.keysDistance + return i.editDistance } // Reset resets the Iterator' internal state to allow for iterator @@ -218,7 +218,7 @@ OUTER: cmp := bytes.Compare(i.keysStack, i.nextStart) if cmp > 0 { if fa, ok := i.aut.(FuzzyAutomaton); ok { - i.keysDistance = fa.EditDistance(autCurr) + i.editDistance = fa.EditDistance(autCurr) } // in final state greater than start key return nil From 93094cba23e5dac6f466a0b69f335356c4bca86d Mon Sep 17 00:00:00 2001 From: Abhinav Dangeti Date: Wed, 20 Nov 2024 10:01:00 -0700 Subject: [PATCH 4/4] Update go ver and workflows --- .github/workflows/tests.yml | 2 +- go.mod | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml index 1acdd98..e0db47e 100644 --- a/.github/workflows/tests.yml +++ b/.github/workflows/tests.yml @@ -8,7 +8,7 @@ jobs: test: strategy: matrix: - go-version: [1.17.x, 1.18.x, 1.19.x] + go-version: [1.20.x, 1.21.x, 1.22.x] platform: [ubuntu-latest, macos-latest, windows-latest] runs-on: ${{ matrix.platform }} steps: diff --git a/go.mod b/go.mod index d1059a8..1e97f08 100644 --- a/go.mod +++ b/go.mod @@ -1,6 +1,6 @@ module github.com/blevesearch/vellum -go 1.18 +go 1.21 require ( github.com/bits-and-blooms/bitset v1.2.0