-
Notifications
You must be signed in to change notification settings - Fork 697
Description
(Full reproducible example at the end.)
I have trouble setting up bleve to achieve the specific use case described below ; it's not clear to me whether it's actually feasible, you tell me :) .
I'm in a situation where I need to support:
- case-insensitive queries: match if some word is present regardless of text case
- exact match queries: match if, and only if, the original document (before analysis) has some word verbatim
- a mix of the above combined with boolean operators (e.g. in pseudo-code:
case_insensitive("foo") AND exact("bar"))
I expect this is a matter of properly setting analyzers, but I'm not sure. I have created the following analyzers:
var LowercaseWords = MyAnalyzer{
Name: "lowercase_words",
Settings: map[string]interface{}{
"type": custom.Name,
"tokenizer": whitespace.Name,
"token_filters": []string{
lowercase.Name,
},
},
}
var ExactWords = MyAnalyzer{
Name: "exact_words",
Settings: map[string]interface{}{
"type": custom.Name,
"tokenizer": whitespace.Name,
},
}My understanding is that analyzers can be assigned to:
- document fields
- match queries
I can implement (1) alone by setting the lowercase_words analyzer to both the field and the match query.
Similarly, I can implement (2) alone by setting the exact_words analyzer to both the field and the match query.
However, I can't find a way to implement (3) : indeed, if I set both analyzers on the same field, bleve will match the exact match query against tokens on which the lowercase analyzer has been applied, which violates (2).
In other words: I'm trying to have bleve match queries and tokens only if they went through the same analyzer. And I haven't found how to express that requirement using the current API.
Can this use case be achieved with bleve ? If so, how ? If not, any hints on how to patch bleve to support it ?
Full reproducible example
The expected values of below tests show the behavior I'm trying to achieve.
Details
package minimal_test
import (
"testing"
"github.com/blevesearch/bleve/v2"
"github.com/blevesearch/bleve/v2/analysis/analyzer/custom"
"github.com/blevesearch/bleve/v2/analysis/token/lowercase"
"github.com/blevesearch/bleve/v2/analysis/tokenizer/whitespace"
"github.com/blevesearch/bleve/v2/mapping"
"github.com/blevesearch/bleve/v2/search/query"
"github.com/stretchr/testify/require"
)
type MyAnalyzer struct {
Name string
Settings map[string]interface{}
}
var ExactWords = MyAnalyzer{
Name: "exact_words",
Settings: map[string]interface{}{
"type": custom.Name,
"tokenizer": whitespace.Name,
},
}
var LowercaseWords = MyAnalyzer{
Name: "lowercase_words",
Settings: map[string]interface{}{
"type": custom.Name,
"tokenizer": whitespace.Name,
"token_filters": []string{
lowercase.Name,
},
},
}
const documentType = "documentType"
type MyDocument struct {
MyField string `json:"my_field"`
}
// The receiver MUST NOT be a pointer for type resolution to work in bleve
func (d MyDocument) Type() string {
return documentType
}
var exactFieldMapping = func() *mapping.FieldMapping {
exactMapping := mapping.NewTextFieldMapping()
exactMapping.Analyzer = ExactWords.Name
return exactMapping
}()
var lowercaseFieldMapping = func() *mapping.FieldMapping {
lowercaseMapping := mapping.NewTextFieldMapping()
lowercaseMapping.Analyzer = LowercaseWords.Name
return lowercaseMapping
}()
func newDocumentMapping(fieldMappings []*mapping.FieldMapping) *mapping.DocumentMapping {
docMapping := mapping.NewDocumentMapping()
docMapping.AddFieldMappingsAt("my_field", fieldMappings...)
return docMapping
}
func newIndex(documentMapping *mapping.DocumentMapping) (bleve.Index, error) {
indexMapping := bleve.NewIndexMapping()
err := indexMapping.AddCustomAnalyzer(ExactWords.Name, ExactWords.Settings)
if err != nil {
return nil, err
}
err = indexMapping.AddCustomAnalyzer(LowercaseWords.Name, LowercaseWords.Settings)
if err != nil {
return nil, err
}
indexMapping.AddDocumentMapping(documentType, documentMapping)
return bleve.NewMemOnly(indexMapping)
}
func TestExact(t *testing.T) {
t.Parallel()
cases := map[string]struct {
FieldValue string
FieldMappings []*mapping.FieldMapping
Query query.Query
ExpectedMatch bool
}{
"exact_mapping_exact_query": {
FieldValue: "foo foobar baz",
FieldMappings: []*mapping.FieldMapping{exactFieldMapping},
Query: &query.MatchQuery{Match: "foobar", Analyzer: ExactWords.Name},
ExpectedMatch: true,
},
// Fails
"lowercase_mapping_exact_query": {
FieldValue: "Foo FooBar Baz",
FieldMappings: []*mapping.FieldMapping{lowercaseFieldMapping},
Query: &query.MatchQuery{Match: "foobar", Analyzer: ExactWords.Name},
ExpectedMatch: false,
},
// Fails
"both_mappings_exact_query": {
FieldValue: "Foo FooBar Baz",
FieldMappings: []*mapping.FieldMapping{exactFieldMapping, lowercaseFieldMapping},
Query: &query.MatchQuery{Match: "foobar", Analyzer: ExactWords.Name},
ExpectedMatch: false,
},
// Fails
"exact_mapping_lowercase_query": {
FieldValue: "foo foobar baz",
FieldMappings: []*mapping.FieldMapping{exactFieldMapping},
Query: &query.MatchQuery{Match: "FooBar", Analyzer: LowercaseWords.Name},
ExpectedMatch: false,
},
"lowercase_mapping_lowercase_query": {
FieldValue: "Foo FooBar Baz",
FieldMappings: []*mapping.FieldMapping{lowercaseFieldMapping},
Query: &query.MatchQuery{Match: "FooBar", Analyzer: LowercaseWords.Name},
ExpectedMatch: true,
},
"both_mappings_lowercase_query": {
FieldValue: "Foo FooBar Baz",
FieldMappings: []*mapping.FieldMapping{exactFieldMapping, lowercaseFieldMapping},
Query: &query.MatchQuery{Match: "FooBar", Analyzer: LowercaseWords.Name},
ExpectedMatch: true,
},
}
for name, c := range cases {
t.Run(name, func(t *testing.T) {
t.Parallel()
index, err := newIndex(newDocumentMapping(c.FieldMappings))
require.NoError(t, err)
err = index.Index("document_id", MyDocument{MyField: c.FieldValue})
require.NoError(t, err)
search := bleve.NewSearchRequest(c.Query)
search.Highlight = bleve.NewHighlight()
search.IncludeLocations = true
result, err := index.Search(search)
require.NoError(t, err)
require.Equal(t, 1, result.Status.Successful)
if c.ExpectedMatch {
require.Len(t, result.Hits, 1)
require.Equal(t, "document_id", result.Hits[0].ID)
} else {
require.Empty(t, result.Hits)
}
})
}
}