Skip to content

Match queries and field values only if both went through the same analyzer #2108

@k0ral

Description

@k0ral

(Full reproducible example at the end.)

I have trouble setting up bleve to achieve the specific use case described below ; it's not clear to me whether it's actually feasible, you tell me :) .

I'm in a situation where I need to support:

  1. case-insensitive queries: match if some word is present regardless of text case
  2. exact match queries: match if, and only if, the original document (before analysis) has some word verbatim
  3. a mix of the above combined with boolean operators (e.g. in pseudo-code: case_insensitive("foo") AND exact("bar"))

I expect this is a matter of properly setting analyzers, but I'm not sure. I have created the following analyzers:

var LowercaseWords = MyAnalyzer{
	Name: "lowercase_words",
	Settings: map[string]interface{}{
		"type":      custom.Name,
		"tokenizer": whitespace.Name,
		"token_filters": []string{
			lowercase.Name,
		},
	},
}

var ExactWords = MyAnalyzer{
	Name: "exact_words",
	Settings: map[string]interface{}{
		"type":      custom.Name,
		"tokenizer": whitespace.Name,
	},
}

My understanding is that analyzers can be assigned to:

  • document fields
  • match queries

I can implement (1) alone by setting the lowercase_words analyzer to both the field and the match query.
Similarly, I can implement (2) alone by setting the exact_words analyzer to both the field and the match query.
However, I can't find a way to implement (3) : indeed, if I set both analyzers on the same field, bleve will match the exact match query against tokens on which the lowercase analyzer has been applied, which violates (2).

In other words: I'm trying to have bleve match queries and tokens only if they went through the same analyzer. And I haven't found how to express that requirement using the current API.

Can this use case be achieved with bleve ? If so, how ? If not, any hints on how to patch bleve to support it ?

Full reproducible example

The expected values of below tests show the behavior I'm trying to achieve.

Details
package minimal_test

import (
	"testing"

	"github.com/blevesearch/bleve/v2"
	"github.com/blevesearch/bleve/v2/analysis/analyzer/custom"
	"github.com/blevesearch/bleve/v2/analysis/token/lowercase"
	"github.com/blevesearch/bleve/v2/analysis/tokenizer/whitespace"
	"github.com/blevesearch/bleve/v2/mapping"
	"github.com/blevesearch/bleve/v2/search/query"
	"github.com/stretchr/testify/require"
)

type MyAnalyzer struct {
	Name     string
	Settings map[string]interface{}
}

var ExactWords = MyAnalyzer{
	Name: "exact_words",
	Settings: map[string]interface{}{
		"type":      custom.Name,
		"tokenizer": whitespace.Name,
	},
}

var LowercaseWords = MyAnalyzer{
	Name: "lowercase_words",
	Settings: map[string]interface{}{
		"type":      custom.Name,
		"tokenizer": whitespace.Name,
		"token_filters": []string{
			lowercase.Name,
		},
	},
}

const documentType = "documentType"

type MyDocument struct {
	MyField string `json:"my_field"`
}

// The receiver MUST NOT be a pointer for type resolution to work in bleve
func (d MyDocument) Type() string {
	return documentType
}

var exactFieldMapping = func() *mapping.FieldMapping {
	exactMapping := mapping.NewTextFieldMapping()
	exactMapping.Analyzer = ExactWords.Name

	return exactMapping
}()

var lowercaseFieldMapping = func() *mapping.FieldMapping {
	lowercaseMapping := mapping.NewTextFieldMapping()
	lowercaseMapping.Analyzer = LowercaseWords.Name

	return lowercaseMapping
}()

func newDocumentMapping(fieldMappings []*mapping.FieldMapping) *mapping.DocumentMapping {
	docMapping := mapping.NewDocumentMapping()
	docMapping.AddFieldMappingsAt("my_field", fieldMappings...)
	return docMapping
}

func newIndex(documentMapping *mapping.DocumentMapping) (bleve.Index, error) {
	indexMapping := bleve.NewIndexMapping()
	err := indexMapping.AddCustomAnalyzer(ExactWords.Name, ExactWords.Settings)
	if err != nil {
		return nil, err
	}

	err = indexMapping.AddCustomAnalyzer(LowercaseWords.Name, LowercaseWords.Settings)
	if err != nil {
		return nil, err
	}

	indexMapping.AddDocumentMapping(documentType, documentMapping)

	return bleve.NewMemOnly(indexMapping)
}

func TestExact(t *testing.T) {
	t.Parallel()

	cases := map[string]struct {
		FieldValue    string
		FieldMappings []*mapping.FieldMapping
		Query         query.Query
		ExpectedMatch bool
	}{
		"exact_mapping_exact_query": {
			FieldValue:    "foo foobar baz",
			FieldMappings: []*mapping.FieldMapping{exactFieldMapping},
			Query:         &query.MatchQuery{Match: "foobar", Analyzer: ExactWords.Name},
			ExpectedMatch: true,
		},

		// Fails
		"lowercase_mapping_exact_query": {
			FieldValue:    "Foo FooBar Baz",
			FieldMappings: []*mapping.FieldMapping{lowercaseFieldMapping},
			Query:         &query.MatchQuery{Match: "foobar", Analyzer: ExactWords.Name},
			ExpectedMatch: false,
		},

		// Fails
		"both_mappings_exact_query": {
			FieldValue:    "Foo FooBar Baz",
			FieldMappings: []*mapping.FieldMapping{exactFieldMapping, lowercaseFieldMapping},
			Query:         &query.MatchQuery{Match: "foobar", Analyzer: ExactWords.Name},
			ExpectedMatch: false,
		},

		// Fails
		"exact_mapping_lowercase_query": {
			FieldValue:    "foo foobar baz",
			FieldMappings: []*mapping.FieldMapping{exactFieldMapping},
			Query:         &query.MatchQuery{Match: "FooBar", Analyzer: LowercaseWords.Name},
			ExpectedMatch: false,
		},

		"lowercase_mapping_lowercase_query": {
			FieldValue:    "Foo FooBar Baz",
			FieldMappings: []*mapping.FieldMapping{lowercaseFieldMapping},
			Query:         &query.MatchQuery{Match: "FooBar", Analyzer: LowercaseWords.Name},
			ExpectedMatch: true,
		},

		"both_mappings_lowercase_query": {
			FieldValue:    "Foo FooBar Baz",
			FieldMappings: []*mapping.FieldMapping{exactFieldMapping, lowercaseFieldMapping},
			Query:         &query.MatchQuery{Match: "FooBar", Analyzer: LowercaseWords.Name},
			ExpectedMatch: true,
		},
	}

	for name, c := range cases {
		t.Run(name, func(t *testing.T) {
			t.Parallel()
			index, err := newIndex(newDocumentMapping(c.FieldMappings))
			require.NoError(t, err)

			err = index.Index("document_id", MyDocument{MyField: c.FieldValue})
			require.NoError(t, err)

			search := bleve.NewSearchRequest(c.Query)
			search.Highlight = bleve.NewHighlight()
			search.IncludeLocations = true
			result, err := index.Search(search)
			require.NoError(t, err)

			require.Equal(t, 1, result.Status.Successful)
			if c.ExpectedMatch {
				require.Len(t, result.Hits, 1)
				require.Equal(t, "document_id", result.Hits[0].ID)
			} else {
				require.Empty(t, result.Hits)
			}
		})
	}
}

Metadata

Metadata

Assignees

No one assigned

    Labels

    No labels
    No labels

    Type

    No type

    Projects

    No projects

    Milestone

    No milestone

    Relationships

    None yet

    Development

    No branches or pull requests

    Issue actions