From 25e3764aef32cfd1cee16744324547da6475a957 Mon Sep 17 00:00:00 2001 From: Darkheir Date: Tue, 2 Dec 2025 17:35:02 +0100 Subject: [PATCH 1/2] feat: Add case_insensitive param to some Elastic DSL queries Signed-off-by: Darkheir --- docs/reference/es_compatible_api.md | 52 +++++++++++++---- .../src/elastic_query_dsl/prefix_query.rs | 8 ++- .../src/elastic_query_dsl/regex_query.rs | 10 +++- .../src/elastic_query_dsl/term_query.rs | 23 +++++++- .../src/elastic_query_dsl/wildcard_query.rs | 9 ++- .../src/query_ast/user_input_query.rs | 1 + .../src/query_ast/wildcard_query.rs | 58 +++++++++++++++++++ .../es_compatibility/0006-term_query.yaml | 2 - .../es_compatibility/0029-wildcard.yaml | 22 +++++++ .../es_compatibility/0030-prefix.yaml | 22 +++++++ .../scenarii/es_compatibility/0031-regex.yaml | 47 +++++++++++++++ .../es_compatibility/_setup.quickwit.yaml | 1 + 12 files changed, 237 insertions(+), 18 deletions(-) create mode 100644 quickwit/rest-api-tests/scenarii/es_compatibility/0031-regex.yaml diff --git a/docs/reference/es_compatible_api.md b/docs/reference/es_compatible_api.md index c9ffa4f463b..32cbdafd761 100644 --- a/docs/reference/es_compatible_api.md +++ b/docs/reference/es_compatible_api.md @@ -695,10 +695,11 @@ When working on text, it is recommended to only use `term` queries on fields con #### Supported Parameters -| Variable | Type | Description | Default | -| -------- | -------- | ---------------------------------------------------------------------------- | ------- | -| `value` | String | Term value. This is the string representation of a token after tokenization. | - | -| `boost` | `Number` | Multiplier boost for score computation | 1.0 | +| Variable | Type | Description | Default | +| ------------------ | ------- | ---------------------------------------------------------------------------- | ------- | +| `value` | String | Term value. This is the string representation of a token after tokenization. | - | +| `boost` | Number | Multiplier boost for score computation | 1.0 | +| `case_insensitive` | Boolean | Allows ASCII case insensitive matching of the value. | false | @@ -763,9 +764,10 @@ Returns documents that contain a specific prefix in a provided field. #### Supported Parameters -| Variable | Type | Description | Default | -| -------- | ------ | ----------------------------------------------- | ------- | -| `value` | String | Beginning characters of terms you wish to find. | - | +| Variable | Type | Description | Default | +| ------------------ | ------- | ---------------------------------------------------- | ------- | +| `value` | String | Beginning characters of terms you wish to find. | - | +| `case_insensitive` | Boolean | Allows ASCII case insensitive matching of the value. | false | ### `wildcard` @@ -791,9 +793,39 @@ Returns documents that contain terms matching a wildcard pattern: #### Supported Parameters -| Variable | Type | Description | Default | -| -------- | ------ | -------------------------------------------- | ------- | -| `value` | String | Wildcard pattern for terms you wish to find. | - | +| Variable | Type | Description | Default | +| ------------------ | ------- | ---------------------------------------------------- | ------- | +| `value` | String | Wildcard pattern for terms you wish to find. | - | +| `boost` | Number | Multiplier boost for score computation. | 1.0 | +| `case_insensitive` | Boolean | Allows ASCII case insensitive matching of the value. | false | + + +### `regexp` + +[Elasticsearch reference documentation](https://www.elastic.co/guide/en/elasticsearch/reference/8.8/query-dsl-regexp-query.html) + +Returns documents that contain terms matching a regular expression. + +#### Example + +```json +{ + "query": { + "regexp": { + "author.login" { + "value": "adm.*n", + } + } + } +} +``` + +#### Supported Parameters + +| Variable | Type | Description | Default | +| ------------------ | ------- | ---------------------------------------------------- | ------- | +| `value` | String | Wildcard pattern for terms you wish to find. | - | +| `case_insensitive` | Boolean | Allows ASCII case insensitive matching of the value. | false | ### About the `lenient` argument diff --git a/quickwit/quickwit-query/src/elastic_query_dsl/prefix_query.rs b/quickwit/quickwit-query/src/elastic_query_dsl/prefix_query.rs index 2baa9a499c3..f19fad61037 100644 --- a/quickwit/quickwit-query/src/elastic_query_dsl/prefix_query.rs +++ b/quickwit/quickwit-query/src/elastic_query_dsl/prefix_query.rs @@ -29,6 +29,8 @@ pub(crate) struct PrefixQuery { #[serde(deny_unknown_fields)] pub struct PrefixQueryParams { value: String, + #[serde(default)] + case_insensitive: bool, } impl ConvertibleToQueryAst for PrefixQuery { @@ -45,6 +47,7 @@ impl ConvertibleToQueryAst for PrefixQuery { field: self.field, value: wildcard, lenient: true, + case_insensitive: self.params.case_insensitive, } .into()) } @@ -64,7 +67,10 @@ impl From>> for Pr impl From for PrefixQueryParams { fn from(value: String) -> PrefixQueryParams { - PrefixQueryParams { value } + PrefixQueryParams { + value, + case_insensitive: false, + } } } diff --git a/quickwit/quickwit-query/src/elastic_query_dsl/regex_query.rs b/quickwit/quickwit-query/src/elastic_query_dsl/regex_query.rs index 747e1321811..2e37c6940bc 100644 --- a/quickwit/quickwit-query/src/elastic_query_dsl/regex_query.rs +++ b/quickwit/quickwit-query/src/elastic_query_dsl/regex_query.rs @@ -22,16 +22,22 @@ use crate::query_ast::{QueryAst, RegexQuery as AstRegexQuery}; #[serde(deny_unknown_fields)] pub struct RegexQueryParams { value: String, - // we could probably add case_insensitive + #[serde(default)] + case_insensitive: bool, } pub type RegexQuery = OneFieldMap; impl ConvertibleToQueryAst for RegexQuery { fn convert_to_query_ast(self) -> anyhow::Result { + let regex = if self.value.case_insensitive { + format!("(?i){}", self.value.value) + } else { + self.value.value.clone() + }; Ok(AstRegexQuery { field: self.field, - regex: self.value.value, + regex, } .into()) } diff --git a/quickwit/quickwit-query/src/elastic_query_dsl/term_query.rs b/quickwit/quickwit-query/src/elastic_query_dsl/term_query.rs index 9f9d32ba685..4fa0cd4957f 100644 --- a/quickwit/quickwit-query/src/elastic_query_dsl/term_query.rs +++ b/quickwit/quickwit-query/src/elastic_query_dsl/term_query.rs @@ -41,6 +41,7 @@ impl From for TermQueryParams { TermQueryParams { value: query, boost: None, + case_insensitive: false, } } } @@ -54,7 +55,9 @@ enum TermValue { } fn deserialize_term_value<'de, D>(deserializer: D) -> Result -where D: Deserializer<'de> { +where + D: Deserializer<'de>, +{ let term_value = TermValue::deserialize(deserializer)?; match term_value { TermValue::I64(i64) => Ok(i64.to_string()), @@ -70,6 +73,8 @@ pub struct TermQueryParams { pub value: String, #[serde(default)] pub boost: Option, + #[serde(default)] + case_insensitive: bool, } pub fn term_query_from_field_value(field: impl ToString, value: impl ToString) -> TermQuery { @@ -78,6 +83,7 @@ pub fn term_query_from_field_value(field: impl ToString, value: impl ToString) - value: TermQueryParams { value: value.to_string(), boost: None, + case_insensitive: false, }, } } @@ -90,7 +96,20 @@ impl From for ElasticQueryDslInner { impl ConvertibleToQueryAst for TermQuery { fn convert_to_query_ast(self) -> anyhow::Result { - let TermQueryParams { value, boost } = self.value; + let TermQueryParams { + value, + boost, + case_insensitive, + } = self.value; + if case_insensitive { + let ci_value = format!("(?i){}", regex::escape(&value)); + let term_ast: QueryAst = query_ast::RegexQuery { + field: self.field, + regex: ci_value, + } + .into(); + return Ok(term_ast.boost(boost)); + } let term_ast: QueryAst = query_ast::TermQuery { field: self.field, value, diff --git a/quickwit/quickwit-query/src/elastic_query_dsl/wildcard_query.rs b/quickwit/quickwit-query/src/elastic_query_dsl/wildcard_query.rs index 973c4d31cf1..3b975e896e9 100644 --- a/quickwit/quickwit-query/src/elastic_query_dsl/wildcard_query.rs +++ b/quickwit/quickwit-query/src/elastic_query_dsl/wildcard_query.rs @@ -32,6 +32,8 @@ pub struct WildcardQueryParams { value: String, #[serde(default)] pub boost: Option, + #[serde(default)] + case_insensitive: bool, } impl ConvertibleToQueryAst for WildcardQuery { @@ -40,6 +42,7 @@ impl ConvertibleToQueryAst for WildcardQuery { field: self.field, value: self.params.value, lenient: true, + case_insensitive: self.params.case_insensitive, } .into(); Ok(wildcard_ast.boost(self.params.boost)) @@ -60,7 +63,11 @@ impl From>> for impl From for WildcardQueryParams { fn from(value: String) -> WildcardQueryParams { - WildcardQueryParams { value, boost: None } + WildcardQueryParams { + value, + boost: None, + case_insensitive: false, + } } } diff --git a/quickwit/quickwit-query/src/query_ast/user_input_query.rs b/quickwit/quickwit-query/src/query_ast/user_input_query.rs index 80d3724d0e0..623a7943c38 100644 --- a/quickwit/quickwit-query/src/query_ast/user_input_query.rs +++ b/quickwit/quickwit-query/src/query_ast/user_input_query.rs @@ -293,6 +293,7 @@ fn convert_user_input_literal( field: field_name, value: phrase.clone(), lenient, + case_insensitive: false, } .into() } else { diff --git a/quickwit/quickwit-query/src/query_ast/wildcard_query.rs b/quickwit/quickwit-query/src/query_ast/wildcard_query.rs index 78253076b29..bed0f949111 100644 --- a/quickwit/quickwit-query/src/query_ast/wildcard_query.rs +++ b/quickwit/quickwit-query/src/query_ast/wildcard_query.rs @@ -32,6 +32,7 @@ pub struct WildcardQuery { pub value: String, /// Support missing fields pub lenient: bool, + pub case_insensitive: bool, } impl From for QueryAst { @@ -133,6 +134,11 @@ impl WildcardQuery { let tokenizer_name = text_field_indexing.tokenizer(); let regex = sub_query_parts_to_regex(sub_query_parts, tokenizer_name, tokenizer_manager)?; + let regex = if self.case_insensitive { + format!("(?i){}", regex) + } else { + regex + }; Ok((field, None, regex)) } @@ -147,6 +153,11 @@ impl WildcardQuery { let tokenizer_name = text_field_indexing.tokenizer(); let regex = sub_query_parts_to_regex(sub_query_parts, tokenizer_name, tokenizer_manager)?; + let regex = if self.case_insensitive { + format!("(?i){}", regex) + } else { + regex + }; let mut term_for_path = Term::from_field_json_path( field, @@ -219,6 +230,7 @@ mod tests { field: "text_field".to_string(), value: "MyString Wh1ch?a.nOrMal Tokenizer would*cut".to_string(), lenient: false, + case_insensitive: false, }; let tokenizer_manager = create_default_quickwit_tokenizer_manager(); @@ -261,6 +273,7 @@ mod tests { field: "text_field".to_string(), value: "MyString Wh1ch\\?a.nOrMal Tokenizer would\\*cut".to_string(), lenient: false, + case_insensitive: false, }; let tokenizer_manager = create_default_quickwit_tokenizer_manager(); @@ -305,6 +318,7 @@ mod tests { field: "json_field.Inner.Fie*ld".to_string(), value: "MyString Wh1ch?a.nOrMal Tokenizer would*cut".to_string(), lenient: false, + case_insensitive: false, }; let tokenizer_manager = create_default_quickwit_tokenizer_manager(); @@ -347,6 +361,7 @@ mod tests { field: "my_missing_field".to_string(), value: "My query value*".to_string(), lenient: false, + case_insensitive: false, }; let tokenizer_manager = create_default_quickwit_tokenizer_manager(); let schema = single_text_field_schema("my_field", "whitespace"); @@ -359,4 +374,47 @@ mod tests { }; assert_eq!(missing_field_full_path, "my_missing_field"); } + + #[test] + fn test_wildcard_query_to_regex_on_text_case_insensitive() { + let query = WildcardQuery { + field: "text_field".to_string(), + value: "MyString Wh1ch?a.nOrMal Tokenizer would*cut".to_string(), + lenient: false, + case_insensitive: true, + }; + + let tokenizer_manager = create_default_quickwit_tokenizer_manager(); + for tokenizer in ["raw", "whitespace"] { + let mut schema_builder = TantivySchema::builder(); + let text_options = TextOptions::default() + .set_indexing_options(TextFieldIndexing::default().set_tokenizer(tokenizer)); + schema_builder.add_text_field("text_field", text_options); + let schema = schema_builder.build(); + + let (_field, path, regex) = query.to_regex(&schema, &tokenizer_manager).unwrap(); + assert_eq!(regex, "(?i)MyString Wh1ch.a\\.nOrMal Tokenizer would.*cut"); + assert!(path.is_none()); + } + + for tokenizer in [ + "raw_lowercase", + "lowercase", + "default", + "en_stem", + "chinese_compatible", + "source_code_default", + "source_code_with_hex", + ] { + let mut schema_builder = TantivySchema::builder(); + let text_options = TextOptions::default() + .set_indexing_options(TextFieldIndexing::default().set_tokenizer(tokenizer)); + schema_builder.add_text_field("text_field", text_options); + let schema = schema_builder.build(); + + let (_field, path, regex) = query.to_regex(&schema, &tokenizer_manager).unwrap(); + assert_eq!(regex, "(?i)mystring wh1ch.a\\.normal tokenizer would.*cut"); + assert!(path.is_none()); + } + } } diff --git a/quickwit/rest-api-tests/scenarii/es_compatibility/0006-term_query.yaml b/quickwit/rest-api-tests/scenarii/es_compatibility/0006-term_query.yaml index 320ea03b47b..3fe75d61973 100644 --- a/quickwit/rest-api-tests/scenarii/es_compatibility/0006-term_query.yaml +++ b/quickwit/rest-api-tests/scenarii/es_compatibility/0006-term_query.yaml @@ -1,5 +1,3 @@ -# case_insensitive not supported. -engines: ["elasticsearch"] params: # this overrides the query sent in body apparently size: 3 diff --git a/quickwit/rest-api-tests/scenarii/es_compatibility/0029-wildcard.yaml b/quickwit/rest-api-tests/scenarii/es_compatibility/0029-wildcard.yaml index 4bf35c0e18c..1dd8d9c5658 100644 --- a/quickwit/rest-api-tests/scenarii/es_compatibility/0029-wildcard.yaml +++ b/quickwit/rest-api-tests/scenarii/es_compatibility/0029-wildcard.yaml @@ -26,3 +26,25 @@ expected: hits: total: value: 2 +--- +json: + query: + wildcard: + repo.name: + value: RUS* + case_insensitive: true +expected: + hits: + total: + value: 1 +--- +json: + query: + wildcard: + repo.name: + value: RUS* + case_insensitive: false +expected: + hits: + total: + value: 0 diff --git a/quickwit/rest-api-tests/scenarii/es_compatibility/0030-prefix.yaml b/quickwit/rest-api-tests/scenarii/es_compatibility/0030-prefix.yaml index d239be8f69c..9f5d9c80420 100644 --- a/quickwit/rest-api-tests/scenarii/es_compatibility/0030-prefix.yaml +++ b/quickwit/rest-api-tests/scenarii/es_compatibility/0030-prefix.yaml @@ -26,3 +26,25 @@ expected: hits: total: value: 2 +--- +json: + query: + prefix: + repo.name: + value: RUST + case_insensitive: true +expected: + hits: + total: + value: 1 +--- +json: + query: + prefix: + repo.name: + value: RUST + case_insensitive: false +expected: + hits: + total: + value: 0 diff --git a/quickwit/rest-api-tests/scenarii/es_compatibility/0031-regex.yaml b/quickwit/rest-api-tests/scenarii/es_compatibility/0031-regex.yaml new file mode 100644 index 00000000000..99ed21169ee --- /dev/null +++ b/quickwit/rest-api-tests/scenarii/es_compatibility/0031-regex.yaml @@ -0,0 +1,47 @@ +# Basic regex match +params: + size: 0 +json: + track_total_hits: true + query: + regexp: + type: + value: ".*event" +expected: + hits: + total: + value: 100 + relation: "eq" +--- +# Regex with case_insensitive flag +params: + size: 3 +json: + track_total_hits: true + query: + regexp: + repo.name: + # lowercased by the tokenizer + value: "RUST.*" + case_insensitive: true +expected: + hits: + total: + value: 1 + relation: "eq" +--- +params: + size: 3 +json: + track_total_hits: true + query: + regexp: + type: + # lowercased by the tokenizer + value: "RUST.*" + case_insensitive: false +expected: + hits: + total: + value: 0 + relation: "eq" diff --git a/quickwit/rest-api-tests/scenarii/es_compatibility/_setup.quickwit.yaml b/quickwit/rest-api-tests/scenarii/es_compatibility/_setup.quickwit.yaml index 9ffe02bcfe0..60709ec7dcd 100644 --- a/quickwit/rest-api-tests/scenarii/es_compatibility/_setup.quickwit.yaml +++ b/quickwit/rest-api-tests/scenarii/es_compatibility/_setup.quickwit.yaml @@ -43,6 +43,7 @@ json: type: text fast: true indexed: true + tokenizer: raw - name: public type: bool fast: false From 2f68622468ee7c1653d75e6873ed8d405744b580 Mon Sep 17 00:00:00 2001 From: Darkheir Date: Wed, 3 Dec 2025 15:21:08 +0100 Subject: [PATCH 2/2] Apply PR review sugestion Signed-off-by: Darkheir --- quickwit/quickwit-query/src/elastic_query_dsl/term_query.rs | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/quickwit/quickwit-query/src/elastic_query_dsl/term_query.rs b/quickwit/quickwit-query/src/elastic_query_dsl/term_query.rs index 4fa0cd4957f..d7d1ea72d92 100644 --- a/quickwit/quickwit-query/src/elastic_query_dsl/term_query.rs +++ b/quickwit/quickwit-query/src/elastic_query_dsl/term_query.rs @@ -55,9 +55,7 @@ enum TermValue { } fn deserialize_term_value<'de, D>(deserializer: D) -> Result -where - D: Deserializer<'de>, -{ +where D: Deserializer<'de> { let term_value = TermValue::deserialize(deserializer)?; match term_value { TermValue::I64(i64) => Ok(i64.to_string()),