From 5f63d2014f1e5b62e2b3977e43851d957c64f0ca Mon Sep 17 00:00:00 2001 From: trinity Pointard Date: Wed, 3 Dec 2025 17:47:14 +0100 Subject: [PATCH 1/8] introduce BuildTantivyAstContext --- .../src/doc_mapper/doc_mapper_impl.rs | 12 +- .../quickwit-doc-mapper/src/query_builder.rs | 48 ++---- .../quickwit-doc-mapper/src/tag_pruning.rs | 1 + .../src/query_ast/bool_query.rs | 38 +---- .../src/query_ast/cache_node.rs | 46 +++++ .../src/query_ast/field_presence.rs | 21 +-- .../src/query_ast/full_text_query.rs | 43 ++--- quickwit/quickwit-query/src/query_ast/mod.rs | 161 +++++++----------- .../src/query_ast/phrase_prefix_query.rs | 9 +- .../src/query_ast/range_query.rs | 77 +++------ .../src/query_ast/regex_query.rs | 10 +- .../src/query_ast/term_query.rs | 51 ++---- .../src/query_ast/term_set_query.rs | 20 +-- .../src/query_ast/user_input_query.rs | 28 +-- .../quickwit-query/src/query_ast/visitor.rs | 17 +- .../src/query_ast/wildcard_query.rs | 9 +- 16 files changed, 230 insertions(+), 361 deletions(-) create mode 100644 quickwit/quickwit-query/src/query_ast/cache_node.rs diff --git a/quickwit/quickwit-doc-mapper/src/doc_mapper/doc_mapper_impl.rs b/quickwit/quickwit-doc-mapper/src/doc_mapper/doc_mapper_impl.rs index 9530c9bb31c..04379de3631 100644 --- a/quickwit/quickwit-doc-mapper/src/doc_mapper/doc_mapper_impl.rs +++ b/quickwit/quickwit-doc-mapper/src/doc_mapper/doc_mapper_impl.rs @@ -19,7 +19,7 @@ use anyhow::{Context, bail}; use fnv::FnvHashSet; use quickwit_proto::types::DocMappingUid; use quickwit_query::create_default_quickwit_tokenizer_manager; -use quickwit_query::query_ast::QueryAst; +use quickwit_query::query_ast::{BuildTantivyAstContext, QueryAst}; use quickwit_query::tokenizers::TokenizerManager; use serde::{Deserialize, Serialize}; use serde_json::{self, Value as JsonValue}; @@ -641,10 +641,12 @@ impl DocMapper { ) -> Result<(Box, WarmupInfo), QueryParserError> { build_query( query_ast, - split_schema, - self.tokenizer_manager(), - &self.default_search_field_names[..], - with_validation, + &BuildTantivyAstContext { + schema: &split_schema, + tokenizer_manager: self.tokenizer_manager(), + search_fields: &self.default_search_field_names[..], + with_validation, + }, ) } diff --git a/quickwit/quickwit-doc-mapper/src/query_builder.rs b/quickwit/quickwit-doc-mapper/src/query_builder.rs index 10c0a8a8397..03c71a856ad 100644 --- a/quickwit/quickwit-doc-mapper/src/query_builder.rs +++ b/quickwit/quickwit-doc-mapper/src/query_builder.rs @@ -17,8 +17,8 @@ use std::convert::Infallible; use std::ops::Bound; use quickwit_query::query_ast::{ - FieldPresenceQuery, FullTextQuery, PhrasePrefixQuery, QueryAst, QueryAstVisitor, RangeQuery, - RegexQuery, TermSetQuery, WildcardQuery, + BuildTantivyAstContext, FieldPresenceQuery, FullTextQuery, PhrasePrefixQuery, QueryAst, + QueryAstVisitor, RangeQuery, RegexQuery, TermSetQuery, WildcardQuery, }; use quickwit_query::tokenizers::TokenizerManager; use quickwit_query::{InvalidQuery, find_field_or_hit_dynamic}; @@ -155,10 +155,7 @@ impl<'a, 'f> QueryAstVisitor<'a> for ExistsQueryFastFields<'f> { /// Build a `Query` with field resolution & forbidding range clauses. pub(crate) fn build_query( query_ast: &QueryAst, - schema: Schema, - tokenizer_manager: &TokenizerManager, - search_fields: &[String], - with_validation: bool, + context: &BuildTantivyAstContext, ) -> Result<(Box, WarmupInfo), QueryParserError> { let mut fast_fields: HashSet = HashSet::new(); @@ -177,31 +174,30 @@ pub(crate) fn build_query( let Ok(_) = TermSearchOnColumnar { fields: &mut fast_fields, - schema: schema.clone(), + schema: context.schema.clone(), } .visit(query_ast); let Ok(_) = ExistsQueryFastFields { fields: &mut fast_fields, - schema: schema.clone(), + schema: context.schema.clone(), } .visit(query_ast); - let query = query_ast.build_tantivy_query( - &schema, - tokenizer_manager, - search_fields, - with_validation, - )?; + let query = query_ast.build_tantivy_query(context)?; - let term_set_query_fields = extract_term_set_query_fields(query_ast, &schema)?; + let term_set_query_fields = extract_term_set_query_fields(query_ast, context.schema)?; let (term_ranges_grouped_by_field, automatons_grouped_by_field) = - extract_prefix_term_ranges_and_automaton(query_ast, &schema, tokenizer_manager)?; + extract_prefix_term_ranges_and_automaton( + query_ast, + context.schema, + context.tokenizer_manager, + )?; let mut terms_grouped_by_field: HashMap> = Default::default(); query.query_terms(&mut |term, need_position| { let field = term.field(); - if !schema.get_field_entry(field).is_indexed() { + if !context.schema.get_field_entry(field).is_indexed() { return; } *terms_grouped_by_field @@ -506,13 +502,7 @@ mod test { .parse_user_query(&[]) .map_err(|err| err.to_string())?; let schema = make_schema(dynamic_mode); - let query_result = build_query( - &query_ast, - schema, - &create_default_quickwit_tokenizer_manager(), - &[], - true, - ); + let query_result = build_query(&query_ast, quickwit_query::test_context!(schema)); query_result .map(|query| format!("{query:?}")) .map_err(|err| err.to_string()) @@ -888,10 +878,7 @@ mod test { let (_, warmup_info) = build_query( &query_with_set, - make_schema(true), - &create_default_quickwit_tokenizer_manager(), - &[], - true, + quickwit_query::test_context!(make_schema(true)), ) .unwrap(); assert_eq!(warmup_info.term_dict_fields.len(), 1); @@ -903,10 +890,7 @@ mod test { let (_, warmup_info) = build_query( &query_without_set, - make_schema(true), - &create_default_quickwit_tokenizer_manager(), - &[], - true, + quickwit_query::test_context!(make_schema(true)), ) .unwrap(); assert!(warmup_info.term_dict_fields.is_empty()); diff --git a/quickwit/quickwit-doc-mapper/src/tag_pruning.rs b/quickwit/quickwit-doc-mapper/src/tag_pruning.rs index e75e5e2e2dc..ad6ded9444c 100644 --- a/quickwit/quickwit-doc-mapper/src/tag_pruning.rs +++ b/quickwit/quickwit-doc-mapper/src/tag_pruning.rs @@ -114,6 +114,7 @@ fn extract_unsimplified_tags_filter_ast(query_ast: QueryAst) -> UnsimplifiedTagF } QueryAst::FieldPresence(_) => UnsimplifiedTagFilterAst::Uninformative, QueryAst::Regex(_) => UnsimplifiedTagFilterAst::Uninformative, + QueryAst::Cache(cache_node) => extract_unsimplified_tags_filter_ast(*cache_node.inner), } } diff --git a/quickwit/quickwit-query/src/query_ast/bool_query.rs b/quickwit/quickwit-query/src/query_ast/bool_query.rs index ee9a46ea97f..b85fbacb4d4 100644 --- a/quickwit/quickwit-query/src/query_ast/bool_query.rs +++ b/quickwit/quickwit-query/src/query_ast/bool_query.rs @@ -13,15 +13,12 @@ // limitations under the License. use serde::{Deserialize, Serialize}; -use tantivy::schema::Schema as TantivySchema; -use super::{BuildTantivyAst, TantivyQueryAst}; +use super::{BuildTantivyAst, BuildTantivyAstContext, TantivyQueryAst}; use crate::InvalidQuery; use crate::query_ast::QueryAst; -use crate::tokenizers::TokenizerManager; /// # Unsupported features -/// - minimum_should_match /// - named queries /// /// Edge cases of BooleanQuery are not obvious, @@ -56,49 +53,26 @@ impl From for QueryAst { impl BuildTantivyAst for BoolQuery { fn build_tantivy_ast_impl( &self, - schema: &TantivySchema, - tokenizer_manager: &TokenizerManager, - search_fields: &[String], - with_validation: bool, + context: &BuildTantivyAstContext, ) -> Result { let mut boolean_query = super::tantivy_query_ast::TantivyBoolQuery { minimum_should_match: self.minimum_should_match, ..Default::default() }; for must in &self.must { - let must_leaf = must.build_tantivy_ast_call( - schema, - tokenizer_manager, - search_fields, - with_validation, - )?; + let must_leaf = must.build_tantivy_ast_call(context)?; boolean_query.must.push(must_leaf); } for must_not in &self.must_not { - let must_not_leaf = must_not.build_tantivy_ast_call( - schema, - tokenizer_manager, - search_fields, - with_validation, - )?; + let must_not_leaf = must_not.build_tantivy_ast_call(context)?; boolean_query.must_not.push(must_not_leaf); } for should in &self.should { - let should_leaf = should.build_tantivy_ast_call( - schema, - tokenizer_manager, - search_fields, - with_validation, - )?; + let should_leaf = should.build_tantivy_ast_call(context)?; boolean_query.should.push(should_leaf); } for filter in &self.filter { - let filter_leaf = filter.build_tantivy_ast_call( - schema, - tokenizer_manager, - search_fields, - with_validation, - )?; + let filter_leaf = filter.build_tantivy_ast_call(context)?; boolean_query.filter.push(filter_leaf); } Ok(TantivyQueryAst::Bool(boolean_query)) diff --git a/quickwit/quickwit-query/src/query_ast/cache_node.rs b/quickwit/quickwit-query/src/query_ast/cache_node.rs new file mode 100644 index 00000000000..68bcf41a413 --- /dev/null +++ b/quickwit/quickwit-query/src/query_ast/cache_node.rs @@ -0,0 +1,46 @@ +// Copyright 2021-Present Datadog, Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +use serde::{Deserialize, Serialize}; + +use super::{BuildTantivyAst, BuildTantivyAstContext, TantivyQueryAst}; +use crate::InvalidQuery; +use crate::query_ast::QueryAst; + +/// A node caching the posting list of the inner query. +/// +/// This can be used when it's known that some sub-ast might appear in many queries, +/// or that the same query might be run, with various aggregations. +/// +/// /!\ Sprinkling this everywhere can lead to performance degradations: the whole posting +/// list of the underlying query will need to be evaluated to build the cache. +#[derive(Serialize, Deserialize, Debug, PartialEq, Eq, Clone)] +pub struct CacheNode { + pub inner: Box, +} + +impl From for QueryAst { + fn from(cache_node: CacheNode) -> Self { + QueryAst::Cache(cache_node) + } +} + +impl BuildTantivyAst for CacheNode { + fn build_tantivy_ast_impl( + &self, + context: &BuildTantivyAstContext, + ) -> Result { + self.inner.build_tantivy_ast_impl(context) + } +} diff --git a/quickwit/quickwit-query/src/query_ast/field_presence.rs b/quickwit/quickwit-query/src/query_ast/field_presence.rs index 844c31d0ff7..89395a9beec 100644 --- a/quickwit/quickwit-query/src/query_ast/field_presence.rs +++ b/quickwit/quickwit-query/src/query_ast/field_presence.rs @@ -21,8 +21,7 @@ use tantivy::schema::{Field, FieldEntry, IndexRecordOption, Schema as TantivySch use super::tantivy_query_ast::TantivyBoolQuery; use super::utils::{DYNAMIC_FIELD_NAME, find_subfields}; use crate::query_ast::tantivy_query_ast::TantivyQueryAst; -use crate::query_ast::{BuildTantivyAst, QueryAst}; -use crate::tokenizers::TokenizerManager; +use crate::query_ast::{BuildTantivyAst, BuildTantivyAstContext, QueryAst}; use crate::{BooleanOperand, InvalidQuery, find_field_or_hit_dynamic}; #[derive(Serialize, Deserialize, Debug, Clone, PartialEq, Eq)] @@ -121,15 +120,17 @@ impl FieldPresenceQuery { impl BuildTantivyAst for FieldPresenceQuery { fn build_tantivy_ast_impl( &self, - schema: &TantivySchema, - _tokenizer_manager: &TokenizerManager, - _search_fields: &[String], - _with_validation: bool, + context: &BuildTantivyAstContext, ) -> Result { - let field_presence_field = schema.get_field(FIELD_PRESENCE_FIELD_NAME).map_err(|_| { - InvalidQuery::SchemaError("field presence is not available for this split".to_string()) - })?; - let fields = self.find_field_and_subfields(schema); + let field_presence_field = context + .schema + .get_field(FIELD_PRESENCE_FIELD_NAME) + .map_err(|_| { + InvalidQuery::SchemaError( + "field presence is not available for this split".to_string(), + ) + })?; + let fields = self.find_field_and_subfields(context.schema); if fields.is_empty() { // the schema is not dynamic and no subfields are defined return Err(InvalidQuery::FieldDoesNotExist { diff --git a/quickwit/quickwit-query/src/query_ast/full_text_query.rs b/quickwit/quickwit-query/src/query_ast/full_text_query.rs index 1df3e9fd6a4..158782b1dea 100644 --- a/quickwit/quickwit-query/src/query_ast/full_text_query.rs +++ b/quickwit/quickwit-query/src/query_ast/full_text_query.rs @@ -27,7 +27,7 @@ use tantivy::tokenizer::{TextAnalyzer, TokenStream}; use crate::query_ast::tantivy_query_ast::{TantivyBoolQuery, TantivyQueryAst}; use crate::query_ast::utils::full_text_query; -use crate::query_ast::{BuildTantivyAst, QueryAst}; +use crate::query_ast::{BuildTantivyAst, BuildTantivyAstContext, QueryAst}; use crate::tokenizers::TokenizerManager; use crate::{BooleanOperand, InvalidQuery, MatchAllOrNone, find_field_or_hit_dynamic}; @@ -235,17 +235,14 @@ impl From for QueryAst { impl BuildTantivyAst for FullTextQuery { fn build_tantivy_ast_impl( &self, - schema: &TantivySchema, - tokenizer_manager: &TokenizerManager, - _search_fields: &[String], - _with_validation: bool, + context: &BuildTantivyAstContext, ) -> Result { full_text_query( &self.field, &self.text, &self.params, - schema, - tokenizer_manager, + context.schema, + context.tokenizer_manager, self.lenient, ) } @@ -306,9 +303,9 @@ impl FullTextQuery { mod tests { use tantivy::schema::{Schema, TEXT}; + use crate::BooleanOperand; use crate::query_ast::tantivy_query_ast::TantivyQueryAst; - use crate::query_ast::{BuildTantivyAst, FullTextMode, FullTextQuery}; - use crate::{BooleanOperand, create_default_quickwit_tokenizer_manager}; + use crate::query_ast::{BuildTantivyAst, FullTextMode, FullTextQuery, test_context}; #[test] fn test_zero_terms() { @@ -326,12 +323,7 @@ mod tests { schema_builder.add_text_field("body", TEXT); let schema = schema_builder.build(); let ast: TantivyQueryAst = full_text_query - .build_tantivy_ast_call( - &schema, - &create_default_quickwit_tokenizer_manager(), - &[], - true, - ) + .build_tantivy_ast_call(test_context!(schema)) .unwrap(); assert_eq!(ast.const_predicate(), Some(crate::MatchAllOrNone::MatchAll)); } @@ -352,12 +344,7 @@ mod tests { schema_builder.add_text_field("body", TEXT); let schema = schema_builder.build(); let ast: TantivyQueryAst = full_text_query - .build_tantivy_ast_call( - &schema, - &create_default_quickwit_tokenizer_manager(), - &[], - true, - ) + .build_tantivy_ast_call(test_context!(schema)) .unwrap(); let leaf = ast.as_leaf().unwrap(); assert_eq!( @@ -383,12 +370,7 @@ mod tests { schema_builder.add_text_field("body", TEXT); let schema = schema_builder.build(); let ast: TantivyQueryAst = full_text_query - .build_tantivy_ast_call( - &schema, - &create_default_quickwit_tokenizer_manager(), - &[], - true, - ) + .build_tantivy_ast_call(test_context!(schema)) .unwrap(); let leaf = ast.as_leaf().unwrap(); assert_eq!( @@ -413,12 +395,7 @@ mod tests { schema_builder.add_text_field("body", TEXT); let schema = schema_builder.build(); let ast: TantivyQueryAst = full_text_query - .build_tantivy_ast_call( - &schema, - &create_default_quickwit_tokenizer_manager(), - &[], - true, - ) + .build_tantivy_ast_call(test_context!(schema)) .unwrap(); let bool_query = ast.as_bool_query().unwrap(); assert_eq!(bool_query.must.len(), 2); diff --git a/quickwit/quickwit-query/src/query_ast/mod.rs b/quickwit/quickwit-query/src/query_ast/mod.rs index c7487f1ed31..918e04ecd60 100644 --- a/quickwit/quickwit-query/src/query_ast/mod.rs +++ b/quickwit/quickwit-query/src/query_ast/mod.rs @@ -19,6 +19,7 @@ use tantivy::schema::Schema as TantivySchema; use crate::tokenizers::TokenizerManager; mod bool_query; +mod cache_node; mod field_presence; mod full_text_query; mod phrase_prefix_query; @@ -33,6 +34,7 @@ mod visitor; mod wildcard_query; pub use bool_query::BoolQuery; +pub use cache_node::CacheNode; pub use field_presence::FieldPresenceQuery; pub use full_text_query::{FullTextMode, FullTextParams, FullTextQuery}; pub use phrase_prefix_query::PhrasePrefixQuery; @@ -67,6 +69,7 @@ pub enum QueryAst { underlying: Box, boost: NotNaNf32, }, + Cache(CacheNode), } impl QueryAst { @@ -115,6 +118,13 @@ impl QueryAst { boost, }) } + QueryAst::Cache(cache_node) => { + let inner = cache_node.inner.parse_user_query(default_search_fields)?; + Ok(CacheNode { + inner: Box::new(inner), + } + .into()) + } } } @@ -144,6 +154,28 @@ impl QueryAst { } } +/// Context used when building a tantivy ast. +pub struct BuildTantivyAstContext<'a> { + pub schema: &'a TantivySchema, + pub tokenizer_manager: &'a TokenizerManager, + pub search_fields: &'a [String], + pub with_validation: bool, +} + +/// Shorthand to build a context from a schema in tests. +#[macro_export] +macro_rules! test_context { + ($schema:expr) => { + &$crate::query_ast::BuildTantivyAstContext { + schema: &$schema, + tokenizer_manager: &$crate::create_default_quickwit_tokenizer_manager(), + search_fields: &[], + with_validation: true, + } + }; +} +pub use test_context; + trait BuildTantivyAst { /// Transforms a query Ast node into a TantivyQueryAst. /// @@ -151,23 +183,16 @@ trait BuildTantivyAst { /// It can call `into_tantivy_ast_call_me` but should never call `into_tantivy_ast_impl`. fn build_tantivy_ast_impl( &self, - schema: &TantivySchema, - tokenizer_manager: &TokenizerManager, - search_fields: &[String], - with_validation: bool, + context: &BuildTantivyAstContext, ) -> Result; /// This method is meant to be called, but should never be overloaded. fn build_tantivy_ast_call( &self, - schema: &TantivySchema, - tokenizer_manager: &TokenizerManager, - search_fields: &[String], - with_validation: bool, + context: &BuildTantivyAstContext, ) -> Result { - let tantivy_ast_res = - self.build_tantivy_ast_impl(schema, tokenizer_manager, search_fields, with_validation); - if !with_validation && tantivy_ast_res.is_err() { + let tantivy_ast_res = self.build_tantivy_ast_impl(context); + if !context.with_validation && tantivy_ast_res.is_err() { return match tantivy_ast_res { res @ Ok(_) | res @ Err(InvalidQuery::UserQueryNotParsed) => res, Err(_) => Ok(TantivyQueryAst::match_none()), @@ -180,80 +205,31 @@ trait BuildTantivyAst { impl BuildTantivyAst for QueryAst { fn build_tantivy_ast_impl( &self, - schema: &TantivySchema, - tokenizer_manager: &TokenizerManager, - search_fields: &[String], - with_validation: bool, + context: &BuildTantivyAstContext, ) -> Result { match self { - QueryAst::Bool(bool_query) => bool_query.build_tantivy_ast_call( - schema, - tokenizer_manager, - search_fields, - with_validation, - ), - QueryAst::Term(term_query) => term_query.build_tantivy_ast_call( - schema, - tokenizer_manager, - search_fields, - with_validation, - ), - QueryAst::Range(range_query) => range_query.build_tantivy_ast_call( - schema, - tokenizer_manager, - search_fields, - with_validation, - ), + QueryAst::Bool(bool_query) => bool_query.build_tantivy_ast_call(context), + QueryAst::Term(term_query) => term_query.build_tantivy_ast_call(context), + QueryAst::Range(range_query) => range_query.build_tantivy_ast_call(context), QueryAst::MatchAll => Ok(TantivyQueryAst::match_all()), QueryAst::MatchNone => Ok(TantivyQueryAst::match_none()), QueryAst::Boost { boost, underlying } => { - let underlying = underlying.build_tantivy_ast_call( - schema, - tokenizer_manager, - search_fields, - with_validation, - )?; + let underlying = underlying.build_tantivy_ast_call(context)?; let boost_query = TantivyBoostQuery::new(underlying.into(), (*boost).into()); Ok(boost_query.into()) } - QueryAst::TermSet(term_set) => term_set.build_tantivy_ast_call( - schema, - tokenizer_manager, - search_fields, - with_validation, - ), - QueryAst::FullText(full_text_query) => full_text_query.build_tantivy_ast_call( - schema, - tokenizer_manager, - search_fields, - with_validation, - ), - QueryAst::PhrasePrefix(phrase_prefix_query) => phrase_prefix_query - .build_tantivy_ast_call(schema, tokenizer_manager, search_fields, with_validation), - QueryAst::UserInput(user_text_query) => user_text_query.build_tantivy_ast_call( - schema, - tokenizer_manager, - search_fields, - with_validation, - ), - QueryAst::FieldPresence(field_presence) => field_presence.build_tantivy_ast_call( - schema, - tokenizer_manager, - search_fields, - with_validation, - ), - QueryAst::Wildcard(wildcard) => wildcard.build_tantivy_ast_call( - schema, - tokenizer_manager, - search_fields, - with_validation, - ), - QueryAst::Regex(regex) => regex.build_tantivy_ast_call( - schema, - tokenizer_manager, - search_fields, - with_validation, - ), + QueryAst::TermSet(term_set) => term_set.build_tantivy_ast_call(context), + QueryAst::FullText(full_text_query) => full_text_query.build_tantivy_ast_call(context), + QueryAst::PhrasePrefix(phrase_prefix_query) => { + phrase_prefix_query.build_tantivy_ast_call(context) + } + QueryAst::UserInput(user_text_query) => user_text_query.build_tantivy_ast_call(context), + QueryAst::FieldPresence(field_presence) => { + field_presence.build_tantivy_ast_call(context) + } + QueryAst::Wildcard(wildcard) => wildcard.build_tantivy_ast_call(context), + QueryAst::Regex(regex) => regex.build_tantivy_ast_call(context), + QueryAst::Cache(cache_node) => cache_node.build_tantivy_ast_call(context), } } } @@ -261,13 +237,9 @@ impl BuildTantivyAst for QueryAst { impl QueryAst { pub fn build_tantivy_query( &self, - schema: &TantivySchema, - tokenizer_manager: &TokenizerManager, - search_fields: &[String], - with_validation: bool, + context: &BuildTantivyAstContext, ) -> Result, InvalidQuery> { - let tantivy_query_ast = - self.build_tantivy_ast_call(schema, tokenizer_manager, search_fields, with_validation)?; + let tantivy_query_ast = self.build_tantivy_ast_call(context)?; Ok(tantivy_query_ast.simplify().into()) } } @@ -334,7 +306,7 @@ mod tests { use crate::query_ast::{ BoolQuery, BuildTantivyAst, QueryAst, UserInputQuery, query_ast_from_user_text, }; - use crate::{BooleanOperand, InvalidQuery, create_default_quickwit_tokenizer_manager}; + use crate::{BooleanOperand, InvalidQuery}; #[test] fn test_user_query_not_parsed() { @@ -347,12 +319,7 @@ mod tests { .into(); let schema = tantivy::schema::Schema::builder().build(); let build_tantivy_ast_err: InvalidQuery = query_ast - .build_tantivy_ast_call( - &schema, - &create_default_quickwit_tokenizer_manager(), - &[], - true, - ) + .build_tantivy_ast_call(test_context!(schema)) .unwrap_err(); assert!(matches!( build_tantivy_ast_err, @@ -372,12 +339,7 @@ mod tests { let query_ast_with_parsed_user_query: QueryAst = query_ast.parse_user_query(&[]).unwrap(); let schema = tantivy::schema::Schema::builder().build(); let tantivy_query_ast = query_ast_with_parsed_user_query - .build_tantivy_ast_call( - &schema, - &create_default_quickwit_tokenizer_manager(), - &[], - true, - ) + .build_tantivy_ast_call(test_context!(schema)) .unwrap(); assert_eq!(&tantivy_query_ast, &TantivyQueryAst::match_all(),); } @@ -400,12 +362,7 @@ mod tests { bool_query_ast.parse_user_query(&[]).unwrap(); let schema = tantivy::schema::Schema::builder().build(); let tantivy_query_ast = query_ast_with_parsed_user_query - .build_tantivy_ast_call( - &schema, - &create_default_quickwit_tokenizer_manager(), - &[], - true, - ) + .build_tantivy_ast_call(test_context!(schema)) .unwrap(); let tantivy_query_ast_simplified = tantivy_query_ast.simplify(); // This does not get more simplified than this, because we need the boost 0 score. diff --git a/quickwit/quickwit-query/src/query_ast/phrase_prefix_query.rs b/quickwit/quickwit-query/src/query_ast/phrase_prefix_query.rs index 3c5b07e6f64..75c3f73b2e8 100644 --- a/quickwit/quickwit-query/src/query_ast/phrase_prefix_query.rs +++ b/quickwit/quickwit-query/src/query_ast/phrase_prefix_query.rs @@ -18,7 +18,7 @@ use tantivy::query::PhrasePrefixQuery as TantivyPhrasePrefixQuery; use tantivy::schema::{Field, FieldType, Schema as TantivySchema}; use crate::query_ast::tantivy_query_ast::TantivyQueryAst; -use crate::query_ast::{BuildTantivyAst, FullTextParams, QueryAst}; +use crate::query_ast::{BuildTantivyAst, BuildTantivyAstContext, FullTextParams, QueryAst}; use crate::tokenizers::TokenizerManager; use crate::{InvalidQuery, find_field_or_hit_dynamic}; @@ -112,12 +112,9 @@ impl From for QueryAst { impl BuildTantivyAst for PhrasePrefixQuery { fn build_tantivy_ast_impl( &self, - schema: &TantivySchema, - tokenizer_manager: &TokenizerManager, - _search_fields: &[String], - _with_validation: bool, + context: &BuildTantivyAstContext, ) -> Result { - let (_, terms) = match self.get_terms(schema, tokenizer_manager) { + let (_, terms) = match self.get_terms(context.schema, context.tokenizer_manager) { Ok(res) => res, Err(InvalidQuery::FieldDoesNotExist { .. }) if self.lenient => { return Ok(TantivyQueryAst::match_none()); diff --git a/quickwit/quickwit-query/src/query_ast/range_query.rs b/quickwit/quickwit-query/src/query_ast/range_query.rs index 3cacf697cf9..0854fa44cf0 100644 --- a/quickwit/quickwit-query/src/query_ast/range_query.rs +++ b/quickwit/quickwit-query/src/query_ast/range_query.rs @@ -17,16 +17,13 @@ use std::ops::Bound; use serde::{Deserialize, Serialize}; use tantivy::fastfield::FastValue; use tantivy::query::FastFieldRangeQuery; -use tantivy::schema::Schema as TantivySchema; use tantivy::tokenizer::TextAnalyzer; use tantivy::{DateTime, Term}; use super::QueryAst; use super::tantivy_query_ast::TantivyBoolQuery; use crate::json_literal::InterpretUserInput; -use crate::query_ast::BuildTantivyAst; -use crate::query_ast::tantivy_query_ast::TantivyQueryAst; -use crate::tokenizers::TokenizerManager; +use crate::query_ast::{BuildTantivyAst, BuildTantivyAstContext, TantivyQueryAst}; use crate::{InvalidQuery, JsonLiteral}; #[derive(Serialize, Deserialize, Clone, Debug, PartialEq, Eq)] @@ -110,17 +107,14 @@ fn get_normalized_text(normalizer: &mut Option, text: &str) -> Str impl BuildTantivyAst for RangeQuery { fn build_tantivy_ast_impl( &self, - schema: &TantivySchema, - tokenizer_manager: &TokenizerManager, - _search_fields: &[String], - _with_validation: bool, + context: &BuildTantivyAstContext, ) -> Result { let (field, field_entry, json_path) = - super::utils::find_field_or_hit_dynamic(&self.field, schema).ok_or_else(|| { - InvalidQuery::FieldDoesNotExist { + super::utils::find_field_or_hit_dynamic(&self.field, context.schema).ok_or_else( + || InvalidQuery::FieldDoesNotExist { full_path: self.field.clone(), - } - })?; + }, + )?; if !field_entry.is_fast() { return Err(InvalidQuery::SchemaError(format!( "range queries are only supported for fast fields. (`{}` is not a fast field)", @@ -129,9 +123,12 @@ impl BuildTantivyAst for RangeQuery { } Ok(match field_entry.field_type() { tantivy::schema::FieldType::Str(options) => { - let mut normalizer = options - .get_fast_field_tokenizer_name() - .and_then(|tokenizer_name| tokenizer_manager.get_normalizer(tokenizer_name)); + let mut normalizer = + options + .get_fast_field_tokenizer_name() + .and_then(|tokenizer_name| { + context.tokenizer_manager.get_normalizer(tokenizer_name) + }); let (lower_bound, upper_bound) = convert_bounds(&self.lower_bound, &self.upper_bound, field_entry.name())?; @@ -220,9 +217,12 @@ impl BuildTantivyAst for RangeQuery { sub_queries.push(query_from_fast_val_range(&empty_term, range).into()); } - let mut normalizer = options - .get_fast_field_tokenizer_name() - .and_then(|tokenizer_name| tokenizer_manager.get_normalizer(tokenizer_name)); + let mut normalizer = + options + .get_fast_field_tokenizer_name() + .and_then(|tokenizer_name| { + context.tokenizer_manager.get_normalizer(tokenizer_name) + }); let bounds_range_str: Option<(Bound<&str>, Bound<&str>)> = convert_bound(&self.lower_bound).zip(convert_bound(&self.upper_bound)); @@ -288,10 +288,8 @@ mod tests { use tantivy::schema::{DateOptions, DateTimePrecision, FAST, STORED, Schema, TEXT}; use super::RangeQuery; - use crate::query_ast::BuildTantivyAst; - use crate::{ - InvalidQuery, JsonLiteral, MatchAllOrNone, create_default_quickwit_tokenizer_manager, - }; + use crate::query_ast::{BuildTantivyAst, test_context}; + use crate::{InvalidQuery, JsonLiteral, MatchAllOrNone}; fn make_schema(dynamic_mode: bool) -> Schema { let mut schema_builder = Schema::builder(); @@ -323,12 +321,7 @@ mod tests { upper_bound: Bound::Included(upper_value), }; let tantivy_ast = range_query - .build_tantivy_ast_call( - &schema, - &create_default_quickwit_tokenizer_manager(), - &[], - true, - ) + .build_tantivy_ast_call(test_context!(schema)) .unwrap() .simplify(); let leaf = tantivy_ast.as_leaf().unwrap(); @@ -371,12 +364,7 @@ mod tests { }; // with validation let invalid_query: InvalidQuery = range_query - .build_tantivy_ast_call( - &schema, - &create_default_quickwit_tokenizer_manager(), - &[], - true, - ) + .build_tantivy_ast_call(test_context!(schema)) .unwrap_err(); assert!( matches!(invalid_query, InvalidQuery::FieldDoesNotExist { full_path } if full_path == "missing_field.toto") @@ -384,12 +372,7 @@ mod tests { // without validation assert_eq!( range_query - .build_tantivy_ast_call( - &schema, - &create_default_quickwit_tokenizer_manager(), - &[], - false - ) + .build_tantivy_ast_call(test_context!(schema),) .unwrap() .const_predicate(), Some(MatchAllOrNone::MatchNone) @@ -405,12 +388,7 @@ mod tests { }; let schema = make_schema(true); let tantivy_ast = range_query - .build_tantivy_ast_call( - &schema, - &create_default_quickwit_tokenizer_manager(), - &[], - true, - ) + .build_tantivy_ast_call(test_context!(schema)) .unwrap(); assert_eq!( format!("{tantivy_ast:?}"), @@ -433,12 +411,7 @@ mod tests { }; let schema = make_schema(false); let err = range_query - .build_tantivy_ast_call( - &schema, - &create_default_quickwit_tokenizer_manager(), - &[], - true, - ) + .build_tantivy_ast_call(test_context!(schema)) .unwrap_err(); assert!(matches!(err, InvalidQuery::SchemaError { .. })); } diff --git a/quickwit/quickwit-query/src/query_ast/regex_query.rs b/quickwit/quickwit-query/src/query_ast/regex_query.rs index 17a6419e3c6..851151e839a 100644 --- a/quickwit/quickwit-query/src/query_ast/regex_query.rs +++ b/quickwit/quickwit-query/src/query_ast/regex_query.rs @@ -20,9 +20,8 @@ use serde::{Deserialize, Serialize}; use tantivy::Term; use tantivy::schema::{Field, FieldType, Schema as TantivySchema}; -use super::{BuildTantivyAst, QueryAst}; +use super::{BuildTantivyAst, BuildTantivyAstContext, QueryAst}; use crate::query_ast::TantivyQueryAst; -use crate::tokenizers::TokenizerManager; use crate::{InvalidQuery, find_field_or_hit_dynamic}; /// A Regex query @@ -103,12 +102,9 @@ impl RegexQuery { impl BuildTantivyAst for RegexQuery { fn build_tantivy_ast_impl( &self, - schema: &TantivySchema, - _tokenizer_manager: &TokenizerManager, - _search_fields: &[String], - _with_validation: bool, + context: &BuildTantivyAstContext, ) -> Result { - let (field, path, regex) = self.to_field_and_regex(schema)?; + let (field, path, regex) = self.to_field_and_regex(context.schema)?; let regex = tantivy_fst::Regex::new(®ex).context("failed to parse regex")?; let regex_automaton_with_path = JsonPathPrefix { prefix: path.unwrap_or_default(), diff --git a/quickwit/quickwit-query/src/query_ast/term_query.rs b/quickwit/quickwit-query/src/query_ast/term_query.rs index 6a4c9511cf7..e2c9fbfc8c0 100644 --- a/quickwit/quickwit-query/src/query_ast/term_query.rs +++ b/quickwit/quickwit-query/src/query_ast/term_query.rs @@ -15,11 +15,9 @@ use std::collections::HashMap; use serde::{Deserialize, Serialize}; -use tantivy::schema::Schema as TantivySchema; use super::{BuildTantivyAst, QueryAst}; -use crate::query_ast::{FullTextParams, TantivyQueryAst}; -use crate::tokenizers::TokenizerManager; +use crate::query_ast::{BuildTantivyAstContext, FullTextParams, TantivyQueryAst}; use crate::{BooleanOperand, InvalidQuery}; /// The TermQuery acts exactly like a FullTextQuery with @@ -49,10 +47,7 @@ impl TermQuery { impl BuildTantivyAst for TermQuery { fn build_tantivy_ast_impl( &self, - schema: &TantivySchema, - tokenizer_manager: &TokenizerManager, - _search_fields: &[String], - _with_validation: bool, + context: &BuildTantivyAstContext, ) -> Result { let full_text_params = FullTextParams { tokenizer: Some("raw".to_string()), @@ -64,8 +59,8 @@ impl BuildTantivyAst for TermQuery { &self.field, &self.value, &full_text_params, - schema, - tokenizer_manager, + context.schema, + context.tokenizer_manager, false, ) } @@ -123,8 +118,7 @@ impl From for HashMap { mod tests { use tantivy::schema::{INDEXED, Schema}; - use crate::create_default_quickwit_tokenizer_manager; - use crate::query_ast::{BuildTantivyAst, TermQuery}; + use crate::query_ast::{BuildTantivyAst, TermQuery, test_context}; #[test] fn test_term_query_with_ipaddr_ipv4() { @@ -136,12 +130,7 @@ mod tests { schema_builder.add_ip_addr_field("ip", INDEXED); let schema = schema_builder.build(); let tantivy_query_ast = term_query - .build_tantivy_ast_call( - &schema, - &create_default_quickwit_tokenizer_manager(), - &[], - true, - ) + .build_tantivy_ast_call(test_context!(schema)) .unwrap(); let leaf = tantivy_query_ast.as_leaf().unwrap(); assert_eq!( @@ -160,12 +149,7 @@ mod tests { schema_builder.add_ip_addr_field("ip", INDEXED); let schema = schema_builder.build(); let tantivy_query_ast = term_query - .build_tantivy_ast_call( - &schema, - &create_default_quickwit_tokenizer_manager(), - &[], - true, - ) + .build_tantivy_ast_call(test_context!(schema)) .unwrap(); let leaf = tantivy_query_ast.as_leaf().unwrap(); assert_eq!( @@ -184,12 +168,7 @@ mod tests { schema_builder.add_bytes_field("bytes", INDEXED); let schema = schema_builder.build(); let tantivy_query_ast = term_query - .build_tantivy_ast_call( - &schema, - &create_default_quickwit_tokenizer_manager(), - &[], - true, - ) + .build_tantivy_ast_call(test_context!(schema)) .unwrap(); let leaf = tantivy_query_ast.as_leaf().unwrap(); assert_eq!( @@ -208,12 +187,7 @@ mod tests { schema_builder.add_bytes_field("bytes", INDEXED); let schema = schema_builder.build(); let tantivy_query_ast = term_query - .build_tantivy_ast_call( - &schema, - &create_default_quickwit_tokenizer_manager(), - &[], - true, - ) + .build_tantivy_ast_call(test_context!(schema)) .unwrap(); let leaf = tantivy_query_ast.as_leaf().unwrap(); assert_eq!( @@ -232,12 +206,7 @@ mod tests { schema_builder.add_date_field("timestamp", INDEXED); let schema = schema_builder.build(); let tantivy_query_ast = term_query - .build_tantivy_ast_call( - &schema, - &create_default_quickwit_tokenizer_manager(), - &[], - true, - ) + .build_tantivy_ast_call(test_context!(schema)) .unwrap(); let leaf = tantivy_query_ast.as_leaf().unwrap(); // The date should have been truncated to seconds precision. diff --git a/quickwit/quickwit-query/src/query_ast/term_set_query.rs b/quickwit/quickwit-query/src/query_ast/term_set_query.rs index c70d71cd94c..d25d01a4703 100644 --- a/quickwit/quickwit-query/src/query_ast/term_set_query.rs +++ b/quickwit/quickwit-query/src/query_ast/term_set_query.rs @@ -16,11 +16,11 @@ use std::collections::{BTreeSet, HashMap, HashSet}; use serde::{Deserialize, Serialize}; use tantivy::Term; -use tantivy::schema::Schema as TantivySchema; use crate::InvalidQuery; -use crate::query_ast::{BuildTantivyAst, QueryAst, TantivyQueryAst, TermQuery}; -use crate::tokenizers::TokenizerManager; +use crate::query_ast::{ + BuildTantivyAst, BuildTantivyAstContext, QueryAst, TantivyQueryAst, TermQuery, +}; /// TermSetQuery matches the same document set as if it was a union of /// the equivalent set of TermQueries. @@ -34,10 +34,10 @@ pub struct TermSetQuery { impl TermSetQuery { fn make_term_iterator( &self, - schema: &TantivySchema, - tokenizer_manager: &TokenizerManager, + context: &BuildTantivyAstContext, ) -> Result, InvalidQuery> { let mut terms: HashSet = HashSet::default(); + for (full_path, values) in &self.terms_per_field { for value in values { // Mapping a text (field, value) is non-trivial: @@ -51,8 +51,7 @@ impl TermSetQuery { field: full_path.to_string(), value: value.to_string(), }; - let ast = - term_query.build_tantivy_ast_call(schema, tokenizer_manager, &[], false)?; + let ast = term_query.build_tantivy_ast_call(context)?; let tantivy_query: Box = ast.simplify().into(); tantivy_query.query_terms(&mut |term, _| { terms.insert(term.clone()); @@ -66,12 +65,9 @@ impl TermSetQuery { impl BuildTantivyAst for TermSetQuery { fn build_tantivy_ast_impl( &self, - schema: &TantivySchema, - tokenizer_manager: &TokenizerManager, - _search_fields: &[String], - _with_validation: bool, + context: &BuildTantivyAstContext, ) -> Result { - let terms_it = self.make_term_iterator(schema, tokenizer_manager)?; + let terms_it = self.make_term_iterator(context)?; let term_set_query = tantivy::query::TermSetQuery::new(terms_it); Ok(term_set_query.into()) } diff --git a/quickwit/quickwit-query/src/query_ast/user_input_query.rs b/quickwit/quickwit-query/src/query_ast/user_input_query.rs index 623a7943c38..1a30b89d483 100644 --- a/quickwit/quickwit-query/src/query_ast/user_input_query.rs +++ b/quickwit/quickwit-query/src/query_ast/user_input_query.rs @@ -20,14 +20,12 @@ use serde::{Deserialize, Serialize}; use tantivy::query_grammar::{ Delimiter, Occur, UserInputAst, UserInputBound, UserInputLeaf, UserInputLiteral, }; -use tantivy::schema::Schema as TantivySchema; use crate::not_nan_f32::NotNaNf32; -use crate::query_ast::tantivy_query_ast::TantivyQueryAst; use crate::query_ast::{ - self, BuildTantivyAst, FieldPresenceQuery, FullTextMode, FullTextParams, QueryAst, + self, BuildTantivyAst, BuildTantivyAstContext, FieldPresenceQuery, FullTextMode, + FullTextParams, QueryAst, TantivyQueryAst, }; -use crate::tokenizers::TokenizerManager; use crate::{BooleanOperand, InvalidQuery, JsonLiteral}; const DEFAULT_PHRASE_QUERY_MAX_EXPANSION: u32 = 50; @@ -88,10 +86,7 @@ impl From for QueryAst { impl BuildTantivyAst for UserInputQuery { fn build_tantivy_ast_impl( &self, - _schema: &TantivySchema, - _tokenizer_manager: &TokenizerManager, - _default_search_fields: &[String], - _with_validation: bool, + _context: &BuildTantivyAstContext, ) -> Result { Err(InvalidQuery::UserQueryNotParsed) } @@ -324,8 +319,9 @@ fn convert_user_input_literal( mod tests { use crate::query_ast::{ BoolQuery, BuildTantivyAst, FullTextMode, FullTextQuery, QueryAst, UserInputQuery, + test_context, }; - use crate::{BooleanOperand, InvalidQuery, create_default_quickwit_tokenizer_manager}; + use crate::{BooleanOperand, InvalidQuery}; #[test] fn test_user_input_query_not_parsed_error() { @@ -338,23 +334,13 @@ mod tests { let schema = tantivy::schema::Schema::builder().build(); { let invalid_query = user_input_query - .build_tantivy_ast_call( - &schema, - &create_default_quickwit_tokenizer_manager(), - &[], - true, - ) + .build_tantivy_ast_call(test_context!(schema)) .unwrap_err(); assert!(matches!(invalid_query, InvalidQuery::UserQueryNotParsed)); } { let invalid_query = user_input_query - .build_tantivy_ast_call( - &schema, - &create_default_quickwit_tokenizer_manager(), - &[], - false, - ) + .build_tantivy_ast_call(test_context!(schema)) .unwrap_err(); assert!(matches!(invalid_query, InvalidQuery::UserQueryNotParsed)); } diff --git a/quickwit/quickwit-query/src/query_ast/visitor.rs b/quickwit/quickwit-query/src/query_ast/visitor.rs index b94893fb54d..0211205c02a 100644 --- a/quickwit/quickwit-query/src/query_ast/visitor.rs +++ b/quickwit/quickwit-query/src/query_ast/visitor.rs @@ -16,8 +16,8 @@ use crate::not_nan_f32::NotNaNf32; use crate::query_ast::field_presence::FieldPresenceQuery; use crate::query_ast::user_input_query::UserInputQuery; use crate::query_ast::{ - BoolQuery, FullTextQuery, PhrasePrefixQuery, QueryAst, RangeQuery, RegexQuery, TermQuery, - TermSetQuery, WildcardQuery, + BoolQuery, CacheNode, FullTextQuery, PhrasePrefixQuery, QueryAst, RangeQuery, RegexQuery, + TermQuery, TermSetQuery, WildcardQuery, }; /// Simple trait to implement a Visitor over the QueryAst. @@ -41,6 +41,7 @@ pub trait QueryAstVisitor<'a> { QueryAst::FieldPresence(exists) => self.visit_exists(exists), QueryAst::Wildcard(wildcard) => self.visit_wildcard(wildcard), QueryAst::Regex(regex) => self.visit_regex(regex), + QueryAst::Cache(cache_node) => self.visit_cache_node(cache_node), } } @@ -111,6 +112,10 @@ pub trait QueryAstVisitor<'a> { fn visit_regex(&mut self, _regex_query: &'a RegexQuery) -> Result<(), Self::Err> { Ok(()) } + + fn visit_cache_node(&mut self, _cache_node: &'a CacheNode) -> Result<(), Self::Err> { + Ok(()) + } } /// Simple trait to implement a Visitor over the QueryAst. @@ -134,6 +139,7 @@ pub trait QueryAstTransformer { QueryAst::FieldPresence(exists) => self.transform_exists(exists), QueryAst::Wildcard(wildcard) => self.transform_wildcard(wildcard), QueryAst::Regex(regex) => self.transform_regex(regex), + QueryAst::Cache(cache_node) => self.transform_cache_node(cache_node), } } @@ -236,4 +242,11 @@ pub trait QueryAstTransformer { fn transform_regex(&mut self, regex_query: RegexQuery) -> Result, Self::Err> { Ok(Some(QueryAst::Regex(regex_query))) } + + fn transform_cache_node( + &mut self, + cache_node: CacheNode, + ) -> Result, Self::Err> { + Ok(Some(QueryAst::Cache(cache_node))) + } } diff --git a/quickwit/quickwit-query/src/query_ast/wildcard_query.rs b/quickwit/quickwit-query/src/query_ast/wildcard_query.rs index bed0f949111..84176f4a4aa 100644 --- a/quickwit/quickwit-query/src/query_ast/wildcard_query.rs +++ b/quickwit/quickwit-query/src/query_ast/wildcard_query.rs @@ -21,7 +21,7 @@ use tantivy::Term; use tantivy::schema::{Field, FieldType, Schema as TantivySchema}; use super::{BuildTantivyAst, QueryAst}; -use crate::query_ast::{AutomatonQuery, JsonPathPrefix, TantivyQueryAst}; +use crate::query_ast::{AutomatonQuery, BuildTantivyAstContext, JsonPathPrefix, TantivyQueryAst}; use crate::tokenizers::TokenizerManager; use crate::{InvalidQuery, find_field_or_hit_dynamic}; @@ -183,12 +183,9 @@ impl WildcardQuery { impl BuildTantivyAst for WildcardQuery { fn build_tantivy_ast_impl( &self, - schema: &TantivySchema, - tokenizer_manager: &TokenizerManager, - _search_fields: &[String], - _with_validation: bool, + context: &BuildTantivyAstContext, ) -> Result { - let (field, path, regex) = match self.to_regex(schema, tokenizer_manager) { + let (field, path, regex) = match self.to_regex(context.schema, context.tokenizer_manager) { Ok(res) => res, Err(InvalidQuery::FieldDoesNotExist { .. }) if self.lenient => { return Ok(TantivyQueryAst::match_none()); From 97c9e0f29dd06dfefc418e05849e0a952db92553 Mon Sep 17 00:00:00 2001 From: trinity Pointard Date: Thu, 4 Dec 2025 18:43:26 +0100 Subject: [PATCH 2/8] implement CacheNode --- quickwit/Cargo.lock | 1 + quickwit/Cargo.toml | 1 + .../src/doc_mapper/doc_mapper_impl.rs | 6 +- .../quickwit-doc-mapper/src/doc_mapper/mod.rs | 6 +- .../src/actors/merge_executor.rs | 3 +- quickwit/quickwit-query/Cargo.toml | 1 + .../src/query_ast/cache_node.rs | 273 +++++++++++++++++- quickwit/quickwit-query/src/query_ast/mod.rs | 12 + .../src/query_ast/range_query.rs | 2 +- .../quickwit-query/src/query_ast/visitor.rs | 13 +- quickwit/quickwit-search/src/fetch_docs.rs | 2 +- quickwit/quickwit-search/src/leaf.rs | 6 +- quickwit/quickwit-search/src/root.rs | 5 +- .../src/delete_task_api/handler.rs | 2 +- 14 files changed, 310 insertions(+), 23 deletions(-) diff --git a/quickwit/Cargo.lock b/quickwit/Cargo.lock index 20b6f2a70b8..cf173c3bf2d 100644 --- a/quickwit/Cargo.lock +++ b/quickwit/Cargo.lock @@ -7370,6 +7370,7 @@ dependencies = [ "serde_json", "serde_with", "tantivy", + "tantivy-common", "tantivy-fst", "thiserror 2.0.17", "time", diff --git a/quickwit/Cargo.toml b/quickwit/Cargo.toml index 810f1d1f7f2..40bdf05bee2 100644 --- a/quickwit/Cargo.toml +++ b/quickwit/Cargo.toml @@ -353,6 +353,7 @@ tantivy = { git = "https://github.com/quickwit-oss/tantivy/", rev = "618e3bd", d "zstd-compression", "columnar-zstd-compression", ] } +tantivy-common = { git = "https://github.com/quickwit-oss/tantivy/", rev = "25d44fcec8"} tantivy-fst = "0.5" # This is actually not used directly the goal is to fix the version diff --git a/quickwit/quickwit-doc-mapper/src/doc_mapper/doc_mapper_impl.rs b/quickwit/quickwit-doc-mapper/src/doc_mapper/doc_mapper_impl.rs index 04379de3631..e9bbfde0860 100644 --- a/quickwit/quickwit-doc-mapper/src/doc_mapper/doc_mapper_impl.rs +++ b/quickwit/quickwit-doc-mapper/src/doc_mapper/doc_mapper_impl.rs @@ -636,11 +636,11 @@ impl DocMapper { pub fn query( &self, split_schema: Schema, - query_ast: &QueryAst, + query_ast: QueryAst, with_validation: bool, ) -> Result<(Box, WarmupInfo), QueryParserError> { build_query( - query_ast, + &query_ast, &BuildTantivyAstContext { schema: &split_schema, tokenizer_manager: self.tokenizer_manager(), @@ -2070,7 +2070,7 @@ mod tests { .parse_user_query(doc_mapper.default_search_fields()) .map_err(|err| err.to_string())?; let (query, _) = doc_mapper - .query(doc_mapper.schema(), &query_ast, true) + .query(doc_mapper.schema(), query_ast, true) .map_err(|err| err.to_string())?; Ok(format!("{query:?}")) } diff --git a/quickwit/quickwit-doc-mapper/src/doc_mapper/mod.rs b/quickwit/quickwit-doc-mapper/src/doc_mapper/mod.rs index 2ee6119ddf2..81b0f5eca06 100644 --- a/quickwit/quickwit-doc-mapper/src/doc_mapper/mod.rs +++ b/quickwit/quickwit-doc-mapper/src/doc_mapper/mod.rs @@ -290,7 +290,7 @@ mod tests { } .parse_user_query(&[]) .unwrap(); - let (query, _) = doc_mapper.query(schema, &query_ast, true).unwrap(); + let (query, _) = doc_mapper.query(schema, query_ast, true).unwrap(); assert_eq!( format!("{query:?}"), r#"TermQuery(Term(field=2, type=Json, path=toto.titi, type=Str, "hello"))"# @@ -304,7 +304,7 @@ mod tests { let query_ast = query_ast_from_user_text("toto.titi:hello", None) .parse_user_query(doc_mapper.default_search_fields()) .unwrap(); - let (query, _) = doc_mapper.query(schema, &query_ast, true).unwrap(); + let (query, _) = doc_mapper.query(schema, query_ast, true).unwrap(); assert_eq!( format!("{query:?}"), r#"TermQuery(Term(field=1, type=Json, path=toto.titi, type=Str, "hello"))"# @@ -318,7 +318,7 @@ mod tests { let query_ast = query_ast_from_user_text("toto:5", None) .parse_user_query(&[]) .unwrap(); - let (query, _) = doc_mapper.query(schema, &query_ast, true).unwrap(); + let (query, _) = doc_mapper.query(schema, query_ast, true).unwrap(); assert_eq!( format!("{query:?}"), r#"BooleanQuery { subqueries: [(Should, TermQuery(Term(field=1, type=Json, path=toto, type=I64, 5))), (Should, TermQuery(Term(field=1, type=Json, path=toto, type=Str, "5")))], minimum_number_should_match: 1 }"# diff --git a/quickwit/quickwit-indexing/src/actors/merge_executor.rs b/quickwit/quickwit-indexing/src/actors/merge_executor.rs index 4c1a5215752..4dcf3502913 100644 --- a/quickwit/quickwit-indexing/src/actors/merge_executor.rs +++ b/quickwit/quickwit-indexing/src/actors/merge_executor.rs @@ -533,8 +533,7 @@ impl MergeExecutor { "Delete all documents matched by query `{:?}`", parsed_query_ast ); - let (query, _) = - doc_mapper.query(union_index.schema(), &parsed_query_ast, false)?; + let (query, _) = doc_mapper.query(union_index.schema(), parsed_query_ast, false)?; index_writer.delete_query(query)?; } debug!("commit-delete-operations"); diff --git a/quickwit/quickwit-query/Cargo.toml b/quickwit/quickwit-query/Cargo.toml index b6f5f173a26..7dc0d056666 100644 --- a/quickwit/quickwit-query/Cargo.toml +++ b/quickwit/quickwit-query/Cargo.toml @@ -23,6 +23,7 @@ serde = { workspace = true } serde_json = { workspace = true } serde_with = { workspace = true } tantivy = { workspace = true } +tantivy-common = { workspace = true } tantivy-fst = { workspace = true } time = { workspace = true } thiserror = { workspace = true } diff --git a/quickwit/quickwit-query/src/query_ast/cache_node.rs b/quickwit/quickwit-query/src/query_ast/cache_node.rs index 68bcf41a413..4ee898f9d0a 100644 --- a/quickwit/quickwit-query/src/query_ast/cache_node.rs +++ b/quickwit/quickwit-query/src/query_ast/cache_node.rs @@ -12,22 +12,60 @@ // See the License for the specific language governing permissions and // limitations under the License. +use std::sync::Arc; + use serde::{Deserialize, Serialize}; use super::{BuildTantivyAst, BuildTantivyAstContext, TantivyQueryAst}; use crate::InvalidQuery; use crate::query_ast::QueryAst; -/// A node caching the posting list of the inner query. +/// A node caching the result of an inner query. /// /// This can be used when it's known that some sub-ast might appear in many queries, /// or that the same query might be run, with various aggregations. /// /// /!\ Sprinkling this everywhere can lead to performance degradations: the whole posting -/// list of the underlying query will need to be evaluated to build the cache. -#[derive(Serialize, Deserialize, Debug, PartialEq, Eq, Clone)] +/// list of the underlying query will need to be evaluated to build the cache, whereas it could +/// have been largely skipped if some other part of the query is very selective. +#[derive(Serialize, Deserialize, Debug, Clone)] pub struct CacheNode { pub inner: Box, + #[serde(skip)] + pub state: CacheState, +} + +#[derive(Default, Clone)] +pub enum CacheState { + // This is the state a CacheNode should be before + #[default] + Uninitialized, + CacheHit(CacheEntry), + CacheMiss(CacheFiller), +} + +type Cache = Arc< + std::sync::Mutex< + std::collections::HashMap<(String, String), Vec<(SegmentId, tantivy_common::BitSet)>>, + >, +>; + +impl std::fmt::Debug for CacheState { + fn fmt(&self, fmt: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + match self { + CacheState::Uninitialized => fmt.debug_tuple("Uninitialized").finish(), + CacheState::CacheHit(_) => fmt.debug_tuple("CacheHit").finish_non_exhaustive(), + CacheState::CacheMiss(_) => fmt.debug_tuple("CacheMiss").finish_non_exhaustive(), + } + } +} + +// cache state shouldn't impact a CacheNode equality +impl Eq for CacheNode {} +impl PartialEq for CacheNode { + fn eq(&self, other: &Self) -> bool { + self.inner == other.inner + } } impl From for QueryAst { @@ -36,11 +74,238 @@ impl From for QueryAst { } } +impl CacheNode { + pub fn fill_cache_state(&mut self, cache: &Cache, split_id: String) { + let Ok(query) = serde_json::to_string(&self.inner) else { + return; + }; + if let Some(entries) = cache + .lock() + .unwrap() + .get(&(split_id.clone(), query.clone())) + { + self.state = CacheState::CacheHit(CacheEntry { + entries: entries.clone(), + }); + } else { + self.state = CacheState::CacheMiss(CacheFiller { + cache: cache.clone(), + split_id, + query, + }); + } + } +} + impl BuildTantivyAst for CacheNode { fn build_tantivy_ast_impl( &self, context: &BuildTantivyAstContext, ) -> Result { - self.inner.build_tantivy_ast_impl(context) + match &self.state { + CacheState::Uninitialized => self.inner.build_tantivy_ast_call(context), + CacheState::CacheHit(cache_entry) => Ok(CacheHitQuery { + cache_entry: cache_entry.clone(), + } + .into()), + CacheState::CacheMiss(cache_filler) => { + let tantivy_query: Box = self + .inner + .build_tantivy_ast_call(context)? + .simplify() + .into(); + Ok(CacheFillerQuery { + inner_query: Arc::new(tantivy_query), + cache_filler: cache_filler.clone(), + } + .into()) + } + } + } +} + +use tantivy::index::SegmentId; +use tantivy::query::{BitSetDocSet, EnableScoring, Explanation, Query, Scorer, Weight}; +use tantivy::{DocId, DocSet, Score, SegmentReader, TantivyError}; + +#[derive(Clone, Debug)] +pub struct CacheHitQuery { + cache_entry: CacheEntry, +} + +impl Query for CacheHitQuery { + fn weight(&self, enable_scoring: EnableScoring<'_>) -> tantivy::Result> { + if enable_scoring.is_scoring_enabled() { + Err(tantivy::TantivyError::InternalError( + "Predicate cache doesn't support scoring yet".to_string(), + )) + } else { + Ok(Box::new(CacheHitWeight { + cache_entry: self.cache_entry.clone(), + })) + } + } +} + +/// Weight associated with the `AllQuery` query. +pub struct CacheHitWeight { + cache_entry: CacheEntry, +} + +impl Weight for CacheHitWeight { + fn scorer(&self, reader: &SegmentReader, boost: Score) -> tantivy::Result> { + // we could try to run the query if for some reason we don't actually find an entry in + // cache, but that would have required loading stuff during warmup which we skipped. + // An error is the best we can do + let bitset = self + .cache_entry + .for_segment(reader.segment_id()) + .ok_or_else(|| TantivyError::InternalError("Segment not found in cache".to_string()))?; + let scorer = CacheHitScorer { + bitset: bitset.into(), + boost, + }; + Ok(Box::new(scorer)) + } + + fn explain(&self, reader: &SegmentReader, doc: DocId) -> tantivy::Result { + let mut scorer = self.scorer(reader, 1.0)?; + if scorer.seek(doc) == doc { + Ok(Explanation::new("CacheHitScorer", 1.0)) + } else { + Err(TantivyError::InvalidArgument( + "Document does not exist".to_string(), + )) + } + } +} + +pub struct CacheHitScorer { + bitset: BitSetDocSet, + boost: Score, +} + +impl DocSet for CacheHitScorer { + fn advance(&mut self) -> DocId { + self.bitset.advance() + } + + fn seek(&mut self, target: DocId) -> DocId { + self.bitset.seek(target) + } + + fn doc(&self) -> DocId { + self.bitset.doc() + } + + fn size_hint(&self) -> u32 { + self.bitset.size_hint() + } +} + +impl Scorer for CacheHitScorer { + fn score(&mut self) -> f32 { + self.boost + } +} + +#[derive(Clone)] +pub struct CacheEntry { + // TODO we probably want to use something more compact than a bitset in the future + entries: Vec<(SegmentId, tantivy_common::BitSet)>, +} + +impl std::fmt::Debug for CacheEntry { + fn fmt(&self, fmt: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + fmt.debug_map() + .entries(self.entries.iter().map(|entry| (&entry.0, "<..>"))) + .finish() + } +} + +impl CacheEntry { + fn for_segment(&self, segment_id: SegmentId) -> Option { + self.entries + .iter() + .find(|entry| entry.0 == segment_id) + .map(|entry| entry.1.clone()) + } +} + +#[derive(Clone)] +pub struct CacheFiller { + cache: Cache, + split_id: String, + query: String, +} + +impl std::fmt::Debug for CacheFiller { + fn fmt(&self, fmt: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + fmt.debug_struct("CacheFiller") + .field("split_id", &self.split_id) + .finish_non_exhaustive() + } +} + +impl CacheFiller { + fn fill_segment(&self, segment_id: SegmentId, value: tantivy_common::BitSet) { + let mut cache = self.cache.lock().unwrap(); + let entry = cache + .entry((self.split_id.clone(), self.query.clone())) + .or_default(); + if !entry + .iter() + .any(|(entry_segment, _)| *entry_segment == segment_id) + { + entry.push((segment_id, value)); + } + } +} + +#[derive(Clone, Debug)] +pub struct CacheFillerQuery { + inner_query: Arc, + cache_filler: CacheFiller, +} + +impl Query for CacheFillerQuery { + fn weight(&self, enable_scoring: EnableScoring<'_>) -> tantivy::Result> { + if enable_scoring.is_scoring_enabled() { + Err(tantivy::TantivyError::InternalError( + "Predicate cache doesn't support scoring yet".to_string(), + )) + } else { + Ok(Box::new(CacheFillerWeight { + inner_weight: self.inner_query.weight(enable_scoring)?, + cache_filler: self.cache_filler.clone(), + })) + } + } +} + +/// Weight associated with the `AllQuery` query. +pub struct CacheFillerWeight { + inner_weight: Box, + cache_filler: CacheFiller, +} + +impl Weight for CacheFillerWeight { + fn scorer(&self, reader: &SegmentReader, boost: Score) -> tantivy::Result> { + let mut bitset = tantivy_common::BitSet::with_max_value(reader.max_doc()); + let mut scorer = self.inner_weight.scorer(reader, boost)?; + while let doc_id @ 0..tantivy::TERMINATED = scorer.advance() { + bitset.insert(doc_id); + } + self.cache_filler + .fill_segment(reader.segment_id(), bitset.clone()); + CacheHitScorer { + bitset: bitset.into(), + boost, + }; + Ok(Box::new(scorer)) + } + + fn explain(&self, reader: &SegmentReader, doc: DocId) -> tantivy::Result { + self.inner_weight.explain(reader, doc) } } diff --git a/quickwit/quickwit-query/src/query_ast/mod.rs b/quickwit/quickwit-query/src/query_ast/mod.rs index 918e04ecd60..3032759b20c 100644 --- a/quickwit/quickwit-query/src/query_ast/mod.rs +++ b/quickwit/quickwit-query/src/query_ast/mod.rs @@ -122,6 +122,10 @@ impl QueryAst { let inner = cache_node.inner.parse_user_query(default_search_fields)?; Ok(CacheNode { inner: Box::new(inner), + // inner got modified, the result is supposed to be equivalent, but to be safe, + // lets reinitialize the cache in practice this function + // shouldn't ever be called after cache was resolved + state: cache_node::CacheState::Uninitialized, } .into()) } @@ -173,6 +177,14 @@ macro_rules! test_context { with_validation: true, } }; + ($schema:expr, validation=$validation:expr) => { + &$crate::query_ast::BuildTantivyAstContext { + schema: &$schema, + tokenizer_manager: &$crate::create_default_quickwit_tokenizer_manager(), + search_fields: &[], + with_validation: $validation, + } + }; } pub use test_context; diff --git a/quickwit/quickwit-query/src/query_ast/range_query.rs b/quickwit/quickwit-query/src/query_ast/range_query.rs index 0854fa44cf0..9494d865610 100644 --- a/quickwit/quickwit-query/src/query_ast/range_query.rs +++ b/quickwit/quickwit-query/src/query_ast/range_query.rs @@ -372,7 +372,7 @@ mod tests { // without validation assert_eq!( range_query - .build_tantivy_ast_call(test_context!(schema),) + .build_tantivy_ast_call(test_context!(schema, validation = false),) .unwrap() .const_predicate(), Some(MatchAllOrNone::MatchNone) diff --git a/quickwit/quickwit-query/src/query_ast/visitor.rs b/quickwit/quickwit-query/src/query_ast/visitor.rs index 0211205c02a..493ad11842e 100644 --- a/quickwit/quickwit-query/src/query_ast/visitor.rs +++ b/quickwit/quickwit-query/src/query_ast/visitor.rs @@ -113,8 +113,8 @@ pub trait QueryAstVisitor<'a> { Ok(()) } - fn visit_cache_node(&mut self, _cache_node: &'a CacheNode) -> Result<(), Self::Err> { - Ok(()) + fn visit_cache_node(&mut self, cache_node: &'a CacheNode) -> Result<(), Self::Err> { + self.visit(&cache_node.inner) } } @@ -247,6 +247,13 @@ pub trait QueryAstTransformer { &mut self, cache_node: CacheNode, ) -> Result, Self::Err> { - Ok(Some(QueryAst::Cache(cache_node))) + self.transform(*cache_node.inner).map(|maybe_ast| { + maybe_ast.map(|inner| { + QueryAst::Cache(CacheNode { + inner: Box::new(inner), + state: Default::default(), + }) + }) + }) } } diff --git a/quickwit/quickwit-search/src/fetch_docs.rs b/quickwit/quickwit-search/src/fetch_docs.rs index d3dd0d3082d..59d6e90e06e 100644 --- a/quickwit/quickwit-search/src/fetch_docs.rs +++ b/quickwit/quickwit-search/src/fetch_docs.rs @@ -304,7 +304,7 @@ async fn create_fields_snippet_generator( let schema = searcher.schema(); let query_ast_resolved = serde_json::from_str(&snippet_request.query_ast_resolved) .context("failed to deserialize QueryAst")?; - let (query, _) = doc_mapper.query(schema.clone(), &query_ast_resolved, false)?; + let (query, _) = doc_mapper.query(schema.clone(), query_ast_resolved, false)?; let mut snippet_generators = HashMap::new(); for field_name in &snippet_request.snippet_fields { let field = schema.get_field(field_name)?; diff --git a/quickwit/quickwit-search/src/leaf.rs b/quickwit/quickwit-search/src/leaf.rs index f8aa60cf98f..815dce416c4 100644 --- a/quickwit/quickwit-search/src/leaf.rs +++ b/quickwit/quickwit-search/src/leaf.rs @@ -499,9 +499,9 @@ async fn leaf_search_single_split( make_collector_for_split(split_id.clone(), &search_request, agg_context_params)?; let split_schema = index.schema(); - let (query, mut warmup_info) = ctx - .doc_mapper - .query(split_schema.clone(), &query_ast, false)?; + let (query, mut warmup_info) = + ctx.doc_mapper + .query(split_schema.clone(), query_ast.clone(), false)?; let collector_warmup_info = collector.warmup_info(); warmup_info.merge(collector_warmup_info); diff --git a/quickwit/quickwit-search/src/root.rs b/quickwit/quickwit-search/src/root.rs index c2225b7524e..0d8f03d6c70 100644 --- a/quickwit/quickwit-search/src/root.rs +++ b/quickwit/quickwit-search/src/root.rs @@ -249,7 +249,7 @@ fn validate_request_and_build_metadata( )?; // Validates the query by effectively building it against the current schema. - doc_mapper.query(doc_mapper.schema(), &query_ast_resolved_for_index, true)?; + doc_mapper.query(doc_mapper.schema(), query_ast_resolved_for_index, true)?; let index_metadata_for_leaf_search = IndexMetasForLeafSearch { index_uri: index_metadata.index_uri().clone(), @@ -648,6 +648,7 @@ pub fn is_metadata_count_request(request: &SearchRequest) -> bool { /// /// The passed query_ast should match the serialized on in request. pub fn is_metadata_count_request_with_ast(query_ast: &QueryAst, request: &SearchRequest) -> bool { + // TODO detect Cache(MatchAll), Boost(MatchAll) and Bool{must/should:MatchAll} if query_ast != &QueryAst::MatchAll { return false; } @@ -1296,7 +1297,7 @@ pub async fn search_plan( let (query, mut warmup_info) = doc_mapper.query( doc_mapper.schema(), - &request_metadata.query_ast_resolved, + request_metadata.query_ast_resolved.clone(), true, )?; let merge_collector = make_merge_collector(&search_request, Default::default())?; diff --git a/quickwit/quickwit-serve/src/delete_task_api/handler.rs b/quickwit/quickwit-serve/src/delete_task_api/handler.rs index f1e305bd004..a582fb0a3f6 100644 --- a/quickwit/quickwit-serve/src/delete_task_api/handler.rs +++ b/quickwit/quickwit-serve/src/delete_task_api/handler.rs @@ -173,7 +173,7 @@ pub async fn post_delete_request( let query_ast: QueryAst = serde_json::from_str(&delete_search_request.query_ast) .map_err(|err| JanitorError::InvalidDeleteQuery(err.to_string()))?; doc_mapper - .query(doc_mapper.schema(), &query_ast, true) + .query(doc_mapper.schema(), query_ast, true) .map_err(|error| JanitorError::InvalidDeleteQuery(error.to_string()))?; let delete_task = metastore.create_delete_task(delete_query).await?; Ok(delete_task) From dc7fcdeb3e8056091835f5f17c7198b89acb4de5 Mon Sep 17 00:00:00 2001 From: trinity Pointard Date: Mon, 15 Dec 2025 19:33:35 +0100 Subject: [PATCH 3/8] implement predicate cache --- quickwit/Cargo.lock | 3 +- quickwit/Cargo.toml | 2 +- .../quickwit-config/src/node_config/mod.rs | 2 + .../src/node_config/serialize.rs | 1 + .../src/doc_mapper/doc_mapper_impl.rs | 7 +- .../quickwit-doc-mapper/src/doc_mapper/mod.rs | 6 +- .../quickwit-doc-mapper/src/query_builder.rs | 33 +- .../src/actors/merge_executor.rs | 3 +- quickwit/quickwit-query/Cargo.toml | 3 +- .../src/query_ast/cache_node.rs | 686 ++++++++++++++++-- quickwit/quickwit-query/src/query_ast/mod.rs | 2 +- .../quickwit-query/src/query_ast/visitor.rs | 28 +- quickwit/quickwit-search/src/fetch_docs.rs | 2 +- quickwit/quickwit-search/src/leaf.rs | 12 +- quickwit/quickwit-search/src/leaf_cache.rs | 46 ++ quickwit/quickwit-search/src/root.rs | 8 +- quickwit/quickwit-search/src/service.rs | 9 +- .../src/delete_task_api/handler.rs | 2 +- quickwit/quickwit-storage/src/metrics.rs | 2 + 19 files changed, 750 insertions(+), 107 deletions(-) diff --git a/quickwit/Cargo.lock b/quickwit/Cargo.lock index cf173c3bf2d..c948b190817 100644 --- a/quickwit/Cargo.lock +++ b/quickwit/Cargo.lock @@ -7355,6 +7355,7 @@ version = "0.8.0" dependencies = [ "anyhow", "base64 0.22.1", + "bitpacking", "criterion", "hex", "lindera-core", @@ -7364,13 +7365,13 @@ dependencies = [ "proptest", "quickwit-common", "quickwit-datetime", + "quickwit-proto", "regex", "rustc-hash", "serde", "serde_json", "serde_with", "tantivy", - "tantivy-common", "tantivy-fst", "thiserror 2.0.17", "time", diff --git a/quickwit/Cargo.toml b/quickwit/Cargo.toml index 40bdf05bee2..c8e8370230d 100644 --- a/quickwit/Cargo.toml +++ b/quickwit/Cargo.toml @@ -87,6 +87,7 @@ async-trait = "0.1" backtrace = "0.3" base64 = "0.22" binggan = { version = "0.14" } +bitpacking = "0.9.2" bytes = { version = "1", features = ["serde"] } bytesize = { version = "1.3", features = ["serde"] } bytestring = "1.4" @@ -353,7 +354,6 @@ tantivy = { git = "https://github.com/quickwit-oss/tantivy/", rev = "618e3bd", d "zstd-compression", "columnar-zstd-compression", ] } -tantivy-common = { git = "https://github.com/quickwit-oss/tantivy/", rev = "25d44fcec8"} tantivy-fst = "0.5" # This is actually not used directly the goal is to fix the version diff --git a/quickwit/quickwit-config/src/node_config/mod.rs b/quickwit/quickwit-config/src/node_config/mod.rs index bb8a17daaeb..66217cabbd9 100644 --- a/quickwit/quickwit-config/src/node_config/mod.rs +++ b/quickwit/quickwit-config/src/node_config/mod.rs @@ -267,6 +267,7 @@ pub struct SearcherConfig { pub fast_field_cache_capacity: ByteSize, pub split_footer_cache_capacity: ByteSize, pub partial_request_cache_capacity: ByteSize, + pub predicate_cache_capacity: ByteSize, pub max_num_concurrent_split_searches: usize, // Deprecated: stream search requests are no longer supported. #[serde(alias = "max_num_concurrent_split_streams", default, skip_serializing)] @@ -324,6 +325,7 @@ impl Default for SearcherConfig { fast_field_cache_capacity: ByteSize::gb(1), split_footer_cache_capacity: ByteSize::mb(500), partial_request_cache_capacity: ByteSize::mb(64), + predicate_cache_capacity: ByteSize::mb(128), max_num_concurrent_split_searches: 100, _max_num_concurrent_split_streams: None, aggregation_memory_limit: ByteSize::mb(500), diff --git a/quickwit/quickwit-config/src/node_config/serialize.rs b/quickwit/quickwit-config/src/node_config/serialize.rs index b5f39ceb0ac..7986e1dbd13 100644 --- a/quickwit/quickwit-config/src/node_config/serialize.rs +++ b/quickwit/quickwit-config/src/node_config/serialize.rs @@ -661,6 +661,7 @@ mod tests { fast_field_cache_capacity: ByteSize::gb(10), split_footer_cache_capacity: ByteSize::gb(1), partial_request_cache_capacity: ByteSize::mb(64), + predicate_cache_capacity: ByteSize::mb(128), max_num_concurrent_split_searches: 150, _max_num_concurrent_split_streams: Some(serde::de::IgnoredAny), split_cache: None, diff --git a/quickwit/quickwit-doc-mapper/src/doc_mapper/doc_mapper_impl.rs b/quickwit/quickwit-doc-mapper/src/doc_mapper/doc_mapper_impl.rs index e9bbfde0860..0ee9e439ba5 100644 --- a/quickwit/quickwit-doc-mapper/src/doc_mapper/doc_mapper_impl.rs +++ b/quickwit/quickwit-doc-mapper/src/doc_mapper/doc_mapper_impl.rs @@ -14,6 +14,7 @@ use std::collections::{BTreeMap, BTreeSet, HashSet}; use std::num::NonZeroU32; +use std::sync::Arc; use anyhow::{Context, bail}; use fnv::FnvHashSet; @@ -638,15 +639,17 @@ impl DocMapper { split_schema: Schema, query_ast: QueryAst, with_validation: bool, + cache_context: Option<(Arc, String)>, ) -> Result<(Box, WarmupInfo), QueryParserError> { build_query( - &query_ast, + query_ast, &BuildTantivyAstContext { schema: &split_schema, tokenizer_manager: self.tokenizer_manager(), search_fields: &self.default_search_field_names[..], with_validation, }, + cache_context, ) } @@ -2070,7 +2073,7 @@ mod tests { .parse_user_query(doc_mapper.default_search_fields()) .map_err(|err| err.to_string())?; let (query, _) = doc_mapper - .query(doc_mapper.schema(), query_ast, true) + .query(doc_mapper.schema(), query_ast, true, None) .map_err(|err| err.to_string())?; Ok(format!("{query:?}")) } diff --git a/quickwit/quickwit-doc-mapper/src/doc_mapper/mod.rs b/quickwit/quickwit-doc-mapper/src/doc_mapper/mod.rs index 81b0f5eca06..c8b9f0bd432 100644 --- a/quickwit/quickwit-doc-mapper/src/doc_mapper/mod.rs +++ b/quickwit/quickwit-doc-mapper/src/doc_mapper/mod.rs @@ -290,7 +290,7 @@ mod tests { } .parse_user_query(&[]) .unwrap(); - let (query, _) = doc_mapper.query(schema, query_ast, true).unwrap(); + let (query, _) = doc_mapper.query(schema, query_ast, true, None).unwrap(); assert_eq!( format!("{query:?}"), r#"TermQuery(Term(field=2, type=Json, path=toto.titi, type=Str, "hello"))"# @@ -304,7 +304,7 @@ mod tests { let query_ast = query_ast_from_user_text("toto.titi:hello", None) .parse_user_query(doc_mapper.default_search_fields()) .unwrap(); - let (query, _) = doc_mapper.query(schema, query_ast, true).unwrap(); + let (query, _) = doc_mapper.query(schema, query_ast, true, None).unwrap(); assert_eq!( format!("{query:?}"), r#"TermQuery(Term(field=1, type=Json, path=toto.titi, type=Str, "hello"))"# @@ -318,7 +318,7 @@ mod tests { let query_ast = query_ast_from_user_text("toto:5", None) .parse_user_query(&[]) .unwrap(); - let (query, _) = doc_mapper.query(schema, query_ast, true).unwrap(); + let (query, _) = doc_mapper.query(schema, query_ast, true, None).unwrap(); assert_eq!( format!("{query:?}"), r#"BooleanQuery { subqueries: [(Should, TermQuery(Term(field=1, type=Json, path=toto, type=I64, 5))), (Should, TermQuery(Term(field=1, type=Json, path=toto, type=Str, "5")))], minimum_number_should_match: 1 }"# diff --git a/quickwit/quickwit-doc-mapper/src/query_builder.rs b/quickwit/quickwit-doc-mapper/src/query_builder.rs index 03c71a856ad..9bc874975ce 100644 --- a/quickwit/quickwit-doc-mapper/src/query_builder.rs +++ b/quickwit/quickwit-doc-mapper/src/query_builder.rs @@ -15,10 +15,11 @@ use std::collections::{HashMap, HashSet}; use std::convert::Infallible; use std::ops::Bound; +use std::sync::Arc; use quickwit_query::query_ast::{ BuildTantivyAstContext, FieldPresenceQuery, FullTextQuery, PhrasePrefixQuery, QueryAst, - QueryAstVisitor, RangeQuery, RegexQuery, TermSetQuery, WildcardQuery, + QueryAstTransformer, QueryAstVisitor, RangeQuery, RegexQuery, TermSetQuery, WildcardQuery, }; use quickwit_query::tokenizers::TokenizerManager; use quickwit_query::{InvalidQuery, find_field_or_hit_dynamic}; @@ -154,14 +155,24 @@ impl<'a, 'f> QueryAstVisitor<'a> for ExistsQueryFastFields<'f> { /// Build a `Query` with field resolution & forbidding range clauses. pub(crate) fn build_query( - query_ast: &QueryAst, + query_ast: QueryAst, context: &BuildTantivyAstContext, + cache_context: Option<(Arc, String)>, ) -> Result<(Box, WarmupInfo), QueryParserError> { let mut fast_fields: HashSet = HashSet::new(); + let query_ast = if let Some((cache, split_id)) = cache_context { + let Ok(query_ast) = + quickwit_query::query_ast::CachePreIgniter { cache, split_id }.transform(query_ast); + // this transformer isn't supposed to ever remove a node + query_ast.unwrap_or(QueryAst::MatchAll) + } else { + query_ast + }; + let mut range_query_fields = RangeQueryFields::default(); // This cannot fail. The error type is Infallible. - let Ok(_) = range_query_fields.visit(query_ast); + let Ok(_) = range_query_fields.visit(&query_ast); let range_query_fast_fields = range_query_fields .range_query_field_names @@ -176,20 +187,20 @@ pub(crate) fn build_query( fields: &mut fast_fields, schema: context.schema.clone(), } - .visit(query_ast); + .visit(&query_ast); let Ok(_) = ExistsQueryFastFields { fields: &mut fast_fields, schema: context.schema.clone(), } - .visit(query_ast); + .visit(&query_ast); let query = query_ast.build_tantivy_query(context)?; - let term_set_query_fields = extract_term_set_query_fields(query_ast, context.schema)?; + let term_set_query_fields = extract_term_set_query_fields(&query_ast, context.schema)?; let (term_ranges_grouped_by_field, automatons_grouped_by_field) = extract_prefix_term_ranges_and_automaton( - query_ast, + &query_ast, context.schema, context.tokenizer_manager, )?; @@ -502,7 +513,7 @@ mod test { .parse_user_query(&[]) .map_err(|err| err.to_string())?; let schema = make_schema(dynamic_mode); - let query_result = build_query(&query_ast, quickwit_query::test_context!(schema)); + let query_result = build_query(query_ast, quickwit_query::test_context!(schema), None); query_result .map(|query| format!("{query:?}")) .map_err(|err| err.to_string()) @@ -877,8 +888,9 @@ mod test { .unwrap(); let (_, warmup_info) = build_query( - &query_with_set, + query_with_set, quickwit_query::test_context!(make_schema(true)), + None, ) .unwrap(); assert_eq!(warmup_info.term_dict_fields.len(), 1); @@ -889,8 +901,9 @@ mod test { ); let (_, warmup_info) = build_query( - &query_without_set, + query_without_set, quickwit_query::test_context!(make_schema(true)), + None, ) .unwrap(); assert!(warmup_info.term_dict_fields.is_empty()); diff --git a/quickwit/quickwit-indexing/src/actors/merge_executor.rs b/quickwit/quickwit-indexing/src/actors/merge_executor.rs index 4dcf3502913..d6ada9dde0f 100644 --- a/quickwit/quickwit-indexing/src/actors/merge_executor.rs +++ b/quickwit/quickwit-indexing/src/actors/merge_executor.rs @@ -533,7 +533,8 @@ impl MergeExecutor { "Delete all documents matched by query `{:?}`", parsed_query_ast ); - let (query, _) = doc_mapper.query(union_index.schema(), parsed_query_ast, false)?; + let (query, _) = + doc_mapper.query(union_index.schema(), parsed_query_ast, false, None)?; index_writer.delete_query(query)?; } debug!("commit-delete-operations"); diff --git a/quickwit/quickwit-query/Cargo.toml b/quickwit/quickwit-query/Cargo.toml index 7dc0d056666..8b92e30f9ff 100644 --- a/quickwit/quickwit-query/Cargo.toml +++ b/quickwit/quickwit-query/Cargo.toml @@ -13,6 +13,7 @@ license.workspace = true [dependencies] anyhow = { workspace = true } base64 = { workspace = true } +bitpacking = { workspace = true } hex = { workspace = true } lindera-core = { workspace = true, optional = true } lindera-dictionary = { workspace = true, optional = true } @@ -23,7 +24,6 @@ serde = { workspace = true } serde_json = { workspace = true } serde_with = { workspace = true } tantivy = { workspace = true } -tantivy-common = { workspace = true } tantivy-fst = { workspace = true } time = { workspace = true } thiserror = { workspace = true } @@ -32,6 +32,7 @@ whichlang = { workspace = true, optional = true } quickwit-common = { workspace = true } quickwit-datetime = { workspace = true } +quickwit-proto = { workspace = true } [dev-dependencies] criterion = { workspace = true } diff --git a/quickwit/quickwit-query/src/query_ast/cache_node.rs b/quickwit/quickwit-query/src/query_ast/cache_node.rs index 4ee898f9d0a..a8f523bafeb 100644 --- a/quickwit/quickwit-query/src/query_ast/cache_node.rs +++ b/quickwit/quickwit-query/src/query_ast/cache_node.rs @@ -14,6 +14,8 @@ use std::sync::Arc; +use bitpacking::{BitPacker, BitPacker1x}; +use quickwit_proto::types::SplitId; use serde::{Deserialize, Serialize}; use super::{BuildTantivyAst, BuildTantivyAstContext, TantivyQueryAst}; @@ -44,12 +46,6 @@ pub enum CacheState { CacheMiss(CacheFiller), } -type Cache = Arc< - std::sync::Mutex< - std::collections::HashMap<(String, String), Vec<(SegmentId, tantivy_common::BitSet)>>, - >, ->; - impl std::fmt::Debug for CacheState { fn fmt(&self, fmt: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { match self { @@ -75,22 +71,23 @@ impl From for QueryAst { } impl CacheNode { - pub fn fill_cache_state(&mut self, cache: &Cache, split_id: String) { + pub fn new(ast: QueryAst) -> Self { + CacheNode { + inner: Box::new(ast), + state: CacheState::Uninitialized, + } + } + + pub fn fill_cache_state(&mut self, cache: &Arc, split_id: &str) { let Ok(query) = serde_json::to_string(&self.inner) else { return; }; - if let Some(entries) = cache - .lock() - .unwrap() - .get(&(split_id.clone(), query.clone())) - { - self.state = CacheState::CacheHit(CacheEntry { - entries: entries.clone(), - }); + if let Some((segment_id, hits)) = cache.get(split_id.to_string(), query.clone()) { + self.state = CacheState::CacheHit(CacheEntry { segment_id, hits }); } else { self.state = CacheState::CacheMiss(CacheFiller { cache: cache.clone(), - split_id, + split_id: split_id.to_string(), query, }); } @@ -124,8 +121,9 @@ impl BuildTantivyAst for CacheNode { } } +use tantivy::directory::OwnedBytes; use tantivy::index::SegmentId; -use tantivy::query::{BitSetDocSet, EnableScoring, Explanation, Query, Scorer, Weight}; +use tantivy::query::{EnableScoring, Explanation, Query, Scorer, Weight}; use tantivy::{DocId, DocSet, Score, SegmentReader, TantivyError}; #[derive(Clone, Debug)] @@ -157,21 +155,18 @@ impl Weight for CacheHitWeight { // we could try to run the query if for some reason we don't actually find an entry in // cache, but that would have required loading stuff during warmup which we skipped. // An error is the best we can do - let bitset = self + let mut hit_set = self .cache_entry .for_segment(reader.segment_id()) .ok_or_else(|| TantivyError::InternalError("Segment not found in cache".to_string()))?; - let scorer = CacheHitScorer { - bitset: bitset.into(), - boost, - }; - Ok(Box::new(scorer)) + hit_set.boost = boost; + Ok(Box::new(hit_set)) } fn explain(&self, reader: &SegmentReader, doc: DocId) -> tantivy::Result { let mut scorer = self.scorer(reader, 1.0)?; if scorer.seek(doc) == doc { - Ok(Explanation::new("CacheHitScorer", 1.0)) + Ok(Explanation::new("HitSet", 1.0)) } else { Err(TantivyError::InvalidArgument( "Document does not exist".to_string(), @@ -180,61 +175,199 @@ impl Weight for CacheHitWeight { } } -pub struct CacheHitScorer { - bitset: BitSetDocSet, +#[derive(Clone)] +pub struct CacheEntry { + segment_id: SegmentId, + hits: HitSet, +} + +impl std::fmt::Debug for CacheEntry { + fn fmt(&self, fmt: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + fmt.debug_struct("CacheEntry") + .field("segment_id", &self.segment_id) + .finish_non_exhaustive() + } +} + +impl CacheEntry { + fn for_segment(&self, segment_id: SegmentId) -> Option { + if segment_id == self.segment_id { + Some(self.hits.clone()) + } else { + None + } + } +} + +#[derive(Debug, Clone)] +pub struct HitSet { + // TODO we could make this an OwnedBytes to save on memcpy + buffer: OwnedBytes, + buffer_pos: usize, + previous_last_val: Option, + current_block: [u32; BitPacker1x::BLOCK_LEN], + block_pos: usize, + finished: bool, boost: Score, } -impl DocSet for CacheHitScorer { - fn advance(&mut self) -> DocId { - self.bitset.advance() +impl HitSet { + #[cfg(test)] + fn empty() -> Self { + Self::from_buffer(OwnedBytes::new(vec![0, 0, 0, 0])) + } + + /// Build a HitSet from its serialized form. + /// + /// The provided buffer must come from `HitSet::into_buffer` + pub fn from_buffer(buffer: OwnedBytes) -> Self { + let mut this = Self { + buffer, + // skip count + buffer_pos: 4, + previous_last_val: None, + current_block: [0; BitPacker1x::BLOCK_LEN], + // we set this to block_len minus 1 so we can call advance() once to initialize + // everything + block_pos: BitPacker1x::BLOCK_LEN - 1, + finished: false, + boost: 1.0, + }; + this.advance(); + this } - fn seek(&mut self, target: DocId) -> DocId { - self.bitset.seek(target) + /// Return a buffer representing the underlying data. + /// + /// This does not preserve where in the DocSet you are. + pub fn into_buffer(self) -> OwnedBytes { + self.buffer + } +} + +impl DocSet for HitSet { + fn advance(&mut self) -> DocId { + if self.block_pos == BitPacker1x::BLOCK_LEN - 1 { + self.block_pos = 0; + let Some(num_bits) = self.buffer.get(self.buffer_pos) else { + self.finished = true; + return self.doc(); + }; + self.buffer_pos += 1; + if *num_bits == 0x80 { + // final block, decode as many ids as possible + let mut i = 0; + for chunk in self.buffer[self.buffer_pos..].as_chunks().0 { + self.current_block[i] = u32::from_le_bytes(*chunk); + i += 1; + } + while i < BitPacker1x::BLOCK_LEN { + self.current_block[i] = tantivy::TERMINATED; + i += 1; + } + self.buffer_pos = self.buffer.len(); + } else { + self.buffer_pos += BitPacker1x.decompress_strictly_sorted( + self.previous_last_val, + &self.buffer[self.buffer_pos..], + &mut self.current_block, + *num_bits, + ); + self.previous_last_val = self.current_block.last().copied(); + } + } else { + self.block_pos += 1; + } + self.doc() } + // fn seek(&mut self, target: DocId) -> DocId { + // } + fn doc(&self) -> DocId { - self.bitset.doc() + if self.finished { + tantivy::TERMINATED + } else { + self.current_block[self.block_pos] + } } fn size_hint(&self) -> u32 { - self.bitset.size_hint() + u32::from_le_bytes(self.buffer[0..4].try_into().unwrap()) } } -impl Scorer for CacheHitScorer { +impl Scorer for HitSet { fn score(&mut self) -> f32 { self.boost } } -#[derive(Clone)] -pub struct CacheEntry { - // TODO we probably want to use something more compact than a bitset in the future - entries: Vec<(SegmentId, tantivy_common::BitSet)>, +pub struct HitSetBuilder { + count: u32, + current_block: [u32; BitPacker1x::BLOCK_LEN], + previous_last_val: Option, + buffer: Vec, } -impl std::fmt::Debug for CacheEntry { - fn fmt(&self, fmt: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { - fmt.debug_map() - .entries(self.entries.iter().map(|entry| (&entry.0, "<..>"))) - .finish() +impl HitSetBuilder { + pub fn new() -> Self { + HitSetBuilder { + count: 0, + current_block: [0; BitPacker1x::BLOCK_LEN], + previous_last_val: None, + buffer: vec![0; 4], + } } -} -impl CacheEntry { - fn for_segment(&self, segment_id: SegmentId) -> Option { - self.entries - .iter() - .find(|entry| entry.0 == segment_id) - .map(|entry| entry.1.clone()) + fn in_block_pos(&self) -> usize { + (self.count % BitPacker1x::BLOCK_LEN as u32) as usize + } + + fn end_of_block(&self) -> bool { + self.in_block_pos() == (BitPacker1x::BLOCK_LEN - 1) + } + + fn flush_block(&mut self) { + let num_bits = + BitPacker1x.num_bits_strictly_sorted(self.previous_last_val, &self.current_block); + self.buffer.push(num_bits); + let current_buffer_pos = self.buffer.len(); + let new_end = current_buffer_pos + BitPacker1x::BLOCK_LEN / 8 * num_bits as usize; + self.buffer.resize(new_end, 0); + BitPacker1x.compress_strictly_sorted( + self.previous_last_val, + &self.current_block, + &mut self.buffer[current_buffer_pos..], + num_bits, + ); + self.previous_last_val = self.current_block.last().copied(); + } + + pub fn insert(&mut self, value: u32) { + self.current_block[self.in_block_pos()] = value; + if self.end_of_block() { + self.flush_block(); + } + self.count += 1; + } + + pub fn build(mut self) -> HitSet { + if self.in_block_pos() != 0 { + self.buffer.push(0x80); + for elem in &self.current_block[..self.in_block_pos()] { + self.buffer.extend_from_slice(&elem.to_le_bytes()); + } + } + // write back the count of items + self.buffer[0..4].copy_from_slice(&self.count.to_le_bytes()); + HitSet::from_buffer(OwnedBytes::new(self.buffer)) } } #[derive(Clone)] pub struct CacheFiller { - cache: Cache, + cache: Arc, split_id: String, query: String, } @@ -248,17 +381,9 @@ impl std::fmt::Debug for CacheFiller { } impl CacheFiller { - fn fill_segment(&self, segment_id: SegmentId, value: tantivy_common::BitSet) { - let mut cache = self.cache.lock().unwrap(); - let entry = cache - .entry((self.split_id.clone(), self.query.clone())) - .or_default(); - if !entry - .iter() - .any(|(entry_segment, _)| *entry_segment == segment_id) - { - entry.push((segment_id, value)); - } + fn fill_segment(&self, segment_id: SegmentId, value: HitSet) { + self.cache + .put(self.split_id.clone(), self.query.clone(), segment_id, value); } } @@ -281,6 +406,9 @@ impl Query for CacheFillerQuery { })) } } + fn query_terms<'a>(&'a self, visitor: &mut dyn FnMut(&'a tantivy::Term, bool)) { + self.inner_query.query_terms(visitor) + } } /// Weight associated with the `AllQuery` query. @@ -291,21 +419,435 @@ pub struct CacheFillerWeight { impl Weight for CacheFillerWeight { fn scorer(&self, reader: &SegmentReader, boost: Score) -> tantivy::Result> { - let mut bitset = tantivy_common::BitSet::with_max_value(reader.max_doc()); - let mut scorer = self.inner_weight.scorer(reader, boost)?; - while let doc_id @ 0..tantivy::TERMINATED = scorer.advance() { - bitset.insert(doc_id); + let mut hit_set_builder = HitSetBuilder::new(); + let mut scorer = self.inner_weight.scorer(reader, 1.0)?; + let mut doc_id = scorer.doc(); + while doc_id < tantivy::TERMINATED { + hit_set_builder.insert(doc_id); + doc_id = scorer.advance(); } + let mut hit_set = hit_set_builder.build(); self.cache_filler - .fill_segment(reader.segment_id(), bitset.clone()); - CacheHitScorer { - bitset: bitset.into(), - boost, - }; - Ok(Box::new(scorer)) + .fill_segment(reader.segment_id(), hit_set.clone()); + hit_set.boost = boost; + Ok(Box::new(hit_set)) } fn explain(&self, reader: &SegmentReader, doc: DocId) -> tantivy::Result { self.inner_weight.explain(reader, doc) } } + +pub struct CachePreIgniter { + pub cache: Arc, + pub split_id: String, +} + +impl crate::query_ast::QueryAstTransformer for CachePreIgniter { + type Err = std::convert::Infallible; + + fn transform_cache_node( + &mut self, + mut cache_node: CacheNode, + ) -> Result, Self::Err> { + cache_node.fill_cache_state(&self.cache, &self.split_id); + self.transform(*cache_node.inner).map(|maybe_ast| { + maybe_ast.map(|inner| { + QueryAst::Cache(CacheNode { + inner: Box::new(inner), + state: cache_node.state, + }) + }) + }) + } +} + +// we use a trait to dodge circular dependancies with quickwit-storage +pub trait PredicateCache: Send + Sync + 'static { + fn get(&self, split_id: SplitId, query_ast_json: String) -> Option<(SegmentId, HitSet)>; + + fn put(&self, split_id: SplitId, query_ast_json: String, segment: SegmentId, results: HitSet); +} + +#[cfg(test)] +mod tests { + use std::collections::HashMap; + use std::sync::Mutex; + + use tantivy::DocSet; + use tantivy::query::Query as TantivyQuery; + use tantivy::schema::{Schema, TEXT}; + + use super::*; + use crate::query_ast::{QueryAstTransformer, QueryAstVisitor, TermQuery}; + use crate::test_context; + + impl PredicateCache for Mutex> { + fn get(&self, split_id: SplitId, query_ast_json: String) -> Option<(SegmentId, HitSet)> { + self.lock() + .unwrap() + .get(&(split_id, query_ast_json)) + .cloned() + } + + fn put( + &self, + split_id: SplitId, + query_ast_json: String, + segment: SegmentId, + results: HitSet, + ) { + self.lock() + .unwrap() + .insert((split_id, query_ast_json), (segment, results)); + } + } + + #[track_caller] + fn test_hit_set_roundtrip_helper + Clone>(iter: I) { + let mut hitset_builder = HitSetBuilder::new(); + for i in iter.clone() { + hitset_builder.insert(i); + } + let mut hitset = hitset_builder.build(); + + for val in iter { + assert_eq!(hitset.doc(), val); + hitset.advance(); + } + for _ in 0..96 { + assert_eq!(hitset.doc(), tantivy::TERMINATED); + hitset.advance(); + } + } + + #[test] + fn test_hit_set_roundtrip() { + // this generate a pseurorandom strictrly increasing sequence + let generator = std::iter::successors(Some(0u32), |x| Some(x + x.trailing_ones() + 1)); + + // empty + test_hit_set_roundtrip_helper(generator.clone().take(0)); + // one item + test_hit_set_roundtrip_helper(generator.clone().take(1)); + test_hit_set_roundtrip_helper(generator.clone().skip(10).take(1)); + // partial block + test_hit_set_roundtrip_helper(generator.clone().take(24)); + test_hit_set_roundtrip_helper(generator.clone().skip(10).take(24)); + + // one block + test_hit_set_roundtrip_helper(generator.clone().take(32)); + test_hit_set_roundtrip_helper(generator.clone().skip(10).take(32)); + // two blocks + test_hit_set_roundtrip_helper(generator.clone().take(64)); + test_hit_set_roundtrip_helper(generator.clone().skip(10).take(64)); + + // many blocks, partial last block + test_hit_set_roundtrip_helper(generator.clone().take(1024 + 6)); + test_hit_set_roundtrip_helper(generator.clone().skip(10).take(1024 + 6)); + } + + #[test] + fn test_built_tantivy_ast() { + let mut schema_builder = Schema::builder(); + schema_builder.add_text_field("body", TEXT); + let schema = schema_builder.build(); + let term_query: QueryAst = TermQuery { + field: "body".to_string(), + value: "val".to_string(), + } + .into(); + let tantivy_term_query: Box = term_query + .build_tantivy_ast_impl(test_context!(schema)) + .unwrap() + .into(); + + { + let ast = CacheNode { + inner: Box::new(term_query.clone()), + state: CacheState::Uninitialized, + }; + let uninit_cache_query: Box = ast + .build_tantivy_ast_impl(test_context!(schema)) + .unwrap() + .into(); + assert_eq!( + format!("{uninit_cache_query:?}"), + format!("{tantivy_term_query:?}") + ); + } + + { + let cache_entry = CacheEntry { + segment_id: SegmentId::from_uuid_string("1686a000d4f7a91939d0e71df1646d7a") + .unwrap(), + hits: HitSet::empty(), + }; + let ast = CacheNode { + inner: Box::new(term_query.clone()), + state: CacheState::CacheHit(cache_entry), + }; + let cache_hit_query: Box = ast + .build_tantivy_ast_impl(test_context!(schema)) + .unwrap() + .into(); + + let debug_query = format!("{cache_hit_query:?}"); + assert!(debug_query.contains("CacheHitQuery")); + assert!(!debug_query.contains("TermQuery")); + } + { + let cache_filler = CacheFiller { + cache: Arc::new(Mutex::new(HashMap::new())), + split_id: "split_id".to_string(), + query: "{}".to_string(), + }; + let ast = CacheNode { + inner: Box::new(term_query.clone().into()), + state: CacheState::CacheMiss(cache_filler), + }; + let cache_miss_query: Box = ast + .build_tantivy_ast_impl(test_context!(schema)) + .unwrap() + .into(); + + let debug_query = format!("{cache_miss_query:?}"); + assert!(debug_query.contains("CacheFillerQuery")); + assert!(debug_query.contains(&format!("{tantivy_term_query:?}"))); + } + } + + struct FoundATermVisitor(bool); + impl QueryAstVisitor<'_> for FoundATermVisitor { + type Err = std::convert::Infallible; + fn visit_term(&mut self, _term: &TermQuery) -> Result<(), Self::Err> { + self.0 = true; + Ok(()) + } + } + + impl QueryAstTransformer for FoundATermVisitor { + type Err = std::convert::Infallible; + fn transform_term(&mut self, term: TermQuery) -> Result, Self::Err> { + self.0 = true; + Ok(Some(term.into())) + } + } + + #[test] + fn test_default_visitor_ignore_cached_node() { + let term_query: QueryAst = TermQuery { + field: "body".to_string(), + value: "val".to_string(), + } + .into(); + { + let ast = CacheNode { + inner: Box::new(term_query.clone()), + state: CacheState::Uninitialized, + } + .into(); + + let mut visitor = FoundATermVisitor(false); + visitor.visit(&ast).unwrap(); + assert!(visitor.0); + let mut visitor = FoundATermVisitor(false); + visitor.transform(ast).unwrap(); + assert!(visitor.0); + } + { + let cache_entry = CacheEntry { + segment_id: SegmentId::from_uuid_string("1686a000d4f7a91939d0e71df1646d7a") + .unwrap(), + hits: HitSet::empty(), + }; + let ast = CacheNode { + inner: Box::new(term_query.clone()), + state: CacheState::CacheHit(cache_entry), + } + .into(); + + let mut visitor = FoundATermVisitor(false); + visitor.visit(&ast).unwrap(); + assert!(!visitor.0); + let mut visitor = FoundATermVisitor(false); + visitor.transform(ast).unwrap(); + assert!(!visitor.0); + } + { + let cache_filler = CacheFiller { + cache: Arc::new(Mutex::new(HashMap::new())), + split_id: "split_id".to_string(), + query: "{}".to_string(), + }; + let ast = CacheNode { + inner: Box::new(term_query.clone().into()), + state: CacheState::CacheMiss(cache_filler), + } + .into(); + + let mut visitor = FoundATermVisitor(false); + visitor.visit(&ast).unwrap(); + assert!(visitor.0); + let mut visitor = FoundATermVisitor(false); + visitor.transform(ast).unwrap(); + assert!(visitor.0); + } + } + + #[test] + fn test_cache_preigniter_fills_cache() { + let term_query: QueryAst = TermQuery { + field: "body".to_string(), + value: "val".to_string(), + } + .into(); + let cache_node = CacheNode { + inner: Box::new(term_query.clone()), + state: CacheState::Uninitialized, + }; + let query_json = serde_json::to_string(&cache_node.inner).unwrap(); + let ast: QueryAst = cache_node.into(); + let cache = Arc::new(Mutex::new(HashMap::new())); + cache.put( + "split_2".to_string(), + query_json, + SegmentId::from_uuid_string("1686a000d4f7a91939d0e71df1646d7a").unwrap(), + HitSet::empty(), + ); + + { + let mut pre_igniter = CachePreIgniter { + cache: cache.clone(), + split_id: "split_1".to_string(), + }; + let filled = pre_igniter.transform(ast.clone()).unwrap().unwrap(); + assert!(matches!( + filled, + QueryAst::Cache(CacheNode { + state: CacheState::CacheMiss(_), + .. + }) + )); + } + + { + let mut pre_igniter = CachePreIgniter { + cache: cache.clone(), + split_id: "split_2".to_string(), + }; + let filled = pre_igniter.transform(ast.clone()).unwrap().unwrap(); + assert!(matches!( + filled, + QueryAst::Cache(CacheNode { + state: CacheState::CacheHit(_), + .. + }) + )); + } + } + + #[test] + fn test_cache_hit_returns_correct_docs() { + let mut schema_builder = Schema::builder(); + let host_field = schema_builder.add_text_field("host", TEXT); + let schema = schema_builder.build(); + let index = tantivy::IndexBuilder::new() + .schema(schema.clone()) + .create_in_ram() + .unwrap(); + let mut index_writer = index.writer_with_num_threads(1, 20_000_000).unwrap(); + for count in 1..13 { + let mut doc = tantivy::TantivyDocument::default(); + doc.add_text(host_field, format!("host_{count}")); + for _ in 0..count { + index_writer.add_document(doc.clone()).unwrap(); + } + } + index_writer.commit().unwrap(); + let searcher = index.reader().unwrap().searcher(); + let segment_id = searcher.segment_readers()[0].segment_id(); + + let generator = + std::iter::successors(Some(0u32), |x| Some(x + x.trailing_ones() + 1)).take(500); + let mut hitset_builder = HitSetBuilder::new(); + for i in generator { + hitset_builder.insert(i); + } + let hitset = hitset_builder.build(); + + // this query isn't even valid for that split, but that's not relevant as it won't get run + let term_query: QueryAst = TermQuery { + field: "body".to_string(), + value: "val".to_string(), + } + .into(); + let cache_entry = CacheEntry { + segment_id, + hits: hitset, + }; + let ast = CacheNode { + inner: Box::new(term_query.clone()), + state: CacheState::CacheHit(cache_entry), + }; + let cache_hit_query: Box = ast + .build_tantivy_ast_impl(test_context!(schema)) + .unwrap() + .into(); + + assert_eq!(cache_hit_query.count(&searcher).unwrap(), 500); + } + + #[test] + fn test_cache_miss_returns_correct_docs_and_fill_cache() { + let mut schema_builder = Schema::builder(); + let host_field = schema_builder.add_text_field("host", TEXT); + let schema = schema_builder.build(); + let index = tantivy::IndexBuilder::new() + .schema(schema.clone()) + .create_in_ram() + .unwrap(); + let mut index_writer = index.writer_with_num_threads(1, 20_000_000).unwrap(); + for count in 1..13 { + let mut doc = tantivy::TantivyDocument::default(); + doc.add_text(host_field, format!("host_{count}")); + for _ in 0..count { + index_writer.add_document(doc.clone()).unwrap(); + } + } + index_writer.commit().unwrap(); + let searcher = index.reader().unwrap().searcher(); + let segment_id = searcher.segment_readers()[0].segment_id(); + + let term_query: QueryAst = TermQuery { + field: "host".to_string(), + value: "11".to_string(), + } + .into(); + let cache = Arc::new(Mutex::new(HashMap::new())); + let cache_filler = CacheFiller { + cache: cache.clone(), + split_id: "split_id".to_string(), + query: "{some_query}".to_string(), + }; + let ast = CacheNode { + inner: Box::new(term_query.clone()), + state: CacheState::CacheMiss(cache_filler), + }; + let cache_hit_query: Box = ast + .build_tantivy_ast_impl(test_context!(schema)) + .unwrap() + .into(); + + assert_eq!(cache_hit_query.count(&searcher).unwrap(), 11); + let mut cache_entry = cache + .get("split_id".to_string(), "{some_query}".to_string()) + .unwrap(); + assert_eq!(cache_entry.0, segment_id); + let expected = (10 * 11 / 2)..(11 * 12 / 2); + for doc_id in expected { + assert_eq!(cache_entry.1.doc(), doc_id); + cache_entry.1.advance(); + } + } +} diff --git a/quickwit/quickwit-query/src/query_ast/mod.rs b/quickwit/quickwit-query/src/query_ast/mod.rs index 3032759b20c..268c0b7dc48 100644 --- a/quickwit/quickwit-query/src/query_ast/mod.rs +++ b/quickwit/quickwit-query/src/query_ast/mod.rs @@ -34,7 +34,7 @@ mod visitor; mod wildcard_query; pub use bool_query::BoolQuery; -pub use cache_node::CacheNode; +pub use cache_node::{CacheNode, CachePreIgniter, HitSet, PredicateCache}; pub use field_presence::FieldPresenceQuery; pub use full_text_query::{FullTextMode, FullTextParams, FullTextQuery}; pub use phrase_prefix_query::PhrasePrefixQuery; diff --git a/quickwit/quickwit-query/src/query_ast/visitor.rs b/quickwit/quickwit-query/src/query_ast/visitor.rs index 493ad11842e..49a6e7a260a 100644 --- a/quickwit/quickwit-query/src/query_ast/visitor.rs +++ b/quickwit/quickwit-query/src/query_ast/visitor.rs @@ -13,6 +13,7 @@ // limitations under the License. use crate::not_nan_f32::NotNaNf32; +use crate::query_ast::cache_node::CacheState; use crate::query_ast::field_presence::FieldPresenceQuery; use crate::query_ast::user_input_query::UserInputQuery; use crate::query_ast::{ @@ -114,7 +115,16 @@ pub trait QueryAstVisitor<'a> { } fn visit_cache_node(&mut self, cache_node: &'a CacheNode) -> Result<(), Self::Err> { - self.visit(&cache_node.inner) + // this goes a bit again how the rest of the default Visitor behave. The rational is that in + // practice, on a cache hit, we don't want to do anything with that node. + // On unitialized cache, any kind of data extract could make sense (extracing tags or + // timestamp bounds) On cache miss, we still want to know what we need for warmup. + // But on cache hit, it's too late to do optimisation based on tags and timestamps, and we + // don't want to warmup anything. + if !matches!(cache_node.state, CacheState::CacheHit(_)) { + self.visit(&cache_node.inner)? + } + Ok(()) } } @@ -247,13 +257,17 @@ pub trait QueryAstTransformer { &mut self, cache_node: CacheNode, ) -> Result, Self::Err> { - self.transform(*cache_node.inner).map(|maybe_ast| { - maybe_ast.map(|inner| { - QueryAst::Cache(CacheNode { - inner: Box::new(inner), - state: Default::default(), + if !matches!(cache_node.state, CacheState::CacheHit(_)) { + self.transform(*cache_node.inner).map(|maybe_ast| { + maybe_ast.map(|inner| { + QueryAst::Cache(CacheNode { + inner: Box::new(inner), + state: Default::default(), + }) }) }) - }) + } else { + Ok(Some(cache_node.into())) + } } } diff --git a/quickwit/quickwit-search/src/fetch_docs.rs b/quickwit/quickwit-search/src/fetch_docs.rs index 59d6e90e06e..9d225113249 100644 --- a/quickwit/quickwit-search/src/fetch_docs.rs +++ b/quickwit/quickwit-search/src/fetch_docs.rs @@ -304,7 +304,7 @@ async fn create_fields_snippet_generator( let schema = searcher.schema(); let query_ast_resolved = serde_json::from_str(&snippet_request.query_ast_resolved) .context("failed to deserialize QueryAst")?; - let (query, _) = doc_mapper.query(schema.clone(), query_ast_resolved, false)?; + let (query, _) = doc_mapper.query(schema.clone(), query_ast_resolved, false, None)?; let mut snippet_generators = HashMap::new(); for field_name in &snippet_request.snippet_fields { let field = schema.get_field(field_name)?; diff --git a/quickwit/quickwit-search/src/leaf.rs b/quickwit/quickwit-search/src/leaf.rs index 815dce416c4..2b310209e80 100644 --- a/quickwit/quickwit-search/src/leaf.rs +++ b/quickwit/quickwit-search/src/leaf.rs @@ -499,9 +499,15 @@ async fn leaf_search_single_split( make_collector_for_split(split_id.clone(), &search_request, agg_context_params)?; let split_schema = index.schema(); - let (query, mut warmup_info) = - ctx.doc_mapper - .query(split_schema.clone(), query_ast.clone(), false)?; + let (query, mut warmup_info) = ctx.doc_mapper.query( + split_schema.clone(), + query_ast.clone(), + false, + Some(( + ctx.searcher_context.predicate_cache.clone(), + split.split_id.clone(), + )), + )?; let collector_warmup_info = collector.warmup_info(); warmup_info.merge(collector_warmup_info); diff --git a/quickwit/quickwit-search/src/leaf_cache.rs b/quickwit/quickwit-search/src/leaf_cache.rs index e1f284214f7..abc756763ef 100644 --- a/quickwit/quickwit-search/src/leaf_cache.rs +++ b/quickwit/quickwit-search/src/leaf_cache.rs @@ -20,6 +20,7 @@ use quickwit_proto::search::{ }; use quickwit_proto::types::SplitId; use quickwit_storage::{MemorySizedCache, OwnedBytes}; +use tantivy::index::SegmentId; /// A cache to memoize `leaf_search_single_split` results. pub struct LeafSearchCache { @@ -187,6 +188,51 @@ impl RangeBounds for HalfOpenRange { } } +pub struct PredicateCacheImpl { + content: MemorySizedCache<(SplitId, String)>, +} + +impl PredicateCacheImpl { + pub fn new(capacity: usize) -> Self { + PredicateCacheImpl { + content: MemorySizedCache::with_capacity_in_bytes( + capacity, + &quickwit_storage::STORAGE_METRICS.predicate_cache, + ), + } + } +} + +impl quickwit_query::query_ast::PredicateCache for PredicateCacheImpl { + fn get( + &self, + split_id: SplitId, + query_ast_json: String, + ) -> Option<(SegmentId, quickwit_query::query_ast::HitSet)> { + let encoded_result = self.content.get(&(split_id, query_ast_json))?; + let (segment_id_bytes, hits_buffer) = encoded_result.split(32); + let segment_id = + SegmentId::from_uuid_string(str::from_utf8(&segment_id_bytes).ok()?).ok()?; + let hits = quickwit_query::query_ast::HitSet::from_buffer(hits_buffer); + Some((segment_id, hits)) + } + + fn put( + &self, + split_id: SplitId, + query_ast_json: String, + segment: SegmentId, + hits: quickwit_query::query_ast::HitSet, + ) { + let hits_buffer = hits.into_buffer(); + let mut buffer = Vec::with_capacity(32 + hits_buffer.len()); + buffer.extend_from_slice(segment.uuid_string().as_bytes()); + buffer.extend_from_slice(&hits_buffer); + self.content + .put((split_id, query_ast_json), OwnedBytes::new(buffer)); + } +} + #[cfg(test)] mod tests { use quickwit_proto::search::{ diff --git a/quickwit/quickwit-search/src/root.rs b/quickwit/quickwit-search/src/root.rs index 0d8f03d6c70..8dcf59d6a83 100644 --- a/quickwit/quickwit-search/src/root.rs +++ b/quickwit/quickwit-search/src/root.rs @@ -249,7 +249,12 @@ fn validate_request_and_build_metadata( )?; // Validates the query by effectively building it against the current schema. - doc_mapper.query(doc_mapper.schema(), query_ast_resolved_for_index, true)?; + doc_mapper.query( + doc_mapper.schema(), + query_ast_resolved_for_index, + true, + None, + )?; let index_metadata_for_leaf_search = IndexMetasForLeafSearch { index_uri: index_metadata.index_uri().clone(), @@ -1299,6 +1304,7 @@ pub async fn search_plan( doc_mapper.schema(), request_metadata.query_ast_resolved.clone(), true, + None, )?; let merge_collector = make_merge_collector(&search_request, Default::default())?; warmup_info.merge(merge_collector.warmup_info()); diff --git a/quickwit/quickwit-search/src/service.rs b/quickwit/quickwit-search/src/service.rs index 88dba389c83..e33b67339ff 100644 --- a/quickwit/quickwit-search/src/service.rs +++ b/quickwit/quickwit-search/src/service.rs @@ -34,7 +34,7 @@ use quickwit_storage::{ use tantivy::aggregation::AggregationLimitsGuard; use crate::leaf::multi_index_leaf_search; -use crate::leaf_cache::LeafSearchCache; +use crate::leaf_cache::{LeafSearchCache, PredicateCacheImpl}; use crate::list_fields::{leaf_list_fields, root_list_fields}; use crate::list_fields_cache::ListFieldsCache; use crate::list_terms::{leaf_list_terms, root_list_terms}; @@ -405,8 +405,10 @@ pub struct SearcherContext { pub search_permit_provider: SearchPermitProvider, /// Split footer cache. pub split_footer_cache: MemorySizedCache, - /// Recent sub-query cache. + /// Per-split and per-query cache. pub leaf_search_cache: LeafSearchCache, + /// Per-split and per-predicate cache. + pub predicate_cache: Arc, /// Search split cache. `None` if no split cache is configured. pub split_cache_opt: Option>, /// List fields cache. Caches the list fields response for a given split. @@ -446,6 +448,8 @@ impl SearcherContext { let storage_long_term_cache = Arc::new(QuickwitCache::new(fast_field_cache_capacity)); let leaf_search_cache = LeafSearchCache::new(searcher_config.partial_request_cache_capacity.as_u64() as usize); + let predicate_cache = + PredicateCacheImpl::new(searcher_config.predicate_cache_capacity.as_u64() as usize); let list_fields_cache = ListFieldsCache::new(searcher_config.partial_request_cache_capacity.as_u64() as usize); let aggregation_limit = AggregationLimitsGuard::new( @@ -456,6 +460,7 @@ impl SearcherContext { Self { searcher_config, fast_fields_cache: storage_long_term_cache, + predicate_cache: predicate_cache.into(), search_permit_provider: leaf_search_split_semaphore, split_footer_cache: global_split_footer_cache, leaf_search_cache, diff --git a/quickwit/quickwit-serve/src/delete_task_api/handler.rs b/quickwit/quickwit-serve/src/delete_task_api/handler.rs index a582fb0a3f6..33c295df008 100644 --- a/quickwit/quickwit-serve/src/delete_task_api/handler.rs +++ b/quickwit/quickwit-serve/src/delete_task_api/handler.rs @@ -173,7 +173,7 @@ pub async fn post_delete_request( let query_ast: QueryAst = serde_json::from_str(&delete_search_request.query_ast) .map_err(|err| JanitorError::InvalidDeleteQuery(err.to_string()))?; doc_mapper - .query(doc_mapper.schema(), query_ast, true) + .query(doc_mapper.schema(), query_ast, true, None) .map_err(|error| JanitorError::InvalidDeleteQuery(error.to_string()))?; let delete_task = metastore.create_delete_task(delete_query).await?; Ok(delete_task) diff --git a/quickwit/quickwit-storage/src/metrics.rs b/quickwit/quickwit-storage/src/metrics.rs index 43ef588e192..3cf72fa679b 100644 --- a/quickwit/quickwit-storage/src/metrics.rs +++ b/quickwit/quickwit-storage/src/metrics.rs @@ -24,6 +24,7 @@ use quickwit_common::metrics::{ pub struct StorageMetrics { pub shortlived_cache: CacheMetrics, pub partial_request_cache: CacheMetrics, + pub predicate_cache: CacheMetrics, pub fd_cache_metrics: CacheMetrics, pub fast_field_cache: CacheMetrics, pub split_footer_cache: CacheMetrics, @@ -92,6 +93,7 @@ impl Default for StorageMetrics { fast_field_cache: CacheMetrics::for_component("fastfields"), fd_cache_metrics: CacheMetrics::for_component("fd"), partial_request_cache: CacheMetrics::for_component("partial_request"), + predicate_cache: CacheMetrics::for_component("predicate"), searcher_split_cache: CacheMetrics::for_component("searcher_split"), shortlived_cache: CacheMetrics::for_component("shortlived"), split_footer_cache: CacheMetrics::for_component("splitfooter"), From 950f446f40d40af93d102ca2406a1c79b40e9786 Mon Sep 17 00:00:00 2001 From: trinity Pointard Date: Tue, 16 Dec 2025 11:17:44 +0100 Subject: [PATCH 4/8] make CacheFiller cloneable without Arc --- .../quickwit-query/src/query_ast/cache_node.rs | 15 ++++++++++++--- 1 file changed, 12 insertions(+), 3 deletions(-) diff --git a/quickwit/quickwit-query/src/query_ast/cache_node.rs b/quickwit/quickwit-query/src/query_ast/cache_node.rs index a8f523bafeb..8a409392e32 100644 --- a/quickwit/quickwit-query/src/query_ast/cache_node.rs +++ b/quickwit/quickwit-query/src/query_ast/cache_node.rs @@ -112,7 +112,7 @@ impl BuildTantivyAst for CacheNode { .simplify() .into(); Ok(CacheFillerQuery { - inner_query: Arc::new(tantivy_query), + inner_query: Box::new(tantivy_query), cache_filler: cache_filler.clone(), } .into()) @@ -387,12 +387,21 @@ impl CacheFiller { } } -#[derive(Clone, Debug)] +#[derive(Debug)] pub struct CacheFillerQuery { - inner_query: Arc, + inner_query: Box, cache_filler: CacheFiller, } +impl Clone for CacheFillerQuery { + fn clone(&self) -> Self { + Self { + inner_query: self.inner_query.box_clone(), + cache_filler: self.cache_filler.clone(), + } + } +} + impl Query for CacheFillerQuery { fn weight(&self, enable_scoring: EnableScoring<'_>) -> tantivy::Result> { if enable_scoring.is_scoring_enabled() { From f34765c7bec6806dc8fbb50a40b228aa4c2f6c00 Mon Sep 17 00:00:00 2001 From: trinity Pointard Date: Tue, 16 Dec 2025 11:23:57 +0100 Subject: [PATCH 5/8] emit cachenode for search_after --- quickwit/quickwit-search/src/leaf.rs | 16 +++++++++++++++- 1 file changed, 15 insertions(+), 1 deletion(-) diff --git a/quickwit/quickwit-search/src/leaf.rs b/quickwit/quickwit-search/src/leaf.rs index 2b310209e80..076ef019488 100644 --- a/quickwit/quickwit-search/src/leaf.rs +++ b/quickwit/quickwit-search/src/leaf.rs @@ -29,7 +29,9 @@ use quickwit_proto::search::{ CountHits, LeafSearchRequest, LeafSearchResponse, PartialHit, ResourceStats, SearchRequest, SortOrder, SortValue, SplitIdAndFooterOffsets, SplitSearchError, }; -use quickwit_query::query_ast::{BoolQuery, QueryAst, QueryAstTransformer, RangeQuery, TermQuery}; +use quickwit_query::query_ast::{ + BoolQuery, CacheNode, QueryAst, QueryAstTransformer, RangeQuery, TermQuery, +}; use quickwit_query::tokenizers::TokenizerManager; use quickwit_storage::{ BundleStorage, ByteRangeCache, MemorySizedCache, OwnedBytes, SplitCache, Storage, @@ -614,6 +616,18 @@ fn rewrite_request( remove_redundant_timestamp_range(search_request, split, timestamp_field); } rewrite_aggregation(search_request); + if search_request.search_after.is_some() { + add_top_cache_node(search_request) + } +} + +fn add_top_cache_node(search_request: &mut SearchRequest) { + let Ok(query_ast) = serde_json::from_str(search_request.query_ast.as_str()) else { + // an error will get raised a bit after anyway + return; + }; + let new_ast: QueryAst = CacheNode::new(query_ast).into(); + search_request.query_ast = serde_json::to_string(&new_ast).unwrap(); } /// Rewrite aggregation to make them easier to cache From 52a1a5ab987b4e28f14f192dcee4d541c5540fbe Mon Sep 17 00:00:00 2001 From: trinity Pointard Date: Tue, 16 Dec 2025 11:33:08 +0100 Subject: [PATCH 6/8] resolved todo --- quickwit/quickwit-query/src/query_ast/cache_node.rs | 1 - 1 file changed, 1 deletion(-) diff --git a/quickwit/quickwit-query/src/query_ast/cache_node.rs b/quickwit/quickwit-query/src/query_ast/cache_node.rs index 8a409392e32..b77bb7ce879 100644 --- a/quickwit/quickwit-query/src/query_ast/cache_node.rs +++ b/quickwit/quickwit-query/src/query_ast/cache_node.rs @@ -201,7 +201,6 @@ impl CacheEntry { #[derive(Debug, Clone)] pub struct HitSet { - // TODO we could make this an OwnedBytes to save on memcpy buffer: OwnedBytes, buffer_pos: usize, previous_last_val: Option, From b54dcf05e981cfc90a5c6eff79f6f030b57d50ed Mon Sep 17 00:00:00 2001 From: trinity Pointard Date: Tue, 16 Dec 2025 16:21:43 +0100 Subject: [PATCH 7/8] cr --- .../quickwit-doc-mapper/src/doc_mapper/mod.rs | 2 +- .../quickwit-doc-mapper/src/query_builder.rs | 30 ++--- .../src/query_ast/cache_node.rs | 123 ++++++++++-------- .../src/query_ast/full_text_query.rs | 10 +- quickwit/quickwit-query/src/query_ast/mod.rs | 47 +++---- .../src/query_ast/range_query.rs | 14 +- .../src/query_ast/term_query.rs | 12 +- .../src/query_ast/user_input_query.rs | 8 +- .../quickwit-query/src/query_ast/visitor.rs | 19 ++- 9 files changed, 135 insertions(+), 130 deletions(-) diff --git a/quickwit/quickwit-doc-mapper/src/doc_mapper/mod.rs b/quickwit/quickwit-doc-mapper/src/doc_mapper/mod.rs index c8b9f0bd432..bed4b18b90f 100644 --- a/quickwit/quickwit-doc-mapper/src/doc_mapper/mod.rs +++ b/quickwit/quickwit-doc-mapper/src/doc_mapper/mod.rs @@ -857,7 +857,7 @@ mod tests { field: "multilang".to_string(), value: "JPN:す".to_string(), }); - let (query, _) = doc_mapper.query(schema, &query_ast, false).unwrap(); + let (query, _) = doc_mapper.query(schema, query_ast, false, None).unwrap(); assert_eq!( format!("{query:?}"), r#"TermQuery(Term(field=2, type=Str, "JPN:す"))"# diff --git a/quickwit/quickwit-doc-mapper/src/query_builder.rs b/quickwit/quickwit-doc-mapper/src/query_builder.rs index 9bc874975ce..5900b577795 100644 --- a/quickwit/quickwit-doc-mapper/src/query_builder.rs +++ b/quickwit/quickwit-doc-mapper/src/query_builder.rs @@ -17,6 +17,7 @@ use std::convert::Infallible; use std::ops::Bound; use std::sync::Arc; +use quickwit_proto::types::SplitId; use quickwit_query::query_ast::{ BuildTantivyAstContext, FieldPresenceQuery, FullTextQuery, PhrasePrefixQuery, QueryAst, QueryAstTransformer, QueryAstVisitor, RangeQuery, RegexQuery, TermSetQuery, WildcardQuery, @@ -157,13 +158,13 @@ impl<'a, 'f> QueryAstVisitor<'a> for ExistsQueryFastFields<'f> { pub(crate) fn build_query( query_ast: QueryAst, context: &BuildTantivyAstContext, - cache_context: Option<(Arc, String)>, + cache_context: Option<(Arc, SplitId)>, ) -> Result<(Box, WarmupInfo), QueryParserError> { let mut fast_fields: HashSet = HashSet::new(); let query_ast = if let Some((cache, split_id)) = cache_context { - let Ok(query_ast) = - quickwit_query::query_ast::CachePreIgniter { cache, split_id }.transform(query_ast); + let Ok(query_ast) = quickwit_query::query_ast::PredicateCacheInjector { cache, split_id } + .transform(query_ast); // this transformer isn't supposed to ever remove a node query_ast.unwrap_or(QueryAst::MatchAll) } else { @@ -426,8 +427,8 @@ mod test { use quickwit_common::shared_consts::FIELD_PRESENCE_FIELD_NAME; use quickwit_query::query_ast::{ - FullTextMode, FullTextParams, PhrasePrefixQuery, QueryAstVisitor, UserInputQuery, - query_ast_from_user_text, + BuildTantivyAstContext, FullTextMode, FullTextParams, PhrasePrefixQuery, QueryAstVisitor, + UserInputQuery, query_ast_from_user_text, }; use quickwit_query::{ BooleanOperand, MatchAllOrNone, create_default_quickwit_tokenizer_manager, @@ -513,7 +514,7 @@ mod test { .parse_user_query(&[]) .map_err(|err| err.to_string())?; let schema = make_schema(dynamic_mode); - let query_result = build_query(query_ast, quickwit_query::test_context!(schema), None); + let query_result = build_query(query_ast, &BuildTantivyAstContext::for_test(&schema), None); query_result .map(|query| format!("{query:?}")) .map_err(|err| err.to_string()) @@ -887,12 +888,10 @@ mod test { .parse_user_query(&[]) .unwrap(); - let (_, warmup_info) = build_query( - query_with_set, - quickwit_query::test_context!(make_schema(true)), - None, - ) - .unwrap(); + let schema = make_schema(true); + let context = BuildTantivyAstContext::for_test(&schema); + + let (_, warmup_info) = build_query(query_with_set, &context, None).unwrap(); assert_eq!(warmup_info.term_dict_fields.len(), 1); assert!( warmup_info @@ -900,12 +899,7 @@ mod test { .contains(&tantivy::schema::Field::from_field_id(2)) ); - let (_, warmup_info) = build_query( - query_without_set, - quickwit_query::test_context!(make_schema(true)), - None, - ) - .unwrap(); + let (_, warmup_info) = build_query(query_without_set, &context, None).unwrap(); assert!(warmup_info.term_dict_fields.is_empty()); } diff --git a/quickwit/quickwit-query/src/query_ast/cache_node.rs b/quickwit/quickwit-query/src/query_ast/cache_node.rs index b77bb7ce879..9c449aef4fb 100644 --- a/quickwit/quickwit-query/src/query_ast/cache_node.rs +++ b/quickwit/quickwit-query/src/query_ast/cache_node.rs @@ -206,10 +206,11 @@ pub struct HitSet { previous_last_val: Option, current_block: [u32; BitPacker1x::BLOCK_LEN], block_pos: usize, - finished: bool, boost: Score, } +const INCOMPLETE_BLOCK_MARKER: u8 = 0x80; + impl HitSet { #[cfg(test)] fn empty() -> Self { @@ -229,7 +230,6 @@ impl HitSet { // we set this to block_len minus 1 so we can call advance() once to initialize // everything block_pos: BitPacker1x::BLOCK_LEN - 1, - finished: false, boost: 1.0, }; this.advance(); @@ -242,57 +242,60 @@ impl HitSet { pub fn into_buffer(self) -> OwnedBytes { self.buffer } + + fn load_new_block(&mut self) { + let Some(num_bits) = self.buffer.get(self.buffer_pos) else { + // we ended iteration: simply fill the current_block full of TERMINATED + self.current_block = [tantivy::TERMINATED; 32]; + return; + }; + self.buffer_pos += 1; + if *num_bits == INCOMPLETE_BLOCK_MARKER { + // final block, decode as many ids as possible + let mut i = 0; + for chunk in self.buffer[self.buffer_pos..].as_chunks().0 { + self.current_block[i] = u32::from_ne_bytes(*chunk); + i += 1; + } + // pad with TERMINATED + while i < BitPacker1x::BLOCK_LEN { + self.current_block[i] = tantivy::TERMINATED; + i += 1; + } + self.buffer_pos = self.buffer.len(); + } else { + self.buffer_pos += BitPacker1x.decompress_strictly_sorted( + self.previous_last_val, + &self.buffer[self.buffer_pos..], + &mut self.current_block, + *num_bits, + ); + self.previous_last_val = self.current_block.last().copied(); + } + } } impl DocSet for HitSet { fn advance(&mut self) -> DocId { - if self.block_pos == BitPacker1x::BLOCK_LEN - 1 { - self.block_pos = 0; - let Some(num_bits) = self.buffer.get(self.buffer_pos) else { - self.finished = true; - return self.doc(); - }; - self.buffer_pos += 1; - if *num_bits == 0x80 { - // final block, decode as many ids as possible - let mut i = 0; - for chunk in self.buffer[self.buffer_pos..].as_chunks().0 { - self.current_block[i] = u32::from_le_bytes(*chunk); - i += 1; - } - while i < BitPacker1x::BLOCK_LEN { - self.current_block[i] = tantivy::TERMINATED; - i += 1; - } - self.buffer_pos = self.buffer.len(); - } else { - self.buffer_pos += BitPacker1x.decompress_strictly_sorted( - self.previous_last_val, - &self.buffer[self.buffer_pos..], - &mut self.current_block, - *num_bits, - ); - self.previous_last_val = self.current_block.last().copied(); - } - } else { - self.block_pos += 1; + self.block_pos += 1; + if let Some(doc_id) = self.current_block.get(self.block_pos) { + return *doc_id; } - self.doc() + self.load_new_block(); + self.block_pos = 0; + self.current_block[0] } // fn seek(&mut self, target: DocId) -> DocId { // } + #[inline(always)] fn doc(&self) -> DocId { - if self.finished { - tantivy::TERMINATED - } else { - self.current_block[self.block_pos] - } + self.current_block[self.block_pos] } fn size_hint(&self) -> u32 { - u32::from_le_bytes(self.buffer[0..4].try_into().unwrap()) + u32::from_ne_bytes(self.buffer[0..4].try_into().unwrap()) } } @@ -332,7 +335,7 @@ impl HitSetBuilder { BitPacker1x.num_bits_strictly_sorted(self.previous_last_val, &self.current_block); self.buffer.push(num_bits); let current_buffer_pos = self.buffer.len(); - let new_end = current_buffer_pos + BitPacker1x::BLOCK_LEN / 8 * num_bits as usize; + let new_end = current_buffer_pos + (BitPacker1x::BLOCK_LEN * num_bits as usize) / 8; self.buffer.resize(new_end, 0); BitPacker1x.compress_strictly_sorted( self.previous_last_val, @@ -353,13 +356,13 @@ impl HitSetBuilder { pub fn build(mut self) -> HitSet { if self.in_block_pos() != 0 { - self.buffer.push(0x80); + self.buffer.push(INCOMPLETE_BLOCK_MARKER); for elem in &self.current_block[..self.in_block_pos()] { - self.buffer.extend_from_slice(&elem.to_le_bytes()); + self.buffer.extend_from_slice(&elem.to_ne_bytes()); } } // write back the count of items - self.buffer[0..4].copy_from_slice(&self.count.to_le_bytes()); + self.buffer[0..4].copy_from_slice(&self.count.to_ne_bytes()); HitSet::from_buffer(OwnedBytes::new(self.buffer)) } } @@ -446,12 +449,17 @@ impl Weight for CacheFillerWeight { } } -pub struct CachePreIgniter { +/// A transformer that goes through a QueryAst, and change the state of all CacheNodes +/// to Hit/Miss based on the provided cache. +/// +/// This must be called for any CacheNode inside a QueryAst to do anything (though not calling +/// it isn't an error, it just means no cache will be used). +pub struct PredicateCacheInjector { pub cache: Arc, pub split_id: String, } -impl crate::query_ast::QueryAstTransformer for CachePreIgniter { +impl crate::query_ast::QueryAstTransformer for PredicateCacheInjector { type Err = std::convert::Infallible; fn transform_cache_node( @@ -487,8 +495,9 @@ mod tests { use tantivy::schema::{Schema, TEXT}; use super::*; - use crate::query_ast::{QueryAstTransformer, QueryAstVisitor, TermQuery}; - use crate::test_context; + use crate::query_ast::{ + BuildTantivyAstContext, QueryAstTransformer, QueryAstVisitor, TermQuery, + }; impl PredicateCache for Mutex> { fn get(&self, split_id: SplitId, query_ast_json: String) -> Option<(SegmentId, HitSet)> { @@ -566,7 +575,7 @@ mod tests { } .into(); let tantivy_term_query: Box = term_query - .build_tantivy_ast_impl(test_context!(schema)) + .build_tantivy_ast_impl(&BuildTantivyAstContext::for_test(&schema)) .unwrap() .into(); @@ -576,7 +585,7 @@ mod tests { state: CacheState::Uninitialized, }; let uninit_cache_query: Box = ast - .build_tantivy_ast_impl(test_context!(schema)) + .build_tantivy_ast_impl(&BuildTantivyAstContext::for_test(&schema)) .unwrap() .into(); assert_eq!( @@ -596,7 +605,7 @@ mod tests { state: CacheState::CacheHit(cache_entry), }; let cache_hit_query: Box = ast - .build_tantivy_ast_impl(test_context!(schema)) + .build_tantivy_ast_impl(&BuildTantivyAstContext::for_test(&schema)) .unwrap() .into(); @@ -611,11 +620,11 @@ mod tests { query: "{}".to_string(), }; let ast = CacheNode { - inner: Box::new(term_query.clone().into()), + inner: Box::new(term_query.clone()), state: CacheState::CacheMiss(cache_filler), }; let cache_miss_query: Box = ast - .build_tantivy_ast_impl(test_context!(schema)) + .build_tantivy_ast_impl(&BuildTantivyAstContext::for_test(&schema)) .unwrap() .into(); @@ -689,7 +698,7 @@ mod tests { query: "{}".to_string(), }; let ast = CacheNode { - inner: Box::new(term_query.clone().into()), + inner: Box::new(term_query.clone()), state: CacheState::CacheMiss(cache_filler), } .into(); @@ -725,7 +734,7 @@ mod tests { ); { - let mut pre_igniter = CachePreIgniter { + let mut pre_igniter = PredicateCacheInjector { cache: cache.clone(), split_id: "split_1".to_string(), }; @@ -740,7 +749,7 @@ mod tests { } { - let mut pre_igniter = CachePreIgniter { + let mut pre_igniter = PredicateCacheInjector { cache: cache.clone(), split_id: "split_2".to_string(), }; @@ -799,7 +808,7 @@ mod tests { state: CacheState::CacheHit(cache_entry), }; let cache_hit_query: Box = ast - .build_tantivy_ast_impl(test_context!(schema)) + .build_tantivy_ast_impl(&BuildTantivyAstContext::for_test(&schema)) .unwrap() .into(); @@ -843,7 +852,7 @@ mod tests { state: CacheState::CacheMiss(cache_filler), }; let cache_hit_query: Box = ast - .build_tantivy_ast_impl(test_context!(schema)) + .build_tantivy_ast_impl(&BuildTantivyAstContext::for_test(&schema)) .unwrap() .into(); diff --git a/quickwit/quickwit-query/src/query_ast/full_text_query.rs b/quickwit/quickwit-query/src/query_ast/full_text_query.rs index 158782b1dea..508b1eb03ab 100644 --- a/quickwit/quickwit-query/src/query_ast/full_text_query.rs +++ b/quickwit/quickwit-query/src/query_ast/full_text_query.rs @@ -305,7 +305,7 @@ mod tests { use crate::BooleanOperand; use crate::query_ast::tantivy_query_ast::TantivyQueryAst; - use crate::query_ast::{BuildTantivyAst, FullTextMode, FullTextQuery, test_context}; + use crate::query_ast::{BuildTantivyAst, BuildTantivyAstContext, FullTextMode, FullTextQuery}; #[test] fn test_zero_terms() { @@ -323,7 +323,7 @@ mod tests { schema_builder.add_text_field("body", TEXT); let schema = schema_builder.build(); let ast: TantivyQueryAst = full_text_query - .build_tantivy_ast_call(test_context!(schema)) + .build_tantivy_ast_call(&BuildTantivyAstContext::for_test(&schema)) .unwrap(); assert_eq!(ast.const_predicate(), Some(crate::MatchAllOrNone::MatchAll)); } @@ -344,7 +344,7 @@ mod tests { schema_builder.add_text_field("body", TEXT); let schema = schema_builder.build(); let ast: TantivyQueryAst = full_text_query - .build_tantivy_ast_call(test_context!(schema)) + .build_tantivy_ast_call(&BuildTantivyAstContext::for_test(&schema)) .unwrap(); let leaf = ast.as_leaf().unwrap(); assert_eq!( @@ -370,7 +370,7 @@ mod tests { schema_builder.add_text_field("body", TEXT); let schema = schema_builder.build(); let ast: TantivyQueryAst = full_text_query - .build_tantivy_ast_call(test_context!(schema)) + .build_tantivy_ast_call(&BuildTantivyAstContext::for_test(&schema)) .unwrap(); let leaf = ast.as_leaf().unwrap(); assert_eq!( @@ -395,7 +395,7 @@ mod tests { schema_builder.add_text_field("body", TEXT); let schema = schema_builder.build(); let ast: TantivyQueryAst = full_text_query - .build_tantivy_ast_call(test_context!(schema)) + .build_tantivy_ast_call(&BuildTantivyAstContext::for_test(&schema)) .unwrap(); let bool_query = ast.as_bool_query().unwrap(); assert_eq!(bool_query.must.len(), 2); diff --git a/quickwit/quickwit-query/src/query_ast/mod.rs b/quickwit/quickwit-query/src/query_ast/mod.rs index 268c0b7dc48..be2242e941c 100644 --- a/quickwit/quickwit-query/src/query_ast/mod.rs +++ b/quickwit/quickwit-query/src/query_ast/mod.rs @@ -34,7 +34,7 @@ mod visitor; mod wildcard_query; pub use bool_query::BoolQuery; -pub use cache_node::{CacheNode, CachePreIgniter, HitSet, PredicateCache}; +pub use cache_node::{CacheNode, HitSet, PredicateCache, PredicateCacheInjector}; pub use field_presence::FieldPresenceQuery; pub use full_text_query::{FullTextMode, FullTextParams, FullTextQuery}; pub use phrase_prefix_query::PhrasePrefixQuery; @@ -166,27 +166,27 @@ pub struct BuildTantivyAstContext<'a> { pub with_validation: bool, } -/// Shorthand to build a context from a schema in tests. -#[macro_export] -macro_rules! test_context { - ($schema:expr) => { - &$crate::query_ast::BuildTantivyAstContext { - schema: &$schema, - tokenizer_manager: &$crate::create_default_quickwit_tokenizer_manager(), +impl<'a> BuildTantivyAstContext<'a> { + pub fn for_test(schema: &'a TantivySchema) -> Self { + use once_cell::sync::Lazy; + + // we do that to have a TokenizerManager with a long enough lifetime + static DEFAULT_TOKENIZER_MANAGER: Lazy = + Lazy::new(crate::create_default_quickwit_tokenizer_manager); + + BuildTantivyAstContext { + schema, + tokenizer_manager: &DEFAULT_TOKENIZER_MANAGER, search_fields: &[], with_validation: true, } - }; - ($schema:expr, validation=$validation:expr) => { - &$crate::query_ast::BuildTantivyAstContext { - schema: &$schema, - tokenizer_manager: &$crate::create_default_quickwit_tokenizer_manager(), - search_fields: &[], - with_validation: $validation, - } - }; + } + + pub fn without_validation(mut self) -> Self { + self.with_validation = false; + self + } } -pub use test_context; trait BuildTantivyAst { /// Transforms a query Ast node into a TantivyQueryAst. @@ -226,7 +226,7 @@ impl BuildTantivyAst for QueryAst { QueryAst::MatchAll => Ok(TantivyQueryAst::match_all()), QueryAst::MatchNone => Ok(TantivyQueryAst::match_none()), QueryAst::Boost { boost, underlying } => { - let underlying = underlying.build_tantivy_ast_call(context)?; + let underlying = underlying.build_tantivy_ast_call(context)?.simplify(); let boost_query = TantivyBoostQuery::new(underlying.into(), (*boost).into()); Ok(boost_query.into()) } @@ -316,7 +316,8 @@ pub fn query_ast_from_user_text(user_text: &str, default_fields: Option Schema { @@ -321,7 +321,7 @@ mod tests { upper_bound: Bound::Included(upper_value), }; let tantivy_ast = range_query - .build_tantivy_ast_call(test_context!(schema)) + .build_tantivy_ast_call(&BuildTantivyAstContext::for_test(&schema)) .unwrap() .simplify(); let leaf = tantivy_ast.as_leaf().unwrap(); @@ -364,7 +364,7 @@ mod tests { }; // with validation let invalid_query: InvalidQuery = range_query - .build_tantivy_ast_call(test_context!(schema)) + .build_tantivy_ast_call(&BuildTantivyAstContext::for_test(&schema)) .unwrap_err(); assert!( matches!(invalid_query, InvalidQuery::FieldDoesNotExist { full_path } if full_path == "missing_field.toto") @@ -372,7 +372,9 @@ mod tests { // without validation assert_eq!( range_query - .build_tantivy_ast_call(test_context!(schema, validation = false),) + .build_tantivy_ast_call( + &BuildTantivyAstContext::for_test(&schema).without_validation() + ) .unwrap() .const_predicate(), Some(MatchAllOrNone::MatchNone) @@ -388,7 +390,7 @@ mod tests { }; let schema = make_schema(true); let tantivy_ast = range_query - .build_tantivy_ast_call(test_context!(schema)) + .build_tantivy_ast_call(&BuildTantivyAstContext::for_test(&schema)) .unwrap(); assert_eq!( format!("{tantivy_ast:?}"), @@ -411,7 +413,7 @@ mod tests { }; let schema = make_schema(false); let err = range_query - .build_tantivy_ast_call(test_context!(schema)) + .build_tantivy_ast_call(&BuildTantivyAstContext::for_test(&schema)) .unwrap_err(); assert!(matches!(err, InvalidQuery::SchemaError { .. })); } diff --git a/quickwit/quickwit-query/src/query_ast/term_query.rs b/quickwit/quickwit-query/src/query_ast/term_query.rs index e2c9fbfc8c0..f0a732b6f8c 100644 --- a/quickwit/quickwit-query/src/query_ast/term_query.rs +++ b/quickwit/quickwit-query/src/query_ast/term_query.rs @@ -118,7 +118,7 @@ impl From for HashMap { mod tests { use tantivy::schema::{INDEXED, Schema}; - use crate::query_ast::{BuildTantivyAst, TermQuery, test_context}; + use crate::query_ast::{BuildTantivyAst, BuildTantivyAstContext, TermQuery}; #[test] fn test_term_query_with_ipaddr_ipv4() { @@ -130,7 +130,7 @@ mod tests { schema_builder.add_ip_addr_field("ip", INDEXED); let schema = schema_builder.build(); let tantivy_query_ast = term_query - .build_tantivy_ast_call(test_context!(schema)) + .build_tantivy_ast_call(&BuildTantivyAstContext::for_test(&schema)) .unwrap(); let leaf = tantivy_query_ast.as_leaf().unwrap(); assert_eq!( @@ -149,7 +149,7 @@ mod tests { schema_builder.add_ip_addr_field("ip", INDEXED); let schema = schema_builder.build(); let tantivy_query_ast = term_query - .build_tantivy_ast_call(test_context!(schema)) + .build_tantivy_ast_call(&BuildTantivyAstContext::for_test(&schema)) .unwrap(); let leaf = tantivy_query_ast.as_leaf().unwrap(); assert_eq!( @@ -168,7 +168,7 @@ mod tests { schema_builder.add_bytes_field("bytes", INDEXED); let schema = schema_builder.build(); let tantivy_query_ast = term_query - .build_tantivy_ast_call(test_context!(schema)) + .build_tantivy_ast_call(&BuildTantivyAstContext::for_test(&schema)) .unwrap(); let leaf = tantivy_query_ast.as_leaf().unwrap(); assert_eq!( @@ -187,7 +187,7 @@ mod tests { schema_builder.add_bytes_field("bytes", INDEXED); let schema = schema_builder.build(); let tantivy_query_ast = term_query - .build_tantivy_ast_call(test_context!(schema)) + .build_tantivy_ast_call(&BuildTantivyAstContext::for_test(&schema)) .unwrap(); let leaf = tantivy_query_ast.as_leaf().unwrap(); assert_eq!( @@ -206,7 +206,7 @@ mod tests { schema_builder.add_date_field("timestamp", INDEXED); let schema = schema_builder.build(); let tantivy_query_ast = term_query - .build_tantivy_ast_call(test_context!(schema)) + .build_tantivy_ast_call(&BuildTantivyAstContext::for_test(&schema)) .unwrap(); let leaf = tantivy_query_ast.as_leaf().unwrap(); // The date should have been truncated to seconds precision. diff --git a/quickwit/quickwit-query/src/query_ast/user_input_query.rs b/quickwit/quickwit-query/src/query_ast/user_input_query.rs index 1a30b89d483..8f83461932e 100644 --- a/quickwit/quickwit-query/src/query_ast/user_input_query.rs +++ b/quickwit/quickwit-query/src/query_ast/user_input_query.rs @@ -318,8 +318,8 @@ fn convert_user_input_literal( #[cfg(test)] mod tests { use crate::query_ast::{ - BoolQuery, BuildTantivyAst, FullTextMode, FullTextQuery, QueryAst, UserInputQuery, - test_context, + BoolQuery, BuildTantivyAst, BuildTantivyAstContext, FullTextMode, FullTextQuery, QueryAst, + UserInputQuery, }; use crate::{BooleanOperand, InvalidQuery}; @@ -334,13 +334,13 @@ mod tests { let schema = tantivy::schema::Schema::builder().build(); { let invalid_query = user_input_query - .build_tantivy_ast_call(test_context!(schema)) + .build_tantivy_ast_call(&BuildTantivyAstContext::for_test(&schema)) .unwrap_err(); assert!(matches!(invalid_query, InvalidQuery::UserQueryNotParsed)); } { let invalid_query = user_input_query - .build_tantivy_ast_call(test_context!(schema)) + .build_tantivy_ast_call(&BuildTantivyAstContext::for_test(&schema)) .unwrap_err(); assert!(matches!(invalid_query, InvalidQuery::UserQueryNotParsed)); } diff --git a/quickwit/quickwit-query/src/query_ast/visitor.rs b/quickwit/quickwit-query/src/query_ast/visitor.rs index 49a6e7a260a..a6d2a5c07d4 100644 --- a/quickwit/quickwit-query/src/query_ast/visitor.rs +++ b/quickwit/quickwit-query/src/query_ast/visitor.rs @@ -257,17 +257,16 @@ pub trait QueryAstTransformer { &mut self, cache_node: CacheNode, ) -> Result, Self::Err> { - if !matches!(cache_node.state, CacheState::CacheHit(_)) { - self.transform(*cache_node.inner).map(|maybe_ast| { - maybe_ast.map(|inner| { - QueryAst::Cache(CacheNode { - inner: Box::new(inner), - state: Default::default(), - }) + if matches!(cache_node.state, CacheState::CacheHit(_)) { + return Ok(Some(cache_node.into())); + } + self.transform(*cache_node.inner).map(|maybe_ast| { + maybe_ast.map(|inner| { + QueryAst::Cache(CacheNode { + inner: Box::new(inner), + state: Default::default(), }) }) - } else { - Ok(Some(cache_node.into())) - } + }) } } From 8e0e861c537cea759bbe64d3198088d64bb94b1f Mon Sep 17 00:00:00 2001 From: trinity Pointard Date: Tue, 16 Dec 2025 16:49:34 +0100 Subject: [PATCH 8/8] cr part2 --- quickwit/Cargo.lock | 1 + quickwit/quickwit-config/src/node_config/mod.rs | 2 +- .../quickwit-config/src/node_config/serialize.rs | 2 +- quickwit/quickwit-query/Cargo.toml | 1 + quickwit/quickwit-query/src/query_ast/mod.rs | 13 +++++++++++++ quickwit/quickwit-search/src/leaf.rs | 2 ++ 6 files changed, 19 insertions(+), 2 deletions(-) diff --git a/quickwit/Cargo.lock b/quickwit/Cargo.lock index c948b190817..708c287cba7 100644 --- a/quickwit/Cargo.lock +++ b/quickwit/Cargo.lock @@ -7375,6 +7375,7 @@ dependencies = [ "tantivy-fst", "thiserror 2.0.17", "time", + "tracing", "whichlang", ] diff --git a/quickwit/quickwit-config/src/node_config/mod.rs b/quickwit/quickwit-config/src/node_config/mod.rs index 66217cabbd9..8217f8becb2 100644 --- a/quickwit/quickwit-config/src/node_config/mod.rs +++ b/quickwit/quickwit-config/src/node_config/mod.rs @@ -325,7 +325,7 @@ impl Default for SearcherConfig { fast_field_cache_capacity: ByteSize::gb(1), split_footer_cache_capacity: ByteSize::mb(500), partial_request_cache_capacity: ByteSize::mb(64), - predicate_cache_capacity: ByteSize::mb(128), + predicate_cache_capacity: ByteSize::mb(256), max_num_concurrent_split_searches: 100, _max_num_concurrent_split_streams: None, aggregation_memory_limit: ByteSize::mb(500), diff --git a/quickwit/quickwit-config/src/node_config/serialize.rs b/quickwit/quickwit-config/src/node_config/serialize.rs index 7986e1dbd13..6d6360163fc 100644 --- a/quickwit/quickwit-config/src/node_config/serialize.rs +++ b/quickwit/quickwit-config/src/node_config/serialize.rs @@ -661,7 +661,7 @@ mod tests { fast_field_cache_capacity: ByteSize::gb(10), split_footer_cache_capacity: ByteSize::gb(1), partial_request_cache_capacity: ByteSize::mb(64), - predicate_cache_capacity: ByteSize::mb(128), + predicate_cache_capacity: ByteSize::mb(256), max_num_concurrent_split_searches: 150, _max_num_concurrent_split_streams: Some(serde::de::IgnoredAny), split_cache: None, diff --git a/quickwit/quickwit-query/Cargo.toml b/quickwit/quickwit-query/Cargo.toml index 8b92e30f9ff..462331ee830 100644 --- a/quickwit/quickwit-query/Cargo.toml +++ b/quickwit/quickwit-query/Cargo.toml @@ -25,6 +25,7 @@ serde_json = { workspace = true } serde_with = { workspace = true } tantivy = { workspace = true } tantivy-fst = { workspace = true } +tracing = { workspace = true } time = { workspace = true } thiserror = { workspace = true } rustc-hash = { workspace = true } diff --git a/quickwit/quickwit-query/src/query_ast/mod.rs b/quickwit/quickwit-query/src/query_ast/mod.rs index be2242e941c..d51e36eb93a 100644 --- a/quickwit/quickwit-query/src/query_ast/mod.rs +++ b/quickwit/quickwit-query/src/query_ast/mod.rs @@ -120,6 +120,19 @@ impl QueryAst { } QueryAst::Cache(cache_node) => { let inner = cache_node.inner.parse_user_query(default_search_fields)?; + let uninitialized = + matches!(cache_node.state, cache_node::CacheState::Uninitialized); + debug_assert!( + uninitialized, + "QueryAst::parse_user_query called on initialized CacheNode, this is probably \ + a misstake" + ); + if !uninitialized { + tracing::warn!( + "QueryAst::parse_user_query called on initialized CacheNode, cache \ + discarded" + ); + } Ok(CacheNode { inner: Box::new(inner), // inner got modified, the result is supposed to be equivalent, but to be safe, diff --git a/quickwit/quickwit-search/src/leaf.rs b/quickwit/quickwit-search/src/leaf.rs index 076ef019488..abf31eadcb7 100644 --- a/quickwit/quickwit-search/src/leaf.rs +++ b/quickwit/quickwit-search/src/leaf.rs @@ -616,6 +616,8 @@ fn rewrite_request( remove_redundant_timestamp_range(search_request, split, timestamp_field); } rewrite_aggregation(search_request); + // we add a top level cache node when search_after is set, this won't help for this query (which + // is the 2nd in its series), but should speedup every other request that comes after if search_request.search_after.is_some() { add_top_cache_node(search_request) }