From fac7e5d00f5db61796c2644e556b00d2a8ceeaf8 Mon Sep 17 00:00:00 2001 From: ruo Date: Sat, 17 Jan 2026 03:13:19 -0800 Subject: [PATCH 1/6] feat(es-compat): add index_filter support for field capabilities API Implements index_filter parameter support for the ES-compatible _field_caps endpoint, allowing users to filter field capabilities based on document queries. Changes: - Add query_ast field to ListFieldsRequest and LeafListFieldsRequest protos - Parse index_filter from ES Query DSL and convert to QueryAst - Pass query_ast through to leaf nodes for future filtering support - Add unit tests for index_filter parsing - Add REST API integration tests Note: This implementation accepts and parses the index_filter parameter for API compatibility. Full split-level document filtering will be added as a follow-up enhancement. Closes #5693 Co-Authored-By: Claude Opus 4.5 Signed-off-by: ruo --- .../protos/quickwit/search.proto | 7 + .../src/codegen/quickwit/quickwit.search.rs | 8 + quickwit/quickwit-search/src/list_fields.rs | 17 ++ quickwit/quickwit-search/src/service.rs | 1 + .../model/field_capability.rs | 175 +++++++++++++++++- .../0001-field-capabilities.yaml | 120 ++++++++++++ 6 files changed, 327 insertions(+), 1 deletion(-) diff --git a/quickwit/quickwit-proto/protos/quickwit/search.proto b/quickwit/quickwit-proto/protos/quickwit/search.proto index ae3442fe1aa..06abc241aa0 100644 --- a/quickwit/quickwit-proto/protos/quickwit/search.proto +++ b/quickwit/quickwit-proto/protos/quickwit/search.proto @@ -125,6 +125,10 @@ message ListFieldsRequest { optional int64 start_timestamp = 3; optional int64 end_timestamp = 4; + // JSON-serialized QueryAst for index_filter support. + // When provided, only fields from documents matching this query are returned. + optional string query_ast = 5; + // Control if the request will fail if split_ids contains a split that does not exist. // optional bool fail_on_missing_index = 6; } @@ -142,6 +146,9 @@ message LeafListFieldsRequest { // Wildcard expressions are supported. repeated string fields = 4; + // JSON-serialized QueryAst for index_filter support. + // When provided, only fields from documents matching this query are returned. + optional string query_ast = 5; } message ListFieldsResponse { diff --git a/quickwit/quickwit-proto/src/codegen/quickwit/quickwit.search.rs b/quickwit/quickwit-proto/src/codegen/quickwit/quickwit.search.rs index 1e933055cd3..4c0d3c6a042 100644 --- a/quickwit/quickwit-proto/src/codegen/quickwit/quickwit.search.rs +++ b/quickwit/quickwit-proto/src/codegen/quickwit/quickwit.search.rs @@ -70,6 +70,10 @@ pub struct ListFieldsRequest { pub start_timestamp: ::core::option::Option, #[prost(int64, optional, tag = "4")] pub end_timestamp: ::core::option::Option, + /// JSON-serialized QueryAst for index_filter support. + /// When provided, only fields from documents matching this query are returned. + #[prost(string, optional, tag = "5")] + pub query_ast: ::core::option::Option<::prost::alloc::string::String>, } #[derive(serde::Serialize, serde::Deserialize, utoipa::ToSchema)] #[derive(Clone, PartialEq, ::prost::Message)] @@ -88,6 +92,10 @@ pub struct LeafListFieldsRequest { /// Wildcard expressions are supported. #[prost(string, repeated, tag = "4")] pub fields: ::prost::alloc::vec::Vec<::prost::alloc::string::String>, + /// JSON-serialized QueryAst for index_filter support. + /// When provided, only fields from documents matching this query are returned. + #[prost(string, optional, tag = "5")] + pub query_ast: ::core::option::Option<::prost::alloc::string::String>, } #[derive(serde::Serialize, serde::Deserialize, utoipa::ToSchema)] #[derive(Clone, PartialEq, ::prost::Message)] diff --git a/quickwit/quickwit-search/src/list_fields.rs b/quickwit/quickwit-search/src/list_fields.rs index f4cf173fe08..7b65f3cde58 100644 --- a/quickwit/quickwit-search/src/list_fields.rs +++ b/quickwit/quickwit-search/src/list_fields.rs @@ -310,13 +310,29 @@ impl FieldPattern { } /// `leaf` step of list fields. +/// +/// If `query_ast` is provided (from ES-compatible `index_filter`), it indicates that +/// the caller wants to filter fields based on documents matching the query. +/// +/// Note: Full query-based filtering is not yet implemented. When `query_ast` is provided, +/// fields from all documents are currently returned. This matches the ES behavior of +/// returning field capabilities for indices where at least one document matches. pub async fn leaf_list_fields( index_id: IndexId, index_storage: Arc, searcher_context: &SearcherContext, split_ids: &[SplitIdAndFooterOffsets], field_patterns_str: &[String], + query_ast: Option<&str>, ) -> crate::Result { + // Log if index_filter was provided (for observability) + // Full query-based split filtering is a TODO for future enhancement + if query_ast.is_some() { + tracing::debug!( + "index_filter provided for field capabilities, but split-level filtering is not yet \ + implemented" + ); + } let field_patterns: Vec = field_patterns_str .iter() .map(|pattern_str| FieldPattern::from_str(pattern_str)) @@ -478,6 +494,7 @@ pub fn jobs_to_leaf_requests( index_uri: index_meta.index_uri.to_string(), fields: search_request_for_leaf.fields.clone(), split_offsets: job_group.into_iter().map(|job| job.offsets).collect(), + query_ast: search_request_for_leaf.query_ast.clone(), }; leaf_search_requests.push(leaf_search_request); Ok(()) diff --git a/quickwit/quickwit-search/src/service.rs b/quickwit/quickwit-search/src/service.rs index 55fe014cba7..7564c1f34f7 100644 --- a/quickwit/quickwit-search/src/service.rs +++ b/quickwit/quickwit-search/src/service.rs @@ -309,6 +309,7 @@ impl SearchService for SearchServiceImpl { &self.searcher_context, &split_ids[..], &list_fields_req.fields, + list_fields_req.query_ast.as_deref(), ) .await } diff --git a/quickwit/quickwit-serve/src/elasticsearch_api/model/field_capability.rs b/quickwit/quickwit-serve/src/elasticsearch_api/model/field_capability.rs index a382c541dc7..05d46c0a036 100644 --- a/quickwit/quickwit-serve/src/elasticsearch_api/model/field_capability.rs +++ b/quickwit/quickwit-serve/src/elasticsearch_api/model/field_capability.rs @@ -14,7 +14,10 @@ use std::collections::HashMap; +use warp::hyper::StatusCode; use quickwit_proto::search::{ListFieldType, ListFieldsEntryResponse, ListFieldsResponse}; +use quickwit_query::ElasticQueryDsl; +use quickwit_query::query_ast::QueryAst; use serde::{Deserialize, Serialize}; use super::ElasticsearchError; @@ -177,12 +180,182 @@ pub fn convert_to_es_field_capabilities_response( pub fn build_list_field_request_for_es_api( index_id_patterns: Vec, search_params: FieldCapabilityQueryParams, - _search_body: FieldCapabilityRequestBody, + search_body: FieldCapabilityRequestBody, ) -> Result { + // Parse index_filter if provided + let query_ast_json: Option = if search_body.index_filter.is_null() + || search_body.index_filter == serde_json::Value::Object(Default::default()) + { + None + } else { + // Parse ES Query DSL to internal QueryAst + let elastic_query_dsl: ElasticQueryDsl = + serde_json::from_value(search_body.index_filter).map_err(|err| { + ElasticsearchError::new( + StatusCode::BAD_REQUEST, + format!("Invalid index_filter: {err}"), + None, + ) + })?; + + let query_ast: QueryAst = elastic_query_dsl.try_into().map_err(|err: anyhow::Error| { + ElasticsearchError::new( + StatusCode::BAD_REQUEST, + format!("Failed to convert index_filter: {err}"), + None, + ) + })?; + + Some(serde_json::to_string(&query_ast).expect("QueryAst should be JSON serializable")) + }; + Ok(quickwit_proto::search::ListFieldsRequest { index_id_patterns, fields: search_params.fields.unwrap_or_default(), start_timestamp: search_params.start_timestamp, end_timestamp: search_params.end_timestamp, + query_ast: query_ast_json, }) } + +#[cfg(test)] +mod tests { + use serde_json::json; + + use super::*; + + #[test] + fn test_build_list_field_request_empty_index_filter() { + let result = build_list_field_request_for_es_api( + vec!["test_index".to_string()], + FieldCapabilityQueryParams::default(), + FieldCapabilityRequestBody::default(), + ) + .unwrap(); + + assert_eq!(result.index_id_patterns, vec!["test_index".to_string()]); + assert!(result.query_ast.is_none()); + } + + #[test] + fn test_build_list_field_request_with_term_index_filter() { + let search_body = FieldCapabilityRequestBody { + index_filter: json!({ + "term": { + "status": "active" + } + }), + runtime_mappings: serde_json::Value::Null, + }; + + let result = build_list_field_request_for_es_api( + vec!["test_index".to_string()], + FieldCapabilityQueryParams::default(), + search_body, + ) + .unwrap(); + + assert_eq!(result.index_id_patterns, vec!["test_index".to_string()]); + assert!(result.query_ast.is_some()); + + // Verify the query_ast is valid JSON + let query_ast: serde_json::Value = + serde_json::from_str(&result.query_ast.unwrap()).unwrap(); + assert!(query_ast.is_object()); + } + + #[test] + fn test_build_list_field_request_with_bool_index_filter() { + let search_body = FieldCapabilityRequestBody { + index_filter: json!({ + "bool": { + "must": [ + { "term": { "status": "active" } } + ], + "filter": [ + { "range": { "age": { "gte": 18 } } } + ] + } + }), + runtime_mappings: serde_json::Value::Null, + }; + + let result = build_list_field_request_for_es_api( + vec!["test_index".to_string()], + FieldCapabilityQueryParams::default(), + search_body, + ) + .unwrap(); + + assert!(result.query_ast.is_some()); + } + + #[test] + fn test_build_list_field_request_with_invalid_index_filter() { + let search_body = FieldCapabilityRequestBody { + index_filter: json!({ + "invalid_query_type": { + "field": "value" + } + }), + runtime_mappings: serde_json::Value::Null, + }; + + let result = build_list_field_request_for_es_api( + vec!["test_index".to_string()], + FieldCapabilityQueryParams::default(), + search_body, + ); + + assert!(result.is_err()); + let err = result.unwrap_err(); + assert_eq!(err.status, StatusCode::BAD_REQUEST); + } + + #[test] + fn test_build_list_field_request_with_null_index_filter() { + let search_body = FieldCapabilityRequestBody { + index_filter: serde_json::Value::Null, + runtime_mappings: serde_json::Value::Null, + }; + + let result = build_list_field_request_for_es_api( + vec!["test_index".to_string()], + FieldCapabilityQueryParams::default(), + search_body, + ) + .unwrap(); + + assert!(result.query_ast.is_none()); + } + + #[test] + fn test_build_list_field_request_preserves_other_params() { + let search_params = FieldCapabilityQueryParams { + fields: Some(vec!["field1".to_string(), "field2".to_string()]), + start_timestamp: Some(1000), + end_timestamp: Some(2000), + ..Default::default() + }; + + let search_body = FieldCapabilityRequestBody { + index_filter: json!({ "match_all": {} }), + runtime_mappings: serde_json::Value::Null, + }; + + let result = build_list_field_request_for_es_api( + vec!["test_index".to_string()], + search_params, + search_body, + ) + .unwrap(); + + assert_eq!( + result.fields, + vec!["field1".to_string(), "field2".to_string()] + ); + assert_eq!(result.start_timestamp, Some(1000)); + assert_eq!(result.end_timestamp, Some(2000)); + assert!(result.query_ast.is_some()); + } +} diff --git a/quickwit/rest-api-tests/scenarii/es_field_capabilities/0001-field-capabilities.yaml b/quickwit/rest-api-tests/scenarii/es_field_capabilities/0001-field-capabilities.yaml index bd3cd917acd..0ff68f7d567 100644 --- a/quickwit/rest-api-tests/scenarii/es_field_capabilities/0001-field-capabilities.yaml +++ b/quickwit/rest-api-tests/scenarii/es_field_capabilities/0001-field-capabilities.yaml @@ -334,3 +334,123 @@ engines: - elasticsearch endpoint: doesno*texist/_field_caps?fields=date status_code: 200 +--- +# Test _field_caps API with index_filter (term query) +method: [POST] +engines: + - quickwit +endpoint: fieldcaps/_field_caps?fields=* +json_body: + index_filter: + term: + name: "Alice" +expected: + indices: + - fieldcaps + fields: + name: + keyword: + type: keyword + metadata_field: false + searchable: true + aggregatable: true + text: + type: text + metadata_field: false + searchable: true + aggregatable: true +--- +# Test _field_caps API with index_filter (match_all query) +method: [POST] +engines: + - quickwit +endpoint: fieldcaps/_field_caps?fields=name +json_body: + index_filter: + match_all: {} +expected: + indices: + - fieldcaps + fields: + name: + keyword: + type: keyword + metadata_field: false + searchable: true + aggregatable: true + text: + type: text + metadata_field: false + searchable: true + aggregatable: true +--- +# Test _field_caps API with index_filter (bool query) +method: [POST] +engines: + - quickwit +endpoint: fieldcaps/_field_caps?fields=response,name +json_body: + index_filter: + bool: + must: + - term: + name: "Alice" + filter: + - range: + response: + gte: 100 +expected: + indices: + - fieldcaps + fields: + response: + long: + type: long + metadata_field: false + searchable: true + aggregatable: true + name: + keyword: + type: keyword + metadata_field: false + searchable: true + aggregatable: true + text: + type: text + metadata_field: false + searchable: true + aggregatable: true +--- +# Test _field_caps API with invalid index_filter +method: [POST] +engines: + - quickwit +endpoint: fieldcaps/_field_caps?fields=* +json_body: + index_filter: + invalid_query_type: + field: "value" +status_code: 400 +--- +# Test _field_caps API with empty index_filter (should work like no filter) +method: [POST] +engines: + - quickwit +endpoint: fieldcaps/_field_caps?fields=name +json_body: + index_filter: {} +expected: + indices: + - fieldcaps + fields: + name: + keyword: + type: keyword + metadata_field: false + searchable: true + aggregatable: true + text: + type: text + metadata_field: false + searchable: true + aggregatable: true From 9618c5479cdc6ded798b1d3df6599f8698a5682f Mon Sep 17 00:00:00 2001 From: ruo Date: Wed, 21 Jan 2026 00:54:14 -0800 Subject: [PATCH 2/6] feat(es-compat): implement split-level filtering for field_caps index_filter Address PR review comments for index_filter support in _field_caps API: - Extract `parse_index_filter_to_query_ast()` function with clean prototype - Implement split-level filtering via `split_matches_query()` using lightweight `query.count()` execution (no document materialization) - Add proper async handling with ByteRangeCache, warmup(), and run_cpu_intensive() for Quickwit's async-only storage - Add metastore-level pruning: - Tag extraction via `extract_tags_from_query()` - Time range extraction via `refine_start_end_timestamp_from_ast()` - Build DocMapper only when query_ast is provided (no overhead for common path without index_filter) - Fix REST API tests: use `json:` key (not `json_body:`), use lowercase term values to match tokenizer behavior - Update tests to run against both quickwit and elasticsearch engines Two-level filtering now implemented: 1. Metastore level: tags + time range from query AST 2. Split level: lightweight query execution for accurate filtering Co-Authored-By: Claude Opus 4.5 Signed-off-by: ruo --- .../protos/quickwit/search.proto | 6 +- .../src/codegen/quickwit/quickwit.search.rs | 6 +- quickwit/quickwit-search/src/list_fields.rs | 222 +++++++++++++++--- quickwit/quickwit-search/src/service.rs | 1 + .../model/field_capability.rs | 93 +++++--- .../0001-field-capabilities.yaml | 22 +- 6 files changed, 274 insertions(+), 76 deletions(-) diff --git a/quickwit/quickwit-proto/protos/quickwit/search.proto b/quickwit/quickwit-proto/protos/quickwit/search.proto index 06abc241aa0..4c0bccb2582 100644 --- a/quickwit/quickwit-proto/protos/quickwit/search.proto +++ b/quickwit/quickwit-proto/protos/quickwit/search.proto @@ -147,8 +147,12 @@ message LeafListFieldsRequest { repeated string fields = 4; // JSON-serialized QueryAst for index_filter support. - // When provided, only fields from documents matching this query are returned. + // When provided, only splits containing documents matching this query are included. optional string query_ast = 5; + + // JSON-serialized DocMapper for query execution. + // Required when query_ast is provided to build and execute the query. + optional string doc_mapper = 6; } message ListFieldsResponse { diff --git a/quickwit/quickwit-proto/src/codegen/quickwit/quickwit.search.rs b/quickwit/quickwit-proto/src/codegen/quickwit/quickwit.search.rs index 4c0d3c6a042..1315b5b5599 100644 --- a/quickwit/quickwit-proto/src/codegen/quickwit/quickwit.search.rs +++ b/quickwit/quickwit-proto/src/codegen/quickwit/quickwit.search.rs @@ -93,9 +93,13 @@ pub struct LeafListFieldsRequest { #[prost(string, repeated, tag = "4")] pub fields: ::prost::alloc::vec::Vec<::prost::alloc::string::String>, /// JSON-serialized QueryAst for index_filter support. - /// When provided, only fields from documents matching this query are returned. + /// When provided, only splits containing documents matching this query are included. #[prost(string, optional, tag = "5")] pub query_ast: ::core::option::Option<::prost::alloc::string::String>, + /// JSON-serialized DocMapper for query execution. + /// Required when query_ast is provided to build and execute the query. + #[prost(string, optional, tag = "6")] + pub doc_mapper: ::core::option::Option<::prost::alloc::string::String>, } #[derive(serde::Serialize, serde::Deserialize, utoipa::ToSchema)] #[derive(Clone, PartialEq, ::prost::Message)] diff --git a/quickwit/quickwit-search/src/list_fields.rs b/quickwit/quickwit-search/src/list_fields.rs index 7b65f3cde58..e225617d0a8 100644 --- a/quickwit/quickwit-search/src/list_fields.rs +++ b/quickwit/quickwit-search/src/list_fields.rs @@ -24,6 +24,9 @@ use itertools::Itertools; use quickwit_common::rate_limited_warn; use quickwit_common::shared_consts::{FIELD_PRESENCE_FIELD_NAME, SPLIT_FIELDS_FILE_NAME}; use quickwit_common::uri::Uri; +use quickwit_config::build_doc_mapper; +use quickwit_doc_mapper::DocMapper; +use quickwit_doc_mapper::tag_pruning::extract_tags_from_query; use quickwit_metastore::SplitMetadata; use quickwit_proto::metastore::MetastoreServiceClient; use quickwit_proto::search::{ @@ -31,9 +34,11 @@ use quickwit_proto::search::{ ListFieldsResponse, SplitIdAndFooterOffsets, deserialize_split_fields, }; use quickwit_proto::types::{IndexId, IndexUid}; -use quickwit_storage::Storage; +use quickwit_query::query_ast::QueryAst; +use quickwit_storage::{ByteRangeCache, Storage}; +use tantivy::ReloadPolicy; -use crate::leaf::open_split_bundle; +use crate::leaf::{open_index_with_caches, open_split_bundle, warmup}; use crate::search_job_placer::group_jobs_by_index_id; use crate::service::SearcherContext; use crate::{ @@ -309,36 +314,127 @@ impl FieldPattern { } } -/// `leaf` step of list fields. +/// Checks if any documents in the split match the query. +/// Returns true if at least one document matches, false otherwise. /// -/// If `query_ast` is provided (from ES-compatible `index_filter`), it indicates that -/// the caller wants to filter fields based on documents matching the query. +/// This is a lightweight query execution that only counts matches without +/// materializing documents, used for split-level filtering in field capabilities. +async fn split_matches_query( + searcher_context: &SearcherContext, + index_storage: Arc, + split: &SplitIdAndFooterOffsets, + doc_mapper: &DocMapper, + query_ast: &QueryAst, +) -> crate::Result { + let byte_range_cache = + ByteRangeCache::with_infinite_capacity(&quickwit_storage::STORAGE_METRICS.shortlived_cache); + // Open split with caches + let (index, _hot_directory) = open_index_with_caches( + searcher_context, + index_storage, + split, + Some(doc_mapper.tokenizer_manager()), + Some(byte_range_cache), + ) + .await?; + + // Create searcher with manual reload policy + let reader = index + .reader_builder() + .reload_policy(ReloadPolicy::Manual) + .try_into() + .map_err(|err| SearchError::Internal(format!("failed to create index reader: {err}")))?; + let searcher = reader.searcher(); + + // Build query from QueryAst + let (query, mut warmup_info) = doc_mapper + .query(searcher.schema().clone(), query_ast.clone(), false, None) + .map_err(|err| SearchError::InvalidQuery(format!("failed to build query: {err}")))?; + + // Warmup to ensure all bytes are fetched asynchronously before sync search + warmup_info.simplify(); + warmup(&searcher, &warmup_info) + .await + .map_err(|err| SearchError::Internal(format!("failed to warmup query: {err}")))?; + + // Check if any docs match (lightweight count) + let count = search_thread_pool() + .run_cpu_intensive(move || { + query + .count(&searcher) + .map_err(|err| SearchError::Internal(format!("failed to count matches: {err}"))) + }) + .await + .map_err(|_| SearchError::Internal("split matches query panicked".to_string()))??; + + Ok(count > 0) +} + +/// `leaf` step of list fields. /// -/// Note: Full query-based filtering is not yet implemented. When `query_ast` is provided, -/// fields from all documents are currently returned. This matches the ES behavior of -/// returning field capabilities for indices where at least one document matches. +/// Returns field metadata from the assigned splits. When `query_ast` and `doc_mapper_str` +/// are provided, splits are filtered to only include those containing at least one +/// matching document (lightweight query execution for split-level filtering). pub async fn leaf_list_fields( index_id: IndexId, index_storage: Arc, searcher_context: &SearcherContext, split_ids: &[SplitIdAndFooterOffsets], field_patterns_str: &[String], - query_ast: Option<&str>, + query_ast_str: Option<&str>, + doc_mapper_str: Option<&str>, ) -> crate::Result { - // Log if index_filter was provided (for observability) - // Full query-based split filtering is a TODO for future enhancement - if query_ast.is_some() { - tracing::debug!( - "index_filter provided for field capabilities, but split-level filtering is not yet \ - implemented" - ); - } let field_patterns: Vec = field_patterns_str .iter() .map(|pattern_str| FieldPattern::from_str(pattern_str)) .collect::>()?; - let single_split_list_fields_futures: Vec<_> = split_ids + // If no splits, return empty response + if split_ids.is_empty() { + return Ok(ListFieldsResponse { fields: Vec::new() }); + } + + // Filter splits based on query if both query_ast and doc_mapper are provided + let matching_splits: Vec<&SplitIdAndFooterOffsets> = match (query_ast_str, doc_mapper_str) { + (Some(ast_json), Some(mapper_json)) => { + let query_ast: QueryAst = serde_json::from_str(ast_json) + .map_err(|err| SearchError::InvalidQuery(err.to_string()))?; + let doc_mapper = crate::service::deserialize_doc_mapper(mapper_json)?; + + let split_match_tasks: Vec<_> = split_ids + .iter() + .map(|split| { + let index_storage = index_storage.clone(); + async { + split_matches_query( + searcher_context, + index_storage, + split, + &doc_mapper, + &query_ast, + ) + .await + } + }) + .collect(); + + let matches_vec = try_join_all(split_match_tasks).await?; + split_ids + .iter() + .zip(matches_vec) + .filter_map(|(split, matches)| matches.then_some(split)) + .collect() + } + _ => split_ids.iter().collect(), + }; + + // If no splits match, return empty response + if matching_splits.is_empty() { + return Ok(ListFieldsResponse { fields: Vec::new() }); + } + + // Get fields from matching splits + let single_split_list_fields_futures: Vec<_> = matching_splits .iter() .map(|split_id| { get_fields_from_split( @@ -391,13 +487,15 @@ pub async fn leaf_list_fields( Ok(ListFieldsResponse { fields }) } -/// Index metas needed for executing a leaf search request. +/// Index metas needed for executing a leaf list fields request. #[derive(Clone, Debug)] pub struct IndexMetasForLeafSearch { /// Index id. pub index_id: IndexId, /// Index URI. pub index_uri: Uri, + /// Serialized DocMapper for query execution (only set when query_ast is provided). + pub doc_mapper_str: Option, } /// Performs a distributed list fields request. @@ -415,29 +513,74 @@ pub async fn root_list_fields( if indexes_metadata.is_empty() { return Ok(ListFieldsResponse { fields: Vec::new() }); } - let index_uid_to_index_meta: HashMap = indexes_metadata - .iter() - .map(|index_metadata| { - let index_metadata_for_leaf_search = IndexMetasForLeafSearch { - index_uri: index_metadata.index_uri().clone(), - index_id: index_metadata.index_config.index_id.to_string(), - }; - - ( - index_metadata.index_uid.clone(), - index_metadata_for_leaf_search, + + // Build index metadata map, including doc_mapper if query_ast is provided + let has_query_ast = list_fields_req.query_ast.is_some(); + let mut index_uid_to_index_meta: HashMap = HashMap::new(); + let mut index_uids: Vec = Vec::new(); + let mut timestamp_field_opt: Option = None; + + for index_metadata in indexes_metadata { + // Only build doc_mapper when query_ast is provided (needed for split-level filtering) + let doc_mapper_str = if has_query_ast { + let doc_mapper = build_doc_mapper( + &index_metadata.index_config.doc_mapping, + &index_metadata.index_config.search_settings, ) - }) - .collect(); - let index_uids: Vec = indexes_metadata - .into_iter() - .map(|index_metadata| index_metadata.index_uid) - .collect(); + .map_err(|err| SearchError::Internal(format!("failed to build doc mapper: {err}")))?; + + // Capture timestamp field for time range extraction (use first index's field) + if timestamp_field_opt.is_none() { + timestamp_field_opt = doc_mapper.timestamp_field_name().map(|s| s.to_string()); + } + + Some(serde_json::to_string(&doc_mapper).map_err(|err| { + SearchError::Internal(format!("failed to serialize doc mapper: {err}")) + })?) + } else { + None + }; + + let index_metadata_for_leaf_search = IndexMetasForLeafSearch { + index_uri: index_metadata.index_uri().clone(), + index_id: index_metadata.index_config.index_id.to_string(), + doc_mapper_str, + }; + + index_uids.push(index_metadata.index_uid.clone()); + index_uid_to_index_meta.insert( + index_metadata.index_uid.clone(), + index_metadata_for_leaf_search, + ); + } + + // Extract tags and refine time range from query_ast for split pruning + let mut start_timestamp = list_fields_req.start_timestamp; + let mut end_timestamp = list_fields_req.end_timestamp; + let tags_filter_opt = if let Some(ref query_ast_json) = list_fields_req.query_ast { + let query_ast: QueryAst = serde_json::from_str(query_ast_json) + .map_err(|err| SearchError::InvalidQuery(err.to_string()))?; + + // Refine time range from query AST if timestamp field is available + if let Some(ref timestamp_field) = timestamp_field_opt { + crate::root::refine_start_end_timestamp_from_ast( + &query_ast, + timestamp_field, + &mut start_timestamp, + &mut end_timestamp, + ); + } + + extract_tags_from_query(query_ast) + } else { + None + }; + let split_metadatas: Vec = list_relevant_splits( index_uids, - list_fields_req.start_timestamp, - list_fields_req.end_timestamp, - None, + start_timestamp, + end_timestamp, + tags_filter_opt, &mut metastore, ) .await?; @@ -495,6 +638,7 @@ pub fn jobs_to_leaf_requests( fields: search_request_for_leaf.fields.clone(), split_offsets: job_group.into_iter().map(|job| job.offsets).collect(), query_ast: search_request_for_leaf.query_ast.clone(), + doc_mapper: index_meta.doc_mapper_str.clone(), }; leaf_search_requests.push(leaf_search_request); Ok(()) diff --git a/quickwit/quickwit-search/src/service.rs b/quickwit/quickwit-search/src/service.rs index 7564c1f34f7..d63cd85cc08 100644 --- a/quickwit/quickwit-search/src/service.rs +++ b/quickwit/quickwit-search/src/service.rs @@ -310,6 +310,7 @@ impl SearchService for SearchServiceImpl { &split_ids[..], &list_fields_req.fields, list_fields_req.query_ast.as_deref(), + list_fields_req.doc_mapper.as_deref(), ) .await } diff --git a/quickwit/quickwit-serve/src/elasticsearch_api/model/field_capability.rs b/quickwit/quickwit-serve/src/elasticsearch_api/model/field_capability.rs index 05d46c0a036..7d1f650274a 100644 --- a/quickwit/quickwit-serve/src/elasticsearch_api/model/field_capability.rs +++ b/quickwit/quickwit-serve/src/elasticsearch_api/model/field_capability.rs @@ -14,11 +14,11 @@ use std::collections::HashMap; -use warp::hyper::StatusCode; use quickwit_proto::search::{ListFieldType, ListFieldsEntryResponse, ListFieldsResponse}; use quickwit_query::ElasticQueryDsl; use quickwit_query::query_ast::QueryAst; use serde::{Deserialize, Serialize}; +use warp::hyper::StatusCode; use super::ElasticsearchError; use super::search_query_params::*; @@ -176,38 +176,48 @@ pub fn convert_to_es_field_capabilities_response( FieldCapabilityResponse { indices, fields } } -#[allow(clippy::result_large_err)] -pub fn build_list_field_request_for_es_api( - index_id_patterns: Vec, - search_params: FieldCapabilityQueryParams, - search_body: FieldCapabilityRequestBody, -) -> Result { - // Parse index_filter if provided - let query_ast_json: Option = if search_body.index_filter.is_null() - || search_body.index_filter == serde_json::Value::Object(Default::default()) - { - None - } else { - // Parse ES Query DSL to internal QueryAst - let elastic_query_dsl: ElasticQueryDsl = - serde_json::from_value(search_body.index_filter).map_err(|err| { - ElasticsearchError::new( - StatusCode::BAD_REQUEST, - format!("Invalid index_filter: {err}"), - None, - ) - })?; - - let query_ast: QueryAst = elastic_query_dsl.try_into().map_err(|err: anyhow::Error| { +/// Parses an Elasticsearch index_filter JSON value into a Quickwit QueryAst. +/// +/// Returns `Ok(None)` if the index_filter is null or empty. +/// Returns `Ok(Some(QueryAst))` if the index_filter is valid. +/// Returns `Err` if the index_filter is invalid or cannot be converted. +pub fn parse_index_filter_to_query_ast( + index_filter: serde_json::Value, +) -> Result, ElasticsearchError> { + if index_filter.is_null() || index_filter == serde_json::Value::Object(Default::default()) { + return Ok(None); + } + + // Parse ES Query DSL to internal QueryAst + let elastic_query_dsl: ElasticQueryDsl = + serde_json::from_value(index_filter).map_err(|err| { ElasticsearchError::new( StatusCode::BAD_REQUEST, - format!("Failed to convert index_filter: {err}"), + format!("Invalid index_filter: {err}"), None, ) })?; - Some(serde_json::to_string(&query_ast).expect("QueryAst should be JSON serializable")) - }; + let query_ast: QueryAst = elastic_query_dsl.try_into().map_err(|err: anyhow::Error| { + ElasticsearchError::new( + StatusCode::BAD_REQUEST, + format!("Failed to convert index_filter: {err}"), + None, + ) + })?; + + Ok(Some(query_ast)) +} + +#[allow(clippy::result_large_err)] +pub fn build_list_field_request_for_es_api( + index_id_patterns: Vec, + search_params: FieldCapabilityQueryParams, + search_body: FieldCapabilityRequestBody, +) -> Result { + let query_ast = parse_index_filter_to_query_ast(search_body.index_filter)?; + let query_ast_json = query_ast + .map(|ast| serde_json::to_string(&ast).expect("QueryAst should be JSON serializable")); Ok(quickwit_proto::search::ListFieldsRequest { index_id_patterns, @@ -358,4 +368,33 @@ mod tests { assert_eq!(result.end_timestamp, Some(2000)); assert!(result.query_ast.is_some()); } + + #[test] + fn test_parse_index_filter_to_query_ast_null() { + let result = parse_index_filter_to_query_ast(serde_json::Value::Null).unwrap(); + assert!(result.is_none()); + } + + #[test] + fn test_parse_index_filter_to_query_ast_empty_object() { + let result = parse_index_filter_to_query_ast(json!({})).unwrap(); + assert!(result.is_none()); + } + + #[test] + fn test_parse_index_filter_to_query_ast_valid_term() { + let result = parse_index_filter_to_query_ast(json!({ + "term": { "status": "active" } + })) + .unwrap(); + assert!(result.is_some()); + } + + #[test] + fn test_parse_index_filter_to_query_ast_invalid() { + let result = parse_index_filter_to_query_ast(json!({ + "invalid_query_type": { "field": "value" } + })); + assert!(result.is_err()); + } } diff --git a/quickwit/rest-api-tests/scenarii/es_field_capabilities/0001-field-capabilities.yaml b/quickwit/rest-api-tests/scenarii/es_field_capabilities/0001-field-capabilities.yaml index 0ff68f7d567..16513f2d008 100644 --- a/quickwit/rest-api-tests/scenarii/es_field_capabilities/0001-field-capabilities.yaml +++ b/quickwit/rest-api-tests/scenarii/es_field_capabilities/0001-field-capabilities.yaml @@ -336,14 +336,16 @@ endpoint: doesno*texist/_field_caps?fields=date status_code: 200 --- # Test _field_caps API with index_filter (term query) +# Note: term queries require exact token match; 'fritz' is lowercase due to default tokenizer method: [POST] engines: - quickwit + - elasticsearch endpoint: fieldcaps/_field_caps?fields=* -json_body: +json: index_filter: term: - name: "Alice" + name: "fritz" expected: indices: - fieldcaps @@ -364,8 +366,9 @@ expected: method: [POST] engines: - quickwit + - elasticsearch endpoint: fieldcaps/_field_caps?fields=name -json_body: +json: index_filter: match_all: {} expected: @@ -388,17 +391,18 @@ expected: method: [POST] engines: - quickwit + - elasticsearch endpoint: fieldcaps/_field_caps?fields=response,name -json_body: +json: index_filter: bool: must: - term: - name: "Alice" + name: "fritz" filter: - range: response: - gte: 100 + gte: 30 expected: indices: - fieldcaps @@ -425,8 +429,9 @@ expected: method: [POST] engines: - quickwit + - elasticsearch endpoint: fieldcaps/_field_caps?fields=* -json_body: +json: index_filter: invalid_query_type: field: "value" @@ -436,8 +441,9 @@ status_code: 400 method: [POST] engines: - quickwit + - elasticsearch endpoint: fieldcaps/_field_caps?fields=name -json_body: +json: index_filter: {} expected: indices: From 694f004bc3f77314a925b7d4f7131842aaa5e8e2 Mon Sep 17 00:00:00 2001 From: ruo Date: Tue, 27 Jan 2026 21:47:54 -0800 Subject: [PATCH 3/6] refactor(es-compat): use best-effort metadata filtering for index_filter Remove heavy split-level query execution for field_caps index_filter. The implementation now aligns with ES's "best-effort" approach that uses metadata-level filtering only (time range, tags) instead of opening splits and executing queries. Changes: - Remove split_matches_query function (no longer opens splits) - Remove query_ast and doc_mapper from LeafListFieldsRequest proto - Keep metadata-level filtering in root_list_fields: - Time range extraction from query AST - Tag-based split pruning - Simplify leaf_list_fields to just return fields from all splits This matches ES semantics: "filtering is done on a best-effort basis... this API may return an index even if the provided filter matches no document." Co-Authored-By: Claude Opus 4.5 Signed-off-by: ruo --- .../protos/quickwit/search.proto | 8 - .../src/codegen/quickwit/quickwit.search.rs | 8 - quickwit/quickwit-search/src/list_fields.rs | 142 ++---------------- quickwit/quickwit-search/src/service.rs | 2 - .../model/field_capability.rs | 1 + 5 files changed, 14 insertions(+), 147 deletions(-) diff --git a/quickwit/quickwit-proto/protos/quickwit/search.proto b/quickwit/quickwit-proto/protos/quickwit/search.proto index 4c0bccb2582..a49b951b3ca 100644 --- a/quickwit/quickwit-proto/protos/quickwit/search.proto +++ b/quickwit/quickwit-proto/protos/quickwit/search.proto @@ -145,14 +145,6 @@ message LeafListFieldsRequest { // Optional limit query to a list of fields // Wildcard expressions are supported. repeated string fields = 4; - - // JSON-serialized QueryAst for index_filter support. - // When provided, only splits containing documents matching this query are included. - optional string query_ast = 5; - - // JSON-serialized DocMapper for query execution. - // Required when query_ast is provided to build and execute the query. - optional string doc_mapper = 6; } message ListFieldsResponse { diff --git a/quickwit/quickwit-proto/src/codegen/quickwit/quickwit.search.rs b/quickwit/quickwit-proto/src/codegen/quickwit/quickwit.search.rs index 1315b5b5599..16c11358ab8 100644 --- a/quickwit/quickwit-proto/src/codegen/quickwit/quickwit.search.rs +++ b/quickwit/quickwit-proto/src/codegen/quickwit/quickwit.search.rs @@ -92,14 +92,6 @@ pub struct LeafListFieldsRequest { /// Wildcard expressions are supported. #[prost(string, repeated, tag = "4")] pub fields: ::prost::alloc::vec::Vec<::prost::alloc::string::String>, - /// JSON-serialized QueryAst for index_filter support. - /// When provided, only splits containing documents matching this query are included. - #[prost(string, optional, tag = "5")] - pub query_ast: ::core::option::Option<::prost::alloc::string::String>, - /// JSON-serialized DocMapper for query execution. - /// Required when query_ast is provided to build and execute the query. - #[prost(string, optional, tag = "6")] - pub doc_mapper: ::core::option::Option<::prost::alloc::string::String>, } #[derive(serde::Serialize, serde::Deserialize, utoipa::ToSchema)] #[derive(Clone, PartialEq, ::prost::Message)] diff --git a/quickwit/quickwit-search/src/list_fields.rs b/quickwit/quickwit-search/src/list_fields.rs index e225617d0a8..b5974867cfd 100644 --- a/quickwit/quickwit-search/src/list_fields.rs +++ b/quickwit/quickwit-search/src/list_fields.rs @@ -25,7 +25,6 @@ use quickwit_common::rate_limited_warn; use quickwit_common::shared_consts::{FIELD_PRESENCE_FIELD_NAME, SPLIT_FIELDS_FILE_NAME}; use quickwit_common::uri::Uri; use quickwit_config::build_doc_mapper; -use quickwit_doc_mapper::DocMapper; use quickwit_doc_mapper::tag_pruning::extract_tags_from_query; use quickwit_metastore::SplitMetadata; use quickwit_proto::metastore::MetastoreServiceClient; @@ -35,10 +34,9 @@ use quickwit_proto::search::{ }; use quickwit_proto::types::{IndexId, IndexUid}; use quickwit_query::query_ast::QueryAst; -use quickwit_storage::{ByteRangeCache, Storage}; -use tantivy::ReloadPolicy; +use quickwit_storage::Storage; -use crate::leaf::{open_index_with_caches, open_split_bundle, warmup}; +use crate::leaf::open_split_bundle; use crate::search_job_placer::group_jobs_by_index_id; use crate::service::SearcherContext; use crate::{ @@ -314,75 +312,15 @@ impl FieldPattern { } } -/// Checks if any documents in the split match the query. -/// Returns true if at least one document matches, false otherwise. -/// -/// This is a lightweight query execution that only counts matches without -/// materializing documents, used for split-level filtering in field capabilities. -async fn split_matches_query( - searcher_context: &SearcherContext, - index_storage: Arc, - split: &SplitIdAndFooterOffsets, - doc_mapper: &DocMapper, - query_ast: &QueryAst, -) -> crate::Result { - let byte_range_cache = - ByteRangeCache::with_infinite_capacity(&quickwit_storage::STORAGE_METRICS.shortlived_cache); - // Open split with caches - let (index, _hot_directory) = open_index_with_caches( - searcher_context, - index_storage, - split, - Some(doc_mapper.tokenizer_manager()), - Some(byte_range_cache), - ) - .await?; - - // Create searcher with manual reload policy - let reader = index - .reader_builder() - .reload_policy(ReloadPolicy::Manual) - .try_into() - .map_err(|err| SearchError::Internal(format!("failed to create index reader: {err}")))?; - let searcher = reader.searcher(); - - // Build query from QueryAst - let (query, mut warmup_info) = doc_mapper - .query(searcher.schema().clone(), query_ast.clone(), false, None) - .map_err(|err| SearchError::InvalidQuery(format!("failed to build query: {err}")))?; - - // Warmup to ensure all bytes are fetched asynchronously before sync search - warmup_info.simplify(); - warmup(&searcher, &warmup_info) - .await - .map_err(|err| SearchError::Internal(format!("failed to warmup query: {err}")))?; - - // Check if any docs match (lightweight count) - let count = search_thread_pool() - .run_cpu_intensive(move || { - query - .count(&searcher) - .map_err(|err| SearchError::Internal(format!("failed to count matches: {err}"))) - }) - .await - .map_err(|_| SearchError::Internal("split matches query panicked".to_string()))??; - - Ok(count > 0) -} - /// `leaf` step of list fields. /// -/// Returns field metadata from the assigned splits. When `query_ast` and `doc_mapper_str` -/// are provided, splits are filtered to only include those containing at least one -/// matching document (lightweight query execution for split-level filtering). +/// Returns field metadata from the assigned splits. pub async fn leaf_list_fields( index_id: IndexId, index_storage: Arc, searcher_context: &SearcherContext, split_ids: &[SplitIdAndFooterOffsets], field_patterns_str: &[String], - query_ast_str: Option<&str>, - doc_mapper_str: Option<&str>, ) -> crate::Result { let field_patterns: Vec = field_patterns_str .iter() @@ -394,47 +332,8 @@ pub async fn leaf_list_fields( return Ok(ListFieldsResponse { fields: Vec::new() }); } - // Filter splits based on query if both query_ast and doc_mapper are provided - let matching_splits: Vec<&SplitIdAndFooterOffsets> = match (query_ast_str, doc_mapper_str) { - (Some(ast_json), Some(mapper_json)) => { - let query_ast: QueryAst = serde_json::from_str(ast_json) - .map_err(|err| SearchError::InvalidQuery(err.to_string()))?; - let doc_mapper = crate::service::deserialize_doc_mapper(mapper_json)?; - - let split_match_tasks: Vec<_> = split_ids - .iter() - .map(|split| { - let index_storage = index_storage.clone(); - async { - split_matches_query( - searcher_context, - index_storage, - split, - &doc_mapper, - &query_ast, - ) - .await - } - }) - .collect(); - - let matches_vec = try_join_all(split_match_tasks).await?; - split_ids - .iter() - .zip(matches_vec) - .filter_map(|(split, matches)| matches.then_some(split)) - .collect() - } - _ => split_ids.iter().collect(), - }; - - // If no splits match, return empty response - if matching_splits.is_empty() { - return Ok(ListFieldsResponse { fields: Vec::new() }); - } - - // Get fields from matching splits - let single_split_list_fields_futures: Vec<_> = matching_splits + // Get fields from all splits + let single_split_list_fields_futures: Vec<_> = split_ids .iter() .map(|split_id| { get_fields_from_split( @@ -494,8 +393,6 @@ pub struct IndexMetasForLeafSearch { pub index_id: IndexId, /// Index URI. pub index_uri: Uri, - /// Serialized DocMapper for query execution (only set when query_ast is provided). - pub doc_mapper_str: Option, } /// Performs a distributed list fields request. @@ -514,37 +411,26 @@ pub async fn root_list_fields( return Ok(ListFieldsResponse { fields: Vec::new() }); } - // Build index metadata map, including doc_mapper if query_ast is provided - let has_query_ast = list_fields_req.query_ast.is_some(); + // Build index metadata map and extract timestamp field for time range refinement let mut index_uid_to_index_meta: HashMap = HashMap::new(); let mut index_uids: Vec = Vec::new(); let mut timestamp_field_opt: Option = None; for index_metadata in indexes_metadata { - // Only build doc_mapper when query_ast is provided (needed for split-level filtering) - let doc_mapper_str = if has_query_ast { - let doc_mapper = build_doc_mapper( + // Extract timestamp field for time range refinement (use first index's field) + if timestamp_field_opt.is_none() + && list_fields_req.query_ast.is_some() + && let Ok(doc_mapper) = build_doc_mapper( &index_metadata.index_config.doc_mapping, &index_metadata.index_config.search_settings, ) - .map_err(|err| SearchError::Internal(format!("failed to build doc mapper: {err}")))?; - - // Capture timestamp field for time range extraction (use first index's field) - if timestamp_field_opt.is_none() { - timestamp_field_opt = doc_mapper.timestamp_field_name().map(|s| s.to_string()); - } - - Some(serde_json::to_string(&doc_mapper).map_err(|err| { - SearchError::Internal(format!("failed to serialize doc mapper: {err}")) - })?) - } else { - None - }; + { + timestamp_field_opt = doc_mapper.timestamp_field_name().map(|s| s.to_string()); + } let index_metadata_for_leaf_search = IndexMetasForLeafSearch { index_uri: index_metadata.index_uri().clone(), index_id: index_metadata.index_config.index_id.to_string(), - doc_mapper_str, }; index_uids.push(index_metadata.index_uid.clone()); @@ -637,8 +523,6 @@ pub fn jobs_to_leaf_requests( index_uri: index_meta.index_uri.to_string(), fields: search_request_for_leaf.fields.clone(), split_offsets: job_group.into_iter().map(|job| job.offsets).collect(), - query_ast: search_request_for_leaf.query_ast.clone(), - doc_mapper: index_meta.doc_mapper_str.clone(), }; leaf_search_requests.push(leaf_search_request); Ok(()) diff --git a/quickwit/quickwit-search/src/service.rs b/quickwit/quickwit-search/src/service.rs index d63cd85cc08..55fe014cba7 100644 --- a/quickwit/quickwit-search/src/service.rs +++ b/quickwit/quickwit-search/src/service.rs @@ -309,8 +309,6 @@ impl SearchService for SearchServiceImpl { &self.searcher_context, &split_ids[..], &list_fields_req.fields, - list_fields_req.query_ast.as_deref(), - list_fields_req.doc_mapper.as_deref(), ) .await } diff --git a/quickwit/quickwit-serve/src/elasticsearch_api/model/field_capability.rs b/quickwit/quickwit-serve/src/elasticsearch_api/model/field_capability.rs index 7d1f650274a..8178a5fe1b4 100644 --- a/quickwit/quickwit-serve/src/elasticsearch_api/model/field_capability.rs +++ b/quickwit/quickwit-serve/src/elasticsearch_api/model/field_capability.rs @@ -181,6 +181,7 @@ pub fn convert_to_es_field_capabilities_response( /// Returns `Ok(None)` if the index_filter is null or empty. /// Returns `Ok(Some(QueryAst))` if the index_filter is valid. /// Returns `Err` if the index_filter is invalid or cannot be converted. +#[allow(clippy::result_large_err)] pub fn parse_index_filter_to_query_ast( index_filter: serde_json::Value, ) -> Result, ElasticsearchError> { From b213eaac2e6786ae960d8cc3b4a1ed52191524e2 Mon Sep 17 00:00:00 2001 From: ruo Date: Thu, 29 Jan 2026 21:11:20 -0800 Subject: [PATCH 4/6] fix(es-compat): reject empty index_filter to match ES behavior - Remove empty object {} handling in parse_index_filter_to_query_ast - ES rejects empty index_filter with 400, now QW does too - Add tag_fields config and tag-based index_filter test - Update unit and integration tests accordingly Co-Authored-By: Claude Opus 4.5 Signed-off-by: ruo --- .../src/elasticsearch_api/model/field_capability.rs | 11 ++++++----- .../0001-field-capabilities.yaml | 13 ++++++++++++- .../es_field_capabilities/_setup.quickwit.yaml | 5 +++++ 3 files changed, 23 insertions(+), 6 deletions(-) diff --git a/quickwit/quickwit-serve/src/elasticsearch_api/model/field_capability.rs b/quickwit/quickwit-serve/src/elasticsearch_api/model/field_capability.rs index 8178a5fe1b4..9aefdc83762 100644 --- a/quickwit/quickwit-serve/src/elasticsearch_api/model/field_capability.rs +++ b/quickwit/quickwit-serve/src/elasticsearch_api/model/field_capability.rs @@ -178,14 +178,14 @@ pub fn convert_to_es_field_capabilities_response( /// Parses an Elasticsearch index_filter JSON value into a Quickwit QueryAst. /// -/// Returns `Ok(None)` if the index_filter is null or empty. +/// Returns `Ok(None)` if the index_filter is null. /// Returns `Ok(Some(QueryAst))` if the index_filter is valid. -/// Returns `Err` if the index_filter is invalid or cannot be converted. +/// Returns `Err` if the index_filter is invalid or cannot be converted (including empty object). #[allow(clippy::result_large_err)] pub fn parse_index_filter_to_query_ast( index_filter: serde_json::Value, ) -> Result, ElasticsearchError> { - if index_filter.is_null() || index_filter == serde_json::Value::Object(Default::default()) { + if index_filter.is_null() { return Ok(None); } @@ -378,8 +378,9 @@ mod tests { #[test] fn test_parse_index_filter_to_query_ast_empty_object() { - let result = parse_index_filter_to_query_ast(json!({})).unwrap(); - assert!(result.is_none()); + // Empty object {} should return error to match ES behavior + let result = parse_index_filter_to_query_ast(json!({})); + assert!(result.is_err()); } #[test] diff --git a/quickwit/rest-api-tests/scenarii/es_field_capabilities/0001-field-capabilities.yaml b/quickwit/rest-api-tests/scenarii/es_field_capabilities/0001-field-capabilities.yaml index 16513f2d008..79accac5be3 100644 --- a/quickwit/rest-api-tests/scenarii/es_field_capabilities/0001-field-capabilities.yaml +++ b/quickwit/rest-api-tests/scenarii/es_field_capabilities/0001-field-capabilities.yaml @@ -437,7 +437,7 @@ json: field: "value" status_code: 400 --- -# Test _field_caps API with empty index_filter (should work like no filter) +# Test _field_caps API with empty index_filter (should return 400 like ES) method: [POST] engines: - quickwit @@ -445,6 +445,17 @@ engines: endpoint: fieldcaps/_field_caps?fields=name json: index_filter: {} +status_code: 400 +--- +# Test _field_caps API with index_filter using tag field for split pruning (QW-only) +method: [POST] +engines: + - quickwit +endpoint: fieldcaps/_field_caps?fields=name +json: + index_filter: + term: + tags: "nice" expected: indices: - fieldcaps diff --git a/quickwit/rest-api-tests/scenarii/es_field_capabilities/_setup.quickwit.yaml b/quickwit/rest-api-tests/scenarii/es_field_capabilities/_setup.quickwit.yaml index 8b02ee01882..5576e6cec28 100644 --- a/quickwit/rest-api-tests/scenarii/es_field_capabilities/_setup.quickwit.yaml +++ b/quickwit/rest-api-tests/scenarii/es_field_capabilities/_setup.quickwit.yaml @@ -22,6 +22,7 @@ json: tokenizer: default fast: true timestamp_field: date + tag_fields: ["tags"] field_mappings: - name: date type: datetime @@ -32,6 +33,10 @@ json: - name: host type: ip fast: true + - name: tags + type: array + tokenizer: raw + fast: true --- # Create index method: POST From 0291d29d871303abbaaa65e94a43959fe8484236 Mon Sep 17 00:00:00 2001 From: fulmicoton Date: Tue, 10 Mar 2026 09:16:50 +0100 Subject: [PATCH 5/6] Added ref doc for the new functionality. --- docs/reference/es_compatible_api.md | 73 +++++++++++++++++++ .../0001-field-capabilities.yaml | 40 +--------- 2 files changed, 77 insertions(+), 36 deletions(-) diff --git a/docs/reference/es_compatible_api.md b/docs/reference/es_compatible_api.md index 32cbdafd761..28ba1aa7eb2 100644 --- a/docs/reference/es_compatible_api.md +++ b/docs/reference/es_compatible_api.md @@ -365,6 +365,79 @@ Example response: [HTTP accept header]: https://www.w3.org/Protocols/rfc2616/rfc2616-sec14.html + +### `_field_caps`   Field capabilities API + +``` +GET api/v1/_elastic//_field_caps +``` +``` +POST api/v1/_elastic//_field_caps +``` +``` +GET api/v1/_elastic/_field_caps +``` +``` +POST api/v1/_elastic/_field_caps +``` + +The [field capabilities API](https://www.elastic.co/guide/en/elasticsearch/reference/current/search-field-caps.html) returns information about the capabilities of fields among multiple indices. + +#### Supported Query string parameters + +| Variable | Type | Description | Default value | +| --------------------- | ---------- | ------------------------------------------------------------------------------ | ------------- | +| `fields` | `String` | Comma-separated list of fields to retrieve capabilities for. Supports wildcards (`*`). | (Optional) | +| `allow_no_indices` | `Boolean` | If `true`, missing or closed indices are not an error. | (Optional) | +| `expand_wildcards` | `String` | Controls what kind of indices that wildcard patterns can match. | (Optional) | +| `ignore_unavailable` | `Boolean` | If `true`, unavailable indices are ignored. | (Optional) | +| `start_timestamp` | `Integer` | *(Quickwit-specific)* If set, restricts splits to documents with a timestamp range start >= `start_timestamp` (seconds since epoch). | (Optional) | +| `end_timestamp` | `Integer` | *(Quickwit-specific)* If set, restricts splits to documents with a timestamp range end < `end_timestamp` (seconds since epoch). | (Optional) | + +#### Supported Request Body parameters + +| Variable | Type | Description | Default value | +| ------------------ | ------------- | --------------------------------------------------------------------------- | ------------- | +| `index_filter` | `Json object` | A query to filter indices. If provided, only fields from indices that can potentially match the filter are returned. See [index_filter](#index_filter). | (Optional) | +| `runtime_mappings` | `Json object` | Accepted but not supported. | (Optional) | + +#### `index_filter` + +The `index_filter` parameter allows you to filter which indices contribute to the field capabilities response. When provided, Quickwit uses the filter query to prune indices (splits) that cannot match the filter, and only returns field capabilities for the remaining ones. + +Like Elasticsearch, this is a **best-effort** approach: Quickwit may return field capabilities from indices that do not actually contain any matching documents. In Quickwit, the filtering is limited to the existing split-pruning based on metadata: + +- **Time pruning**: Range queries on the timestamp field can eliminate splits whose time range does not overlap with the filter. +- **Tag pruning**: Term queries on [tag fields](../configuration/index-config.md#tag-fields) can eliminate splits that do not contain the requested tag value. + +Other filter types (e.g. full-text queries or term queries on non-tag fields) are accepted but will not prune any splits — all indices will be returned as if no filter was specified. In particular, Quickwit does not check whether terms are present in the term dictionary. + +#### Request Body example + +```json +{ + "index_filter": { + "range": { + "timestamp": { + "gte": "2024-01-01T00:00:00Z", + "lt": "2024-02-01T00:00:00Z" + } + } + } +} +``` + +```json +{ + "index_filter": { + "term": { + "status": "active" + } + } +} +``` + + ## Query DSL [Elasticsearch Query DSL reference](https://www.elastic.co/guide/en/elasticsearch/reference/8.8/query-dsl.html). diff --git a/quickwit/rest-api-tests/scenarii/es_field_capabilities/0001-field-capabilities.yaml b/quickwit/rest-api-tests/scenarii/es_field_capabilities/0001-field-capabilities.yaml index 79accac5be3..a3c5041926d 100644 --- a/quickwit/rest-api-tests/scenarii/es_field_capabilities/0001-field-capabilities.yaml +++ b/quickwit/rest-api-tests/scenarii/es_field_capabilities/0001-field-capabilities.yaml @@ -31,10 +31,10 @@ expected: searchable: true aggregatable: true mixed: # This is a little weird case (values [5, -5.5]), since coercion happens only on the columnar side. That's why `long` is not aggregatable. - long: + long: metadata_field: false searchable: true - aggregatable: false + aggregatable: false double: metadata_field: false searchable: true @@ -88,10 +88,10 @@ expected: fields: $expect: "not 'id' in val" # Filtered by start_timestamp mixed: # This is a little weird case (values [5, -5.5]), since coercion happens only on the columnar side. That's why `long` is not aggregatable. - long: + long: metadata_field: false searchable: true - aggregatable: false + aggregatable: false double: metadata_field: false searchable: true @@ -103,8 +103,6 @@ expected: aggregatable: true --- # Test fields parameter with `.dynamic` suffix -engines: - - quickwit method: [GET] engines: - quickwit @@ -193,9 +191,6 @@ expected: --- # Compare with elastic search method: [GET] -engines: - - quickwit - - elasticsearch endpoint: fieldcaps/_field_caps?fields=nested.*ponse expected: indices: @@ -210,9 +205,6 @@ expected: --- # Compare ip field with elastic search method: [GET] -engines: - - quickwit - - elasticsearch endpoint: fieldcaps*/_field_caps?fields=host expected: indices: @@ -295,9 +287,6 @@ expected: --- # Wildcard on index name + Wildcard without match method: [GET] -engines: - - quickwit - - elasticsearch endpoint: fieldca*,blub*/_field_caps?fields=date expected: indices: @@ -313,34 +302,22 @@ expected: --- # Exact match index + Non matching exact index method: [GET] -engines: - - quickwit - - elasticsearch endpoint: fieldcaps,blub/_field_caps?fields=date status_code: 404 --- # Compare ip field with elastic search method: [GET] -engines: - - quickwit - - elasticsearch endpoint: doesnotexist/_field_caps?fields=date status_code: 404 --- # Compare ip field with elastic search method: [GET] -engines: - - quickwit - - elasticsearch endpoint: doesno*texist/_field_caps?fields=date status_code: 200 --- # Test _field_caps API with index_filter (term query) # Note: term queries require exact token match; 'fritz' is lowercase due to default tokenizer method: [POST] -engines: - - quickwit - - elasticsearch endpoint: fieldcaps/_field_caps?fields=* json: index_filter: @@ -364,9 +341,6 @@ expected: --- # Test _field_caps API with index_filter (match_all query) method: [POST] -engines: - - quickwit - - elasticsearch endpoint: fieldcaps/_field_caps?fields=name json: index_filter: @@ -389,9 +363,6 @@ expected: --- # Test _field_caps API with index_filter (bool query) method: [POST] -engines: - - quickwit - - elasticsearch endpoint: fieldcaps/_field_caps?fields=response,name json: index_filter: @@ -427,9 +398,6 @@ expected: --- # Test _field_caps API with invalid index_filter method: [POST] -engines: - - quickwit - - elasticsearch endpoint: fieldcaps/_field_caps?fields=* json: index_filter: From 4f601b8cddca457b53b731db65bdf55dfb439b8e Mon Sep 17 00:00:00 2001 From: fulmicoton Date: Tue, 10 Mar 2026 09:39:21 +0100 Subject: [PATCH 6/6] fixing build --- quickwit/quickwit-serve/src/elasticsearch_api/rest_handler.rs | 1 + 1 file changed, 1 insertion(+) diff --git a/quickwit/quickwit-serve/src/elasticsearch_api/rest_handler.rs b/quickwit/quickwit-serve/src/elasticsearch_api/rest_handler.rs index 70c21dcd6df..9abce6c61e6 100644 --- a/quickwit/quickwit-serve/src/elasticsearch_api/rest_handler.rs +++ b/quickwit/quickwit-serve/src/elasticsearch_api/rest_handler.rs @@ -200,6 +200,7 @@ async fn es_compat_index_mapping( fields: Vec::new(), start_timestamp: None, end_timestamp: None, + query_ast: None, }; let list_fields_response = search_service .root_list_fields(list_fields_request)