diff --git a/quickwit/quickwit-common/src/lib.rs b/quickwit/quickwit-common/src/lib.rs index 92d51696b7f..0882d7f8ee6 100644 --- a/quickwit/quickwit-common/src/lib.rs +++ b/quickwit/quickwit-common/src/lib.rs @@ -65,6 +65,18 @@ pub use socket_addr_legacy_hash::SocketAddrLegacyHash; pub use stream_utils::{BoxStream, ServiceStream}; use tracing::{error, info}; +/// Returns true at compile time. This function is mostly used with serde to initialize boolean +/// fields to true. +pub const fn true_fn() -> bool { + true +} + +/// Returns whether the given boolean value is true. This function is mostly used with serde to skip +/// serializing boolean fields with `skip_serializing_if = "is_true"` when the value is true. +pub fn is_true(value: &bool) -> bool { + *value +} + pub fn chunk_range(range: Range, chunk_size: usize) -> impl Iterator> { range.clone().step_by(chunk_size).map(move |block_start| { let block_end = (block_start + chunk_size).min(range.end); diff --git a/quickwit/quickwit-config/src/index_config/mod.rs b/quickwit/quickwit-config/src/index_config/mod.rs index e62e556bce3..c258c8052d8 100644 --- a/quickwit/quickwit-config/src/index_config/mod.rs +++ b/quickwit/quickwit-config/src/index_config/mod.rs @@ -26,6 +26,7 @@ use chrono::Utc; use cron::Schedule; use humantime::parse_duration; use quickwit_common::uri::Uri; +use quickwit_common::{is_true, true_fn}; use quickwit_doc_mapper::{DocMapper, DocMapperBuilder, DocMapping}; use quickwit_proto::types::IndexId; use serde::{Deserialize, Serialize}; @@ -170,6 +171,16 @@ pub struct IngestSettings { #[schema(default = 1, value_type = usize)] #[serde(default = "IngestSettings::default_min_shards")] pub min_shards: NonZeroUsize, + /// Whether to validate documents against the current doc mapping during ingestion. + /// Defaults to true. When false, documents will be written directly to the WAL without + /// validation, but might still be rejected during indexing when applying the doc mapping + /// in the doc processor, in that case the documents are dropped and a warning is logged. + /// + /// Note that when a source has a VRL transform configured, documents are not validated against + /// the doc mapping during ingestion either. + #[schema(default = true, value_type = bool)] + #[serde(default = "true_fn", skip_serializing_if = "is_true")] + pub validate_docs: bool, } impl IngestSettings { @@ -182,6 +193,7 @@ impl Default for IngestSettings { fn default() -> Self { Self { min_shards: Self::default_min_shards(), + validate_docs: true, } } } @@ -481,6 +493,7 @@ impl crate::TestableForRegression for IndexConfig { }; let ingest_settings = IngestSettings { min_shards: NonZeroUsize::new(12).unwrap(), + validate_docs: true, }; let search_settings = SearchSettings { default_search_fields: vec!["message".to_string()], @@ -942,18 +955,30 @@ mod tests { #[test] fn test_ingest_settings_serde() { - let ingest_settings = IngestSettings { + let settings = IngestSettings { min_shards: NonZeroUsize::MIN, + validate_docs: false, }; - let ingest_settings_yaml = serde_yaml::to_string(&ingest_settings).unwrap(); - let ingest_settings_roundtrip: IngestSettings = - serde_yaml::from_str(&ingest_settings_yaml).unwrap(); - assert_eq!(ingest_settings, ingest_settings_roundtrip); + let settings_yaml = serde_yaml::to_string(&settings).unwrap(); + assert!(settings_yaml.contains("validate_docs")); + + let expected_settings: IngestSettings = serde_yaml::from_str(&settings_yaml).unwrap(); + assert_eq!(settings, expected_settings); + + let settings = IngestSettings { + min_shards: NonZeroUsize::MIN, + validate_docs: true, + }; + let settings_yaml = serde_yaml::to_string(&settings).unwrap(); + assert!(!settings_yaml.contains("validate_docs")); + + let expected_settings: IngestSettings = serde_yaml::from_str(&settings_yaml).unwrap(); + assert_eq!(settings, expected_settings); - let ingest_settings_yaml = r#" + let settings_yaml = r#" min_shards: 0 "#; - let error = serde_yaml::from_str::(ingest_settings_yaml).unwrap_err(); + let error = serde_yaml::from_str::(settings_yaml).unwrap_err(); assert!(error.to_string().contains("expected a nonzero")); } } diff --git a/quickwit/quickwit-control-plane/src/ingest/ingest_controller.rs b/quickwit/quickwit-control-plane/src/ingest/ingest_controller.rs index bc6242cf5d1..4296c57311b 100644 --- a/quickwit/quickwit-control-plane/src/ingest/ingest_controller.rs +++ b/quickwit/quickwit-control-plane/src/ingest/ingest_controller.rs @@ -789,11 +789,13 @@ impl IngestController { let index_metadata = model .index_metadata(&source_uid.index_uid) .expect("index should exist"); - let validate_docs = model + let has_transform = model .source_metadata(source_uid) .expect("source should exist") .transform_config - .is_none(); + .is_some(); + let validate_docs = + index_metadata.index_config.ingest_settings.validate_docs && !has_transform; let doc_mapping = &index_metadata.index_config.doc_mapping; let doc_mapping_uid = doc_mapping.doc_mapping_uid; let doc_mapping_json = serde_utils::to_json_str(doc_mapping)?; diff --git a/quickwit/quickwit-doc-mapper/src/doc_mapper/date_time_type.rs b/quickwit/quickwit-doc-mapper/src/doc_mapper/date_time_type.rs index cf10e06a735..772819d6dc2 100644 --- a/quickwit/quickwit-doc-mapper/src/doc_mapper/date_time_type.rs +++ b/quickwit/quickwit-doc-mapper/src/doc_mapper/date_time_type.rs @@ -13,13 +13,12 @@ // limitations under the License. use indexmap::IndexSet; +use quickwit_common::true_fn; use quickwit_datetime::{DateTimeInputFormat, DateTimeOutputFormat, TantivyDateTime}; use serde::{Deserialize, Deserializer, Serialize}; use serde_json::Value as JsonValue; use tantivy::schema::{DateTimePrecision, OwnedValue as TantivyValue}; -use super::default_as_true; - /// A struct holding DateTime field options. #[derive(Clone, Debug, Eq, PartialEq, Serialize, Deserialize)] #[serde(deny_unknown_fields)] @@ -41,10 +40,10 @@ pub struct QuickwitDateTimeOptions { #[serde(alias = "precision")] pub fast_precision: DateTimePrecision, - #[serde(default = "default_as_true")] + #[serde(default = "true_fn")] pub indexed: bool, - #[serde(default = "default_as_true")] + #[serde(default = "true_fn")] pub stored: bool, #[serde(default)] diff --git a/quickwit/quickwit-doc-mapper/src/doc_mapper/field_mapping_entry.rs b/quickwit/quickwit-doc-mapper/src/doc_mapper/field_mapping_entry.rs index 45f7916db5d..ae3388aee32 100644 --- a/quickwit/quickwit-doc-mapper/src/doc_mapper/field_mapping_entry.rs +++ b/quickwit/quickwit-doc-mapper/src/doc_mapper/field_mapping_entry.rs @@ -18,6 +18,7 @@ use std::convert::TryFrom; use anyhow::bail; use base64::prelude::{BASE64_STANDARD, Engine}; use once_cell::sync::Lazy; +use quickwit_common::true_fn; use regex::Regex; use serde::{Deserialize, Serialize}; use serde_json::Value as JsonValue; @@ -26,8 +27,8 @@ use tantivy::schema::{ TextOptions, Type, }; +use super::FieldMappingType; use super::date_time_type::QuickwitDateTimeOptions; -use super::{FieldMappingType, default_as_true}; use crate::doc_mapper::field_mapping_type::QuickwitFieldType; use crate::{Cardinality, QW_RESERVED_FIELD_NAMES}; @@ -85,13 +86,13 @@ pub struct QuickwitNumericOptions { #[serde(default)] #[serde(skip_serializing_if = "Option::is_none")] pub description: Option, - #[serde(default = "default_as_true")] + #[serde(default = "true_fn")] pub stored: bool, - #[serde(default = "default_as_true")] + #[serde(default = "true_fn")] pub indexed: bool, #[serde(default)] pub fast: bool, - #[serde(default = "default_as_true")] + #[serde(default = "true_fn")] pub coerce: bool, #[serde(default)] pub output_format: NumericOutputFormat, @@ -116,9 +117,9 @@ pub struct QuickwitBoolOptions { #[serde(default)] #[serde(skip_serializing_if = "Option::is_none")] pub description: Option, - #[serde(default = "default_as_true")] + #[serde(default = "true_fn")] pub stored: bool, - #[serde(default = "default_as_true")] + #[serde(default = "true_fn")] pub indexed: bool, #[serde(default)] pub fast: bool, @@ -144,10 +145,10 @@ pub struct QuickwitBytesOptions { #[serde(skip_serializing_if = "Option::is_none")] pub description: Option, /// If true, the field will be stored in the doc store. - #[serde(default = "default_as_true")] + #[serde(default = "true_fn")] pub stored: bool, /// If true, the field will be indexed. - #[serde(default = "default_as_true")] + #[serde(default = "true_fn")] pub indexed: bool, /// If true, the field will be stored in columnar format. #[serde(default)] @@ -245,9 +246,9 @@ pub struct QuickwitIpAddrOptions { #[serde(default)] #[serde(skip_serializing_if = "Option::is_none")] pub description: Option, - #[serde(default = "default_as_true")] + #[serde(default = "true_fn")] pub stored: bool, - #[serde(default = "default_as_true")] + #[serde(default = "true_fn")] pub indexed: bool, #[serde(default)] pub fast: bool, @@ -433,7 +434,7 @@ pub struct QuickwitTextOptions { deserializer = TextIndexingOptions::from_parts_text, serializer = TextIndexingOptions::to_parts_text, fields = ( - #[serde(default = "default_as_true")] + #[serde(default = "true_fn")] pub indexed: bool, #[serde(default)] #[serde(skip_serializing_if = "Option::is_none")] @@ -447,7 +448,7 @@ pub struct QuickwitTextOptions { ), )] pub indexing_options: Option, - #[serde(default = "default_as_true")] + #[serde(default = "true_fn")] pub stored: bool, #[serde(default)] pub fast: FastFieldOptions, @@ -577,7 +578,7 @@ pub struct QuickwitJsonOptions { serializer = TextIndexingOptions::to_parts_json, fields = ( /// If true, all of the element in the json object will be indexed. - #[serde(default = "default_as_true")] + #[serde(default = "true_fn")] pub indexed: bool, /// Sets the tokenize that should be used with the text fields in the /// json object. @@ -597,10 +598,10 @@ pub struct QuickwitJsonOptions { /// Options for indexing text in a Json field. pub indexing_options: Option, /// If true, the field will be stored in the doc store. - #[serde(default = "default_as_true")] + #[serde(default = "true_fn")] pub stored: bool, /// If true, the '.' in json keys will be expanded. - #[serde(default = "default_as_true")] + #[serde(default = "true_fn")] pub expand_dots: bool, /// If true, the json object will be stored in columnar format. #[serde(default)] diff --git a/quickwit/quickwit-doc-mapper/src/doc_mapper/mod.rs b/quickwit/quickwit-doc-mapper/src/doc_mapper/mod.rs index de79188f3a8..2ee6119ddf2 100644 --- a/quickwit/quickwit-doc-mapper/src/doc_mapper/mod.rs +++ b/quickwit/quickwit-doc-mapper/src/doc_mapper/mod.rs @@ -48,11 +48,6 @@ pub(crate) use tokenizer_entry::{ }; pub use tokenizer_entry::{TokenizerConfig, TokenizerEntry, analyze_text}; -/// Function used with serde to initialize boolean value at true if there is no value in json. -fn default_as_true() -> bool { - true -} - pub type Partition = u64; /// An alias for serde_json's object type. diff --git a/quickwit/quickwit-metastore/src/tests/index.rs b/quickwit/quickwit-metastore/src/tests/index.rs index 6d7adf8ced7..225d9e98b2a 100644 --- a/quickwit/quickwit-metastore/src/tests/index.rs +++ b/quickwit/quickwit-metastore/src/tests/index.rs @@ -160,6 +160,7 @@ pub async fn test_metastore_update_ingest_settings< let ingest_settings = IngestSettings { min_shards: NonZeroUsize::new(12).unwrap(), + ..Default::default() }; let index_update_request = UpdateIndexRequest::try_from_updates( index_uid.clone(),