Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
12 changes: 12 additions & 0 deletions quickwit/quickwit-common/src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -65,6 +65,18 @@ pub use socket_addr_legacy_hash::SocketAddrLegacyHash;
pub use stream_utils::{BoxStream, ServiceStream};
use tracing::{error, info};

/// Returns true at compile time. This function is mostly used with serde to initialize boolean
/// fields to true.
pub const fn true_fn() -> bool {
true
}

/// Returns whether the given boolean value is true. This function is mostly used with serde to skip
/// serializing boolean fields with `skip_serializing_if = "is_true"` when the value is true.
pub fn is_true(value: &bool) -> bool {
*value
}

pub fn chunk_range(range: Range<usize>, chunk_size: usize) -> impl Iterator<Item = Range<usize>> {
range.clone().step_by(chunk_size).map(move |block_start| {
let block_end = (block_start + chunk_size).min(range.end);
Expand Down
39 changes: 32 additions & 7 deletions quickwit/quickwit-config/src/index_config/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,7 @@ use chrono::Utc;
use cron::Schedule;
use humantime::parse_duration;
use quickwit_common::uri::Uri;
use quickwit_common::{is_true, true_fn};
use quickwit_doc_mapper::{DocMapper, DocMapperBuilder, DocMapping};
use quickwit_proto::types::IndexId;
use serde::{Deserialize, Serialize};
Expand Down Expand Up @@ -170,6 +171,16 @@ pub struct IngestSettings {
#[schema(default = 1, value_type = usize)]
#[serde(default = "IngestSettings::default_min_shards")]
pub min_shards: NonZeroUsize,
/// Whether to validate documents against the current doc mapping during ingestion.
/// Defaults to true. When false, documents will be written directly to the WAL without
/// validation, but might still be rejected during indexing when applying the doc mapping
/// in the doc processor, in that case the documents are dropped and a warning is logged.
///
/// Note that when a source has a VRL transform configured, documents are not validated against
/// the doc mapping during ingestion either.
#[schema(default = true, value_type = bool)]
#[serde(default = "true_fn", skip_serializing_if = "is_true")]
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

why skip_serializing_if? In most case it is better to serialize defaults no?
Here this is index config, so the data is deserialized -> serialized and end up in the metastore.

Serializing the default ensure changing default won't affect existing indexes, which is usually a good thing.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I agree with you with one caveat: after adding a new field, if you have to rollback then the cluster is borked because of the deny_unknown_fields that we have. Maybe we should remove them and use serde_ignored instead to log a warning.

pub validate_docs: bool,
}

impl IngestSettings {
Expand All @@ -182,6 +193,7 @@ impl Default for IngestSettings {
fn default() -> Self {
Self {
min_shards: Self::default_min_shards(),
validate_docs: true,
}
}
}
Expand Down Expand Up @@ -481,6 +493,7 @@ impl crate::TestableForRegression for IndexConfig {
};
let ingest_settings = IngestSettings {
min_shards: NonZeroUsize::new(12).unwrap(),
validate_docs: true,
};
let search_settings = SearchSettings {
default_search_fields: vec!["message".to_string()],
Expand Down Expand Up @@ -942,18 +955,30 @@ mod tests {

#[test]
fn test_ingest_settings_serde() {
let ingest_settings = IngestSettings {
let settings = IngestSettings {
min_shards: NonZeroUsize::MIN,
validate_docs: false,
};
let ingest_settings_yaml = serde_yaml::to_string(&ingest_settings).unwrap();
let ingest_settings_roundtrip: IngestSettings =
serde_yaml::from_str(&ingest_settings_yaml).unwrap();
assert_eq!(ingest_settings, ingest_settings_roundtrip);
let settings_yaml = serde_yaml::to_string(&settings).unwrap();
assert!(settings_yaml.contains("validate_docs"));

let expected_settings: IngestSettings = serde_yaml::from_str(&settings_yaml).unwrap();
assert_eq!(settings, expected_settings);

let settings = IngestSettings {
min_shards: NonZeroUsize::MIN,
validate_docs: true,
};
let settings_yaml = serde_yaml::to_string(&settings).unwrap();
assert!(!settings_yaml.contains("validate_docs"));

let expected_settings: IngestSettings = serde_yaml::from_str(&settings_yaml).unwrap();
assert_eq!(settings, expected_settings);

let ingest_settings_yaml = r#"
let settings_yaml = r#"
min_shards: 0
"#;
let error = serde_yaml::from_str::<IngestSettings>(ingest_settings_yaml).unwrap_err();
let error = serde_yaml::from_str::<IngestSettings>(settings_yaml).unwrap_err();
assert!(error.to_string().contains("expected a nonzero"));
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -789,11 +789,13 @@ impl IngestController {
let index_metadata = model
.index_metadata(&source_uid.index_uid)
.expect("index should exist");
let validate_docs = model
let has_transform = model
.source_metadata(source_uid)
.expect("source should exist")
.transform_config
.is_none();
.is_some();
let validate_docs =
index_metadata.index_config.ingest_settings.validate_docs && !has_transform;
let doc_mapping = &index_metadata.index_config.doc_mapping;
let doc_mapping_uid = doc_mapping.doc_mapping_uid;
let doc_mapping_json = serde_utils::to_json_str(doc_mapping)?;
Expand Down
7 changes: 3 additions & 4 deletions quickwit/quickwit-doc-mapper/src/doc_mapper/date_time_type.rs
Original file line number Diff line number Diff line change
Expand Up @@ -13,13 +13,12 @@
// limitations under the License.

use indexmap::IndexSet;
use quickwit_common::true_fn;
use quickwit_datetime::{DateTimeInputFormat, DateTimeOutputFormat, TantivyDateTime};
use serde::{Deserialize, Deserializer, Serialize};
use serde_json::Value as JsonValue;
use tantivy::schema::{DateTimePrecision, OwnedValue as TantivyValue};

use super::default_as_true;

/// A struct holding DateTime field options.
#[derive(Clone, Debug, Eq, PartialEq, Serialize, Deserialize)]
#[serde(deny_unknown_fields)]
Expand All @@ -41,10 +40,10 @@ pub struct QuickwitDateTimeOptions {
#[serde(alias = "precision")]
pub fast_precision: DateTimePrecision,

#[serde(default = "default_as_true")]
#[serde(default = "true_fn")]
pub indexed: bool,

#[serde(default = "default_as_true")]
#[serde(default = "true_fn")]
pub stored: bool,

#[serde(default)]
Expand Down
31 changes: 16 additions & 15 deletions quickwit/quickwit-doc-mapper/src/doc_mapper/field_mapping_entry.rs
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@ use std::convert::TryFrom;
use anyhow::bail;
use base64::prelude::{BASE64_STANDARD, Engine};
use once_cell::sync::Lazy;
use quickwit_common::true_fn;
use regex::Regex;
use serde::{Deserialize, Serialize};
use serde_json::Value as JsonValue;
Expand All @@ -26,8 +27,8 @@ use tantivy::schema::{
TextOptions, Type,
};

use super::FieldMappingType;
use super::date_time_type::QuickwitDateTimeOptions;
use super::{FieldMappingType, default_as_true};
use crate::doc_mapper::field_mapping_type::QuickwitFieldType;
use crate::{Cardinality, QW_RESERVED_FIELD_NAMES};

Expand Down Expand Up @@ -85,13 +86,13 @@ pub struct QuickwitNumericOptions {
#[serde(default)]
#[serde(skip_serializing_if = "Option::is_none")]
pub description: Option<String>,
#[serde(default = "default_as_true")]
#[serde(default = "true_fn")]
pub stored: bool,
#[serde(default = "default_as_true")]
#[serde(default = "true_fn")]
pub indexed: bool,
#[serde(default)]
pub fast: bool,
#[serde(default = "default_as_true")]
#[serde(default = "true_fn")]
pub coerce: bool,
#[serde(default)]
pub output_format: NumericOutputFormat,
Expand All @@ -116,9 +117,9 @@ pub struct QuickwitBoolOptions {
#[serde(default)]
#[serde(skip_serializing_if = "Option::is_none")]
pub description: Option<String>,
#[serde(default = "default_as_true")]
#[serde(default = "true_fn")]
pub stored: bool,
#[serde(default = "default_as_true")]
#[serde(default = "true_fn")]
pub indexed: bool,
#[serde(default)]
pub fast: bool,
Expand All @@ -144,10 +145,10 @@ pub struct QuickwitBytesOptions {
#[serde(skip_serializing_if = "Option::is_none")]
pub description: Option<String>,
/// If true, the field will be stored in the doc store.
#[serde(default = "default_as_true")]
#[serde(default = "true_fn")]
pub stored: bool,
/// If true, the field will be indexed.
#[serde(default = "default_as_true")]
#[serde(default = "true_fn")]
pub indexed: bool,
/// If true, the field will be stored in columnar format.
#[serde(default)]
Expand Down Expand Up @@ -245,9 +246,9 @@ pub struct QuickwitIpAddrOptions {
#[serde(default)]
#[serde(skip_serializing_if = "Option::is_none")]
pub description: Option<String>,
#[serde(default = "default_as_true")]
#[serde(default = "true_fn")]
pub stored: bool,
#[serde(default = "default_as_true")]
#[serde(default = "true_fn")]
pub indexed: bool,
#[serde(default)]
pub fast: bool,
Expand Down Expand Up @@ -433,7 +434,7 @@ pub struct QuickwitTextOptions {
deserializer = TextIndexingOptions::from_parts_text,
serializer = TextIndexingOptions::to_parts_text,
fields = (
#[serde(default = "default_as_true")]
#[serde(default = "true_fn")]
pub indexed: bool,
#[serde(default)]
#[serde(skip_serializing_if = "Option::is_none")]
Expand All @@ -447,7 +448,7 @@ pub struct QuickwitTextOptions {
),
)]
pub indexing_options: Option<TextIndexingOptions>,
#[serde(default = "default_as_true")]
#[serde(default = "true_fn")]
pub stored: bool,
#[serde(default)]
pub fast: FastFieldOptions,
Expand Down Expand Up @@ -577,7 +578,7 @@ pub struct QuickwitJsonOptions {
serializer = TextIndexingOptions::to_parts_json,
fields = (
/// If true, all of the element in the json object will be indexed.
#[serde(default = "default_as_true")]
#[serde(default = "true_fn")]
pub indexed: bool,
/// Sets the tokenize that should be used with the text fields in the
/// json object.
Expand All @@ -597,10 +598,10 @@ pub struct QuickwitJsonOptions {
/// Options for indexing text in a Json field.
pub indexing_options: Option<TextIndexingOptions>,
/// If true, the field will be stored in the doc store.
#[serde(default = "default_as_true")]
#[serde(default = "true_fn")]
pub stored: bool,
/// If true, the '.' in json keys will be expanded.
#[serde(default = "default_as_true")]
#[serde(default = "true_fn")]
pub expand_dots: bool,
/// If true, the json object will be stored in columnar format.
#[serde(default)]
Expand Down
5 changes: 0 additions & 5 deletions quickwit/quickwit-doc-mapper/src/doc_mapper/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -48,11 +48,6 @@ pub(crate) use tokenizer_entry::{
};
pub use tokenizer_entry::{TokenizerConfig, TokenizerEntry, analyze_text};

/// Function used with serde to initialize boolean value at true if there is no value in json.
fn default_as_true() -> bool {
true
}

pub type Partition = u64;

/// An alias for serde_json's object type.
Expand Down
1 change: 1 addition & 0 deletions quickwit/quickwit-metastore/src/tests/index.rs
Original file line number Diff line number Diff line change
Expand Up @@ -160,6 +160,7 @@ pub async fn test_metastore_update_ingest_settings<

let ingest_settings = IngestSettings {
min_shards: NonZeroUsize::new(12).unwrap(),
..Default::default()
};
let index_update_request = UpdateIndexRequest::try_from_updates(
index_uid.clone(),
Expand Down
Loading