From d892f232b8bb9e7541a6e77fc8f31714124bc5d0 Mon Sep 17 00:00:00 2001
From: Matthew Kim <matthew.kim@datadoghq.com>
Date: Wed, 18 Mar 2026 11:01:55 -0400
Subject: [PATCH 01/35] feat: replace fixed MetricDataPoint fields with dynamic
 tag HashMap

---
 .../src/otlp/arrow_metrics.rs                 | 520 +++++++-----------
 .../quickwit-opentelemetry/src/otlp/mod.rs    |   2 +-
 .../src/otlp/otel_metrics.rs                  | 408 ++++++++------
 3 files changed, 425 insertions(+), 505 deletions(-)

diff --git a/quickwit/quickwit-opentelemetry/src/otlp/arrow_metrics.rs b/quickwit/quickwit-opentelemetry/src/otlp/arrow_metrics.rs
index 1811a63a909..8a4b3142501 100644
--- a/quickwit/quickwit-opentelemetry/src/otlp/arrow_metrics.rs
+++ b/quickwit/quickwit-opentelemetry/src/otlp/arrow_metrics.rs
@@ -12,272 +12,139 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-//! Arrow-based batch building for metrics with dictionary encoding.
+//! Arrow-based batch building for metrics with dynamic schema discovery.
 //!
 //! This module provides Arrow RecordBatch construction with dictionary-encoded
-//! string columns for efficient storage of metrics with low cardinality tags.
+//! string columns for efficient storage of metrics with dynamic tag keys.
+//! The schema is discovered at `finish()` time by scanning all accumulated
+//! data points for the union of tag keys.
 
+use std::collections::BTreeSet;
 use std::io::Cursor;
 use std::sync::Arc;
 
 use arrow::array::{
-    ArrayBuilder, ArrayRef, Float64Builder, RecordBatch, StringBuilder, StringDictionaryBuilder,
-    UInt8Builder, UInt64Builder,
+    ArrayRef, Float64Builder, RecordBatch, StringDictionaryBuilder, UInt64Builder, UInt8Builder,
 };
-use arrow::datatypes::{DataType, Field, Fields, Int32Type, Schema as ArrowSchema};
+use arrow::datatypes::{DataType, Field, Int32Type, Schema as ArrowSchema};
 use arrow::ipc::reader::StreamReader;
 use arrow::ipc::writer::StreamWriter;
-use parquet::variant::{VariantArrayBuilder, VariantBuilderExt, VariantType};
 use quickwit_proto::bytes::Bytes;
 use quickwit_proto::ingest::{DocBatchV2, DocFormat};
 use quickwit_proto::types::DocUid;
 
 use super::otel_metrics::{MetricDataPoint, MetricType};
 
-/// Creates the Arrow schema for metrics with dictionary-encoded string columns.
-///
-/// Dictionary encoding stores unique string values once and references them by
-/// integer index, providing significant compression for low cardinality tag values.
-pub fn metrics_arrow_schema() -> ArrowSchema {
-    ArrowSchema::new(vec![
-        // Dictionary-encoded string columns for low cardinality fields
-        Field::new(
-            "metric_name",
-            DataType::Dictionary(Box::new(DataType::Int32), Box::new(DataType::Utf8)),
-            false,
-        ),
-        // MetricType enum stored as UInt8 (only ~5 possible values)
-        Field::new("metric_type", DataType::UInt8, false),
-        Field::new("metric_unit", DataType::Utf8, true),
-        // Measurement timestamp in seconds since Unix epoch.
-        Field::new("timestamp_secs", DataType::UInt64, false),
-        Field::new("start_timestamp_secs", DataType::UInt64, true),
-        Field::new("value", DataType::Float64, false),
-        // Dictionary-encoded tag columns (low cardinality expected)
-        Field::new(
-            "tag_service",
-            DataType::Dictionary(Box::new(DataType::Int32), Box::new(DataType::Utf8)),
-            true,
-        ),
-        Field::new(
-            "tag_env",
-            DataType::Dictionary(Box::new(DataType::Int32), Box::new(DataType::Utf8)),
-            true,
-        ),
-        Field::new(
-            "tag_datacenter",
-            DataType::Dictionary(Box::new(DataType::Int32), Box::new(DataType::Utf8)),
-            true,
-        ),
-        Field::new(
-            "tag_region",
-            DataType::Dictionary(Box::new(DataType::Int32), Box::new(DataType::Utf8)),
-            true,
-        ),
-        Field::new(
-            "tag_host",
-            DataType::Dictionary(Box::new(DataType::Int32), Box::new(DataType::Utf8)),
-            true,
-        ),
-        // VARIANT fields for semi-structured attributes
-        // VariantArrayBuilder produces BinaryView fields, not Binary
-        Field::new(
-            "attributes",
-            DataType::Struct(Fields::from(vec![
-                Field::new("metadata", DataType::BinaryView, false),
-                Field::new("value", DataType::BinaryView, false),
-            ])),
-            true,
-        )
-        .with_extension_type(VariantType),
-        // Service name (low cardinality)
-        Field::new(
-            "service_name",
-            DataType::Dictionary(Box::new(DataType::Int32), Box::new(DataType::Utf8)),
-            false,
-        ),
-        Field::new(
-            "resource_attributes",
-            DataType::Struct(Fields::from(vec![
-                Field::new("metadata", DataType::BinaryView, false),
-                Field::new("value", DataType::BinaryView, false),
-            ])),
-            true,
-        )
-        .with_extension_type(VariantType),
-    ])
-}
-
 /// Builder for creating Arrow RecordBatch from MetricDataPoints.
 ///
-/// Uses dictionary encoding for low cardinality string columns
-/// (tags, service names, metric names) to achieve significant compression.
-/// Uses VARIANT encoding for semi-structured attributes.
+/// Accumulates data points and discovers the schema dynamically at `finish()`
+/// time. Uses dictionary encoding for string columns (metric_name, all tags).
 pub struct ArrowMetricsBatchBuilder {
-    metric_name: StringDictionaryBuilder<Int32Type>,
-    metric_type: UInt8Builder,
-    metric_unit: StringBuilder,
-    timestamp_secs: UInt64Builder,
-    start_timestamp_secs: UInt64Builder,
-    value: Float64Builder,
-    tag_service: StringDictionaryBuilder<Int32Type>,
-    tag_env: StringDictionaryBuilder<Int32Type>,
-    tag_datacenter: StringDictionaryBuilder<Int32Type>,
-    tag_region: StringDictionaryBuilder<Int32Type>,
-    tag_host: StringDictionaryBuilder<Int32Type>,
-    attributes: VariantArrayBuilder,
-    service_name: StringDictionaryBuilder<Int32Type>,
-    resource_attributes: VariantArrayBuilder,
+    data_points: Vec<MetricDataPoint>,
 }
 
 impl ArrowMetricsBatchBuilder {
     /// Creates a new builder with pre-allocated capacity.
     pub fn with_capacity(capacity: usize) -> Self {
         Self {
-            metric_name: StringDictionaryBuilder::new(),
-            metric_type: UInt8Builder::with_capacity(capacity),
-            metric_unit: StringBuilder::with_capacity(capacity, capacity * 8),
-            timestamp_secs: UInt64Builder::with_capacity(capacity),
-            start_timestamp_secs: UInt64Builder::with_capacity(capacity),
-            value: Float64Builder::with_capacity(capacity),
-            tag_service: StringDictionaryBuilder::new(),
-            tag_env: StringDictionaryBuilder::new(),
-            tag_datacenter: StringDictionaryBuilder::new(),
-            tag_region: StringDictionaryBuilder::new(),
-            tag_host: StringDictionaryBuilder::new(),
-            attributes: VariantArrayBuilder::new(capacity),
-            service_name: StringDictionaryBuilder::new(),
-            resource_attributes: VariantArrayBuilder::new(capacity),
+            data_points: Vec::with_capacity(capacity),
         }
     }
 
     /// Appends a MetricDataPoint to the batch.
-    pub fn append(&mut self, data_point: &MetricDataPoint) {
-        self.metric_name.append_value(&data_point.metric_name);
-        self.metric_type.append_value(data_point.metric_type as u8);
+    pub fn append(&mut self, data_point: MetricDataPoint) {
+        self.data_points.push(data_point);
+    }
 
-        match &data_point.metric_unit {
-            Some(unit) => self.metric_unit.append_value(unit),
-            None => self.metric_unit.append_null(),
+    /// Finalizes and returns the RecordBatch.
+    ///
+    /// Performs two passes:
+    /// 1. Schema discovery: scans all data points to collect the union of tag keys.
+    /// 2. Array building: creates per-column builders and populates them.
+    pub fn finish(self) -> RecordBatch {
+        let num_rows = self.data_points.len();
+
+        // Pass 1: discover all tag keys across all data points.
+        let mut tag_keys: BTreeSet<&str> = BTreeSet::new();
+        for dp in &self.data_points {
+            for key in dp.tags.keys() {
+                tag_keys.insert(key.as_str());
+            }
         }
+        let sorted_tag_keys: Vec<&str> = tag_keys.into_iter().collect();
 
-        self.timestamp_secs.append_value(data_point.timestamp_secs);
-        match data_point.start_timestamp_secs {
-            Some(ts) => self.start_timestamp_secs.append_value(ts),
-            None => self.start_timestamp_secs.append_null(),
+        // Build the Arrow schema dynamically
+        let mut fields = Vec::with_capacity(4 + sorted_tag_keys.len());
+        fields.push(Field::new(
+            "metric_name",
+            DataType::Dictionary(Box::new(DataType::Int32), Box::new(DataType::Utf8)),
+            false,
+        ));
+        fields.push(Field::new("metric_type", DataType::UInt8, false));
+        fields.push(Field::new("timestamp_secs", DataType::UInt64, false));
+        fields.push(Field::new("value", DataType::Float64, false));
+
+        for &tag_key in &sorted_tag_keys {
+            fields.push(Field::new(
+                tag_key,
+                DataType::Dictionary(Box::new(DataType::Int32), Box::new(DataType::Utf8)),
+                true,
+            ));
         }
-        self.value.append_value(data_point.value);
-
-        append_optional_dict(&mut self.tag_service, &data_point.tag_service);
-        append_optional_dict(&mut self.tag_env, &data_point.tag_env);
-        append_optional_dict(&mut self.tag_datacenter, &data_point.tag_datacenter);
-        append_optional_dict(&mut self.tag_region, &data_point.tag_region);
-        append_optional_dict(&mut self.tag_host, &data_point.tag_host);
-
-        if data_point.attributes.is_empty() {
-            self.attributes.append_null();
-        } else {
-            append_variant_object(&mut self.attributes, &data_point.attributes);
+
+        let schema = Arc::new(ArrowSchema::new(fields));
+
+        // Pass 2: build arrays
+        let mut metric_name_builder: StringDictionaryBuilder<Int32Type> =
+            StringDictionaryBuilder::new();
+        let mut metric_type_builder = UInt8Builder::with_capacity(num_rows);
+        let mut timestamp_secs_builder = UInt64Builder::with_capacity(num_rows);
+        let mut value_builder = Float64Builder::with_capacity(num_rows);
+
+        let mut tag_builders: Vec<StringDictionaryBuilder<Int32Type>> = sorted_tag_keys
+            .iter()
+            .map(|_| StringDictionaryBuilder::new())
+            .collect();
+
+        for dp in &self.data_points {
+            metric_name_builder.append_value(&dp.metric_name);
+            metric_type_builder.append_value(dp.metric_type as u8);
+            timestamp_secs_builder.append_value(dp.timestamp_secs);
+            value_builder.append_value(dp.value);
+
+            for (tag_idx, tag_key) in sorted_tag_keys.iter().enumerate() {
+                match dp.tags.get(*tag_key) {
+                    Some(tag_val) => tag_builders[tag_idx].append_value(tag_val),
+                    None => tag_builders[tag_idx].append_null(),
+                }
+            }
         }
 
-        self.service_name.append_value(&data_point.service_name);
+        let mut arrays: Vec<ArrayRef> = Vec::with_capacity(4 + sorted_tag_keys.len());
+        arrays.push(Arc::new(metric_name_builder.finish()));
+        arrays.push(Arc::new(metric_type_builder.finish()));
+        arrays.push(Arc::new(timestamp_secs_builder.finish()));
+        arrays.push(Arc::new(value_builder.finish()));
 
-        if data_point.resource_attributes.is_empty() {
-            self.resource_attributes.append_null();
-        } else {
-            append_variant_object(
-                &mut self.resource_attributes,
-                &data_point.resource_attributes,
-            );
+        for tag_builder in &mut tag_builders {
+            arrays.push(Arc::new(tag_builder.finish()));
         }
-    }
 
-    /// Finalizes and returns the RecordBatch.
-    pub fn finish(mut self) -> RecordBatch {
-        // Build variant arrays and convert to ArrayRef
-        let attributes_array = self.attributes.build();
-        let resource_attributes_array = self.resource_attributes.build();
-
-        let arrays: Vec<ArrayRef> = vec![
-            Arc::new(self.metric_name.finish()),
-            Arc::new(self.metric_type.finish()),
-            Arc::new(self.metric_unit.finish()),
-            Arc::new(self.timestamp_secs.finish()),
-            Arc::new(self.start_timestamp_secs.finish()),
-            Arc::new(self.value.finish()),
-            Arc::new(self.tag_service.finish()),
-            Arc::new(self.tag_env.finish()),
-            Arc::new(self.tag_datacenter.finish()),
-            Arc::new(self.tag_region.finish()),
-            Arc::new(self.tag_host.finish()),
-            ArrayRef::from(attributes_array),
-            Arc::new(self.service_name.finish()),
-            ArrayRef::from(resource_attributes_array),
-        ];
-
-        RecordBatch::try_new(Arc::new(metrics_arrow_schema()), arrays)
+        RecordBatch::try_new(schema, arrays)
             .expect("record batch should match Arrow schema")
     }
 
     /// Returns the number of rows appended so far.
     pub fn len(&self) -> usize {
-        self.timestamp_secs.len()
+        self.data_points.len()
     }
 
     /// Returns true if no rows have been appended.
     pub fn is_empty(&self) -> bool {
-        self.len() == 0
+        self.data_points.is_empty()
     }
 }
 
-/// Helper to append optional string values to dictionary builder.
-fn append_optional_dict(builder: &mut StringDictionaryBuilder<Int32Type>, value: &Option<String>) {
-    match value {
-        Some(s) => builder.append_value(s),
-        None => builder.append_null(),
-    }
-}
-
-/// Helper to append a HashMap as a VARIANT object to the builder.
-fn append_variant_object(
-    builder: &mut VariantArrayBuilder,
-    map: &std::collections::HashMap<String, serde_json::Value>,
-) {
-    // Use a macro-like approach with fold to build the object
-    // We need to chain with_field calls which consume and return the builder
-    let obj_builder = builder.new_object();
-
-    // Build object by folding over the map entries
-    let final_builder = map.iter().fold(obj_builder, |b, (key, value)| {
-        match value {
-            serde_json::Value::Null => b.with_field(key.as_str(), ()),
-            serde_json::Value::Bool(v) => b.with_field(key.as_str(), *v),
-            serde_json::Value::Number(n) => {
-                if let Some(i) = n.as_i64() {
-                    b.with_field(key.as_str(), i)
-                } else if let Some(f) = n.as_f64() {
-                    b.with_field(key.as_str(), f)
-                } else {
-                    b.with_field(key.as_str(), ())
-                }
-            }
-            serde_json::Value::String(s) => b.with_field(key.as_str(), s.as_str()),
-            serde_json::Value::Array(arr) => {
-                // For arrays, serialize to JSON string as fallback
-                let json_str = serde_json::to_string(arr).unwrap_or_default();
-                b.with_field(key.as_str(), json_str.as_str())
-            }
-            serde_json::Value::Object(obj) => {
-                // For nested objects, serialize to JSON string as fallback
-                let json_str = serde_json::to_string(obj).unwrap_or_default();
-                b.with_field(key.as_str(), json_str.as_str())
-            }
-        }
-    });
-
-    final_builder.finish();
-}
-
 /// Error type for Arrow IPC operations.
 #[derive(Debug, thiserror::Error)]
 pub enum ArrowIpcError {
@@ -462,32 +329,26 @@ impl ArrowDocBatchV2Builder {
 mod tests {
     use std::collections::HashMap;
 
-    use serde_json::Value as JsonValue;
-
     use super::*;
 
     fn make_test_data_point() -> MetricDataPoint {
+        let mut tags = HashMap::new();
+        tags.insert("service".to_string(), "api".to_string());
+        tags.insert("env".to_string(), "prod".to_string());
+        tags.insert("datacenter".to_string(), "us-east-1a".to_string());
+        tags.insert("region".to_string(), "us-east-1".to_string());
+        tags.insert("host".to_string(), "server-001".to_string());
+        tags.insert("endpoint".to_string(), "/health".to_string());
+        tags.insert("metric_unit".to_string(), "%".to_string());
+        tags.insert("start_timestamp_secs".to_string(), "1704067190".to_string());
+        tags.insert("service_name".to_string(), "api-service".to_string());
+
         MetricDataPoint {
             metric_name: "cpu.usage".to_string(),
             metric_type: MetricType::Gauge,
-            metric_unit: Some("%".to_string()),
             timestamp_secs: 1704067200,
-            start_timestamp_secs: Some(1704067190),
             value: 85.5,
-            tag_service: Some("api".to_string()),
-            tag_env: Some("prod".to_string()),
-            tag_datacenter: Some("us-east-1a".to_string()),
-            tag_region: Some("us-east-1".to_string()),
-            tag_host: Some("server-001".to_string()),
-            attributes: HashMap::from([(
-                "endpoint".to_string(),
-                JsonValue::String("/health".to_string()),
-            )]),
-            service_name: "api-service".to_string(),
-            resource_attributes: HashMap::from([(
-                "k8s.pod".to_string(),
-                JsonValue::String("pod-123".to_string()),
-            )]),
+            tags,
         }
     }
 
@@ -495,38 +356,36 @@ mod tests {
     fn test_arrow_batch_builder_single_row() {
         let dp = make_test_data_point();
         let mut builder = ArrowMetricsBatchBuilder::with_capacity(1);
-        builder.append(&dp);
+        builder.append(dp);
 
         assert_eq!(builder.len(), 1);
         assert!(!builder.is_empty());
 
         let batch = builder.finish();
         assert_eq!(batch.num_rows(), 1);
-        assert_eq!(batch.num_columns(), 14);
+        // 4 fixed columns + 9 tag columns
+        assert_eq!(batch.num_columns(), 13);
     }
 
     #[test]
     fn test_arrow_batch_builder_multiple_rows() {
         let mut builder = ArrowMetricsBatchBuilder::with_capacity(100);
 
-        for i in 0..100 {
+        for idx in 0..100 {
+            let mut tags = HashMap::new();
+            tags.insert("service".to_string(), format!("service-{}", idx % 10));
+            tags.insert("env".to_string(), "prod".to_string());
+            tags.insert("host".to_string(), format!("host-{}", idx % 5));
+            tags.insert("service_name".to_string(), "test-service".to_string());
+
             let dp = MetricDataPoint {
                 metric_name: "test.metric".to_string(),
                 metric_type: MetricType::Gauge,
-                metric_unit: None,
-                timestamp_secs: 1704067200 + i as u64,
-                start_timestamp_secs: None,
-                value: i as f64 * 0.1,
-                tag_service: Some(format!("service-{}", i % 10)), // 10 unique values
-                tag_env: Some("prod".to_string()),                // 1 unique value
-                tag_datacenter: None,
-                tag_region: None,
-                tag_host: Some(format!("host-{}", i % 5)), // 5 unique values
-                attributes: HashMap::new(),
-                service_name: "test-service".to_string(),
-                resource_attributes: HashMap::new(),
+                timestamp_secs: 1704067200 + idx as u64,
+                value: idx as f64 * 0.1,
+                tags,
             };
-            builder.append(&dp);
+            builder.append(dp);
         }
 
         assert_eq!(builder.len(), 100);
@@ -539,89 +398,108 @@ mod tests {
         let mut builder = ArrowMetricsBatchBuilder::with_capacity(1000);
 
         // Create 1000 data points with only 10 unique service values
-        for i in 0..1000 {
+        for idx in 0..1000 {
+            let mut tags = HashMap::new();
+            tags.insert("service".to_string(), format!("service-{}", idx % 10));
+            tags.insert("env".to_string(), "prod".to_string());
+            tags.insert("datacenter".to_string(), format!("dc-{}", idx % 4));
+            tags.insert("service_name".to_string(), format!("svc-{}", idx % 5));
+
             let dp = MetricDataPoint {
                 metric_name: "test.metric".to_string(),
                 metric_type: MetricType::Gauge,
-                metric_unit: None,
-                timestamp_secs: 1704067200 + i as u64,
-                start_timestamp_secs: None,
-                value: i as f64,
-                tag_service: Some(format!("service-{}", i % 10)),
-                tag_env: Some("prod".to_string()),
-                tag_datacenter: Some(format!("dc-{}", i % 4)),
-                tag_region: None,
-                tag_host: None,
-                attributes: HashMap::new(),
-                service_name: format!("svc-{}", i % 5),
-                resource_attributes: HashMap::new(),
+                timestamp_secs: 1704067200 + idx as u64,
+                value: idx as f64,
+                tags,
             };
-            builder.append(&dp);
+            builder.append(dp);
         }
 
         let batch = builder.finish();
         assert_eq!(batch.num_rows(), 1000);
 
         // Verify the batch was created successfully with dictionary encoding
-        // The dictionary arrays should have far fewer unique values than rows
         let schema = batch.schema();
 
-        // Check that tag_service uses dictionary encoding
-        let tag_service_field = schema.field_with_name("tag_service").unwrap();
+        // Check that the service tag uses dictionary encoding
+        let service_field = schema.field_with_name("service").unwrap();
         assert!(matches!(
-            tag_service_field.data_type(),
+            service_field.data_type(),
             DataType::Dictionary(_, _)
         ));
     }
 
     #[test]
     fn test_null_handling() {
+        let mut tags = HashMap::new();
+        tags.insert("service_name".to_string(), "unknown".to_string());
+
         let dp = MetricDataPoint {
             metric_name: "minimal.metric".to_string(),
             metric_type: MetricType::Gauge,
-            metric_unit: None, // null
             timestamp_secs: 1704067200,
-            start_timestamp_secs: None, // null
             value: 0.0,
-            tag_service: None, // null
-            tag_env: None,     // null
-            tag_datacenter: None,
-            tag_region: None,
-            tag_host: None,
-            attributes: HashMap::new(), // empty -> null
-            service_name: "unknown".to_string(),
-            resource_attributes: HashMap::new(), // empty -> null
+            tags,
         };
 
         let mut builder = ArrowMetricsBatchBuilder::with_capacity(1);
-        builder.append(&dp);
+        builder.append(dp);
         let batch = builder.finish();
 
         assert_eq!(batch.num_rows(), 1);
-        // The batch should handle nulls correctly
+        // 4 fixed columns + 1 tag column (service_name)
+        assert_eq!(batch.num_columns(), 5);
     }
 
     #[test]
-    fn test_schema_field_count() {
-        let schema = metrics_arrow_schema();
-        assert_eq!(schema.fields().len(), 14);
+    fn test_dynamic_schema_discovery() {
+        let mut builder = ArrowMetricsBatchBuilder::with_capacity(2);
 
-        // Verify field names
+        // First data point has tags: env, host
+        let mut tags1 = HashMap::new();
+        tags1.insert("env".to_string(), "prod".to_string());
+        tags1.insert("host".to_string(), "server-1".to_string());
+
+        builder.append(MetricDataPoint {
+            metric_name: "metric.a".to_string(),
+            metric_type: MetricType::Gauge,
+            timestamp_secs: 1704067200,
+            value: 1.0,
+            tags: tags1,
+        });
+
+        // Second data point has tags: env, region (different set)
+        let mut tags2 = HashMap::new();
+        tags2.insert("env".to_string(), "staging".to_string());
+        tags2.insert("region".to_string(), "us-west".to_string());
+
+        builder.append(MetricDataPoint {
+            metric_name: "metric.b".to_string(),
+            metric_type: MetricType::Sum,
+            timestamp_secs: 1704067201,
+            value: 2.0,
+            tags: tags2,
+        });
+
+        let batch = builder.finish();
+        assert_eq!(batch.num_rows(), 2);
+        // 4 fixed + 3 tag columns (env, host, region) - sorted alphabetically
+        assert_eq!(batch.num_columns(), 7);
+
+        let schema = batch.schema();
         let field_names: Vec<&str> = schema.fields().iter().map(|f| f.name().as_str()).collect();
-        assert!(field_names.contains(&"metric_name"));
-        assert!(field_names.contains(&"metric_type"));
-        assert!(field_names.contains(&"metric_unit"));
-        assert!(field_names.contains(&"timestamp_secs"));
-        assert!(field_names.contains(&"start_timestamp_secs"));
-        assert!(field_names.contains(&"value"));
-        assert!(field_names.contains(&"tag_service"));
-        assert!(field_names.contains(&"tag_env"));
-        assert!(field_names.contains(&"tag_datacenter"));
-        assert!(field_names.contains(&"tag_region"));
-        assert!(field_names.contains(&"tag_host"));
-        assert!(field_names.contains(&"attributes"));
-        assert!(field_names.contains(&"service_name"));
-        assert!(field_names.contains(&"resource_attributes"));
+        assert_eq!(
+            field_names,
+            vec![
+                "metric_name",
+                "metric_type",
+                "timestamp_secs",
+                "value",
+                "env",
+                "host",
+                "region",
+            ]
+        );
     }
 
     #[test]
@@ -638,24 +516,20 @@ mod tests {
     fn test_ipc_round_trip() {
         // Build a RecordBatch
         let mut builder = ArrowMetricsBatchBuilder::with_capacity(10);
-        for i in 0..10 {
+        for idx in 0..10 {
+            let mut tags = HashMap::new();
+            tags.insert("service".to_string(), format!("service-{}", idx % 3));
+            tags.insert("env".to_string(), "prod".to_string());
+            tags.insert("service_name".to_string(), "test-service".to_string());
+
             let dp = MetricDataPoint {
                 metric_name: "test.metric".to_string(),
                 metric_type: MetricType::Gauge,
-                metric_unit: None,
-                timestamp_secs: 1704067200 + i as u64,
-                start_timestamp_secs: None,
-                value: i as f64 * 0.1,
-                tag_service: Some(format!("service-{}", i % 3)),
-                tag_env: Some("prod".to_string()),
-                tag_datacenter: None,
-                tag_region: None,
-                tag_host: None,
-                attributes: HashMap::new(),
-                service_name: "test-service".to_string(),
-                resource_attributes: HashMap::new(),
+                timestamp_secs: 1704067200 + idx as u64,
+                value: idx as f64 * 0.1,
+                tags,
             };
-            builder.append(&dp);
+            builder.append(dp);
         }
         let original_batch = builder.finish();
 
@@ -682,24 +556,18 @@ mod tests {
         let mut doc_uid_generator = DocUidGenerator::default();
         let mut doc_uids = Vec::new();
 
-        for i in 0..5 {
+        for idx in 0..5 {
+            let mut tags = HashMap::new();
+            tags.insert("service_name".to_string(), "test".to_string());
+
             let dp = MetricDataPoint {
                 metric_name: "test.metric".to_string(),
                 metric_type: MetricType::Gauge,
-                metric_unit: None,
-                timestamp_secs: 1704067200 + i as u64,
-                start_timestamp_secs: None,
-                value: i as f64,
-                tag_service: None,
-                tag_env: None,
-                tag_datacenter: None,
-                tag_region: None,
-                tag_host: None,
-                attributes: HashMap::new(),
-                service_name: "test".to_string(),
-                resource_attributes: HashMap::new(),
+                timestamp_secs: 1704067200 + idx as u64,
+                value: idx as f64,
+                tags,
             };
-            builder.append(&dp);
+            builder.append(dp);
             doc_uids.push(doc_uid_generator.next_doc_uid());
         }
         let batch = builder.finish();
diff --git a/quickwit/quickwit-opentelemetry/src/otlp/mod.rs b/quickwit/quickwit-opentelemetry/src/otlp/mod.rs
index 67abd6c1d68..9927c4fd800 100644
--- a/quickwit/quickwit-opentelemetry/src/otlp/mod.rs
+++ b/quickwit/quickwit-opentelemetry/src/otlp/mod.rs
@@ -37,7 +37,7 @@ mod traces;
 
 pub use arrow_metrics::{
     ArrowDocBatchV2Builder, ArrowIpcError, ArrowMetricsBatchBuilder, ipc_to_json_values,
-    ipc_to_record_batch, metrics_arrow_schema, record_batch_to_ipc,
+    ipc_to_record_batch, record_batch_to_ipc,
 };
 pub use logs::{
     JsonLogIterator, OTEL_LOGS_INDEX_ID, OtlpGrpcLogsService, OtlpLogsError, parse_otlp_logs_json,
diff --git a/quickwit/quickwit-opentelemetry/src/otlp/otel_metrics.rs b/quickwit/quickwit-opentelemetry/src/otlp/otel_metrics.rs
index bcb67c5fadb..1a19e5fea53 100644
--- a/quickwit/quickwit-opentelemetry/src/otlp/otel_metrics.rs
+++ b/quickwit/quickwit-opentelemetry/src/otlp/otel_metrics.rs
@@ -134,59 +134,14 @@ impl From<OtlpMetricsError> for tonic::Status {
     }
 }
 
-/// Represents a single metric data point document
+/// Represents a single metric data point document.
 #[derive(Debug, Clone)]
 pub struct MetricDataPoint {
-    // Metric identity
     pub metric_name: String,
     pub metric_type: MetricType,
-    pub metric_unit: Option<String>,
-
-    // Timestamps (seconds granularity)
     pub timestamp_secs: u64,
-    pub start_timestamp_secs: Option<u64>,
-
-    // Value (f64 only)
     pub value: f64,
-
-    // Explicit tag columns
-    pub tag_service: Option<String>,
-    pub tag_env: Option<String>,
-    pub tag_datacenter: Option<String>,
-    pub tag_region: Option<String>,
-    pub tag_host: Option<String>,
-
-    // Dynamic tags (remaining attributes)
-    pub attributes: HashMap<String, JsonValue>,
-
-    // Resource metadata
-    pub service_name: String,
-    pub resource_attributes: HashMap<String, JsonValue>,
-}
-
-struct ExplicitTags {
-    service: Option<String>,
-    env: Option<String>,
-    datacenter: Option<String>,
-    region: Option<String>,
-    host: Option<String>,
-}
-
-fn extract_string_tag(attributes: &mut HashMap<String, JsonValue>, key: &str) -> Option<String> {
-    attributes.remove(key).and_then(|v| match v {
-        JsonValue::String(s) => Some(s),
-        _ => None,
-    })
-}
-
-fn extract_explicit_tags(attributes: &mut HashMap<String, JsonValue>) -> ExplicitTags {
-    ExplicitTags {
-        service: extract_string_tag(attributes, "service"),
-        env: extract_string_tag(attributes, "env"),
-        datacenter: extract_string_tag(attributes, "datacenter"),
-        region: extract_string_tag(attributes, "region"),
-        host: extract_string_tag(attributes, "host"),
-    }
+    pub tags: HashMap<String, String>,
 }
 
 /// Convert nanoseconds to seconds
@@ -194,6 +149,16 @@ fn nanos_to_secs(nanos: u64) -> u64 {
     nanos / 1_000_000_000
 }
 
+/// Convert a `serde_json::Value` to a plain `String`.
+fn json_value_to_string(value: JsonValue) -> String {
+    match value {
+        JsonValue::String(s) => s,
+        JsonValue::Number(n) => n.to_string(),
+        JsonValue::Bool(b) => b.to_string(),
+        other => serde_json::to_string(&other).unwrap_or_default(),
+    }
+}
+
 struct ParsedMetrics {
     doc_batch: DocBatchV2,
     num_data_points: u64,
@@ -243,7 +208,7 @@ impl OtlpGrpcMetricsService {
             Status::internal("failed to parse metric records")
         })??;
 
-        if num_data_points == num_parse_errors {
+        if num_data_points > 0 && num_data_points == num_parse_errors {
             return Err(tonic::Status::internal(error_message));
         }
 
@@ -274,15 +239,19 @@ impl OtlpGrpcMetricsService {
         request: ExportMetricsServiceRequest,
         parent_span: RuntimeSpan,
     ) -> tonic::Result<ParsedMetrics> {
-        let data_points = parse_otlp_metrics(request)?;
-        let num_data_points = data_points.len() as u64;
-
-        // Build Arrow RecordBatch from data points
-        let mut arrow_builder = ArrowMetricsBatchBuilder::with_capacity(num_data_points as usize);
+        let ParseOtlpResult {
+            data_points,
+            num_rejected,
+        } = parse_otlp_metrics(request);
+        let num_data_points = data_points.len() as u64 + num_rejected;
+
+        // Build Arrow RecordBatch from valid data points
+        let mut arrow_builder =
+            ArrowMetricsBatchBuilder::with_capacity(data_points.len());
         let mut doc_uid_generator = DocUidGenerator::default();
-        let mut doc_uids = Vec::with_capacity(num_data_points as usize);
+        let mut doc_uids = Vec::with_capacity(data_points.len());
 
-        for data_point in &data_points {
+        for data_point in data_points {
             arrow_builder.append(data_point);
             doc_uids.push(doc_uid_generator.next_doc_uid());
         }
@@ -300,13 +269,22 @@ impl OtlpGrpcMetricsService {
         let current_span = RuntimeSpan::current();
         current_span.record("num_data_points", num_data_points);
         current_span.record("num_bytes", doc_batch.num_bytes());
-        current_span.record("num_parse_errors", 0u64);
+        current_span.record("num_parse_errors", num_rejected);
+
+        let error_message = if num_rejected > 0 {
+            format!(
+                "{num_rejected} data point(s) rejected (unsupported temporality or missing \
+                 required fields)"
+            )
+        } else {
+            String::new()
+        };
 
         let parsed_metrics = ParsedMetrics {
             doc_batch,
             num_data_points,
-            num_parse_errors: 0,
-            error_message: String::new(),
+            num_parse_errors: num_rejected,
+            error_message,
         };
         Ok(parsed_metrics)
     }
@@ -381,16 +359,19 @@ impl MetricsService for OtlpGrpcMetricsService {
     }
 }
 
-fn parse_otlp_metrics(
-    request: ExportMetricsServiceRequest,
-) -> Result<Vec<MetricDataPoint>, OtlpMetricsError> {
+struct ParseOtlpResult {
+    data_points: Vec<MetricDataPoint>,
+    num_rejected: u64,
+}
+
+fn parse_otlp_metrics(request: ExportMetricsServiceRequest) -> ParseOtlpResult {
     let mut data_points = Vec::new();
+    let mut num_rejected: u64 = 0;
 
     for resource_metrics in request.resource_metrics {
         let mut resource_attributes = extract_attributes(
             resource_metrics
                 .resource
-                .clone()
                 .map(|rsrc| rsrc.attributes)
                 .unwrap_or_default(),
         );
@@ -401,25 +382,23 @@ fn parse_otlp_metrics(
 
         for scope_metrics in resource_metrics.scope_metrics {
             for metric in scope_metrics.metrics {
-                parse_metric(
-                    &metric,
-                    &service_name,
-                    &resource_attributes,
-                    &mut data_points,
-                )?;
+                parse_metric(&metric, &service_name, &mut data_points, &mut num_rejected);
             }
         }
     }
 
-    Ok(data_points)
+    ParseOtlpResult {
+        data_points,
+        num_rejected,
+    }
 }
 
 fn parse_metric(
     metric: &Metric,
     service_name: &str,
-    resource_attributes: &HashMap<String, JsonValue>,
     data_points: &mut Vec<MetricDataPoint>,
-) -> Result<(), OtlpMetricsError> {
+    num_rejected: &mut u64,
+) {
     let metric_name = metric.name.clone();
     let metric_unit = if metric.unit.is_empty() {
         None
@@ -430,36 +409,47 @@ fn parse_metric(
     match &metric.data {
         Some(metric::Data::Gauge(gauge)) => {
             for dp in &gauge.data_points {
-                let data_point = create_number_data_point(
+                match create_number_data_point(
                     &metric_name,
                     MetricType::Gauge,
                     &metric_unit,
                     dp,
                     service_name,
-                    resource_attributes,
-                )?;
-                data_points.push(data_point);
+                ) {
+                    Ok(Some(data_point)) => data_points.push(data_point),
+                    Ok(None) => *num_rejected += 1,
+                    Err(err) => {
+                        warn!(error = %err, metric_name, "skipping invalid gauge data point");
+                        *num_rejected += 1;
+                    }
+                }
             }
         }
         Some(metric::Data::Sum(sum)) => {
-            // Only support DELTA temporality
             if sum.aggregation_temporality == AggregationTemporality::Cumulative as i32 {
-                return Err(OtlpMetricsError::InvalidArgument(
-                    "cumulative aggregation temporality is not supported, only delta is supported"
-                        .to_string(),
-                ));
+                warn!(
+                    metric_name,
+                    "skipping sum metric with cumulative temporality (only delta is supported)"
+                );
+                *num_rejected += sum.data_points.len() as u64;
+                return;
             }
 
             for dp in &sum.data_points {
-                let data_point = create_number_data_point(
+                match create_number_data_point(
                     &metric_name,
                     MetricType::Sum,
                     &metric_unit,
                     dp,
                     service_name,
-                    resource_attributes,
-                )?;
-                data_points.push(data_point);
+                ) {
+                    Ok(Some(data_point)) => data_points.push(data_point),
+                    Ok(None) => *num_rejected += 1,
+                    Err(err) => {
+                        warn!(error = %err, metric_name, "skipping invalid sum data point");
+                        *num_rejected += 1;
+                    }
+                }
             }
         }
         Some(metric::Data::Histogram(_)) => {
@@ -475,8 +465,6 @@ fn parse_metric(
             warn!("metric has no data, skipping");
         }
     }
-
-    Ok(())
 }
 
 fn create_number_data_point(
@@ -485,8 +473,20 @@ fn create_number_data_point(
     metric_unit: &Option<String>,
     dp: &NumberDataPoint,
     service_name: &str,
-    resource_attributes: &HashMap<String, JsonValue>,
-) -> Result<MetricDataPoint, OtlpMetricsError> {
+) -> Result<Option<MetricDataPoint>, OtlpMetricsError> {
+    // Convert timestamps to seconds
+    let timestamp_secs = nanos_to_secs(dp.time_unix_nano);
+
+    // Validate: skip data points with empty metric_name or zero timestamp
+    if metric_name.is_empty() {
+        warn!("skipping data point with empty metric_name");
+        return Ok(None);
+    }
+    if timestamp_secs == 0 {
+        warn!("skipping data point with zero timestamp_secs");
+        return Ok(None);
+    }
+
     // Extract value as f64
     let value = match &dp.value {
         Some(
@@ -500,34 +500,39 @@ fn create_number_data_point(
         None => 0.0,
     };
 
-    // Extract attributes and explicit tags
-    let mut attributes = extract_attributes(dp.attributes.clone());
-    let explicit_tags = extract_explicit_tags(&mut attributes);
+    // Extract attributes and convert all values to strings for tags
+    let attributes = extract_attributes(dp.attributes.clone());
+    let mut tags = HashMap::with_capacity(attributes.len() + 3);
 
-    // Convert timestamps to seconds
-    let timestamp_secs = nanos_to_secs(dp.time_unix_nano);
-    let start_timestamp_secs = if dp.start_time_unix_nano != 0 {
-        Some(nanos_to_secs(dp.start_time_unix_nano))
-    } else {
-        None
-    };
+    for (key, json_val) in attributes {
+        tags.insert(key, json_value_to_string(json_val));
+    }
+
+    // Add metric_unit and start_timestamp_secs using or_insert_with so a
+    // data-point attribute with the same name is not silently overwritten.
+    if let Some(unit) = metric_unit {
+        tags.entry("metric_unit".to_string())
+            .or_insert_with(|| unit.clone());
+    }
 
-    Ok(MetricDataPoint {
+    if dp.start_time_unix_nano != 0 {
+        let start_ts = nanos_to_secs(dp.start_time_unix_nano);
+        tags.entry("start_timestamp_secs".to_string())
+            .or_insert_with(|| start_ts.to_string());
+    }
+
+    // Fall back to the resource-level service.name if no data-point-level
+    // "service" tag was set. Data-point attributes take precedence.
+    tags.entry("service".to_string())
+        .or_insert_with(|| service_name.to_string());
+
+    Ok(Some(MetricDataPoint {
         metric_name: metric_name.to_string(),
         metric_type,
-        metric_unit: metric_unit.clone(),
         timestamp_secs,
-        start_timestamp_secs,
         value,
-        tag_service: explicit_tags.service,
-        tag_env: explicit_tags.env,
-        tag_datacenter: explicit_tags.datacenter,
-        tag_region: explicit_tags.region,
-        tag_host: explicit_tags.host,
-        attributes,
-        service_name: service_name.to_string(),
-        resource_attributes: resource_attributes.clone(),
-    })
+        tags,
+    }))
 }
 
 #[cfg(test)]
@@ -562,42 +567,6 @@ mod tests {
         assert_eq!(nanos_to_secs(2_000_000_000), 2);
     }
 
-    #[test]
-    fn test_extract_explicit_tags() {
-        let mut attributes = HashMap::from([
-            ("service".to_string(), JsonValue::String("api".to_string())),
-            ("env".to_string(), JsonValue::String("prod".to_string())),
-            (
-                "datacenter".to_string(),
-                JsonValue::String("us-east".to_string()),
-            ),
-            (
-                "region".to_string(),
-                JsonValue::String("us-east-1".to_string()),
-            ),
-            (
-                "host".to_string(),
-                JsonValue::String("server-1".to_string()),
-            ),
-            (
-                "custom_tag".to_string(),
-                JsonValue::String("custom_value".to_string()),
-            ),
-        ]);
-
-        let explicit_tags = extract_explicit_tags(&mut attributes);
-
-        assert_eq!(explicit_tags.service, Some("api".to_string()));
-        assert_eq!(explicit_tags.env, Some("prod".to_string()));
-        assert_eq!(explicit_tags.datacenter, Some("us-east".to_string()));
-        assert_eq!(explicit_tags.region, Some("us-east-1".to_string()));
-        assert_eq!(explicit_tags.host, Some("server-1".to_string()));
-
-        // custom_tag should remain in attributes
-        assert_eq!(attributes.len(), 1);
-        assert!(attributes.contains_key("custom_tag"));
-    }
-
     fn make_test_gauge_request() -> ExportMetricsServiceRequest {
         use quickwit_proto::opentelemetry::proto::common::v1::{AnyValue, KeyValue, any_value};
         use quickwit_proto::opentelemetry::proto::metrics::v1::{
@@ -747,35 +716,44 @@ mod tests {
     #[test]
     fn test_parse_gauge_metrics() {
         let request = make_test_gauge_request();
-        let data_points = parse_otlp_metrics(request).unwrap();
+        let data_points = parse_otlp_metrics(request).data_points;
 
         assert_eq!(data_points.len(), 1);
         let dp = &data_points[0];
         assert_eq!(dp.metric_name, "cpu.usage");
         assert_eq!(dp.metric_type, MetricType::Gauge);
-        assert_eq!(dp.metric_unit, Some("%".to_string()));
+        assert_eq!(dp.tags.get("metric_unit").map(|s| s.as_str()), Some("%"));
         assert_eq!(dp.timestamp_secs, 2);
-        assert_eq!(dp.start_timestamp_secs, Some(1));
+        assert_eq!(
+            dp.tags.get("start_timestamp_secs").map(|s| s.as_str()),
+            Some("1")
+        );
         assert_eq!(dp.value, 85.5);
-        assert_eq!(dp.tag_service, Some("api".to_string()));
-        assert_eq!(dp.tag_env, Some("prod".to_string()));
-        assert_eq!(dp.service_name, "test-service");
+        // Data-point attribute "service" takes precedence over resource-level service.name.
+        assert_eq!(dp.tags.get("service").map(|s| s.as_str()), Some("api"));
+        assert_eq!(dp.tags.get("env").map(|s| s.as_str()), Some("prod"));
     }
 
     #[test]
     fn test_parse_sum_delta_metrics() {
         let request = make_test_sum_delta_request();
-        let data_points = parse_otlp_metrics(request).unwrap();
+        let data_points = parse_otlp_metrics(request).data_points;
 
         assert_eq!(data_points.len(), 1);
         let dp = &data_points[0];
         assert_eq!(dp.metric_name, "http.requests");
         assert_eq!(dp.metric_type, MetricType::Sum);
-        assert_eq!(dp.metric_unit, Some("1".to_string()));
+        assert_eq!(dp.tags.get("metric_unit").map(|s| s.as_str()), Some("1"));
         assert_eq!(dp.timestamp_secs, 2);
         assert_eq!(dp.value, 100.0); // int converted to f64
-        assert_eq!(dp.tag_host, Some("server-1".to_string()));
-        assert_eq!(dp.service_name, "counter-service");
+        assert_eq!(
+            dp.tags.get("host").map(|s| s.as_str()),
+            Some("server-1")
+        );
+        assert_eq!(
+            dp.tags.get("service").map(|s| s.as_str()),
+            Some("counter-service")
+        );
     }
 
     #[test]
@@ -783,11 +761,10 @@ mod tests {
         let request = make_test_sum_cumulative_request();
         let result = parse_otlp_metrics(request);
 
-        assert!(result.is_err());
-        match result.unwrap_err() {
-            OtlpMetricsError::InvalidArgument(_) => {}
-            err => panic!("unexpected error type: {:?}", err),
-        }
+        // Cumulative sums are skipped (not a hard error) so other metrics in the same
+        // request can still be processed. The rejected count is incremented instead.
+        assert_eq!(result.data_points.len(), 0);
+        assert_eq!(result.num_rejected, 1);
     }
 
     /// Test parsing metrics with various attribute types
@@ -860,19 +837,17 @@ mod tests {
             }],
         };
 
-        let data_points = parse_otlp_metrics(request).unwrap();
+        let data_points = parse_otlp_metrics(request).data_points;
         assert_eq!(data_points.len(), 1);
 
         let dp = &data_points[0];
-        assert_eq!(dp.service_name, "test");
-
-        // Verify resource attributes contain the non-service.name attributes
-        assert!(dp.resource_attributes.contains_key("int_attr"));
-        assert!(dp.resource_attributes.contains_key("bool_attr"));
-        assert!(dp.resource_attributes.contains_key("double_attr"));
+        assert_eq!(dp.tags.get("service").map(|s| s.as_str()), Some("test"));
 
-        // Verify data point attributes
-        assert!(dp.attributes.contains_key("string_tag"));
+        // Verify data point attributes are in tags as strings
+        assert_eq!(
+            dp.tags.get("string_tag").map(|s| s.as_str()),
+            Some("value")
+        );
     }
 
     /// Test metrics with empty and missing values
@@ -908,17 +883,94 @@ mod tests {
             }],
         };
 
-        let data_points = parse_otlp_metrics(request).unwrap();
+        let data_points = parse_otlp_metrics(request).data_points;
         assert_eq!(data_points.len(), 1);
 
         let dp = &data_points[0];
         assert_eq!(dp.metric_name, "minimal.metric");
-        assert_eq!(dp.service_name, "unknown_service"); // Default value
-        assert!(dp.metric_unit.is_none());
-        assert!(dp.start_timestamp_secs.is_none());
-        assert!(dp.tag_service.is_none());
-        assert!(dp.tag_env.is_none());
-        assert!(dp.attributes.is_empty());
-        assert!(dp.resource_attributes.is_empty());
+        assert_eq!(
+            dp.tags.get("service").map(|s| s.as_str()),
+            Some("unknown_service")
+        );
+        // No metric_unit tag when unit is empty
+        assert!(dp.tags.get("metric_unit").is_none());
+        // No start_timestamp_secs tag when start time is 0
+        assert!(dp.tags.get("start_timestamp_secs").is_none());
+        // Only "service" should be in tags (no attributes, no unit, no start time)
+        assert_eq!(dp.tags.len(), 1);
+    }
+
+    /// Test that data points with empty metric_name are skipped
+    #[test]
+    fn test_skip_empty_metric_name() {
+        use quickwit_proto::opentelemetry::proto::metrics::v1::{
+            Gauge, ResourceMetrics, ScopeMetrics, number_data_point,
+        };
+
+        let request = ExportMetricsServiceRequest {
+            resource_metrics: vec![ResourceMetrics {
+                resource: None,
+                scope_metrics: vec![ScopeMetrics {
+                    scope: None,
+                    metrics: vec![Metric {
+                        name: String::new(), // Empty name
+                        description: String::new(),
+                        unit: String::new(),
+                        data: Some(metric::Data::Gauge(Gauge {
+                            data_points: vec![NumberDataPoint {
+                                attributes: Vec::new(),
+                                start_time_unix_nano: 0,
+                                time_unix_nano: 1_000_000_000,
+                                exemplars: Vec::new(),
+                                flags: 0,
+                                value: Some(number_data_point::Value::AsDouble(1.0)),
+                            }],
+                        })),
+                    }],
+                    schema_url: String::new(),
+                }],
+                schema_url: String::new(),
+            }],
+        };
+
+        let data_points = parse_otlp_metrics(request).data_points;
+        assert_eq!(data_points.len(), 0);
+    }
+
+    /// Test that data points with zero timestamp are skipped
+    #[test]
+    fn test_skip_zero_timestamp() {
+        use quickwit_proto::opentelemetry::proto::metrics::v1::{
+            Gauge, ResourceMetrics, ScopeMetrics, number_data_point,
+        };
+
+        let request = ExportMetricsServiceRequest {
+            resource_metrics: vec![ResourceMetrics {
+                resource: None,
+                scope_metrics: vec![ScopeMetrics {
+                    scope: None,
+                    metrics: vec![Metric {
+                        name: "test.metric".to_string(),
+                        description: String::new(),
+                        unit: String::new(),
+                        data: Some(metric::Data::Gauge(Gauge {
+                            data_points: vec![NumberDataPoint {
+                                attributes: Vec::new(),
+                                start_time_unix_nano: 0,
+                                time_unix_nano: 0, // Zero timestamp
+                                exemplars: Vec::new(),
+                                flags: 0,
+                                value: Some(number_data_point::Value::AsDouble(1.0)),
+                            }],
+                        })),
+                    }],
+                    schema_url: String::new(),
+                }],
+                schema_url: String::new(),
+            }],
+        };
+
+        let data_points = parse_otlp_metrics(request).data_points;
+        assert_eq!(data_points.len(), 0);
     }
 }

From 4e928fea88d1217d8416aa2939538671345d62ea Mon Sep 17 00:00:00 2001
From: Matthew Kim <matthew.kim@datadoghq.com>
Date: Wed, 18 Mar 2026 11:02:04 -0400
Subject: [PATCH 02/35] feat: replace ParquetField enum with constants and
 dynamic validation

---
 quickwit/Cargo.lock                           |   1 +
 quickwit/quickwit-parquet-engine/Cargo.toml   |   1 +
 .../src/schema/fields.rs                      | 180 ++++--------------
 .../quickwit-parquet-engine/src/schema/mod.rs |   2 +-
 .../src/schema/parquet.rs                     |  68 +++----
 5 files changed, 76 insertions(+), 176 deletions(-)

diff --git a/quickwit/Cargo.lock b/quickwit/Cargo.lock
index d099964a383..684303cb6a8 100644
--- a/quickwit/Cargo.lock
+++ b/quickwit/Cargo.lock
@@ -7611,6 +7611,7 @@ dependencies = [
 name = "quickwit-parquet-engine"
 version = "0.8.0"
 dependencies = [
+ "anyhow",
  "arrow",
  "parquet",
  "proptest",
diff --git a/quickwit/quickwit-parquet-engine/Cargo.toml b/quickwit/quickwit-parquet-engine/Cargo.toml
index f886083eded..9842ceb6f00 100644
--- a/quickwit/quickwit-parquet-engine/Cargo.toml
+++ b/quickwit/quickwit-parquet-engine/Cargo.toml
@@ -11,6 +11,7 @@ authors.workspace = true
 license.workspace = true
 
 [dependencies]
+anyhow = { workspace = true }
 arrow = { workspace = true }
 parquet = { workspace = true }
 quickwit-common = { workspace = true }
diff --git a/quickwit/quickwit-parquet-engine/src/schema/fields.rs b/quickwit/quickwit-parquet-engine/src/schema/fields.rs
index b6a52c738aa..a8646823017 100644
--- a/quickwit/quickwit-parquet-engine/src/schema/fields.rs
+++ b/quickwit/quickwit-parquet-engine/src/schema/fields.rs
@@ -12,151 +12,55 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-//! Parquet field definitions with column metadata.
+//! Parquet field definitions with sort order constants and validation.
 
-use arrow::datatypes::{DataType, Field, Fields};
-use parquet::variant::VariantType;
+use anyhow::{bail, Result};
+use arrow::datatypes::DataType;
 
-/// All fields in the parquet schema.
-#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
-pub enum ParquetField {
-    MetricName,
-    MetricType,
-    MetricUnit,
-    TimestampSecs,
-    StartTimestampSecs,
-    Value,
-    TagService,
-    TagEnv,
-    TagDatacenter,
-    TagRegion,
-    TagHost,
-    Attributes,
-    ServiceName,
-    ResourceAttributes,
-}
+/// Required field names that must exist in every batch.
+pub const REQUIRED_FIELDS: &[&str] = &["metric_name", "metric_type", "timestamp_secs", "value"];
 
-impl ParquetField {
-    /// Field name as stored in Parquet.
-    pub fn name(&self) -> &'static str {
-        match self {
-            Self::MetricName => "metric_name",
-            Self::MetricType => "metric_type",
-            Self::MetricUnit => "metric_unit",
-            Self::TimestampSecs => "timestamp_secs",
-            Self::StartTimestampSecs => "start_timestamp_secs",
-            Self::Value => "value",
-            Self::TagService => "tag_service",
-            Self::TagEnv => "tag_env",
-            Self::TagDatacenter => "tag_datacenter",
-            Self::TagRegion => "tag_region",
-            Self::TagHost => "tag_host",
-            Self::Attributes => "attributes",
-            Self::ServiceName => "service_name",
-            Self::ResourceAttributes => "resource_attributes",
-        }
-    }
+/// Sort order column names. Columns not present in a batch are skipped.
+pub const SORT_ORDER: &[&str] = &[
+    "metric_name",
+    "service",
+    "env",
+    "datacenter",
+    "region",
+    "host",
+    "timestamp_secs",
+];
 
-    /// Whether this field is nullable.
-    pub fn is_nullable(&self) -> bool {
-        matches!(
-            self,
-            Self::MetricUnit
-                | Self::StartTimestampSecs
-                | Self::TagService
-                | Self::TagEnv
-                | Self::TagDatacenter
-                | Self::TagRegion
-                | Self::TagHost
-                | Self::Attributes
-                | Self::ResourceAttributes
-        )
+/// Arrow type for required fields by name.
+pub fn required_field_type(name: &str) -> Option<DataType> {
+    match name {
+        "metric_name" => Some(DataType::Dictionary(
+            Box::new(DataType::Int32),
+            Box::new(DataType::Utf8),
+        )),
+        "metric_type" => Some(DataType::UInt8),
+        "timestamp_secs" => Some(DataType::UInt64),
+        "value" => Some(DataType::Float64),
+        _ => None,
     }
+}
 
-    /// Arrow DataType for this field.
-    /// Use dictionary encoding for high-cardinality strings.
-    pub fn arrow_type(&self) -> DataType {
-        match self {
-            // Dictionary-encoded strings for high cardinality
-            Self::MetricName | Self::ServiceName => {
-                DataType::Dictionary(Box::new(DataType::Int32), Box::new(DataType::Utf8))
+/// Validate that a batch schema contains all required fields with correct types.
+pub fn validate_required_fields(schema: &arrow::datatypes::Schema) -> Result<()> {
+    for &name in REQUIRED_FIELDS {
+        match schema.index_of(name) {
+            Ok(idx) => {
+                let expected_type = required_field_type(name).unwrap();
+                let actual_type = schema.field(idx).data_type();
+                if *actual_type != expected_type {
+                    bail!(
+                        "field '{}' has type {:?}, expected {:?}",
+                        name, actual_type, expected_type
+                    );
+                }
             }
-            // Dictionary-encoded optional tags
-            Self::TagService
-            | Self::TagEnv
-            | Self::TagDatacenter
-            | Self::TagRegion
-            | Self::TagHost => {
-                DataType::Dictionary(Box::new(DataType::Int32), Box::new(DataType::Utf8))
-            }
-            // Enum stored as UInt8
-            Self::MetricType => DataType::UInt8,
-            // Timestamps as UInt64 seconds
-            Self::TimestampSecs | Self::StartTimestampSecs => DataType::UInt64,
-            // Metric value
-            Self::Value => DataType::Float64,
-            // Plain string for metric unit
-            Self::MetricUnit => DataType::Utf8,
-            // VARIANT type for semi-structured attributes
-            // Uses the Parquet Variant binary encoding format
-            Self::Attributes | Self::ResourceAttributes => {
-                // VARIANT is stored as a struct with metadata and value BinaryView fields
-                // VariantArrayBuilder produces BinaryView, not Binary
-                DataType::Struct(Fields::from(vec![
-                    Field::new("metadata", DataType::BinaryView, false),
-                    Field::new("value", DataType::BinaryView, false),
-                ]))
-            }
-        }
-    }
-
-    /// Convert to Arrow Field.
-    pub fn to_arrow_field(&self) -> Field {
-        let field = Field::new(self.name(), self.arrow_type(), self.is_nullable());
-
-        // Add VARIANT extension type metadata for attributes fields
-        match self {
-            Self::Attributes | Self::ResourceAttributes => field.with_extension_type(VariantType),
-            _ => field,
+            Err(_) => bail!("missing required field '{}'", name),
         }
     }
-
-    /// All fields in schema order.
-    pub fn all() -> &'static [ParquetField] {
-        &[
-            Self::MetricName,
-            Self::MetricType,
-            Self::MetricUnit,
-            Self::TimestampSecs,
-            Self::StartTimestampSecs,
-            Self::Value,
-            Self::TagService,
-            Self::TagEnv,
-            Self::TagDatacenter,
-            Self::TagRegion,
-            Self::TagHost,
-            Self::Attributes,
-            Self::ServiceName,
-            Self::ResourceAttributes,
-        ]
-    }
-
-    /// Sort order for metrics data (used for pruning).
-    /// Order: metric_name, common tags (service, env, datacenter, region, host), timestamp.
-    pub fn sort_order() -> &'static [ParquetField] {
-        &[
-            Self::MetricName,
-            Self::TagService,
-            Self::TagEnv,
-            Self::TagDatacenter,
-            Self::TagRegion,
-            Self::TagHost,
-            Self::TimestampSecs,
-        ]
-    }
-
-    /// Get the column index in the schema.
-    pub fn column_index(&self) -> usize {
-        Self::all().iter().position(|f| f == self).unwrap()
-    }
+    Ok(())
 }
diff --git a/quickwit/quickwit-parquet-engine/src/schema/mod.rs b/quickwit/quickwit-parquet-engine/src/schema/mod.rs
index 4cbe31d357d..f9b5c06d9c4 100644
--- a/quickwit/quickwit-parquet-engine/src/schema/mod.rs
+++ b/quickwit/quickwit-parquet-engine/src/schema/mod.rs
@@ -20,5 +20,5 @@
 mod fields;
 mod parquet;
 
-pub use fields::ParquetField;
+pub use fields::{REQUIRED_FIELDS, SORT_ORDER, required_field_type, validate_required_fields};
 pub use parquet::ParquetSchema;
diff --git a/quickwit/quickwit-parquet-engine/src/schema/parquet.rs b/quickwit/quickwit-parquet-engine/src/schema/parquet.rs
index 83ef99e351c..d0991f89807 100644
--- a/quickwit/quickwit-parquet-engine/src/schema/parquet.rs
+++ b/quickwit/quickwit-parquet-engine/src/schema/parquet.rs
@@ -14,13 +14,7 @@
 
 //! Parquet schema construction for metrics.
 
-use std::sync::Arc;
-
-use arrow::datatypes::{Schema as ArrowSchema, SchemaRef};
-use parquet::arrow::ArrowSchemaConverter;
-use parquet::schema::types::SchemaDescriptor;
-
-use super::fields::ParquetField;
+use arrow::datatypes::SchemaRef;
 
 /// Parquet schema for storage.
 #[derive(Debug, Clone)]
@@ -29,15 +23,11 @@ pub struct ParquetSchema {
 }
 
 impl ParquetSchema {
-    /// Create a new ParquetSchema.
-    pub fn new() -> Self {
-        let fields: Vec<_> = ParquetField::all()
-            .iter()
-            .map(|f| f.to_arrow_field())
-            .collect();
-
-        let arrow_schema = Arc::new(ArrowSchema::new(fields));
-        Self { arrow_schema }
+    /// Create a ParquetSchema from an Arrow schema.
+    pub fn from_arrow_schema(schema: SchemaRef) -> Self {
+        Self {
+            arrow_schema: schema,
+        }
     }
 
     /// Get the Arrow schema.
@@ -45,11 +35,6 @@ impl ParquetSchema {
         &self.arrow_schema
     }
 
-    /// Convert to Parquet schema descriptor.
-    pub fn parquet_schema(&self) -> Result<SchemaDescriptor, parquet::errors::ParquetError> {
-        ArrowSchemaConverter::new().convert(&self.arrow_schema)
-    }
-
     /// Get field by name.
     pub fn field(&self, name: &str) -> Option<&arrow::datatypes::Field> {
         self.arrow_schema.field_with_name(name).ok()
@@ -61,33 +46,42 @@ impl ParquetSchema {
     }
 }
 
-impl Default for ParquetSchema {
-    fn default() -> Self {
-        Self::new()
-    }
-}
-
 #[cfg(test)]
 mod tests {
+    use std::sync::Arc;
+
+    use arrow::datatypes::{DataType, Field, Schema as ArrowSchema};
+
     use super::*;
 
+    fn create_test_schema() -> SchemaRef {
+        Arc::new(ArrowSchema::new(vec![
+            Field::new(
+                "metric_name",
+                DataType::Dictionary(Box::new(DataType::Int32), Box::new(DataType::Utf8)),
+                false,
+            ),
+            Field::new("metric_type", DataType::UInt8, false),
+            Field::new("timestamp_secs", DataType::UInt64, false),
+            Field::new("value", DataType::Float64, false),
+            Field::new(
+                "service",
+                DataType::Dictionary(Box::new(DataType::Int32), Box::new(DataType::Utf8)),
+                true,
+            ),
+        ]))
+    }
+
     #[test]
     fn test_schema_creation() {
-        let schema = ParquetSchema::new();
-        assert_eq!(schema.num_fields(), 14);
+        let schema = ParquetSchema::from_arrow_schema(create_test_schema());
+        assert_eq!(schema.num_fields(), 5);
     }
 
     #[test]
     fn test_field_lookup() {
-        let schema = ParquetSchema::new();
+        let schema = ParquetSchema::from_arrow_schema(create_test_schema());
         let field = schema.field("metric_name").unwrap();
         assert!(!field.is_nullable());
     }
-
-    #[test]
-    fn test_parquet_conversion() {
-        let schema = ParquetSchema::new();
-        let parquet_schema = schema.parquet_schema();
-        assert!(parquet_schema.is_ok());
-    }
 }

From d8d71ed515260592ce0d142ba018ce0224dd8e6d Mon Sep 17 00:00:00 2001
From: Matthew Kim <matthew.kim@datadoghq.com>
Date: Wed, 18 Mar 2026 11:02:13 -0400
Subject: [PATCH 03/35] feat: derive sort order and bloom filters from batch
 schema

---
 .../src/storage/config.rs                     | 247 ++++++-----
 .../src/storage/split_writer.rs               |  10 +-
 .../src/storage/writer.rs                     | 401 ++++++------------
 3 files changed, 244 insertions(+), 414 deletions(-)

diff --git a/quickwit/quickwit-parquet-engine/src/storage/config.rs b/quickwit/quickwit-parquet-engine/src/storage/config.rs
index f87845d2ba9..a10a423cf9c 100644
--- a/quickwit/quickwit-parquet-engine/src/storage/config.rs
+++ b/quickwit/quickwit-parquet-engine/src/storage/config.rs
@@ -14,12 +14,13 @@
 
 //! Parquet writer configuration for metrics storage.
 
+use arrow::datatypes::{DataType, Schema as ArrowSchema};
 use parquet::basic::Compression as ParquetCompression;
 use parquet::file::metadata::SortingColumn;
 use parquet::file::properties::{EnabledStatistics, WriterProperties, WriterPropertiesBuilder};
 use parquet::schema::types::ColumnPath;
 
-use crate::schema::ParquetField;
+use crate::schema::SORT_ORDER;
 
 /// Default row group size: 128K rows for efficient columnar scans.
 const DEFAULT_ROW_GROUP_SIZE: usize = 128 * 1024;
@@ -117,8 +118,9 @@ impl ParquetWriterConfig {
         self
     }
 
-    /// Convert to Parquet WriterProperties.
-    pub fn to_writer_properties(&self) -> WriterProperties {
+    /// Convert to Parquet WriterProperties using the given Arrow schema to configure
+    /// per-column settings like dictionary encoding and bloom filters.
+    pub fn to_writer_properties(&self, schema: &ArrowSchema) -> WriterProperties {
         let mut builder = WriterProperties::builder()
             .set_max_row_group_size(self.row_group_size)
             .set_data_page_size_limit(self.data_page_size)
@@ -126,7 +128,7 @@ impl ParquetWriterConfig {
             // Enable column index for efficient pruning on sorted data (64 bytes default)
             .set_column_index_truncate_length(Some(64))
             // Set sorting columns metadata for readers to use during pruning
-            .set_sorting_columns(Some(Self::sorting_columns()))
+            .set_sorting_columns(Some(Self::sorting_columns(schema)))
             // Enable row group level statistics (min/max/null_count) for query pruning
             // This allows DataFusion to skip row groups based on timestamp ranges
             .set_statistics_enabled(EnabledStatistics::Chunk);
@@ -142,77 +144,58 @@ impl ParquetWriterConfig {
             Compression::Uncompressed => builder.set_compression(ParquetCompression::UNCOMPRESSED),
         };
 
-        // Apply RLE_DICTIONARY encoding and bloom filters for dictionary columns
-        builder = Self::configure_dictionary_columns(builder);
+        // Apply dictionary encoding and bloom filters based on schema column types
+        builder = Self::configure_columns(builder, schema);
 
         builder.build()
     }
 
-    /// Configure dictionary encoding and bloom filters for high-cardinality columns.
+    /// Configure dictionary encoding and bloom filters based on the Arrow schema.
     ///
-    /// Dictionary-encoded columns benefit from:
-    /// - Dictionary encoding: Enabled by default, uses RLE for dictionary indices
-    /// - Bloom filters: Enable efficient equality filtering without scanning
-    ///
-    /// Note: Dictionary encoding is ON by default in Parquet. When enabled, the dictionary
-    /// indices are automatically encoded using RLE (run-length encoding), which efficiently
-    /// compresses runs of repeated values. This is ideal for sorted data where consecutive
-    /// rows often share the same dictionary index.
-    fn configure_dictionary_columns(
+    /// - Dictionary encoding is enabled on all Dictionary(Int32, Utf8) columns.
+    /// - Bloom filters are enabled on metric_name and sort order tag columns.
+    fn configure_columns(
         mut builder: WriterPropertiesBuilder,
+        schema: &ArrowSchema,
     ) -> WriterPropertiesBuilder {
-        // Dictionary-encoded columns - ensure dictionary encoding is explicitly enabled
-        // (default is true, but being explicit documents intent)
-        let dictionary_columns = [
-            ParquetField::MetricName,
-            ParquetField::TagService,
-            ParquetField::TagEnv,
-            ParquetField::TagDatacenter,
-            ParquetField::TagRegion,
-            ParquetField::TagHost,
-            ParquetField::ServiceName,
-        ];
-
-        // Columns that benefit from bloom filters (used in WHERE clauses)
-        // Note: We enable bloom filters on filtering columns, not timestamp_secs or value
-        let bloom_filter_columns = [
-            (ParquetField::MetricName, BLOOM_FILTER_NDV_METRIC_NAME),
-            (ParquetField::TagService, BLOOM_FILTER_NDV_TAGS),
-            (ParquetField::TagEnv, BLOOM_FILTER_NDV_TAGS),
-            (ParquetField::TagDatacenter, BLOOM_FILTER_NDV_TAGS),
-            (ParquetField::TagHost, BLOOM_FILTER_NDV_TAGS),
-            (ParquetField::ServiceName, BLOOM_FILTER_NDV_TAGS),
-        ];
-
-        // Ensure dictionary encoding is enabled on dictionary columns
-        // (dictionary encoding uses RLE for indices automatically)
-        for field in dictionary_columns {
+        for field in schema.fields() {
             let col_path = ColumnPath::new(vec![field.name().to_string()]);
-            builder = builder.set_column_dictionary_enabled(col_path, true);
-        }
 
-        // Enable bloom filters on filtering columns
-        for (field, ndv) in bloom_filter_columns {
-            let col_path = ColumnPath::new(vec![field.name().to_string()]);
-            builder = builder
-                .set_column_bloom_filter_enabled(col_path.clone(), true)
-                .set_column_bloom_filter_fpp(col_path.clone(), BLOOM_FILTER_FPP)
-                .set_column_bloom_filter_ndv(col_path, ndv);
-        }
+            // Enable dictionary encoding on all Dictionary(_, _) columns
+            if matches!(field.data_type(), DataType::Dictionary(_, _)) {
+                builder = builder.set_column_dictionary_enabled(col_path.clone(), true);
+            }
 
+            // Enable bloom filters on dictionary-typed metric_name and sort order tag columns.
+            // Exclude non-dictionary columns, like timestamp_secs.
+            let is_bloom_column = matches!(field.data_type(), DataType::Dictionary(_, _))
+                && (field.name() == "metric_name"
+                    || SORT_ORDER.contains(&field.name().as_str()));
+            if is_bloom_column {
+                let ndv = if field.name() == "metric_name" {
+                    BLOOM_FILTER_NDV_METRIC_NAME
+                } else {
+                    BLOOM_FILTER_NDV_TAGS
+                };
+                builder = builder
+                    .set_column_bloom_filter_enabled(col_path.clone(), true)
+                    .set_column_bloom_filter_fpp(col_path.clone(), BLOOM_FILTER_FPP)
+                    .set_column_bloom_filter_ndv(col_path, ndv);
+            }
+        }
         builder
     }
 
-    /// Get the sorting columns for parquet metadata.
-    /// Order: metric_name, tag_service, tag_env, tag_datacenter, tag_region, tag_host,
-    /// timestamp_secs.
-    fn sorting_columns() -> Vec<SortingColumn> {
-        ParquetField::sort_order()
+    /// Get the sorting columns for parquet metadata, computed from the schema
+    /// and SORT_ORDER. Only columns present in the schema are included.
+    fn sorting_columns(schema: &ArrowSchema) -> Vec<SortingColumn> {
+        SORT_ORDER
             .iter()
-            .map(|field| SortingColumn {
-                column_idx: field.column_index() as i32,
+            .filter_map(|name| schema.index_of(name).ok())
+            .map(|idx| SortingColumn {
+                column_idx: idx as i32,
                 descending: false,
-                nulls_first: true,
+                nulls_first: false,
             })
             .collect()
     }
@@ -220,8 +203,41 @@ impl ParquetWriterConfig {
 
 #[cfg(test)]
 mod tests {
+    use std::sync::Arc;
+
+    use arrow::datatypes::Field;
+
     use super::*;
 
+    /// Create a test schema with required fields + some tag columns.
+    fn create_test_schema() -> ArrowSchema {
+        ArrowSchema::new(vec![
+            Field::new(
+                "metric_name",
+                DataType::Dictionary(Box::new(DataType::Int32), Box::new(DataType::Utf8)),
+                false,
+            ),
+            Field::new("metric_type", DataType::UInt8, false),
+            Field::new("timestamp_secs", DataType::UInt64, false),
+            Field::new("value", DataType::Float64, false),
+            Field::new(
+                "service",
+                DataType::Dictionary(Box::new(DataType::Int32), Box::new(DataType::Utf8)),
+                true,
+            ),
+            Field::new(
+                "env",
+                DataType::Dictionary(Box::new(DataType::Int32), Box::new(DataType::Utf8)),
+                true,
+            ),
+            Field::new(
+                "host",
+                DataType::Dictionary(Box::new(DataType::Int32), Box::new(DataType::Utf8)),
+                true,
+            ),
+        ])
+    }
+
     #[test]
     fn test_default_config() {
         let config = ParquetWriterConfig::default();
@@ -245,93 +261,89 @@ mod tests {
     #[test]
     fn test_to_writer_properties_zstd() {
         let config = ParquetWriterConfig::default();
-        let props = config.to_writer_properties();
-        // WriterProperties doesn't expose compression directly, but we can verify it builds
+        let schema = create_test_schema();
+        let props = config.to_writer_properties(&schema);
         assert!(props.max_row_group_size() == 128 * 1024);
     }
 
     #[test]
     fn test_to_writer_properties_snappy() {
         let config = ParquetWriterConfig::new().with_compression(Compression::Snappy);
-        let props = config.to_writer_properties();
+        let schema = create_test_schema();
+        let props = config.to_writer_properties(&schema);
         assert!(props.max_row_group_size() == 128 * 1024);
     }
 
     #[test]
     fn test_to_writer_properties_uncompressed() {
         let config = ParquetWriterConfig::new().with_compression(Compression::Uncompressed);
-        let props = config.to_writer_properties();
+        let schema = create_test_schema();
+        let props = config.to_writer_properties(&schema);
         assert!(props.max_row_group_size() == 128 * 1024);
     }
 
     #[test]
     fn test_bloom_filter_configuration() {
         let config = ParquetWriterConfig::default();
-        let props = config.to_writer_properties();
+        let schema = create_test_schema();
+        let props = config.to_writer_properties(&schema);
 
         // Verify bloom filter is enabled on metric_name column
         let metric_name_path = ColumnPath::new(vec!["metric_name".to_string()]);
         let bloom_props = props.bloom_filter_properties(&metric_name_path);
         assert!(
             bloom_props.is_some(),
-            "Bloom filter should be enabled for metric_name"
+            "bloom filter should be enabled for metric_name"
         );
 
         let bloom_props = bloom_props.unwrap();
         assert!(
             (bloom_props.fpp - BLOOM_FILTER_FPP).abs() < 0.001,
-            "Bloom filter FPP should be {}",
+            "bloom filter FPP should be {}",
             BLOOM_FILTER_FPP
         );
         assert_eq!(
             bloom_props.ndv, BLOOM_FILTER_NDV_METRIC_NAME,
-            "Bloom filter NDV for metric_name should be {}",
+            "bloom filter NDV for metric_name should be {}",
             BLOOM_FILTER_NDV_METRIC_NAME
         );
 
-        // Verify bloom filter is enabled on tag columns
-        let tag_service_path = ColumnPath::new(vec!["tag_service".to_string()]);
-        let bloom_props = props.bloom_filter_properties(&tag_service_path);
+        // Verify bloom filter is enabled on service tag column (in SORT_ORDER)
+        let service_path = ColumnPath::new(vec!["service".to_string()]);
+        let bloom_props = props.bloom_filter_properties(&service_path);
         assert!(
             bloom_props.is_some(),
-            "Bloom filter should be enabled for tag_service"
+            "bloom filter should be enabled for service"
         );
 
         let bloom_props = bloom_props.unwrap();
         assert_eq!(
             bloom_props.ndv, BLOOM_FILTER_NDV_TAGS,
-            "Bloom filter NDV for tag columns should be {}",
+            "bloom filter NDV for tag columns should be {}",
             BLOOM_FILTER_NDV_TAGS
         );
 
-        // Verify bloom filter is NOT enabled on timestamp_secs (not a filtering column)
-        let timestamp_path = ColumnPath::new(vec!["timestamp_secs".to_string()]);
-        let bloom_props = props.bloom_filter_properties(&timestamp_path);
-        assert!(
-            bloom_props.is_none(),
-            "Bloom filter should NOT be enabled for timestamp_secs"
-        );
-
         // Verify bloom filter is NOT enabled on value column
         let value_path = ColumnPath::new(vec!["value".to_string()]);
         let bloom_props = props.bloom_filter_properties(&value_path);
         assert!(
             bloom_props.is_none(),
-            "Bloom filter should NOT be enabled for value"
+            "bloom filter should NOT be enabled for value"
         );
     }
 
     #[test]
     fn test_statistics_enabled() {
         let config = ParquetWriterConfig::default();
-        let props = config.to_writer_properties();
+        let schema = create_test_schema();
+        let props = config.to_writer_properties(&schema);
 
         // Verify statistics are enabled at Chunk (row group) level
         let metric_name_path = ColumnPath::new(vec!["metric_name".to_string()]);
         assert_eq!(
             props.statistics_enabled(&metric_name_path),
             EnabledStatistics::Chunk,
-            "Statistics should be enabled at Chunk level"
+            "statistics should be enabled at Chunk level"
         );
 
         // Verify for timestamp column as well (important for time range pruning)
@@ -339,31 +351,24 @@ mod tests {
         assert_eq!(
             props.statistics_enabled(&timestamp_path),
             EnabledStatistics::Chunk,
-            "Statistics should be enabled at Chunk level for timestamp"
+            "statistics should be enabled at Chunk level for timestamp"
         );
     }
 
     #[test]
     fn test_dictionary_encoding_enabled() {
         let config = ParquetWriterConfig::default();
-        let props = config.to_writer_properties();
-
-        // Verify dictionary encoding is enabled for dictionary columns
-        let dictionary_columns = [
-            "metric_name",
-            "tag_service",
-            "tag_env",
-            "tag_datacenter",
-            "tag_region",
-            "tag_host",
-            "service_name",
-        ];
+        let schema = create_test_schema();
+        let props = config.to_writer_properties(&schema);
+
+        // Verify dictionary encoding is enabled for dictionary-typed columns
+        let dictionary_columns = ["metric_name", "service", "env", "host"];
 
         for col_name in dictionary_columns {
             let col_path = ColumnPath::new(vec![col_name.to_string()]);
             assert!(
                 props.dictionary_enabled(&col_path),
-                "Dictionary encoding should be enabled for {}",
+                "dictionary encoding should be enabled for {}",
                 col_name
             );
         }
@@ -371,37 +376,31 @@ mod tests {
 
     #[test]
     fn test_sorting_columns_order() {
-        let sorting_cols = ParquetWriterConfig::sorting_columns();
+        let schema = create_test_schema();
+        let sorting_cols = ParquetWriterConfig::sorting_columns(&schema);
 
-        // Verify we have the expected number of sorting columns
+        // The test schema has metric_name (idx 0), timestamp_secs (idx 2),
+        // service (idx 4), env (idx 5), host (idx 6).
+        // SORT_ORDER is: metric_name, service, env, datacenter, region, host, timestamp_secs
+        // Only present columns are included, so: metric_name, service, env, host, timestamp_secs
         assert_eq!(
             sorting_cols.len(),
-            7,
-            "Should have 7 sorting columns: metric_name, 5 tags, timestamp"
+            5,
+            "should have 5 sorting columns from the test schema"
         );
 
-        // Verify sort order matches expected: metric_name, tag_service, tag_env,
-        // tag_datacenter, tag_region, tag_host, timestamp_secs
-        let expected_order = [
-            ParquetField::MetricName,
-            ParquetField::TagService,
-            ParquetField::TagEnv,
-            ParquetField::TagDatacenter,
-            ParquetField::TagRegion,
-            ParquetField::TagHost,
-            ParquetField::TimestampSecs,
-        ];
-
-        for (i, expected_field) in expected_order.iter().enumerate() {
-            assert_eq!(
-                sorting_cols[i].column_idx,
-                expected_field.column_index() as i32,
-                "Sorting column {} should be {}",
-                i,
-                expected_field.name()
-            );
-            assert!(!sorting_cols[i].descending, "Sorting should be ascending");
-            assert!(sorting_cols[i].nulls_first, "Nulls should be first");
+        // Verify all are ascending with nulls first
+        for col in &sorting_cols {
+            assert!(!col.descending, "sorting should be ascending");
+            assert!(!col.nulls_first, "nulls should be last");
         }
+
+        // Verify order matches SORT_ORDER filtered by schema presence:
+        // metric_name (idx 0), service (idx 4), env (idx 5), host (idx 6), timestamp_secs (idx 2)
+        assert_eq!(sorting_cols[0].column_idx, 0); // metric_name
+        assert_eq!(sorting_cols[1].column_idx, 4); // service
+        assert_eq!(sorting_cols[2].column_idx, 5); // env
+        assert_eq!(sorting_cols[3].column_idx, 6); // host
+        assert_eq!(sorting_cols[4].column_idx, 2); // timestamp_secs
     }
 }
diff --git a/quickwit/quickwit-parquet-engine/src/storage/split_writer.rs b/quickwit/quickwit-parquet-engine/src/storage/split_writer.rs
index 8505eb0f75c..ae5fa340722 100644
--- a/quickwit/quickwit-parquet-engine/src/storage/split_writer.rs
+++ b/quickwit/quickwit-parquet-engine/src/storage/split_writer.rs
@@ -25,7 +25,6 @@ use tracing::{debug, info, instrument};
 
 use super::config::ParquetWriterConfig;
 use super::writer::{ParquetWriteError, ParquetWriter};
-use crate::schema::{ParquetField, ParquetSchema};
 use crate::split::{MetricsSplitMetadata, ParquetSplit, SplitId, TAG_SERVICE, TimeRange};
 
 /// Writer that produces complete ParquetSplit with metadata from RecordBatch data.
@@ -40,16 +39,11 @@ impl ParquetSplitWriter {
     /// Create a new ParquetSplitWriter.
     ///
     /// # Arguments
-    /// * `schema` - The metrics schema for validation
     /// * `config` - Parquet writer configuration
     /// * `base_path` - Directory where split files will be written
-    pub fn new(
-        schema: ParquetSchema,
-        config: ParquetWriterConfig,
-        base_path: impl Into<PathBuf>,
-    ) -> Self {
+    pub fn new(config: ParquetWriterConfig, base_path: impl Into<PathBuf>) -> Self {
         Self {
-            writer: ParquetWriter::new(schema, config),
+            writer: ParquetWriter::new(config),
             base_path: base_path.into(),
         }
     }
diff --git a/quickwit/quickwit-parquet-engine/src/storage/writer.rs b/quickwit/quickwit-parquet-engine/src/storage/writer.rs
index 50a375f42b8..134702103eb 100644
--- a/quickwit/quickwit-parquet-engine/src/storage/writer.rs
+++ b/quickwit/quickwit-parquet-engine/src/storage/writer.rs
@@ -27,7 +27,7 @@ use thiserror::Error;
 use tracing::{debug, instrument};
 
 use super::config::ParquetWriterConfig;
-use crate::schema::{ParquetField, ParquetSchema};
+use crate::schema::{SORT_ORDER, validate_required_fields};
 
 /// Errors that can occur during parquet writing.
 #[derive(Debug, Error)]
@@ -44,21 +44,20 @@ pub enum ParquetWriteError {
     #[error("Arrow error: {0}")]
     ArrowError(#[from] arrow::error::ArrowError),
 
-    /// Schema mismatch between RecordBatch and ParquetSchema.
-    #[error("Schema mismatch: expected {expected} fields, got {got}")]
-    SchemaMismatch { expected: usize, got: usize },
+    /// Schema validation failed.
+    #[error("Schema validation failed: {0}")]
+    SchemaValidation(String),
 }
 
 /// Writer for metrics data to Parquet format.
 pub struct ParquetWriter {
     config: ParquetWriterConfig,
-    schema: ParquetSchema,
 }
 
 impl ParquetWriter {
     /// Create a new ParquetWriter.
-    pub fn new(schema: ParquetSchema, config: ParquetWriterConfig) -> Self {
-        Self { config, schema }
+    pub fn new(config: ParquetWriterConfig) -> Self {
+        Self { config }
     }
 
     /// Get the writer configuration.
@@ -66,58 +65,58 @@ impl ParquetWriter {
         &self.config
     }
 
-    /// Get the metrics schema.
-    pub fn schema(&self) -> &ParquetSchema {
-        &self.schema
-    }
-
-    /// Validate that a RecordBatch matches the expected schema.
-    fn validate_batch(&self, batch: &RecordBatch) -> Result<(), ParquetWriteError> {
-        let expected = self.schema.num_fields();
-        let got = batch.num_columns();
-        if expected != got {
-            return Err(ParquetWriteError::SchemaMismatch { expected, got });
-        }
-        Ok(())
-    }
-
     /// Sort a RecordBatch by the metrics sort order.
-    /// Order: metric_name, tag_service, tag_env, tag_datacenter, tag_region, tag_host,
-    /// timestamp_secs. This sorting enables efficient pruning during query execution.
+    /// Columns from SORT_ORDER that are present in the batch schema are used;
+    /// missing columns are skipped.
     fn sort_batch(&self, batch: &RecordBatch) -> Result<RecordBatch, ParquetWriteError> {
-        // Build sort columns from the defined sort order
-        let sort_columns: Vec<SortColumn> = ParquetField::sort_order()
+        let schema = batch.schema();
+        let mut sort_columns: Vec<SortColumn> = SORT_ORDER
             .iter()
-            .map(|field| {
-                let col_idx = field.column_index();
-                SortColumn {
-                    values: Arc::clone(batch.column(col_idx)),
-                    options: Some(SortOptions {
-                        descending: false,
-                        nulls_first: true,
-                    }),
-                }
+            .filter_map(|name| schema.index_of(name).ok())
+            .map(|idx| SortColumn {
+                values: Arc::clone(batch.column(idx)),
+                options: Some(SortOptions {
+                    descending: false,
+                    nulls_first: false,
+                }),
             })
             .collect();
 
-        // Compute sorted indices
-        let indices = lexsort_to_indices(&sort_columns, None)?;
+        if sort_columns.is_empty() {
+            return Ok(batch.clone());
+        }
 
-        // Reorder the batch using the sorted indices
-        let sorted_batch = take_record_batch(batch, &indices)?;
-        Ok(sorted_batch)
+        // Append the original row index as a tiebreaker so that rows with
+        // identical sort keys keep their arrival order (stable sort semantics).
+        // lexsort_to_indices uses an unstable sort internally; the tiebreaker
+        // makes it behave stably at a small cost (one u32 comparison per
+        // equal-key pair, 4 bytes × num_rows of extra allocation).
+        let row_indices = Arc::new(arrow::array::UInt32Array::from_iter_values(
+            0..batch.num_rows() as u32,
+        ));
+        sort_columns.push(SortColumn {
+            values: row_indices,
+            options: Some(SortOptions {
+                descending: false,
+                nulls_first: false,
+            }),
+        });
+
+        let indices = lexsort_to_indices(&sort_columns, None)?;
+        Ok(take_record_batch(batch, &indices)?)
     }
 
     /// Write a RecordBatch to Parquet bytes in memory.
     /// The batch is sorted before writing by: metric_name, common tags, timestamp.
     #[instrument(skip(self, batch), fields(batch_rows = batch.num_rows()))]
     pub fn write_to_bytes(&self, batch: &RecordBatch) -> Result<Vec<u8>, ParquetWriteError> {
-        self.validate_batch(batch)?;
+        validate_required_fields(&batch.schema())
+            .map_err(|e| ParquetWriteError::SchemaValidation(e.to_string()))?;
 
         // Sort the batch before writing for efficient pruning
         let sorted_batch = self.sort_batch(batch)?;
 
-        let props = self.config.to_writer_properties();
+        let props = self.config.to_writer_properties(&sorted_batch.schema());
         let buffer = Cursor::new(Vec::new());
 
         let mut writer = ArrowWriter::try_new(buffer, sorted_batch.schema(), Some(props))?;
@@ -125,7 +124,7 @@ impl ParquetWriter {
         let buffer = writer.into_inner()?;
 
         let bytes = buffer.into_inner();
-        debug!(bytes_written = bytes.len(), "Completed write to bytes");
+        debug!(bytes_written = bytes.len(), "completed write to bytes");
         Ok(bytes)
     }
 
@@ -139,12 +138,13 @@ impl ParquetWriter {
         batch: &RecordBatch,
         path: &Path,
     ) -> Result<u64, ParquetWriteError> {
-        self.validate_batch(batch)?;
+        validate_required_fields(&batch.schema())
+            .map_err(|e| ParquetWriteError::SchemaValidation(e.to_string()))?;
 
         // Sort the batch before writing for efficient pruning
         let sorted_batch = self.sort_batch(batch)?;
 
-        let props = self.config.to_writer_properties();
+        let props = self.config.to_writer_properties(&sorted_batch.schema());
         let file = File::create(path)?;
 
         let mut writer = ArrowWriter::try_new(file, sorted_batch.schema(), Some(props))?;
@@ -152,7 +152,7 @@ impl ParquetWriter {
         let file = writer.into_inner()?;
 
         let bytes_written = file.metadata()?.len();
-        debug!(bytes_written, "Completed write to file");
+        debug!(bytes_written, "completed write to file");
         Ok(bytes_written)
     }
 }
@@ -161,152 +161,26 @@ impl ParquetWriter {
 mod tests {
     use std::sync::Arc;
 
-    use arrow::array::{
-        ArrayRef, DictionaryArray, Float64Array, StringArray, UInt8Array, UInt64Array,
-    };
+    use arrow::array::{ArrayRef, DictionaryArray, Float64Array, StringArray, UInt64Array, UInt8Array};
     use arrow::datatypes::{DataType, Field, Int32Type, Schema};
-    use parquet::variant::{VariantArrayBuilder, VariantBuilderExt};
+    use crate::test_helpers::create_test_batch_with_tags;
 
     use super::*;
 
-    /// Create dictionary array for string fields with Int32 keys.
-    fn create_dict_array(values: Vec<&str>) -> ArrayRef {
-        let string_array = StringArray::from(values);
-        Arc::new(
-            DictionaryArray::<Int32Type>::try_new(
-                arrow::array::Int32Array::from(vec![0i32]),
-                Arc::new(string_array),
-            )
-            .unwrap(),
-        )
-    }
-
-    /// Create nullable dictionary array for optional string fields.
-    fn create_nullable_dict_array(value: Option<&str>) -> ArrayRef {
-        match value {
-            Some(v) => {
-                let string_array = StringArray::from(vec![v]);
-                Arc::new(
-                    DictionaryArray::<Int32Type>::try_new(
-                        arrow::array::Int32Array::from(vec![0i32]),
-                        Arc::new(string_array),
-                    )
-                    .unwrap(),
-                )
-            }
-            None => {
-                let string_array = StringArray::from(vec![None::<&str>]);
-                Arc::new(
-                    DictionaryArray::<Int32Type>::try_new(
-                        arrow::array::Int32Array::from(vec![None::<i32>]),
-                        Arc::new(string_array),
-                    )
-                    .unwrap(),
-                )
-            }
-        }
-    }
-
-    /// Create a VARIANT array for testing.
-    fn create_variant_array(fields: Option<&[(&str, &str)]>) -> ArrayRef {
-        let mut builder = VariantArrayBuilder::new(1);
-        match fields {
-            Some(kv_pairs) => {
-                let mut obj = builder.new_object();
-                for (key, value) in kv_pairs {
-                    obj = obj.with_field(key, *value);
-                }
-                obj.finish();
-            }
-            None => {
-                builder.append_null();
-            }
-        }
-        ArrayRef::from(builder.build())
-    }
-
     fn create_test_batch() -> RecordBatch {
-        let schema = ParquetSchema::new();
-
-        // Create arrays for all 14 fields in ParquetSchema matching fields.rs:
-        // MetricName: Dictionary(Int32, Utf8)
-        let metric_name: ArrayRef = create_dict_array(vec!["test.metric"]);
-
-        // MetricType: UInt8
-        let metric_type: ArrayRef = Arc::new(UInt8Array::from(vec![0u8])); // gauge
-
-        // MetricUnit: Utf8 (nullable)
-        let metric_unit: ArrayRef = Arc::new(StringArray::from(vec![Some("bytes")]));
-
-        // TimestampSecs: UInt64
-        let timestamp_secs: ArrayRef = Arc::new(UInt64Array::from(vec![1704067200u64]));
-
-        // StartTimestampSecs: UInt64 (nullable)
-        let start_timestamp_secs: ArrayRef = Arc::new(UInt64Array::from(vec![None::<u64>]));
-
-        // Value: Float64
-        let value: ArrayRef = Arc::new(Float64Array::from(vec![42.0]));
-
-        // TagService: Dictionary(Int32, Utf8) (nullable)
-        let tag_service: ArrayRef = create_nullable_dict_array(Some("web"));
-
-        // TagEnv: Dictionary(Int32, Utf8) (nullable)
-        let tag_env: ArrayRef = create_nullable_dict_array(Some("prod"));
-
-        // TagDatacenter: Dictionary(Int32, Utf8) (nullable)
-        let tag_datacenter: ArrayRef = create_nullable_dict_array(Some("us-east-1"));
-
-        // TagRegion: Dictionary(Int32, Utf8) (nullable)
-        let tag_region: ArrayRef = create_nullable_dict_array(None);
-
-        // TagHost: Dictionary(Int32, Utf8) (nullable)
-        let tag_host: ArrayRef = create_nullable_dict_array(Some("host-001"));
-
-        // Attributes: VARIANT (nullable)
-        let attributes: ArrayRef = create_variant_array(Some(&[("key", "value")]));
-
-        // ServiceName: Dictionary(Int32, Utf8)
-        let service_name: ArrayRef = create_dict_array(vec!["my-service"]);
-
-        // ResourceAttributes: VARIANT (nullable)
-        let resource_attributes: ArrayRef = create_variant_array(None);
-
-        RecordBatch::try_new(
-            schema.arrow_schema().clone(),
-            vec![
-                metric_name,
-                metric_type,
-                metric_unit,
-                timestamp_secs,
-                start_timestamp_secs,
-                value,
-                tag_service,
-                tag_env,
-                tag_datacenter,
-                tag_region,
-                tag_host,
-                attributes,
-                service_name,
-                resource_attributes,
-            ],
-        )
-        .unwrap()
+        create_test_batch_with_tags(1, &["service", "env"])
     }
 
     #[test]
     fn test_writer_creation() {
-        let schema = ParquetSchema::new();
         let config = ParquetWriterConfig::default();
-        let writer = ParquetWriter::new(schema, config);
-
-        assert_eq!(writer.schema().num_fields(), 14);
+        let _writer = ParquetWriter::new(config);
     }
 
     #[test]
     fn test_write_to_bytes() {
-        let schema = ParquetSchema::new();
         let config = ParquetWriterConfig::default();
-        let writer = ParquetWriter::new(schema, config);
+        let writer = ParquetWriter::new(config);
 
         let batch = create_test_batch();
         let bytes = writer.write_to_bytes(&batch).unwrap();
@@ -318,9 +192,8 @@ mod tests {
 
     #[test]
     fn test_write_to_file() {
-        let schema = ParquetSchema::new();
         let config = ParquetWriterConfig::default();
-        let writer = ParquetWriter::new(schema, config);
+        let writer = ParquetWriter::new(config);
 
         let batch = create_test_batch();
         let temp_dir = std::env::temp_dir();
@@ -334,12 +207,11 @@ mod tests {
     }
 
     #[test]
-    fn test_schema_mismatch() {
-        let schema = ParquetSchema::new();
+    fn test_schema_validation_missing_field() {
         let config = ParquetWriterConfig::default();
-        let writer = ParquetWriter::new(schema, config);
+        let writer = ParquetWriter::new(config);
 
-        // Create a batch with wrong number of columns
+        // Create a batch missing required fields
         let wrong_schema = Arc::new(Schema::new(vec![Field::new(
             "single_field",
             DataType::Utf8,
@@ -352,22 +224,42 @@ mod tests {
         .unwrap();
 
         let result = writer.write_to_bytes(&wrong_batch);
-        assert!(matches!(
-            result,
-            Err(ParquetWriteError::SchemaMismatch {
-                expected: 14,
-                got: 1
-            })
-        ));
+        assert!(matches!(result, Err(ParquetWriteError::SchemaValidation(_))));
+    }
+
+    #[test]
+    fn test_schema_validation_wrong_type() {
+        let config = ParquetWriterConfig::default();
+        let writer = ParquetWriter::new(config);
+
+        // Create a batch where metric_name has wrong type (Utf8 instead of Dictionary)
+        let wrong_schema = Arc::new(Schema::new(vec![
+            Field::new("metric_name", DataType::Utf8, false),
+            Field::new("metric_type", DataType::UInt8, false),
+            Field::new("timestamp_secs", DataType::UInt64, false),
+            Field::new("value", DataType::Float64, false),
+        ]));
+        let wrong_batch = RecordBatch::try_new(
+            wrong_schema,
+            vec![
+                Arc::new(StringArray::from(vec!["test"])) as ArrayRef,
+                Arc::new(UInt8Array::from(vec![0u8])) as ArrayRef,
+                Arc::new(UInt64Array::from(vec![100u64])) as ArrayRef,
+                Arc::new(Float64Array::from(vec![1.0])) as ArrayRef,
+            ],
+        )
+        .unwrap();
+
+        let result = writer.write_to_bytes(&wrong_batch);
+        assert!(matches!(result, Err(ParquetWriteError::SchemaValidation(_))));
     }
 
     #[test]
     fn test_write_with_snappy_compression() {
         use super::super::config::Compression;
 
-        let schema = ParquetSchema::new();
         let config = ParquetWriterConfig::new().with_compression(Compression::Snappy);
-        let writer = ParquetWriter::new(schema, config);
+        let writer = ParquetWriter::new(config);
 
         let batch = create_test_batch();
         let bytes = writer.write_to_bytes(&batch).unwrap();
@@ -382,9 +274,25 @@ mod tests {
 
         use parquet::arrow::arrow_reader::ParquetRecordBatchReaderBuilder;
 
-        let schema = ParquetSchema::new();
         let config = ParquetWriterConfig::default();
-        let writer = ParquetWriter::new(schema.clone(), config);
+        let writer = ParquetWriter::new(config);
+
+        // Create a schema with required fields + service tag for sort verification
+        let schema = Arc::new(Schema::new(vec![
+            Field::new(
+                "metric_name",
+                DataType::Dictionary(Box::new(DataType::Int32), Box::new(DataType::Utf8)),
+                false,
+            ),
+            Field::new("metric_type", DataType::UInt8, false),
+            Field::new("timestamp_secs", DataType::UInt64, false),
+            Field::new("value", DataType::Float64, false),
+            Field::new(
+                "service",
+                DataType::Dictionary(Box::new(DataType::Int32), Box::new(DataType::Utf8)),
+                true,
+            ),
+        ]));
 
         // Create unsorted batch with multiple rows:
         // Row 0: metric_b, service_a, timestamp=300
@@ -393,11 +301,6 @@ mod tests {
         // Expected sorted order: metric_a/service_a/200, metric_a/service_b/100,
         // metric_b/service_a/300
 
-        // Build arrays for 3 rows (original unsorted order in comments above)
-        let timestamps = [300u64, 100u64, 200u64];
-        let values = [1.0, 2.0, 3.0];
-
-        // metric_name: Dictionary(Int32, Utf8)
         let metric_name: ArrayRef = {
             let keys = arrow::array::Int32Array::from(vec![0i32, 1, 1]);
             let values = StringArray::from(vec!["metric_b", "metric_a"]);
@@ -405,88 +308,18 @@ mod tests {
         };
 
         let metric_type: ArrayRef = Arc::new(UInt8Array::from(vec![0u8, 0, 0]));
-        let metric_unit: ArrayRef = Arc::new(StringArray::from(vec![
-            Some("bytes"),
-            Some("bytes"),
-            Some("bytes"),
-        ]));
-        let timestamp_secs: ArrayRef = Arc::new(UInt64Array::from(timestamps.to_vec()));
-        let start_timestamp_secs: ArrayRef =
-            Arc::new(UInt64Array::from(vec![None::<u64>, None, None]));
-        let value: ArrayRef = Arc::new(Float64Array::from(values.to_vec()));
+        let timestamp_secs: ArrayRef = Arc::new(UInt64Array::from(vec![300u64, 100u64, 200u64]));
+        let value: ArrayRef = Arc::new(Float64Array::from(vec![1.0, 2.0, 3.0]));
 
-        // tag_service: Dictionary(Int32, Utf8) (nullable)
-        let tag_service: ArrayRef = {
+        let service: ArrayRef = {
             let keys = arrow::array::Int32Array::from(vec![Some(0i32), Some(1), Some(0)]);
             let values = StringArray::from(vec!["service_a", "service_b"]);
             Arc::new(DictionaryArray::<Int32Type>::try_new(keys, Arc::new(values)).unwrap())
         };
 
-        let tag_env: ArrayRef = {
-            let keys = arrow::array::Int32Array::from(vec![Some(0i32), Some(0), Some(0)]);
-            let values = StringArray::from(vec!["prod"]);
-            Arc::new(DictionaryArray::<Int32Type>::try_new(keys, Arc::new(values)).unwrap())
-        };
-
-        let tag_datacenter: ArrayRef = {
-            let keys = arrow::array::Int32Array::from(vec![None::<i32>, None, None]);
-            let values = StringArray::from(vec![None::<&str>]);
-            Arc::new(DictionaryArray::<Int32Type>::try_new(keys, Arc::new(values)).unwrap())
-        };
-
-        let tag_region: ArrayRef = {
-            let keys = arrow::array::Int32Array::from(vec![None::<i32>, None, None]);
-            let values = StringArray::from(vec![None::<&str>]);
-            Arc::new(DictionaryArray::<Int32Type>::try_new(keys, Arc::new(values)).unwrap())
-        };
-
-        let tag_host: ArrayRef = {
-            let keys = arrow::array::Int32Array::from(vec![None::<i32>, None, None]);
-            let values = StringArray::from(vec![None::<&str>]);
-            Arc::new(DictionaryArray::<Int32Type>::try_new(keys, Arc::new(values)).unwrap())
-        };
-
-        // Build VARIANT arrays for 3 rows
-        let attributes: ArrayRef = {
-            let mut builder = VariantArrayBuilder::new(3);
-            for _ in 0..3 {
-                builder.append_null();
-            }
-            ArrayRef::from(builder.build())
-        };
-
-        let service_name: ArrayRef = {
-            let keys = arrow::array::Int32Array::from(vec![0i32, 1, 0]);
-            let values = StringArray::from(vec!["service_a", "service_b"]);
-            Arc::new(DictionaryArray::<Int32Type>::try_new(keys, Arc::new(values)).unwrap())
-        };
-
-        let resource_attributes: ArrayRef = {
-            let mut builder = VariantArrayBuilder::new(3);
-            for _ in 0..3 {
-                builder.append_null();
-            }
-            ArrayRef::from(builder.build())
-        };
-
         let batch = RecordBatch::try_new(
-            schema.arrow_schema().clone(),
-            vec![
-                metric_name,
-                metric_type,
-                metric_unit,
-                timestamp_secs,
-                start_timestamp_secs,
-                value,
-                tag_service,
-                tag_env,
-                tag_datacenter,
-                tag_region,
-                tag_host,
-                attributes,
-                service_name,
-                resource_attributes,
-            ],
+            schema,
+            vec![metric_name, metric_type, timestamp_secs, value, service],
         )
         .unwrap();
 
@@ -509,25 +342,29 @@ mod tests {
         assert_eq!(result.num_rows(), 3);
 
         // Extract metric names and timestamps to verify sort order
+        let metric_idx = result.schema().index_of("metric_name").unwrap();
+        let ts_idx = result.schema().index_of("timestamp_secs").unwrap();
+        let service_idx = result.schema().index_of("service").unwrap();
+
         let metric_col = result
-            .column(ParquetField::MetricName.column_index())
+            .column(metric_idx)
             .as_any()
             .downcast_ref::<DictionaryArray<Int32Type>>()
             .unwrap();
         let ts_col = result
-            .column(ParquetField::TimestampSecs.column_index())
+            .column(ts_idx)
             .as_any()
             .downcast_ref::<UInt64Array>()
             .unwrap();
         let service_col = result
-            .column(ParquetField::TagService.column_index())
+            .column(service_idx)
             .as_any()
             .downcast_ref::<DictionaryArray<Int32Type>>()
             .unwrap();
 
         // Get string values from dictionary
-        let get_metric = |i: usize| -> &str {
-            let key = metric_col.keys().value(i);
+        let get_metric = |row: usize| -> &str {
+            let key = metric_col.keys().value(row);
             metric_col
                 .values()
                 .as_any()
@@ -535,8 +372,8 @@ mod tests {
                 .unwrap()
                 .value(key as usize)
         };
-        let get_service = |i: usize| -> &str {
-            let key = service_col.keys().value(i);
+        let get_service = |row: usize| -> &str {
+            let key = service_col.keys().value(row);
             service_col
                 .values()
                 .as_any()
@@ -545,7 +382,7 @@ mod tests {
                 .value(key as usize)
         };
 
-        // Expected sort order: metric_name ASC, tag_service ASC, timestamp ASC
+        // Expected sort order: metric_name ASC, service ASC, timestamp_secs ASC
         // Row 0: metric_a, service_a, 200 (original row 2)
         // Row 1: metric_a, service_b, 100 (original row 1)
         // Row 2: metric_b, service_a, 300 (original row 0)

From ad115bff5da04e0c153277dae8f5b1508b930f41 Mon Sep 17 00:00:00 2001
From: Matthew Kim <matthew.kim@datadoghq.com>
Date: Wed, 18 Mar 2026 11:02:23 -0400
Subject: [PATCH 04/35] feat: union schema accumulation and schema-agnostic
 ingest validation

---
 .../src/index/accumulator.rs                  | 251 ++++++++++--------
 .../src/ingest/processor.rs                   | 179 ++-----------
 quickwit/quickwit-parquet-engine/src/lib.rs   |   3 +
 3 files changed, 160 insertions(+), 273 deletions(-)

diff --git a/quickwit/quickwit-parquet-engine/src/index/accumulator.rs b/quickwit/quickwit-parquet-engine/src/index/accumulator.rs
index 29065847136..d9eb01f4411 100644
--- a/quickwit/quickwit-parquet-engine/src/index/accumulator.rs
+++ b/quickwit/quickwit-parquet-engine/src/index/accumulator.rs
@@ -14,15 +14,18 @@
 
 //! Batch accumulator for producing splits from RecordBatches.
 
+use std::collections::BTreeMap;
+use std::sync::Arc;
 use std::time::Instant;
 
+use arrow::array::{new_null_array, ArrayRef};
 use arrow::compute::concat_batches;
+use arrow::datatypes::{DataType, Field, Schema as ArrowSchema, SchemaRef};
 use arrow::record_batch::RecordBatch;
 use tracing::{debug, info};
 
 use super::config::ParquetIndexingConfig;
 use crate::metrics::PARQUET_ENGINE_METRICS;
-use crate::schema::ParquetSchema;
 
 /// Error type for index operations.
 #[derive(Debug, thiserror::Error)]
@@ -38,11 +41,14 @@ pub enum IndexingError {
 /// Batches are accumulated until either `max_rows` or `max_bytes` threshold is reached,
 /// at which point they are concatenated and returned for downstream processing (writing to
 /// Parquet by the ParquetPackager actor).
+///
+/// Consecutive batches may have different column sets. The accumulator tracks the union
+/// schema incrementally and aligns all batches to it on flush.
 pub struct ParquetBatchAccumulator {
     /// Configuration for accumulation thresholds.
     config: ParquetIndexingConfig,
-    /// Metrics schema for concatenation.
-    schema: ParquetSchema,
+    /// Union of all fields seen across pending batches.
+    union_fields: BTreeMap<String, (DataType, bool)>,
     /// Pending batches waiting to be flushed.
     pending_batches: Vec<RecordBatch>,
     /// Total rows in pending batches.
@@ -57,11 +63,9 @@ impl ParquetBatchAccumulator {
     /// # Arguments
     /// * `config` - Configuration for accumulation thresholds
     pub fn new(config: ParquetIndexingConfig) -> Self {
-        let schema = ParquetSchema::new();
-
         Self {
             config,
-            schema,
+            union_fields: BTreeMap::new(),
             pending_batches: Vec::new(),
             pending_rows: 0,
             pending_bytes: 0,
@@ -83,6 +87,13 @@ impl ParquetBatchAccumulator {
             .index_rows_total
             .inc_by(batch_rows as u64);
 
+        // Merge fields into union schema before pushing (we need the schema reference)
+        for field in batch.schema().fields() {
+            self.union_fields
+                .entry(field.name().clone())
+                .or_insert_with(|| (field.data_type().clone(), field.is_nullable()));
+        }
+
         self.pending_batches.push(batch);
         self.pending_rows += batch_rows;
         self.pending_bytes += batch_bytes;
@@ -92,7 +103,7 @@ impl ParquetBatchAccumulator {
             batch_bytes,
             total_pending_rows = self.pending_rows,
             total_pending_bytes = self.pending_bytes,
-            "Added batch to accumulator"
+            "added batch to accumulator"
         );
 
         let flushed = if self.should_flush() {
@@ -101,7 +112,7 @@ impl ParquetBatchAccumulator {
                 pending_bytes = self.pending_bytes,
                 max_rows = self.config.max_rows,
                 max_bytes = self.config.max_bytes,
-                "Threshold exceeded, triggering flush"
+                "threshold exceeded, triggering flush"
             );
             self.flush_internal()?
         } else {
@@ -119,6 +130,7 @@ impl ParquetBatchAccumulator {
     /// Discard all pending data without producing output.
     pub fn discard(&mut self) {
         self.pending_batches.clear();
+        self.union_fields.clear();
         self.pending_rows = 0;
         self.pending_bytes = 0;
     }
@@ -136,14 +148,33 @@ impl ParquetBatchAccumulator {
             return Ok(None);
         }
 
-        // Concatenate all pending batches into one
-        let combined = concat_batches(self.schema.arrow_schema(), self.pending_batches.iter())?;
+        // Build the union schema from accumulated fields.
+        // All fields are marked nullable=true regardless of their source schema:
+        // any field that appears in some batches but not others will be null-filled
+        // for the missing batches, so non-nullable would cause Arrow to reject the concat.
+        let fields: Vec<Field> = self
+            .union_fields
+            .iter()
+            .map(|(name, (data_type, _nullable))| Field::new(name, data_type.clone(), true))
+            .collect();
+        let union_schema: SchemaRef = Arc::new(ArrowSchema::new(fields));
+
+        // Align each pending batch to the union schema
+        let aligned: Vec<RecordBatch> = self
+            .pending_batches
+            .iter()
+            .map(|batch| align_batch_to_schema(batch, &union_schema))
+            .collect::<Result<Vec<_>, _>>()?;
+
+        // Concatenate all aligned batches
+        let combined = concat_batches(&union_schema, aligned.iter())?;
 
         let num_rows = combined.num_rows();
-        info!(num_rows, "Flushed accumulated batches");
+        info!(num_rows, "flushed accumulated batches");
 
         // Reset state
         self.pending_batches.clear();
+        self.union_fields.clear();
         self.pending_rows = 0;
         self.pending_bytes = 0;
 
@@ -173,6 +204,24 @@ impl ParquetBatchAccumulator {
     }
 }
 
+/// Align a RecordBatch to a target schema, inserting null columns where needed.
+fn align_batch_to_schema(
+    batch: &RecordBatch,
+    target_schema: &SchemaRef,
+) -> Result<RecordBatch, arrow::error::ArrowError> {
+    let num_rows = batch.num_rows();
+    let batch_schema = batch.schema();
+    let columns: Vec<ArrayRef> = target_schema
+        .fields()
+        .iter()
+        .map(|field| match batch_schema.index_of(field.name()) {
+            Ok(idx) => Arc::clone(batch.column(idx)),
+            Err(_) => new_null_array(field.data_type(), num_rows),
+        })
+        .collect();
+    RecordBatch::try_new(target_schema.clone(), columns)
+}
+
 /// Estimate the memory size of a RecordBatch.
 fn estimate_batch_bytes(batch: &RecordBatch) -> usize {
     batch
@@ -184,107 +233,10 @@ fn estimate_batch_bytes(batch: &RecordBatch) -> usize {
 
 #[cfg(test)]
 mod tests {
-    use std::sync::Arc;
-
-    use arrow::array::{
-        ArrayRef, DictionaryArray, Float64Array, Int32Array, StringArray, UInt8Array, UInt64Array,
-    };
-    use arrow::datatypes::Int32Type;
-    use parquet::variant::{VariantArrayBuilder, VariantBuilderExt};
+    use crate::test_helpers::{create_test_batch, create_test_batch_with_tags};
 
     use super::*;
 
-    /// Create dictionary array for string fields with Int32 keys.
-    fn create_dict_array(values: &[&str]) -> ArrayRef {
-        let keys: Vec<i32> = (0..values.len()).map(|i| i as i32).collect();
-        let string_array = StringArray::from(values.to_vec());
-        Arc::new(
-            DictionaryArray::<Int32Type>::try_new(Int32Array::from(keys), Arc::new(string_array))
-                .unwrap(),
-        )
-    }
-
-    /// Create nullable dictionary array for optional string fields.
-    fn create_nullable_dict_array(values: &[Option<&str>]) -> ArrayRef {
-        let keys: Vec<Option<i32>> = values
-            .iter()
-            .enumerate()
-            .map(|(i, v)| v.map(|_| i as i32))
-            .collect();
-        let string_values: Vec<&str> = values.iter().filter_map(|v| *v).collect();
-        let string_array = StringArray::from(string_values);
-        Arc::new(
-            DictionaryArray::<Int32Type>::try_new(Int32Array::from(keys), Arc::new(string_array))
-                .unwrap(),
-        )
-    }
-
-    /// Create a VARIANT array for testing with specified number of rows.
-    fn create_variant_array(num_rows: usize, fields: Option<&[(&str, &str)]>) -> ArrayRef {
-        let mut builder = VariantArrayBuilder::new(num_rows);
-        for _ in 0..num_rows {
-            match fields {
-                Some(kv_pairs) => {
-                    let mut obj = builder.new_object();
-                    for (key, value) in kv_pairs {
-                        obj = obj.with_field(key, *value);
-                    }
-                    obj.finish();
-                }
-                None => {
-                    builder.append_null();
-                }
-            }
-        }
-        ArrayRef::from(builder.build())
-    }
-
-    /// Create a test batch matching the metrics schema.
-    fn create_test_batch(num_rows: usize) -> RecordBatch {
-        let schema = ParquetSchema::new();
-
-        let metric_names: Vec<&str> = vec!["cpu.usage"; num_rows];
-        let metric_name: ArrayRef = create_dict_array(&metric_names);
-        let metric_type: ArrayRef = Arc::new(UInt8Array::from(vec![0u8; num_rows]));
-        let metric_unit: ArrayRef = Arc::new(StringArray::from(vec![Some("bytes"); num_rows]));
-        let timestamps: Vec<u64> = (0..num_rows).map(|i| 100 + i as u64).collect();
-        let timestamp_secs: ArrayRef = Arc::new(UInt64Array::from(timestamps));
-        let start_timestamp_secs: ArrayRef =
-            Arc::new(UInt64Array::from(vec![None::<u64>; num_rows]));
-        let values: Vec<f64> = (0..num_rows).map(|i| 42.0 + i as f64).collect();
-        let value: ArrayRef = Arc::new(Float64Array::from(values));
-        let tag_service: ArrayRef = create_nullable_dict_array(&vec![Some("web"); num_rows]);
-        let tag_env: ArrayRef = create_nullable_dict_array(&vec![Some("prod"); num_rows]);
-        let tag_datacenter: ArrayRef =
-            create_nullable_dict_array(&vec![Some("us-east-1"); num_rows]);
-        let tag_region: ArrayRef = create_nullable_dict_array(&vec![None; num_rows]);
-        let tag_host: ArrayRef = create_nullable_dict_array(&vec![Some("host-001"); num_rows]);
-        let attributes: ArrayRef = create_variant_array(num_rows, None);
-        let service_name: ArrayRef = create_dict_array(&vec!["my-service"; num_rows]);
-        let resource_attributes: ArrayRef = create_variant_array(num_rows, None);
-
-        RecordBatch::try_new(
-            schema.arrow_schema().clone(),
-            vec![
-                metric_name,
-                metric_type,
-                metric_unit,
-                timestamp_secs,
-                start_timestamp_secs,
-                value,
-                tag_service,
-                tag_env,
-                tag_datacenter,
-                tag_region,
-                tag_host,
-                attributes,
-                service_name,
-                resource_attributes,
-            ],
-        )
-        .unwrap()
-    }
-
     #[test]
     fn test_accumulator_below_threshold() {
         let config = ParquetIndexingConfig::default().with_max_rows(1000);
@@ -351,6 +303,87 @@ mod tests {
         assert_eq!(combined.num_rows(), 50);
     }
 
+    #[test]
+    fn test_accumulator_merges_different_tag_sets() {
+        let config = ParquetIndexingConfig::default().with_max_rows(1000);
+        let mut accumulator = ParquetBatchAccumulator::new(config);
+
+        // First batch has "service" tag
+        let batch1 = create_test_batch_with_tags(3, &["service"]);
+        let _ = accumulator.add_batch(batch1).unwrap();
+
+        // Second batch has "host" tag
+        let batch2 = create_test_batch_with_tags(2, &["host"]);
+        let _ = accumulator.add_batch(batch2).unwrap();
+
+        let combined = accumulator.flush().unwrap().unwrap();
+        assert_eq!(combined.num_rows(), 5);
+
+        // Union schema should have all 4 required fields + both tags
+        let schema = combined.schema();
+        assert!(schema.index_of("metric_name").is_ok());
+        assert!(schema.index_of("metric_type").is_ok());
+        assert!(schema.index_of("timestamp_secs").is_ok());
+        assert!(schema.index_of("value").is_ok());
+        assert!(schema.index_of("service").is_ok());
+        assert!(schema.index_of("host").is_ok());
+        assert_eq!(schema.fields().len(), 6);
+
+        // First 3 rows should have null "host", last 2 rows should have null "service"
+        let host_idx = schema.index_of("host").unwrap();
+        let host_col = combined.column(host_idx);
+        assert_eq!(host_col.null_count(), 3); // first batch had no host
+
+        let service_idx = schema.index_of("service").unwrap();
+        let service_col = combined.column(service_idx);
+        assert_eq!(service_col.null_count(), 2); // second batch had no service
+
+        // No duplicate column names — each name appears exactly once.
+        let field_names: Vec<&str> = schema.fields().iter().map(|f| f.name().as_str()).collect();
+        let unique_count = field_names.iter().collect::<std::collections::HashSet<_>>().len();
+        assert_eq!(
+            unique_count,
+            field_names.len(),
+            "duplicate columns in union schema: {field_names:?}"
+        );
+    }
+
+    #[test]
+    fn test_accumulator_no_duplicates_with_overlapping_tags() {
+        let config = ParquetIndexingConfig::default().with_max_rows(1000);
+        let mut accumulator = ParquetBatchAccumulator::new(config);
+
+        // Both batches share "service"; second also has "host".
+        // "service" must appear exactly once in the flushed schema.
+        let batch1 = create_test_batch_with_tags(3, &["service"]);
+        let batch2 = create_test_batch_with_tags(2, &["service", "host"]);
+        let _ = accumulator.add_batch(batch1).unwrap();
+        let _ = accumulator.add_batch(batch2).unwrap();
+
+        let combined = accumulator.flush().unwrap().unwrap();
+        assert_eq!(combined.num_rows(), 5);
+
+        let schema = combined.schema();
+        let field_names: Vec<&str> = schema.fields().iter().map(|f| f.name().as_str()).collect();
+        let unique_count = field_names.iter().collect::<std::collections::HashSet<_>>().len();
+        assert_eq!(
+            unique_count,
+            field_names.len(),
+            "duplicate columns in union schema: {field_names:?}"
+        );
+
+        // 4 required + service + host = 6
+        assert_eq!(schema.fields().len(), 6);
+
+        // Rows from batch1 have no "host" → 3 nulls; batch2 has "host" for all 2 rows → 0 nulls.
+        let host_idx = schema.index_of("host").unwrap();
+        assert_eq!(combined.column(host_idx).null_count(), 3);
+
+        // "service" present in both batches → 0 nulls total.
+        let service_idx = schema.index_of("service").unwrap();
+        assert_eq!(combined.column(service_idx).null_count(), 0);
+    }
+
     #[test]
     fn test_estimate_batch_bytes() {
         let batch = create_test_batch(100);
diff --git a/quickwit/quickwit-parquet-engine/src/ingest/processor.rs b/quickwit/quickwit-parquet-engine/src/ingest/processor.rs
index 80dd4f322e5..6097d7c3c32 100644
--- a/quickwit/quickwit-parquet-engine/src/ingest/processor.rs
+++ b/quickwit/quickwit-parquet-engine/src/ingest/processor.rs
@@ -21,7 +21,7 @@ use arrow::record_batch::RecordBatch;
 use tracing::{debug, instrument, warn};
 
 use crate::metrics::PARQUET_ENGINE_METRICS;
-use crate::schema::ParquetSchema;
+use crate::schema::validate_required_fields;
 
 /// Error type for ingest operations.
 #[derive(Debug, thiserror::Error)]
@@ -34,9 +34,9 @@ pub enum IngestError {
     #[error("IPC decode error: {0}")]
     IpcDecode(#[from] arrow::error::ArrowError),
 
-    /// RecordBatch schema doesn't match expected metrics schema.
-    #[error("Schema mismatch: expected {expected} fields, got {actual}")]
-    SchemaMismatch { expected: usize, actual: usize },
+    /// RecordBatch schema validation failed.
+    #[error("Schema validation failed: {0}")]
+    SchemaValidation(String),
 
     /// Expected exactly one RecordBatch in IPC stream.
     #[error("Expected 1 RecordBatch in IPC stream, got {0}")]
@@ -53,17 +53,10 @@ pub enum IngestError {
 
 /// Processor that converts Arrow IPC bytes to RecordBatch.
 ///
-/// Validates that the decoded batch matches the expected metrics schema.
-pub struct ParquetIngestProcessor {
-    schema: ParquetSchema,
-}
+/// Validates that the decoded batch contains all required fields.
+pub struct ParquetIngestProcessor;
 
 impl ParquetIngestProcessor {
-    /// Create a new ParquetIngestProcessor.
-    pub fn new(schema: ParquetSchema) -> Self {
-        Self { schema }
-    }
-
     /// Convert Arrow IPC bytes to RecordBatch.
     ///
     /// Returns error if IPC is malformed or schema doesn't match.
@@ -95,65 +88,15 @@ impl ParquetIngestProcessor {
 
         debug!(
             num_rows = batch.num_rows(),
-            "Successfully decoded IPC to RecordBatch"
+            "successfully decoded IPC to RecordBatch"
         );
         Ok(batch)
     }
 
-    /// Validate that the RecordBatch schema matches expected metrics schema.
+    /// Validate that the RecordBatch schema contains all required fields.
     fn validate_schema(&self, batch: &RecordBatch) -> Result<(), IngestError> {
-        let expected_fields = self.schema.num_fields();
-        let actual_fields = batch.schema().fields().len();
-
-        if expected_fields != actual_fields {
-            warn!(
-                expected = expected_fields,
-                actual = actual_fields,
-                "Schema mismatch: field count differs"
-            );
-            return Err(IngestError::SchemaMismatch {
-                expected: expected_fields,
-                actual: actual_fields,
-            });
-        }
-
-        // Verify field names match (in order)
-        let schema_fields = self.schema.arrow_schema().fields();
-        let batch_schema = batch.schema();
-        let batch_fields = batch_schema.fields();
-
-        for (expected, actual) in schema_fields.iter().zip(batch_fields.iter()) {
-            if expected.name() != actual.name() {
-                warn!(
-                    expected_name = expected.name(),
-                    actual_name = actual.name(),
-                    "Schema mismatch: field name differs"
-                );
-                return Err(IngestError::SchemaMismatch {
-                    expected: expected_fields,
-                    actual: actual_fields,
-                });
-            }
-            if expected.data_type() != actual.data_type() {
-                warn!(
-                    field = expected.name(),
-                    expected_type = ?expected.data_type(),
-                    actual_type = ?actual.data_type(),
-                    "Schema mismatch: field type differs"
-                );
-                return Err(IngestError::SchemaMismatch {
-                    expected: expected_fields,
-                    actual: actual_fields,
-                });
-            }
-        }
-
-        Ok(())
-    }
-
-    /// Get a reference to the schema.
-    pub fn schema(&self) -> &ParquetSchema {
-        &self.schema
+        validate_required_fields(batch.schema().as_ref())
+            .map_err(|e| IngestError::SchemaValidation(e.to_string()))
     }
 }
 
@@ -168,11 +111,7 @@ fn ipc_to_record_batch(ipc_bytes: &[u8]) -> Result<RecordBatch, IngestError> {
         return Err(IngestError::UnexpectedBatchCount(batches.len()));
     }
 
-    // Safe: we verified exactly 1 batch above, but use ok_or for defensive programming
-    batches
-        .into_iter()
-        .next()
-        .ok_or(IngestError::UnexpectedBatchCount(0))
+    Ok(batches.into_iter().next().expect("len verified to be 1 above"))
 }
 
 /// Serialize a RecordBatch to Arrow IPC stream format.
@@ -190,100 +129,13 @@ pub fn record_batch_to_ipc(batch: &RecordBatch) -> Result<Vec<u8>, IngestError>
 
 #[cfg(test)]
 mod tests {
-    use std::sync::Arc;
-
-    use arrow::array::{
-        ArrayRef, DictionaryArray, Float64Array, Int32Array, StringArray, UInt8Array, UInt64Array,
-    };
-    use arrow::datatypes::Int32Type;
-    use parquet::variant::VariantArrayBuilder;
+    use crate::test_helpers::create_test_batch;
 
     use super::*;
 
-    /// Create dictionary array for string fields with Int32 keys.
-    fn create_dict_array(values: &[&str]) -> ArrayRef {
-        let keys: Vec<i32> = (0..values.len()).map(|i| i as i32).collect();
-        let string_array = StringArray::from(values.to_vec());
-        Arc::new(
-            DictionaryArray::<Int32Type>::try_new(Int32Array::from(keys), Arc::new(string_array))
-                .unwrap(),
-        )
-    }
-
-    /// Create nullable dictionary array for optional string fields.
-    fn create_nullable_dict_array(values: &[Option<&str>]) -> ArrayRef {
-        let keys: Vec<Option<i32>> = values
-            .iter()
-            .enumerate()
-            .map(|(i, v)| v.map(|_| i as i32))
-            .collect();
-        let string_values: Vec<&str> = values.iter().filter_map(|v| *v).collect();
-        let string_array = StringArray::from(string_values);
-        Arc::new(
-            DictionaryArray::<Int32Type>::try_new(Int32Array::from(keys), Arc::new(string_array))
-                .unwrap(),
-        )
-    }
-
-    /// Create a VARIANT array for testing with specified number of rows (all nulls).
-    fn create_variant_array(num_rows: usize) -> ArrayRef {
-        let mut builder = VariantArrayBuilder::new(num_rows);
-        for _ in 0..num_rows {
-            builder.append_null();
-        }
-        ArrayRef::from(builder.build())
-    }
-
-    /// Create a test batch matching the metrics schema.
-    fn create_test_batch(num_rows: usize) -> RecordBatch {
-        let schema = ParquetSchema::new();
-
-        let metric_names: Vec<&str> = vec!["cpu.usage"; num_rows];
-        let metric_name: ArrayRef = create_dict_array(&metric_names);
-        let metric_type: ArrayRef = Arc::new(UInt8Array::from(vec![0u8; num_rows]));
-        let metric_unit: ArrayRef = Arc::new(StringArray::from(vec![Some("bytes"); num_rows]));
-        let timestamps: Vec<u64> = (0..num_rows).map(|i| 100 + i as u64).collect();
-        let timestamp_secs: ArrayRef = Arc::new(UInt64Array::from(timestamps));
-        let start_timestamp_secs: ArrayRef =
-            Arc::new(UInt64Array::from(vec![None::<u64>; num_rows]));
-        let values: Vec<f64> = (0..num_rows).map(|i| 42.0 + i as f64).collect();
-        let value: ArrayRef = Arc::new(Float64Array::from(values));
-        let tag_service: ArrayRef = create_nullable_dict_array(&vec![Some("web"); num_rows]);
-        let tag_env: ArrayRef = create_nullable_dict_array(&vec![Some("prod"); num_rows]);
-        let tag_datacenter: ArrayRef =
-            create_nullable_dict_array(&vec![Some("us-east-1"); num_rows]);
-        let tag_region: ArrayRef = create_nullable_dict_array(&vec![None; num_rows]);
-        let tag_host: ArrayRef = create_nullable_dict_array(&vec![Some("host-001"); num_rows]);
-        let attributes: ArrayRef = create_variant_array(num_rows);
-        let service_name: ArrayRef = create_dict_array(&vec!["my-service"; num_rows]);
-        let resource_attributes: ArrayRef = create_variant_array(num_rows);
-
-        RecordBatch::try_new(
-            schema.arrow_schema().clone(),
-            vec![
-                metric_name,
-                metric_type,
-                metric_unit,
-                timestamp_secs,
-                start_timestamp_secs,
-                value,
-                tag_service,
-                tag_env,
-                tag_datacenter,
-                tag_region,
-                tag_host,
-                attributes,
-                service_name,
-                resource_attributes,
-            ],
-        )
-        .unwrap()
-    }
-
     #[test]
     fn test_process_ipc() {
-        let schema = ParquetSchema::new();
-        let processor = ParquetIngestProcessor::new(schema);
+        let processor = ParquetIngestProcessor;
 
         // Create a valid batch
         let batch = create_test_batch(10);
@@ -295,13 +147,12 @@ mod tests {
 
         let recovered = result.unwrap();
         assert_eq!(recovered.num_rows(), 10);
-        assert_eq!(recovered.num_columns(), 14);
+        assert_eq!(recovered.num_columns(), 6); // 4 required + 2 tags
     }
 
     #[test]
     fn test_process_ipc_invalid_bytes() {
-        let schema = ParquetSchema::new();
-        let processor = ParquetIngestProcessor::new(schema);
+        let processor = ParquetIngestProcessor;
 
         let result = processor.process_ipc(&[0u8; 10]);
         assert!(result.is_err());
diff --git a/quickwit/quickwit-parquet-engine/src/lib.rs b/quickwit/quickwit-parquet-engine/src/lib.rs
index eac11bfa8e9..c6d8640e801 100644
--- a/quickwit/quickwit-parquet-engine/src/lib.rs
+++ b/quickwit/quickwit-parquet-engine/src/lib.rs
@@ -26,3 +26,6 @@ pub mod metrics;
 pub mod schema;
 pub mod split;
 pub mod storage;
+
+#[cfg(test)]
+pub mod test_helpers;

From d0a995e82f51e3c8963d8dc2fd6b1792e3f5ff70 Mon Sep 17 00:00:00 2001
From: Matthew Kim <matthew.kim@datadoghq.com>
Date: Wed, 18 Mar 2026 11:02:30 -0400
Subject: [PATCH 05/35] feat: dynamic column lookup in split writer

---
 .../src/storage/split_writer.rs               | 318 +++++-------------
 1 file changed, 83 insertions(+), 235 deletions(-)

diff --git a/quickwit/quickwit-parquet-engine/src/storage/split_writer.rs b/quickwit/quickwit-parquet-engine/src/storage/split_writer.rs
index ae5fa340722..a9e081455d0 100644
--- a/quickwit/quickwit-parquet-engine/src/storage/split_writer.rs
+++ b/quickwit/quickwit-parquet-engine/src/storage/split_writer.rs
@@ -83,7 +83,7 @@ impl ParquetSplitWriter {
         debug!(
             start_secs = time_range.start_secs,
             end_secs = time_range.end_secs,
-            "Extracted time range from batch"
+            "extracted time range from batch"
         );
 
         // Extract distinct metric names from batch
@@ -116,7 +116,7 @@ impl ParquetSplitWriter {
             split_id = %split_id,
             file_path = %file_path.display(),
             size_bytes,
-            "Split file written successfully"
+            "split file written successfully"
         );
 
         Ok(ParquetSplit::new(metadata))
@@ -125,7 +125,11 @@ impl ParquetSplitWriter {
 
 /// Extracts the time range (min/max timestamp_secs) from a RecordBatch.
 fn extract_time_range(batch: &RecordBatch) -> Result<TimeRange, ParquetWriteError> {
-    let timestamp_col = batch.column(ParquetField::TimestampSecs.column_index());
+    let timestamp_idx = batch
+        .schema()
+        .index_of("timestamp_secs")
+        .map_err(|_| ParquetWriteError::SchemaValidation("missing timestamp_secs column".into()))?;
+    let timestamp_col = batch.column(timestamp_idx);
     let timestamp_array = timestamp_col.as_primitive::<UInt64Type>();
 
     let min_val = min(timestamp_array);
@@ -145,7 +149,11 @@ fn extract_time_range(batch: &RecordBatch) -> Result<TimeRange, ParquetWriteErro
 
 /// Extracts distinct metric names from a RecordBatch.
 fn extract_metric_names(batch: &RecordBatch) -> Result<HashSet<String>, ParquetWriteError> {
-    let metric_col = batch.column(ParquetField::MetricName.column_index());
+    let metric_idx = batch
+        .schema()
+        .index_of("metric_name")
+        .map_err(|_| ParquetWriteError::SchemaValidation("missing metric_name column".into()))?;
+    let metric_col = batch.column(metric_idx);
     let mut names = HashSet::new();
 
     // The column is Dictionary(Int32, Utf8)
@@ -174,7 +182,11 @@ fn extract_metric_names(batch: &RecordBatch) -> Result<HashSet<String>, ParquetW
 
 /// Extracts distinct service names from a RecordBatch.
 fn extract_service_names(batch: &RecordBatch) -> Result<HashSet<String>, ParquetWriteError> {
-    let service_col = batch.column(ParquetField::ServiceName.column_index());
+    let service_idx = match batch.schema().index_of("service").ok() {
+        Some(idx) => idx,
+        None => return Ok(HashSet::new()),
+    };
+    let service_col = batch.column(service_idx);
     let mut names = HashSet::new();
 
     // The column is Dictionary(Int32, Utf8)
@@ -205,84 +217,58 @@ fn extract_service_names(batch: &RecordBatch) -> Result<HashSet<String>, Parquet
 mod tests {
     use std::sync::Arc;
 
-    use arrow::array::{
-        ArrayRef, DictionaryArray, Float64Array, Int32Array, StringArray, UInt8Array, UInt64Array,
-    };
-    use arrow::datatypes::Int32Type;
-    use parquet::variant::{VariantArrayBuilder, VariantBuilderExt};
+    use arrow::array::{ArrayRef, Float64Array, UInt8Array, UInt64Array};
+    use arrow::datatypes::{DataType, Field, Schema as ArrowSchema};
+    use crate::test_helpers::{create_dict_array, create_nullable_dict_array};
 
     use super::*;
 
-    #[test]
-    fn test_column_indices_match_schema() {
-        let schema = ParquetSchema::new();
-        for field in ParquetField::all() {
-            let expected: usize =
-                schema
-                    .arrow_schema()
-                    .index_of(field.name())
-                    .unwrap_or_else(|_| {
-                        panic!("field {:?} should exist in arrow schema", field.name())
-                    });
-            assert_eq!(
-                field.column_index(),
-                expected,
-                "column_index() for {:?} does not match arrow schema position",
-                field.name()
-            );
+    /// Create a test batch with required fields, optional service column, and specified tag columns.
+    fn create_test_batch_with_options(
+        num_rows: usize,
+        metric_names: &[&str],
+        timestamps: &[u64],
+        service_names: Option<&[&str]>,
+        tags: &[&str],
+    ) -> RecordBatch {
+        let dict_type = DataType::Dictionary(Box::new(DataType::Int32), Box::new(DataType::Utf8));
+
+        let mut fields = vec![
+            Field::new("metric_name", dict_type.clone(), false),
+            Field::new("metric_type", DataType::UInt8, false),
+            Field::new("timestamp_secs", DataType::UInt64, false),
+            Field::new("value", DataType::Float64, false),
+        ];
+        if service_names.is_some() {
+            fields.push(Field::new("service", dict_type.clone(), true));
         }
-    }
+        for tag in tags {
+            fields.push(Field::new(*tag, dict_type.clone(), true));
+        }
+        let schema = Arc::new(ArrowSchema::new(fields));
 
-    /// Create a VARIANT array for testing with specified number of rows.
-    fn create_variant_array(num_rows: usize, fields: Option<&[(&str, &str)]>) -> ArrayRef {
-        let mut builder = VariantArrayBuilder::new(num_rows);
-        for _ in 0..num_rows {
-            match fields {
-                Some(kv_pairs) => {
-                    let mut obj = builder.new_object();
-                    for (key, value) in kv_pairs {
-                        obj = obj.with_field(key, *value);
-                    }
-                    obj.finish();
-                }
-                None => {
-                    builder.append_null();
-                }
-            }
+        let metric_name: ArrayRef = create_dict_array(metric_names);
+        let metric_type: ArrayRef = Arc::new(UInt8Array::from(vec![0u8; num_rows]));
+        let timestamp_secs: ArrayRef = Arc::new(UInt64Array::from(timestamps.to_vec()));
+        let values: Vec<f64> = (0..num_rows).map(|i| 42.0 + i as f64).collect();
+        let value: ArrayRef = Arc::new(Float64Array::from(values));
+
+        let mut columns: Vec<ArrayRef> = vec![metric_name, metric_type, timestamp_secs, value];
+
+        if let Some(svc_names) = service_names {
+            columns.push(create_dict_array(svc_names));
         }
-        ArrayRef::from(builder.build())
-    }
 
-    /// Create dictionary array for string fields with Int32 keys.
-    fn create_dict_array(values: &[&str]) -> ArrayRef {
-        let keys: Vec<i32> = (0..values.len()).map(|i| i as i32).collect();
-        let string_array = StringArray::from(values.to_vec());
-        Arc::new(
-            DictionaryArray::<Int32Type>::try_new(Int32Array::from(keys), Arc::new(string_array))
-                .unwrap(),
-        )
-    }
+        for tag in tags {
+            let tag_values: Vec<Option<&str>> = vec![Some(tag); num_rows];
+            columns.push(create_nullable_dict_array(&tag_values));
+        }
 
-    /// Create nullable dictionary array for optional string fields.
-    fn create_nullable_dict_array(values: &[Option<&str>]) -> ArrayRef {
-        let keys: Vec<Option<i32>> = values
-            .iter()
-            .enumerate()
-            .map(|(i, v)| v.map(|_| i as i32))
-            .collect();
-        let string_values: Vec<&str> = values.iter().filter_map(|v| *v).collect();
-        let string_array = StringArray::from(string_values);
-        Arc::new(
-            DictionaryArray::<Int32Type>::try_new(Int32Array::from(keys), Arc::new(string_array))
-                .unwrap(),
-        )
+        RecordBatch::try_new(schema, columns).unwrap()
     }
 
-    /// Create a test batch with specified number of rows and test data.
+    /// Create a simple test batch with default values.
     fn create_test_batch(num_rows: usize) -> RecordBatch {
-        let schema = ParquetSchema::new();
-
-        // Generate test data
         let metric_names: Vec<&str> = (0..num_rows)
             .map(|i| {
                 if i % 2 == 0 {
@@ -293,82 +279,23 @@ mod tests {
             })
             .collect();
         let timestamps: Vec<u64> = (0..num_rows).map(|i| 100 + i as u64 * 10).collect();
-
-        // MetricName: Dictionary(Int32, Utf8)
-        let metric_name: ArrayRef = create_dict_array(&metric_names);
-
-        // MetricType: UInt8
-        let metric_type: ArrayRef = Arc::new(UInt8Array::from(vec![0u8; num_rows]));
-
-        // MetricUnit: Utf8 (nullable)
-        let metric_unit: ArrayRef = Arc::new(StringArray::from(vec![Some("bytes"); num_rows]));
-
-        // TimestampSecs: UInt64
-        let timestamp_secs: ArrayRef = Arc::new(UInt64Array::from(timestamps));
-
-        // StartTimestampSecs: UInt64 (nullable)
-        let start_timestamp_secs: ArrayRef =
-            Arc::new(UInt64Array::from(vec![None::<u64>; num_rows]));
-
-        // Value: Float64
-        let values: Vec<f64> = (0..num_rows).map(|i| 42.0 + i as f64).collect();
-        let value: ArrayRef = Arc::new(Float64Array::from(values));
-
-        // TagService: Dictionary(Int32, Utf8) (nullable)
-        let tag_service: ArrayRef = create_nullable_dict_array(&vec![Some("web"); num_rows]);
-
-        // TagEnv: Dictionary(Int32, Utf8) (nullable)
-        let tag_env: ArrayRef = create_nullable_dict_array(&vec![Some("prod"); num_rows]);
-
-        // TagDatacenter: Dictionary(Int32, Utf8) (nullable)
-        let tag_datacenter: ArrayRef =
-            create_nullable_dict_array(&vec![Some("us-east-1"); num_rows]);
-
-        // TagRegion: Dictionary(Int32, Utf8) (nullable)
-        let tag_region: ArrayRef = create_nullable_dict_array(&vec![None; num_rows]);
-
-        // TagHost: Dictionary(Int32, Utf8) (nullable)
-        let tag_host: ArrayRef = create_nullable_dict_array(&vec![Some("host-001"); num_rows]);
-
-        // Attributes: VARIANT (nullable)
-        let attributes: ArrayRef = create_variant_array(num_rows, Some(&[("key", "value")]));
-
-        // ServiceName: Dictionary(Int32, Utf8)
-        let service_names: Vec<&str> = (0..num_rows).map(|_| "my-service").collect();
-        let service_name: ArrayRef = create_dict_array(&service_names);
-
-        // ResourceAttributes: VARIANT (nullable)
-        let resource_attributes: ArrayRef = create_variant_array(num_rows, None);
-
-        RecordBatch::try_new(
-            schema.arrow_schema().clone(),
-            vec![
-                metric_name,
-                metric_type,
-                metric_unit,
-                timestamp_secs,
-                start_timestamp_secs,
-                value,
-                tag_service,
-                tag_env,
-                tag_datacenter,
-                tag_region,
-                tag_host,
-                attributes,
-                service_name,
-                resource_attributes,
-            ],
+        let service_names: Vec<&str> = vec!["my-service"; num_rows];
+
+        create_test_batch_with_options(
+            num_rows,
+            &metric_names,
+            &timestamps,
+            Some(&service_names),
+            &["service", "host"],
         )
-        .unwrap()
     }
 
     #[test]
     fn test_write_split_creates_file() {
-        let schema = ParquetSchema::new();
         let config = ParquetWriterConfig::default();
         let temp_dir = tempfile::tempdir().unwrap();
 
-        let writer = ParquetSplitWriter::new(schema, config, temp_dir.path());
+        let writer = ParquetSplitWriter::new(config, temp_dir.path());
 
         let batch = create_test_batch(10);
         let split = writer.write_split(&batch, "test-index").unwrap();
@@ -387,14 +314,19 @@ mod tests {
 
     #[test]
     fn test_write_split_extracts_time_range() {
-        let schema = ParquetSchema::new();
         let config = ParquetWriterConfig::default();
         let temp_dir = tempfile::tempdir().unwrap();
 
-        let writer = ParquetSplitWriter::new(schema, config, temp_dir.path());
+        let writer = ParquetSplitWriter::new(config, temp_dir.path());
 
         // Create batch with timestamps [100, 150, 200]
-        let batch = create_test_batch_with_timestamps(&[100, 150, 200]);
+        let batch = create_test_batch_with_options(
+            3,
+            &["test.metric", "test.metric", "test.metric"],
+            &[100, 150, 200],
+            Some(&["my-service", "my-service", "my-service"]),
+            &[],
+        );
         let split = writer.write_split(&batch, "test-index").unwrap();
 
         // Verify time range
@@ -404,14 +336,19 @@ mod tests {
 
     #[test]
     fn test_write_split_extracts_metric_names() {
-        let schema = ParquetSchema::new();
         let config = ParquetWriterConfig::default();
         let temp_dir = tempfile::tempdir().unwrap();
 
-        let writer = ParquetSplitWriter::new(schema, config, temp_dir.path());
+        let writer = ParquetSplitWriter::new(config, temp_dir.path());
 
         // Create batch with specific metric names
-        let batch = create_test_batch_with_metric_names(&["cpu.usage", "memory.used", "cpu.usage"]);
+        let batch = create_test_batch_with_options(
+            3,
+            &["cpu.usage", "memory.used", "cpu.usage"],
+            &[100, 100, 100],
+            Some(&["my-service", "my-service", "my-service"]),
+            &[],
+        );
         let split = writer.write_split(&batch, "test-index").unwrap();
 
         // Verify metric names (distinct values)
@@ -419,93 +356,4 @@ mod tests {
         assert!(split.metadata.metric_names.contains("memory.used"));
         assert_eq!(split.metadata.metric_names.len(), 2);
     }
-
-    /// Create a test batch with specific timestamps.
-    fn create_test_batch_with_timestamps(timestamps: &[u64]) -> RecordBatch {
-        let schema = ParquetSchema::new();
-        let num_rows = timestamps.len();
-
-        let metric_names: Vec<&str> = vec!["test.metric"; num_rows];
-        let metric_name: ArrayRef = create_dict_array(&metric_names);
-        let metric_type: ArrayRef = Arc::new(UInt8Array::from(vec![0u8; num_rows]));
-        let metric_unit: ArrayRef = Arc::new(StringArray::from(vec![Some("bytes"); num_rows]));
-        let timestamp_secs: ArrayRef = Arc::new(UInt64Array::from(timestamps.to_vec()));
-        let start_timestamp_secs: ArrayRef =
-            Arc::new(UInt64Array::from(vec![None::<u64>; num_rows]));
-        let value: ArrayRef = Arc::new(Float64Array::from(vec![42.0; num_rows]));
-        let tag_service: ArrayRef = create_nullable_dict_array(&vec![Some("web"); num_rows]);
-        let tag_env: ArrayRef = create_nullable_dict_array(&vec![Some("prod"); num_rows]);
-        let tag_datacenter: ArrayRef =
-            create_nullable_dict_array(&vec![Some("us-east-1"); num_rows]);
-        let tag_region: ArrayRef = create_nullable_dict_array(&vec![None; num_rows]);
-        let tag_host: ArrayRef = create_nullable_dict_array(&vec![Some("host-001"); num_rows]);
-        let attributes: ArrayRef = create_variant_array(num_rows, None);
-        let service_name: ArrayRef = create_dict_array(&vec!["my-service"; num_rows]);
-        let resource_attributes: ArrayRef = create_variant_array(num_rows, None);
-
-        RecordBatch::try_new(
-            schema.arrow_schema().clone(),
-            vec![
-                metric_name,
-                metric_type,
-                metric_unit,
-                timestamp_secs,
-                start_timestamp_secs,
-                value,
-                tag_service,
-                tag_env,
-                tag_datacenter,
-                tag_region,
-                tag_host,
-                attributes,
-                service_name,
-                resource_attributes,
-            ],
-        )
-        .unwrap()
-    }
-
-    /// Create a test batch with specific metric names.
-    fn create_test_batch_with_metric_names(metric_names: &[&str]) -> RecordBatch {
-        let schema = ParquetSchema::new();
-        let num_rows = metric_names.len();
-
-        let metric_name: ArrayRef = create_dict_array(metric_names);
-        let metric_type: ArrayRef = Arc::new(UInt8Array::from(vec![0u8; num_rows]));
-        let metric_unit: ArrayRef = Arc::new(StringArray::from(vec![Some("bytes"); num_rows]));
-        let timestamp_secs: ArrayRef = Arc::new(UInt64Array::from(vec![100u64; num_rows]));
-        let start_timestamp_secs: ArrayRef =
-            Arc::new(UInt64Array::from(vec![None::<u64>; num_rows]));
-        let value: ArrayRef = Arc::new(Float64Array::from(vec![42.0; num_rows]));
-        let tag_service: ArrayRef = create_nullable_dict_array(&vec![Some("web"); num_rows]);
-        let tag_env: ArrayRef = create_nullable_dict_array(&vec![Some("prod"); num_rows]);
-        let tag_datacenter: ArrayRef =
-            create_nullable_dict_array(&vec![Some("us-east-1"); num_rows]);
-        let tag_region: ArrayRef = create_nullable_dict_array(&vec![None; num_rows]);
-        let tag_host: ArrayRef = create_nullable_dict_array(&vec![Some("host-001"); num_rows]);
-        let attributes: ArrayRef = create_variant_array(num_rows, None);
-        let service_name: ArrayRef = create_dict_array(&vec!["my-service"; num_rows]);
-        let resource_attributes: ArrayRef = create_variant_array(num_rows, None);
-
-        RecordBatch::try_new(
-            schema.arrow_schema().clone(),
-            vec![
-                metric_name,
-                metric_type,
-                metric_unit,
-                timestamp_secs,
-                start_timestamp_secs,
-                value,
-                tag_service,
-                tag_env,
-                tag_datacenter,
-                tag_region,
-                tag_host,
-                attributes,
-                service_name,
-                resource_attributes,
-            ],
-        )
-        .unwrap()
-    }
 }

From 35c39421f257aaec47dffc49037a56479ad4ff5e Mon Sep 17 00:00:00 2001
From: Matthew Kim <matthew.kim@datadoghq.com>
Date: Wed, 18 Mar 2026 11:02:39 -0400
Subject: [PATCH 06/35] feat: remove ParquetSchema dependency from indexing
 actors

---
 .../src/actors/indexing_pipeline.rs           |   2 -
 .../src/actors/parquet_doc_processor.rs       | 257 +++++-------------
 .../src/actors/parquet_e2e_test.rs            | 103 ++-----
 .../src/actors/parquet_indexer.rs             | 101 +------
 .../src/actors/parquet_packager.rs            | 102 +------
 .../src/models/processed_parquet_batch.rs     | 104 +------
 6 files changed, 110 insertions(+), 559 deletions(-)

diff --git a/quickwit/quickwit-indexing/src/actors/indexing_pipeline.rs b/quickwit/quickwit-indexing/src/actors/indexing_pipeline.rs
index 16daf18102f..cfb2ade9361 100644
--- a/quickwit/quickwit-indexing/src/actors/indexing_pipeline.rs
+++ b/quickwit/quickwit-indexing/src/actors/indexing_pipeline.rs
@@ -637,10 +637,8 @@ impl IndexingPipeline {
             .spawn(parquet_uploader);
 
         // ParquetPackager
-        let parquet_schema = quickwit_parquet_engine::schema::ParquetSchema::new();
         let writer_config = quickwit_parquet_engine::storage::ParquetWriterConfig::default();
         let split_writer = quickwit_parquet_engine::storage::ParquetSplitWriter::new(
-            parquet_schema,
             writer_config,
             self.params.indexing_directory.path(),
         );
diff --git a/quickwit/quickwit-indexing/src/actors/parquet_doc_processor.rs b/quickwit/quickwit-indexing/src/actors/parquet_doc_processor.rs
index 2a83c9610c9..aa26f67fc94 100644
--- a/quickwit/quickwit-indexing/src/actors/parquet_doc_processor.rs
+++ b/quickwit/quickwit-indexing/src/actors/parquet_doc_processor.rs
@@ -24,7 +24,6 @@ use quickwit_common::rate_limited_tracing::rate_limited_warn;
 use quickwit_common::runtimes::RuntimeType;
 use quickwit_metastore::checkpoint::SourceCheckpointDelta;
 use quickwit_parquet_engine::ingest::{IngestError, ParquetIngestProcessor};
-use quickwit_parquet_engine::schema::ParquetSchema;
 use quickwit_proto::types::{IndexId, SourceId};
 use serde::Serialize;
 use tokio::runtime::Handle;
@@ -143,9 +142,11 @@ impl ParquetDocProcessor {
         source_id: SourceId,
         indexer_mailbox: Mailbox<ParquetIndexer>,
     ) -> Self {
-        let schema = ParquetSchema::new();
-        let processor = ParquetIngestProcessor::new(schema);
-        let counters = ParquetDocProcessorCounters::new(index_id.clone(), source_id.clone());
+        let processor = ParquetIngestProcessor;
+        let counters = ParquetDocProcessorCounters::new(
+            index_id.clone(),
+            source_id.clone(),
+        );
 
         info!(
             index_id = %index_id,
@@ -305,8 +306,9 @@ impl Handler<RawDocBatch> for ParquetDocProcessor {
         // Without this, a batch of consistently malformed data blocks offset progress
         // forever.
         if !checkpoint_forwarded && !checkpoint_delta.is_empty() {
-            let empty_batch =
-                RecordBatch::new_empty(self.processor.schema().arrow_schema().clone());
+            let empty_batch = RecordBatch::new_empty(std::sync::Arc::new(
+                arrow::datatypes::Schema::empty(),
+            ));
             let processed_batch =
                 ProcessedParquetBatch::new(empty_batch, checkpoint_delta, force_commit);
             ctx.send_message(&self.indexer_mailbox, processed_batch)
@@ -402,10 +404,10 @@ mod tests {
         use std::sync::Arc as StdArc;
 
         use arrow::array::{
-            ArrayRef, BinaryViewArray, DictionaryArray, Float64Array, Int32Array, StringArray,
-            StructArray, UInt8Array, UInt64Array,
+            ArrayRef, DictionaryArray, Float64Array, Int32Array, StringArray, UInt8Array,
+            UInt64Array,
         };
-        use arrow::datatypes::{DataType, Field, Int32Type};
+        use arrow::datatypes::{DataType, Field, Int32Type, Schema as ArrowSchema};
         use arrow::record_batch::RecordBatch;
         let universe = Universe::with_accelerated_time();
 
@@ -419,99 +421,42 @@ mod tests {
         let (metrics_doc_processor_mailbox, metrics_doc_processor_handle) =
             universe.spawn_builder().spawn(metrics_doc_processor);
 
-        // Create a test batch matching the metrics schema
-        let schema = ParquetSchema::new();
+        // Create a test batch with the 4 required fields plus a tag column
         let num_rows = 3;
-
-        // Helper to create dictionary arrays
-        fn create_dict_array(values: &[&str]) -> ArrayRef {
-            let keys: Vec<i32> = (0..values.len()).map(|i| i as i32).collect();
-            let string_array = StringArray::from(values.to_vec());
-            StdArc::new(
-                DictionaryArray::<Int32Type>::try_new(
-                    Int32Array::from(keys),
-                    StdArc::new(string_array),
-                )
-                .unwrap(),
-            )
-        }
-
-        fn create_nullable_dict_array(values: &[Option<&str>]) -> ArrayRef {
-            let keys: Vec<Option<i32>> = values
-                .iter()
-                .enumerate()
-                .map(|(i, v)| v.map(|_| i as i32))
-                .collect();
-            let string_values: Vec<&str> = values.iter().filter_map(|v| *v).collect();
-            let string_array = StringArray::from(string_values);
-            StdArc::new(
-                DictionaryArray::<Int32Type>::try_new(
-                    Int32Array::from(keys),
-                    StdArc::new(string_array),
-                )
-                .unwrap(),
-            )
-        }
-
-        let metric_name: ArrayRef = create_dict_array(&vec!["cpu.usage"; num_rows]);
-        let metric_type: ArrayRef = StdArc::new(UInt8Array::from(vec![0u8; num_rows]));
-        let metric_unit: ArrayRef = StdArc::new(StringArray::from(vec![Some("bytes"); num_rows]));
-        let timestamp_secs: ArrayRef = StdArc::new(UInt64Array::from(vec![100u64, 101u64, 102u64]));
-        let start_timestamp_secs: ArrayRef =
-            StdArc::new(UInt64Array::from(vec![None::<u64>; num_rows]));
-        let value: ArrayRef = StdArc::new(Float64Array::from(vec![42.0, 43.0, 44.0]));
-        let tag_service: ArrayRef = create_nullable_dict_array(&vec![Some("web"); num_rows]);
-        let tag_env: ArrayRef = create_nullable_dict_array(&vec![Some("prod"); num_rows]);
-        let tag_datacenter: ArrayRef =
-            create_nullable_dict_array(&vec![Some("us-east-1"); num_rows]);
-        let tag_region: ArrayRef = create_nullable_dict_array(&vec![None; num_rows]);
-        let tag_host: ArrayRef = create_nullable_dict_array(&vec![Some("host-001"); num_rows]);
-
-        // Create empty Variant (Struct with metadata and value BinaryView fields)
-        let metadata_array = StdArc::new(BinaryViewArray::from(vec![b"" as &[u8]; num_rows]));
-        let value_array = StdArc::new(BinaryViewArray::from(vec![b"" as &[u8]; num_rows]));
-        let attributes: ArrayRef = StdArc::new(StructArray::from(vec![
-            (
-                StdArc::new(Field::new("metadata", DataType::BinaryView, false)),
-                metadata_array.clone() as ArrayRef,
+        let schema = StdArc::new(ArrowSchema::new(vec![
+            Field::new(
+                "metric_name",
+                DataType::Dictionary(Box::new(DataType::Int32), Box::new(DataType::Utf8)),
+                false,
             ),
-            (
-                StdArc::new(Field::new("value", DataType::BinaryView, false)),
-                value_array.clone() as ArrayRef,
+            Field::new("metric_type", DataType::UInt8, false),
+            Field::new("timestamp_secs", DataType::UInt64, false),
+            Field::new("value", DataType::Float64, false),
+            Field::new(
+                "service",
+                DataType::Dictionary(Box::new(DataType::Int32), Box::new(DataType::Utf8)),
+                true,
             ),
         ]));
 
-        let service_name: ArrayRef = create_dict_array(&vec!["my-service"; num_rows]);
-
-        let resource_attributes: ArrayRef = StdArc::new(StructArray::from(vec![
-            (
-                StdArc::new(Field::new("metadata", DataType::BinaryView, false)),
-                metadata_array as ArrayRef,
-            ),
-            (
-                StdArc::new(Field::new("value", DataType::BinaryView, false)),
-                value_array as ArrayRef,
-            ),
-        ]));
+        let metric_name: ArrayRef = {
+            let keys = Int32Array::from(vec![0i32; num_rows]);
+            let vals = StringArray::from(vec!["cpu.usage"]);
+            StdArc::new(DictionaryArray::<Int32Type>::try_new(keys, StdArc::new(vals)).unwrap())
+        };
+        let metric_type: ArrayRef = StdArc::new(UInt8Array::from(vec![0u8; num_rows]));
+        let timestamp_secs: ArrayRef =
+            StdArc::new(UInt64Array::from(vec![100u64, 101u64, 102u64]));
+        let value: ArrayRef = StdArc::new(Float64Array::from(vec![42.0, 43.0, 44.0]));
+        let service: ArrayRef = {
+            let keys = Int32Array::from(vec![0i32; num_rows]);
+            let vals = StringArray::from(vec!["web"]);
+            StdArc::new(DictionaryArray::<Int32Type>::try_new(keys, StdArc::new(vals)).unwrap())
+        };
 
         let batch = RecordBatch::try_new(
-            schema.arrow_schema().clone(),
-            vec![
-                metric_name,
-                metric_type,
-                metric_unit,
-                timestamp_secs,
-                start_timestamp_secs,
-                value,
-                tag_service,
-                tag_env,
-                tag_datacenter,
-                tag_region,
-                tag_host,
-                attributes,
-                service_name,
-                resource_attributes,
-            ],
+            schema,
+            vec![metric_name, metric_type, timestamp_secs, value, service],
         )
         .unwrap();
 
@@ -625,10 +570,10 @@ mod tests {
         use std::sync::Arc as StdArc;
 
         use arrow::array::{
-            ArrayRef, BinaryViewArray, DictionaryArray, Float64Array, Int32Array, StringArray,
-            StructArray, UInt8Array, UInt64Array,
+            ArrayRef, DictionaryArray, Float64Array, Int32Array, StringArray, UInt8Array,
+            UInt64Array,
         };
-        use arrow::datatypes::{DataType, Field, Int32Type};
+        use arrow::datatypes::{DataType, Field, Int32Type, Schema as ArrowSchema};
         use arrow::record_batch::RecordBatch;
         use quickwit_parquet_engine::storage::{ParquetSplitWriter, ParquetWriterConfig};
         use quickwit_proto::metastore::MockMetastoreService;
@@ -657,9 +602,8 @@ mod tests {
         let (uploader_mailbox, _uploader_handle) = universe.spawn_builder().spawn(uploader);
 
         // Create ParquetPackager
-        let parquet_schema = ParquetSchema::new();
         let writer_config = ParquetWriterConfig::default();
-        let split_writer = ParquetSplitWriter::new(parquet_schema, writer_config, temp_dir.path());
+        let split_writer = ParquetSplitWriter::new(writer_config, temp_dir.path());
         let packager = ParquetPackager::new(split_writer, uploader_mailbox);
         let (packager_mailbox, packager_handle) = universe.spawn_builder().spawn(packager);
 
@@ -681,100 +625,43 @@ mod tests {
         let (metrics_doc_processor_mailbox, metrics_doc_processor_handle) =
             universe.spawn_builder().spawn(metrics_doc_processor);
 
-        // Create a test batch
-        let schema = ParquetSchema::new();
+        // Create a test batch with the 4 required fields plus a tag column
         let num_rows = 5;
+        let schema = StdArc::new(ArrowSchema::new(vec![
+            Field::new(
+                "metric_name",
+                DataType::Dictionary(Box::new(DataType::Int32), Box::new(DataType::Utf8)),
+                false,
+            ),
+            Field::new("metric_type", DataType::UInt8, false),
+            Field::new("timestamp_secs", DataType::UInt64, false),
+            Field::new("value", DataType::Float64, false),
+            Field::new(
+                "service",
+                DataType::Dictionary(Box::new(DataType::Int32), Box::new(DataType::Utf8)),
+                true,
+            ),
+        ]));
 
-        fn create_dict_array(values: &[&str]) -> ArrayRef {
-            let keys: Vec<i32> = (0..values.len()).map(|i| i as i32).collect();
-            let string_array = StringArray::from(values.to_vec());
-            StdArc::new(
-                DictionaryArray::<Int32Type>::try_new(
-                    Int32Array::from(keys),
-                    StdArc::new(string_array),
-                )
-                .unwrap(),
-            )
-        }
-
-        fn create_nullable_dict_array(values: &[Option<&str>]) -> ArrayRef {
-            let keys: Vec<Option<i32>> = values
-                .iter()
-                .enumerate()
-                .map(|(i, v)| v.map(|_| i as i32))
-                .collect();
-            let string_values: Vec<&str> = values.iter().filter_map(|v| *v).collect();
-            let string_array = StringArray::from(string_values);
-            StdArc::new(
-                DictionaryArray::<Int32Type>::try_new(
-                    Int32Array::from(keys),
-                    StdArc::new(string_array),
-                )
-                .unwrap(),
-            )
-        }
-
-        let metric_name: ArrayRef = create_dict_array(&vec!["cpu.usage"; num_rows]);
+        let metric_name: ArrayRef = {
+            let keys = Int32Array::from(vec![0i32; num_rows]);
+            let vals = StringArray::from(vec!["cpu.usage"]);
+            StdArc::new(DictionaryArray::<Int32Type>::try_new(keys, StdArc::new(vals)).unwrap())
+        };
         let metric_type: ArrayRef = StdArc::new(UInt8Array::from(vec![0u8; num_rows]));
-        let metric_unit: ArrayRef = StdArc::new(StringArray::from(vec![Some("bytes"); num_rows]));
         let timestamps: Vec<u64> = (0..num_rows).map(|i| 100 + i as u64).collect();
         let timestamp_secs: ArrayRef = StdArc::new(UInt64Array::from(timestamps));
-        let start_timestamp_secs: ArrayRef =
-            StdArc::new(UInt64Array::from(vec![None::<u64>; num_rows]));
         let values: Vec<f64> = (0..num_rows).map(|i| 42.0 + i as f64).collect();
         let value: ArrayRef = StdArc::new(Float64Array::from(values));
-        let tag_service: ArrayRef = create_nullable_dict_array(&vec![Some("web"); num_rows]);
-        let tag_env: ArrayRef = create_nullable_dict_array(&vec![Some("prod"); num_rows]);
-        let tag_datacenter: ArrayRef =
-            create_nullable_dict_array(&vec![Some("us-east-1"); num_rows]);
-        let tag_region: ArrayRef = create_nullable_dict_array(&vec![None; num_rows]);
-        let tag_host: ArrayRef = create_nullable_dict_array(&vec![Some("host-001"); num_rows]);
-
-        // Create empty Variant (Struct with metadata and value BinaryView fields)
-        let metadata_array = StdArc::new(BinaryViewArray::from(vec![b"" as &[u8]; num_rows]));
-        let value_array = StdArc::new(BinaryViewArray::from(vec![b"" as &[u8]; num_rows]));
-        let attributes: ArrayRef = StdArc::new(StructArray::from(vec![
-            (
-                StdArc::new(Field::new("metadata", DataType::BinaryView, false)),
-                metadata_array.clone() as ArrayRef,
-            ),
-            (
-                StdArc::new(Field::new("value", DataType::BinaryView, false)),
-                value_array.clone() as ArrayRef,
-            ),
-        ]));
-
-        let service_name: ArrayRef = create_dict_array(&vec!["my-service"; num_rows]);
-
-        let resource_attributes: ArrayRef = StdArc::new(StructArray::from(vec![
-            (
-                StdArc::new(Field::new("metadata", DataType::BinaryView, false)),
-                metadata_array as ArrayRef,
-            ),
-            (
-                StdArc::new(Field::new("value", DataType::BinaryView, false)),
-                value_array as ArrayRef,
-            ),
-        ]));
+        let service: ArrayRef = {
+            let keys = Int32Array::from(vec![0i32; num_rows]);
+            let vals = StringArray::from(vec!["web"]);
+            StdArc::new(DictionaryArray::<Int32Type>::try_new(keys, StdArc::new(vals)).unwrap())
+        };
 
         let batch = RecordBatch::try_new(
-            schema.arrow_schema().clone(),
-            vec![
-                metric_name,
-                metric_type,
-                metric_unit,
-                timestamp_secs,
-                start_timestamp_secs,
-                value,
-                tag_service,
-                tag_env,
-                tag_datacenter,
-                tag_region,
-                tag_host,
-                attributes,
-                service_name,
-                resource_attributes,
-            ],
+            schema,
+            vec![metric_name, metric_type, timestamp_secs, value, service],
         )
         .unwrap();
 
diff --git a/quickwit/quickwit-indexing/src/actors/parquet_e2e_test.rs b/quickwit/quickwit-indexing/src/actors/parquet_e2e_test.rs
index 0823ce4ad04..d613fc96003 100644
--- a/quickwit/quickwit-indexing/src/actors/parquet_e2e_test.rs
+++ b/quickwit/quickwit-indexing/src/actors/parquet_e2e_test.rs
@@ -24,15 +24,13 @@ use std::time::Duration;
 use arrow::array::{
     ArrayRef, DictionaryArray, Float64Array, Int32Array, StringArray, UInt8Array, UInt64Array,
 };
-use arrow::datatypes::Int32Type;
+use arrow::datatypes::{DataType, Field, Int32Type, Schema as ArrowSchema};
 use arrow::record_batch::RecordBatch;
 use bytes::Bytes;
-use parquet::variant::{VariantArrayBuilder, VariantBuilderExt};
 use quickwit_actors::{ActorHandle, Universe};
 use quickwit_common::test_utils::wait_until_predicate;
 use quickwit_metastore::checkpoint::SourceCheckpointDelta;
 use quickwit_parquet_engine::ingest::record_batch_to_ipc;
-use quickwit_parquet_engine::schema::ParquetSchema;
 use quickwit_parquet_engine::storage::{ParquetSplitWriter, ParquetWriterConfig};
 use quickwit_proto::metastore::{EmptyResponse, MockMetastoreService};
 use quickwit_proto::types::IndexUid;
@@ -65,48 +63,6 @@ async fn wait_for_published_splits(
     .map_err(|_| anyhow::anyhow!("Timeout waiting for {} published splits", expected_splits))
 }
 
-fn create_dict_array(values: &[&str]) -> ArrayRef {
-    let keys: Vec<i32> = (0..values.len()).map(|i| i as i32).collect();
-    let string_array = StringArray::from(values.to_vec());
-    Arc::new(
-        DictionaryArray::<Int32Type>::try_new(Int32Array::from(keys), Arc::new(string_array))
-            .unwrap(),
-    )
-}
-
-fn create_nullable_dict_array(values: &[Option<&str>]) -> ArrayRef {
-    let keys: Vec<Option<i32>> = values
-        .iter()
-        .enumerate()
-        .map(|(i, v)| v.map(|_| i as i32))
-        .collect();
-    let string_values: Vec<&str> = values.iter().filter_map(|v| *v).collect();
-    let string_array = StringArray::from(string_values);
-    Arc::new(
-        DictionaryArray::<Int32Type>::try_new(Int32Array::from(keys), Arc::new(string_array))
-            .unwrap(),
-    )
-}
-
-fn create_variant_array(num_rows: usize, fields: Option<&[(&str, &str)]>) -> ArrayRef {
-    let mut builder = VariantArrayBuilder::new(num_rows);
-    for _ in 0..num_rows {
-        match fields {
-            Some(kv_pairs) => {
-                let mut obj = builder.new_object();
-                for &(key, value) in kv_pairs {
-                    obj = obj.with_field(key, value);
-                }
-                obj.finish();
-            }
-            None => {
-                builder.append_null();
-            }
-        }
-    }
-    ArrayRef::from(builder.build())
-}
-
 fn create_test_batch(
     num_rows: usize,
     metric_name: &str,
@@ -114,44 +70,46 @@ fn create_test_batch(
     base_timestamp: u64,
     base_value: f64,
 ) -> RecordBatch {
-    let schema = ParquetSchema::new();
-
-    let metric_names: Vec<&str> = vec![metric_name; num_rows];
-    let metric_name_arr: ArrayRef = create_dict_array(&metric_names);
+    let schema = Arc::new(ArrowSchema::new(vec![
+        Field::new(
+            "metric_name",
+            DataType::Dictionary(Box::new(DataType::Int32), Box::new(DataType::Utf8)),
+            false,
+        ),
+        Field::new("metric_type", DataType::UInt8, false),
+        Field::new("timestamp_secs", DataType::UInt64, false),
+        Field::new("value", DataType::Float64, false),
+        Field::new(
+            "service",
+            DataType::Dictionary(Box::new(DataType::Int32), Box::new(DataType::Utf8)),
+            true,
+        ),
+    ]));
+
+    let metric_name_arr: ArrayRef = {
+        let keys = Int32Array::from(vec![0i32; num_rows]);
+        let vals = StringArray::from(vec![metric_name]);
+        Arc::new(DictionaryArray::<Int32Type>::try_new(keys, Arc::new(vals)).unwrap())
+    };
     let metric_type: ArrayRef = Arc::new(UInt8Array::from(vec![0u8; num_rows]));
-    let metric_unit: ArrayRef = Arc::new(StringArray::from(vec![Some("count"); num_rows]));
     let timestamps: Vec<u64> = (0..num_rows).map(|i| base_timestamp + i as u64).collect();
     let timestamp_secs: ArrayRef = Arc::new(UInt64Array::from(timestamps));
-    let start_timestamp_secs: ArrayRef = Arc::new(UInt64Array::from(vec![None::<u64>; num_rows]));
     let values: Vec<f64> = (0..num_rows).map(|i| base_value + i as f64).collect();
     let value: ArrayRef = Arc::new(Float64Array::from(values));
-    let tag_service: ArrayRef = create_nullable_dict_array(&vec![Some(service); num_rows]);
-    let tag_env: ArrayRef = create_nullable_dict_array(&vec![Some("prod"); num_rows]);
-    let tag_datacenter: ArrayRef = create_nullable_dict_array(&vec![Some("us-east-1"); num_rows]);
-    let tag_region: ArrayRef = create_nullable_dict_array(&vec![None; num_rows]);
-    let tag_host: ArrayRef = create_nullable_dict_array(&vec![Some("host-001"); num_rows]);
-    let attributes: ArrayRef = create_variant_array(num_rows, None);
-    let service_names: Vec<&str> = vec![service; num_rows];
-    let service_name: ArrayRef = create_dict_array(&service_names);
-    let resource_attributes: ArrayRef = create_variant_array(num_rows, None);
+    let service_arr: ArrayRef = {
+        let keys = Int32Array::from(vec![0i32; num_rows]);
+        let vals = StringArray::from(vec![service]);
+        Arc::new(DictionaryArray::<Int32Type>::try_new(keys, Arc::new(vals)).unwrap())
+    };
 
     RecordBatch::try_new(
-        schema.arrow_schema().clone(),
+        schema,
         vec![
             metric_name_arr,
             metric_type,
-            metric_unit,
             timestamp_secs,
-            start_timestamp_secs,
             value,
-            tag_service,
-            tag_env,
-            tag_datacenter,
-            tag_region,
-            tag_host,
-            attributes,
-            service_name,
-            resource_attributes,
+            service_arr,
         ],
     )
     .unwrap()
@@ -213,9 +171,8 @@ async fn test_metrics_pipeline_e2e() {
     let (uploader_mailbox, _uploader_handle) = universe.spawn_builder().spawn(uploader);
 
     // ParquetPackager between indexer and uploader
-    let parquet_schema = ParquetSchema::new();
     let writer_config = ParquetWriterConfig::default();
-    let split_writer = ParquetSplitWriter::new(parquet_schema, writer_config, temp_dir.path());
+    let split_writer = ParquetSplitWriter::new(writer_config, temp_dir.path());
     let packager = ParquetPackager::new(split_writer, uploader_mailbox);
     let (packager_mailbox, packager_handle) = universe.spawn_builder().spawn(packager);
 
diff --git a/quickwit/quickwit-indexing/src/actors/parquet_indexer.rs b/quickwit/quickwit-indexing/src/actors/parquet_indexer.rs
index 2738a42163f..a8dfe450f37 100644
--- a/quickwit/quickwit-indexing/src/actors/parquet_indexer.rs
+++ b/quickwit/quickwit-indexing/src/actors/parquet_indexer.rs
@@ -530,15 +530,8 @@ mod tests {
     use std::sync::atomic::Ordering;
     use std::time::Duration;
 
-    use arrow::array::{
-        ArrayRef, DictionaryArray, Float64Array, Int32Array, StringArray, UInt8Array, UInt64Array,
-    };
-    use arrow::datatypes::Int32Type;
-    use arrow::record_batch::RecordBatch;
-    use parquet::variant::{VariantArrayBuilder, VariantBuilderExt};
     use quickwit_actors::{ActorHandle, Universe};
     use quickwit_common::test_utils::wait_until_predicate;
-    use quickwit_parquet_engine::schema::ParquetSchema;
     use quickwit_parquet_engine::storage::{ParquetSplitWriter, ParquetWriterConfig};
     use quickwit_proto::metastore::{EmptyResponse, MockMetastoreService};
     use quickwit_storage::RamStorage;
@@ -546,6 +539,7 @@ mod tests {
     use super::*;
     use crate::actors::{
         ParquetPackager, ParquetPublisher, ParquetUploader, SplitsUpdateMailbox, UploaderType,
+        parquet_test_helpers::create_test_batch,
     };
 
     /// Create a test ParquetUploader and return its mailbox.
@@ -598,9 +592,8 @@ mod tests {
         temp_dir: &std::path::Path,
         uploader_mailbox: Mailbox<ParquetUploader>,
     ) -> (Mailbox<ParquetPackager>, ActorHandle<ParquetPackager>) {
-        let schema = ParquetSchema::new();
         let writer_config = ParquetWriterConfig::default();
-        let split_writer = ParquetSplitWriter::new(schema, writer_config, temp_dir);
+        let split_writer = ParquetSplitWriter::new(writer_config, temp_dir);
 
         let packager = ParquetPackager::new(split_writer, uploader_mailbox);
         universe.spawn_builder().spawn(packager)
@@ -624,96 +617,6 @@ mod tests {
         .map_err(|_| anyhow::anyhow!("Timeout waiting for {} staged splits", expected_splits))
     }
 
-    /// Create dictionary array for string fields with Int32 keys.
-    fn create_dict_array(values: &[&str]) -> ArrayRef {
-        let keys: Vec<i32> = (0..values.len()).map(|i| i as i32).collect();
-        let string_array = StringArray::from(values.to_vec());
-        Arc::new(
-            DictionaryArray::<Int32Type>::try_new(Int32Array::from(keys), Arc::new(string_array))
-                .unwrap(),
-        )
-    }
-
-    /// Create nullable dictionary array for optional string fields.
-    fn create_nullable_dict_array(values: &[Option<&str>]) -> ArrayRef {
-        let keys: Vec<Option<i32>> = values
-            .iter()
-            .enumerate()
-            .map(|(i, v)| v.map(|_| i as i32))
-            .collect();
-        let string_values: Vec<&str> = values.iter().filter_map(|v| *v).collect();
-        let string_array = StringArray::from(string_values);
-        Arc::new(
-            DictionaryArray::<Int32Type>::try_new(Int32Array::from(keys), Arc::new(string_array))
-                .unwrap(),
-        )
-    }
-
-    /// Create a VARIANT array for testing with specified number of rows.
-    fn create_variant_array(num_rows: usize, fields: Option<&[(&str, &str)]>) -> ArrayRef {
-        let mut builder = VariantArrayBuilder::new(num_rows);
-        for _ in 0..num_rows {
-            match fields {
-                Some(kv_pairs) => {
-                    let mut obj = builder.new_object();
-                    for &(key, value) in kv_pairs {
-                        obj = obj.with_field(key, value);
-                    }
-                    obj.finish();
-                }
-                None => {
-                    builder.append_null();
-                }
-            }
-        }
-        ArrayRef::from(builder.build())
-    }
-
-    /// Create a test batch matching the metrics schema.
-    fn create_test_batch(num_rows: usize) -> RecordBatch {
-        let schema = ParquetSchema::new();
-
-        let metric_names: Vec<&str> = vec!["cpu.usage"; num_rows];
-        let metric_name: ArrayRef = create_dict_array(&metric_names);
-        let metric_type: ArrayRef = Arc::new(UInt8Array::from(vec![0u8; num_rows]));
-        let metric_unit: ArrayRef = Arc::new(StringArray::from(vec![Some("bytes"); num_rows]));
-        let timestamps: Vec<u64> = (0..num_rows).map(|i| 100 + i as u64).collect();
-        let timestamp_secs: ArrayRef = Arc::new(UInt64Array::from(timestamps));
-        let start_timestamp_secs: ArrayRef =
-            Arc::new(UInt64Array::from(vec![None::<u64>; num_rows]));
-        let values: Vec<f64> = (0..num_rows).map(|i| 42.0 + i as f64).collect();
-        let value: ArrayRef = Arc::new(Float64Array::from(values));
-        let tag_service: ArrayRef = create_nullable_dict_array(&vec![Some("web"); num_rows]);
-        let tag_env: ArrayRef = create_nullable_dict_array(&vec![Some("prod"); num_rows]);
-        let tag_datacenter: ArrayRef =
-            create_nullable_dict_array(&vec![Some("us-east-1"); num_rows]);
-        let tag_region: ArrayRef = create_nullable_dict_array(&vec![None; num_rows]);
-        let tag_host: ArrayRef = create_nullable_dict_array(&vec![Some("host-001"); num_rows]);
-        let attributes: ArrayRef = create_variant_array(num_rows, None);
-        let service_name: ArrayRef = create_dict_array(&vec!["my-service"; num_rows]);
-        let resource_attributes: ArrayRef = create_variant_array(num_rows, None);
-
-        RecordBatch::try_new(
-            schema.arrow_schema().clone(),
-            vec![
-                metric_name,
-                metric_type,
-                metric_unit,
-                timestamp_secs,
-                start_timestamp_secs,
-                value,
-                tag_service,
-                tag_env,
-                tag_datacenter,
-                tag_region,
-                tag_host,
-                attributes,
-                service_name,
-                resource_attributes,
-            ],
-        )
-        .unwrap()
-    }
 
     #[tokio::test]
     async fn test_metrics_indexer_receives_batch() {
diff --git a/quickwit/quickwit-indexing/src/actors/parquet_packager.rs b/quickwit/quickwit-indexing/src/actors/parquet_packager.rs
index a23b88789e4..d0d39d05606 100644
--- a/quickwit/quickwit-indexing/src/actors/parquet_packager.rs
+++ b/quickwit/quickwit-indexing/src/actors/parquet_packager.rs
@@ -235,109 +235,18 @@ mod tests {
     use std::sync::atomic::Ordering as AtomicOrdering;
     use std::time::Duration;
 
-    use arrow::array::{
-        ArrayRef, DictionaryArray, Float64Array, Int32Array, StringArray, UInt8Array, UInt64Array,
-    };
-    use arrow::datatypes::Int32Type;
-    use arrow::record_batch::RecordBatch;
-    use parquet::variant::{VariantArrayBuilder, VariantBuilderExt};
     use quickwit_actors::{ActorHandle, Universe};
     use quickwit_common::test_utils::wait_until_predicate;
     use quickwit_metastore::checkpoint::{IndexCheckpointDelta, SourceCheckpointDelta};
-    use quickwit_parquet_engine::schema::ParquetSchema;
     use quickwit_parquet_engine::storage::ParquetWriterConfig;
     use quickwit_proto::metastore::{EmptyResponse, MockMetastoreService};
     use quickwit_storage::RamStorage;
 
     use super::*;
-    use crate::actors::{ParquetPublisher, SplitsUpdateMailbox, UploaderType};
-
-    fn create_dict_array(values: &[&str]) -> ArrayRef {
-        let keys: Vec<i32> = (0..values.len()).map(|i| i as i32).collect();
-        let string_array = StringArray::from(values.to_vec());
-        Arc::new(
-            DictionaryArray::<Int32Type>::try_new(Int32Array::from(keys), Arc::new(string_array))
-                .unwrap(),
-        )
-    }
-
-    fn create_nullable_dict_array(values: &[Option<&str>]) -> ArrayRef {
-        let keys: Vec<Option<i32>> = values
-            .iter()
-            .enumerate()
-            .map(|(i, v)| v.map(|_| i as i32))
-            .collect();
-        let string_values: Vec<&str> = values.iter().filter_map(|v| *v).collect();
-        let string_array = StringArray::from(string_values);
-        Arc::new(
-            DictionaryArray::<Int32Type>::try_new(Int32Array::from(keys), Arc::new(string_array))
-                .unwrap(),
-        )
-    }
-
-    fn create_variant_array(num_rows: usize, fields: Option<&[(&str, &str)]>) -> ArrayRef {
-        let mut builder = VariantArrayBuilder::new(num_rows);
-        for _ in 0..num_rows {
-            match fields {
-                Some(kv_pairs) => {
-                    let mut obj = builder.new_object();
-                    for &(key, value) in kv_pairs {
-                        obj = obj.with_field(key, value);
-                    }
-                    obj.finish();
-                }
-                None => {
-                    builder.append_null();
-                }
-            }
-        }
-        ArrayRef::from(builder.build())
-    }
-
-    fn create_test_batch(num_rows: usize) -> RecordBatch {
-        let schema = ParquetSchema::new();
-
-        let metric_names: Vec<&str> = vec!["cpu.usage"; num_rows];
-        let metric_name: ArrayRef = create_dict_array(&metric_names);
-        let metric_type: ArrayRef = Arc::new(UInt8Array::from(vec![0u8; num_rows]));
-        let metric_unit: ArrayRef = Arc::new(StringArray::from(vec![Some("bytes"); num_rows]));
-        let timestamps: Vec<u64> = (0..num_rows).map(|i| 100 + i as u64).collect();
-        let timestamp_secs: ArrayRef = Arc::new(UInt64Array::from(timestamps));
-        let start_timestamp_secs: ArrayRef =
-            Arc::new(UInt64Array::from(vec![None::<u64>; num_rows]));
-        let values: Vec<f64> = (0..num_rows).map(|i| 42.0 + i as f64).collect();
-        let value: ArrayRef = Arc::new(Float64Array::from(values));
-        let tag_service: ArrayRef = create_nullable_dict_array(&vec![Some("web"); num_rows]);
-        let tag_env: ArrayRef = create_nullable_dict_array(&vec![Some("prod"); num_rows]);
-        let tag_datacenter: ArrayRef =
-            create_nullable_dict_array(&vec![Some("us-east-1"); num_rows]);
-        let tag_region: ArrayRef = create_nullable_dict_array(&vec![None; num_rows]);
-        let tag_host: ArrayRef = create_nullable_dict_array(&vec![Some("host-001"); num_rows]);
-        let attributes: ArrayRef = create_variant_array(num_rows, None);
-        let service_name: ArrayRef = create_dict_array(&vec!["my-service"; num_rows]);
-        let resource_attributes: ArrayRef = create_variant_array(num_rows, None);
-
-        RecordBatch::try_new(
-            schema.arrow_schema().clone(),
-            vec![
-                metric_name,
-                metric_type,
-                metric_unit,
-                timestamp_secs,
-                start_timestamp_secs,
-                value,
-                tag_service,
-                tag_env,
-                tag_datacenter,
-                tag_region,
-                tag_host,
-                attributes,
-                service_name,
-                resource_attributes,
-            ],
-        )
-        .unwrap()
-    }
+    use crate::actors::{
+        ParquetPublisher, SplitsUpdateMailbox, UploaderType,
+        parquet_test_helpers::create_test_batch,
+    };
 
     fn create_test_uploader(
         universe: &Universe,
@@ -366,9 +275,8 @@ mod tests {
         temp_dir: &std::path::Path,
         uploader_mailbox: Mailbox<ParquetUploader>,
     ) -> (Mailbox<ParquetPackager>, ActorHandle<ParquetPackager>) {
-        let schema = ParquetSchema::new();
         let writer_config = ParquetWriterConfig::default();
-        let split_writer = ParquetSplitWriter::new(schema, writer_config, temp_dir);
+        let split_writer = ParquetSplitWriter::new(writer_config, temp_dir);
 
         let packager = ParquetPackager::new(split_writer, uploader_mailbox);
         universe.spawn_builder().spawn(packager)
diff --git a/quickwit/quickwit-indexing/src/models/processed_parquet_batch.rs b/quickwit/quickwit-indexing/src/models/processed_parquet_batch.rs
index 0db83abcc02..e09b5d5d9a9 100644
--- a/quickwit/quickwit-indexing/src/models/processed_parquet_batch.rs
+++ b/quickwit/quickwit-indexing/src/models/processed_parquet_batch.rs
@@ -96,112 +96,10 @@ impl fmt::Debug for ProcessedParquetBatch {
 
 #[cfg(test)]
 mod tests {
-    use std::sync::Arc;
-
-    use arrow::array::{
-        ArrayRef, BinaryViewArray, DictionaryArray, Float64Array, Int32Array, StringArray,
-        StructArray, UInt8Array, UInt64Array,
-    };
-    use arrow::datatypes::{DataType, Field, Int32Type};
-    use quickwit_parquet_engine::schema::ParquetSchema;
+    use crate::actors::parquet_test_helpers::create_test_batch;
 
     use super::*;
 
-    /// Create dictionary array for string fields with Int32 keys.
-    fn create_dict_array(values: &[&str]) -> ArrayRef {
-        let keys: Vec<i32> = (0..values.len()).map(|i| i as i32).collect();
-        let string_array = StringArray::from(values.to_vec());
-        Arc::new(
-            DictionaryArray::<Int32Type>::try_new(Int32Array::from(keys), Arc::new(string_array))
-                .unwrap(),
-        )
-    }
-
-    /// Create nullable dictionary array for optional string fields.
-    fn create_nullable_dict_array(values: &[Option<&str>]) -> ArrayRef {
-        let keys: Vec<Option<i32>> = values
-            .iter()
-            .enumerate()
-            .map(|(i, v)| v.map(|_| i as i32))
-            .collect();
-        let string_values: Vec<&str> = values.iter().filter_map(|v| *v).collect();
-        let string_array = StringArray::from(string_values);
-        Arc::new(
-            DictionaryArray::<Int32Type>::try_new(Int32Array::from(keys), Arc::new(string_array))
-                .unwrap(),
-        )
-    }
-
-    /// Create a test batch matching the metrics schema.
-    fn create_test_batch(num_rows: usize) -> RecordBatch {
-        let schema = ParquetSchema::new();
-
-        let metric_names: Vec<&str> = vec!["cpu.usage"; num_rows];
-        let metric_name: ArrayRef = create_dict_array(&metric_names);
-        let metric_type: ArrayRef = Arc::new(UInt8Array::from(vec![0u8; num_rows]));
-        let metric_unit: ArrayRef = Arc::new(StringArray::from(vec![Some("bytes"); num_rows]));
-        let timestamps: Vec<u64> = (0..num_rows).map(|i| 100 + i as u64).collect();
-        let timestamp_secs: ArrayRef = Arc::new(UInt64Array::from(timestamps));
-        let start_timestamp_secs: ArrayRef =
-            Arc::new(UInt64Array::from(vec![None::<u64>; num_rows]));
-        let values: Vec<f64> = (0..num_rows).map(|i| 42.0 + i as f64).collect();
-        let value: ArrayRef = Arc::new(Float64Array::from(values));
-        let tag_service: ArrayRef = create_nullable_dict_array(&vec![Some("web"); num_rows]);
-        let tag_env: ArrayRef = create_nullable_dict_array(&vec![Some("prod"); num_rows]);
-        let tag_datacenter: ArrayRef =
-            create_nullable_dict_array(&vec![Some("us-east-1"); num_rows]);
-        let tag_region: ArrayRef = create_nullable_dict_array(&vec![None; num_rows]);
-        let tag_host: ArrayRef = create_nullable_dict_array(&vec![Some("host-001"); num_rows]);
-
-        // Create empty Variant (Struct with metadata and value BinaryView fields)
-        let metadata_array = Arc::new(BinaryViewArray::from(vec![b"" as &[u8]; num_rows]));
-        let value_array = Arc::new(BinaryViewArray::from(vec![b"" as &[u8]; num_rows]));
-        let attributes: ArrayRef = Arc::new(StructArray::from(vec![
-            (
-                Arc::new(Field::new("metadata", DataType::BinaryView, false)),
-                metadata_array.clone() as ArrayRef,
-            ),
-            (
-                Arc::new(Field::new("value", DataType::BinaryView, false)),
-                value_array.clone() as ArrayRef,
-            ),
-        ]));
-
-        let service_name: ArrayRef = create_dict_array(&vec!["my-service"; num_rows]);
-
-        let resource_attributes: ArrayRef = Arc::new(StructArray::from(vec![
-            (
-                Arc::new(Field::new("metadata", DataType::BinaryView, false)),
-                metadata_array as ArrayRef,
-            ),
-            (
-                Arc::new(Field::new("value", DataType::BinaryView, false)),
-                value_array as ArrayRef,
-            ),
-        ]));
-
-        RecordBatch::try_new(
-            schema.arrow_schema().clone(),
-            vec![
-                metric_name,
-                metric_type,
-                metric_unit,
-                timestamp_secs,
-                start_timestamp_secs,
-                value,
-                tag_service,
-                tag_env,
-                tag_datacenter,
-                tag_region,
-                tag_host,
-                attributes,
-                service_name,
-                resource_attributes,
-            ],
-        )
-        .unwrap()
-    }
-
     #[test]
     fn test_processed_parquet_batch_new() {
         let batch = create_test_batch(10);

From 33c4070f2020320bcb6f5423a9a0094c1e3a700e Mon Sep 17 00:00:00 2001
From: Matthew Kim <matthew.kim@datadoghq.com>
Date: Wed, 18 Mar 2026 11:02:50 -0400
Subject: [PATCH 07/35] refactor: deduplicate test batch helpers

---
 quickwit/quickwit-indexing/Cargo.toml         |  1 +
 .../src/actors/parquet_doc_processor.rs       | 98 +------------------
 .../src/actors/parquet_indexer.rs             |  3 +-
 .../src/actors/parquet_packager.rs            |  7 +-
 .../src/models/processed_parquet_batch.rs     |  2 +-
 quickwit/quickwit-opentelemetry/Cargo.toml    |  1 -
 quickwit/quickwit-parquet-engine/src/lib.rs   |  2 +-
 .../src/test_helpers.rs                       | 96 ++++++++++++++++++
 8 files changed, 108 insertions(+), 102 deletions(-)
 create mode 100644 quickwit/quickwit-parquet-engine/src/test_helpers.rs

diff --git a/quickwit/quickwit-indexing/Cargo.toml b/quickwit/quickwit-indexing/Cargo.toml
index 677b95e32aa..5d3a6504d29 100644
--- a/quickwit/quickwit-indexing/Cargo.toml
+++ b/quickwit/quickwit-indexing/Cargo.toml
@@ -123,6 +123,7 @@ sqlx = { workspace = true, features = ["runtime-tokio", "postgres"] }
 tempfile = { workspace = true }
 
 quickwit-actors = { workspace = true, features = ["testsuite"] }
+quickwit-parquet-engine = { workspace = true, features = ["testsuite"] }
 quickwit-cluster = { workspace = true, features = ["testsuite"] }
 quickwit-common = { workspace = true, features = ["testsuite"] }
 quickwit-config = { workspace = true, features = ["testsuite"] }
diff --git a/quickwit/quickwit-indexing/src/actors/parquet_doc_processor.rs b/quickwit/quickwit-indexing/src/actors/parquet_doc_processor.rs
index aa26f67fc94..11265b52c9f 100644
--- a/quickwit/quickwit-indexing/src/actors/parquet_doc_processor.rs
+++ b/quickwit/quickwit-indexing/src/actors/parquet_doc_processor.rs
@@ -401,14 +401,8 @@ mod tests {
 
     #[tokio::test]
     async fn test_metrics_doc_processor_valid_arrow_ipc() {
-        use std::sync::Arc as StdArc;
+        use quickwit_parquet_engine::test_helpers::create_test_batch_with_tags;
 
-        use arrow::array::{
-            ArrayRef, DictionaryArray, Float64Array, Int32Array, StringArray, UInt8Array,
-            UInt64Array,
-        };
-        use arrow::datatypes::{DataType, Field, Int32Type, Schema as ArrowSchema};
-        use arrow::record_batch::RecordBatch;
         let universe = Universe::with_accelerated_time();
 
         let (indexer_mailbox, _indexer_inbox) = universe.create_test_mailbox::<ParquetIndexer>();
@@ -421,46 +415,7 @@ mod tests {
         let (metrics_doc_processor_mailbox, metrics_doc_processor_handle) =
             universe.spawn_builder().spawn(metrics_doc_processor);
 
-        // Create a test batch with the 4 required fields plus a tag column
-        let num_rows = 3;
-        let schema = StdArc::new(ArrowSchema::new(vec![
-            Field::new(
-                "metric_name",
-                DataType::Dictionary(Box::new(DataType::Int32), Box::new(DataType::Utf8)),
-                false,
-            ),
-            Field::new("metric_type", DataType::UInt8, false),
-            Field::new("timestamp_secs", DataType::UInt64, false),
-            Field::new("value", DataType::Float64, false),
-            Field::new(
-                "service",
-                DataType::Dictionary(Box::new(DataType::Int32), Box::new(DataType::Utf8)),
-                true,
-            ),
-        ]));
-
-        let metric_name: ArrayRef = {
-            let keys = Int32Array::from(vec![0i32; num_rows]);
-            let vals = StringArray::from(vec!["cpu.usage"]);
-            StdArc::new(DictionaryArray::<Int32Type>::try_new(keys, StdArc::new(vals)).unwrap())
-        };
-        let metric_type: ArrayRef = StdArc::new(UInt8Array::from(vec![0u8; num_rows]));
-        let timestamp_secs: ArrayRef =
-            StdArc::new(UInt64Array::from(vec![100u64, 101u64, 102u64]));
-        let value: ArrayRef = StdArc::new(Float64Array::from(vec![42.0, 43.0, 44.0]));
-        let service: ArrayRef = {
-            let keys = Int32Array::from(vec![0i32; num_rows]);
-            let vals = StringArray::from(vec!["web"]);
-            StdArc::new(DictionaryArray::<Int32Type>::try_new(keys, StdArc::new(vals)).unwrap())
-        };
-
-        let batch = RecordBatch::try_new(
-            schema,
-            vec![metric_name, metric_type, timestamp_secs, value, service],
-        )
-        .unwrap();
-
-        // Serialize to Arrow IPC
+        let batch = create_test_batch_with_tags(3, &["service"]);
         let ipc_bytes = record_batch_to_ipc(&batch).unwrap();
 
         // Create RawDocBatch with the IPC bytes
@@ -569,13 +524,8 @@ mod tests {
     async fn test_metrics_doc_processor_with_indexer() {
         use std::sync::Arc as StdArc;
 
-        use arrow::array::{
-            ArrayRef, DictionaryArray, Float64Array, Int32Array, StringArray, UInt8Array,
-            UInt64Array,
-        };
-        use arrow::datatypes::{DataType, Field, Int32Type, Schema as ArrowSchema};
-        use arrow::record_batch::RecordBatch;
         use quickwit_parquet_engine::storage::{ParquetSplitWriter, ParquetWriterConfig};
+        use quickwit_parquet_engine::test_helpers::create_test_batch_with_tags;
         use quickwit_proto::metastore::MockMetastoreService;
         use quickwit_storage::RamStorage;
 
@@ -625,47 +575,7 @@ mod tests {
         let (metrics_doc_processor_mailbox, metrics_doc_processor_handle) =
             universe.spawn_builder().spawn(metrics_doc_processor);
 
-        // Create a test batch with the 4 required fields plus a tag column
-        let num_rows = 5;
-        let schema = StdArc::new(ArrowSchema::new(vec![
-            Field::new(
-                "metric_name",
-                DataType::Dictionary(Box::new(DataType::Int32), Box::new(DataType::Utf8)),
-                false,
-            ),
-            Field::new("metric_type", DataType::UInt8, false),
-            Field::new("timestamp_secs", DataType::UInt64, false),
-            Field::new("value", DataType::Float64, false),
-            Field::new(
-                "service",
-                DataType::Dictionary(Box::new(DataType::Int32), Box::new(DataType::Utf8)),
-                true,
-            ),
-        ]));
-
-        let metric_name: ArrayRef = {
-            let keys = Int32Array::from(vec![0i32; num_rows]);
-            let vals = StringArray::from(vec!["cpu.usage"]);
-            StdArc::new(DictionaryArray::<Int32Type>::try_new(keys, StdArc::new(vals)).unwrap())
-        };
-        let metric_type: ArrayRef = StdArc::new(UInt8Array::from(vec![0u8; num_rows]));
-        let timestamps: Vec<u64> = (0..num_rows).map(|i| 100 + i as u64).collect();
-        let timestamp_secs: ArrayRef = StdArc::new(UInt64Array::from(timestamps));
-        let values: Vec<f64> = (0..num_rows).map(|i| 42.0 + i as f64).collect();
-        let value: ArrayRef = StdArc::new(Float64Array::from(values));
-        let service: ArrayRef = {
-            let keys = Int32Array::from(vec![0i32; num_rows]);
-            let vals = StringArray::from(vec!["web"]);
-            StdArc::new(DictionaryArray::<Int32Type>::try_new(keys, StdArc::new(vals)).unwrap())
-        };
-
-        let batch = RecordBatch::try_new(
-            schema,
-            vec![metric_name, metric_type, timestamp_secs, value, service],
-        )
-        .unwrap();
-
-        // Serialize to Arrow IPC
+        let batch = create_test_batch_with_tags(5, &["service"]);
         let ipc_bytes = record_batch_to_ipc(&batch).unwrap();
 
         // Create RawDocBatch with force_commit to trigger split production
diff --git a/quickwit/quickwit-indexing/src/actors/parquet_indexer.rs b/quickwit/quickwit-indexing/src/actors/parquet_indexer.rs
index a8dfe450f37..89d3d7c8e21 100644
--- a/quickwit/quickwit-indexing/src/actors/parquet_indexer.rs
+++ b/quickwit/quickwit-indexing/src/actors/parquet_indexer.rs
@@ -537,9 +537,10 @@ mod tests {
     use quickwit_storage::RamStorage;
 
     use super::*;
+    use quickwit_parquet_engine::test_helpers::create_test_batch;
+
     use crate::actors::{
         ParquetPackager, ParquetPublisher, ParquetUploader, SplitsUpdateMailbox, UploaderType,
-        parquet_test_helpers::create_test_batch,
     };
 
     /// Create a test ParquetUploader and return its mailbox.
diff --git a/quickwit/quickwit-indexing/src/actors/parquet_packager.rs b/quickwit/quickwit-indexing/src/actors/parquet_packager.rs
index d0d39d05606..f0e5fe8dc76 100644
--- a/quickwit/quickwit-indexing/src/actors/parquet_packager.rs
+++ b/quickwit/quickwit-indexing/src/actors/parquet_packager.rs
@@ -243,10 +243,9 @@ mod tests {
     use quickwit_storage::RamStorage;
 
     use super::*;
-    use crate::actors::{
-        ParquetPublisher, SplitsUpdateMailbox, UploaderType,
-        parquet_test_helpers::create_test_batch,
-    };
+    use quickwit_parquet_engine::test_helpers::create_test_batch;
+
+    use crate::actors::{ParquetPublisher, SplitsUpdateMailbox, UploaderType};
 
     fn create_test_uploader(
         universe: &Universe,
diff --git a/quickwit/quickwit-indexing/src/models/processed_parquet_batch.rs b/quickwit/quickwit-indexing/src/models/processed_parquet_batch.rs
index e09b5d5d9a9..c70afe35976 100644
--- a/quickwit/quickwit-indexing/src/models/processed_parquet_batch.rs
+++ b/quickwit/quickwit-indexing/src/models/processed_parquet_batch.rs
@@ -96,7 +96,7 @@ impl fmt::Debug for ProcessedParquetBatch {
 
 #[cfg(test)]
 mod tests {
-    use crate::actors::parquet_test_helpers::create_test_batch;
+    use quickwit_parquet_engine::test_helpers::create_test_batch;
 
     use super::*;
 
diff --git a/quickwit/quickwit-opentelemetry/Cargo.toml b/quickwit/quickwit-opentelemetry/Cargo.toml
index d6e897990a2..a7432ac5403 100644
--- a/quickwit/quickwit-opentelemetry/Cargo.toml
+++ b/quickwit/quickwit-opentelemetry/Cargo.toml
@@ -15,7 +15,6 @@ anyhow = { workspace = true }
 arrow = { workspace = true }
 async-trait = { workspace = true }
 once_cell = { workspace = true }
-parquet = { workspace = true }
 prost = { workspace = true }
 serde = { workspace = true }
 serde_json = { workspace = true }
diff --git a/quickwit/quickwit-parquet-engine/src/lib.rs b/quickwit/quickwit-parquet-engine/src/lib.rs
index c6d8640e801..309ebf4f442 100644
--- a/quickwit/quickwit-parquet-engine/src/lib.rs
+++ b/quickwit/quickwit-parquet-engine/src/lib.rs
@@ -27,5 +27,5 @@ pub mod schema;
 pub mod split;
 pub mod storage;
 
-#[cfg(test)]
+#[cfg(any(test, feature = "testsuite"))]
 pub mod test_helpers;
diff --git a/quickwit/quickwit-parquet-engine/src/test_helpers.rs b/quickwit/quickwit-parquet-engine/src/test_helpers.rs
new file mode 100644
index 00000000000..dd6892dbe23
--- /dev/null
+++ b/quickwit/quickwit-parquet-engine/src/test_helpers.rs
@@ -0,0 +1,96 @@
+// Copyright 2021-Present Datadog, Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+//! Shared test helpers for building Arrow RecordBatches in unit tests.
+
+use std::sync::Arc;
+
+use arrow::array::{
+    ArrayRef, DictionaryArray, Float64Array, Int32Array, StringArray, UInt8Array, UInt64Array,
+};
+use arrow::datatypes::{DataType, Field, Int32Type, Schema as ArrowSchema};
+use arrow::record_batch::RecordBatch;
+
+/// Creates a dictionary-encoded string array with compact 0-based keys.
+pub fn create_dict_array(values: &[&str]) -> ArrayRef {
+    let keys: Vec<i32> = (0..values.len()).map(|i| i as i32).collect();
+    let string_array = StringArray::from(values.to_vec());
+    Arc::new(
+        DictionaryArray::<Int32Type>::try_new(Int32Array::from(keys), Arc::new(string_array))
+            .unwrap(),
+    )
+}
+
+/// Creates a nullable dictionary-encoded string array.
+///
+/// Each `Some(value)` gets a key into the dictionary; `None` values produce
+/// a null key.
+pub fn create_nullable_dict_array(values: &[Option<&str>]) -> ArrayRef {
+    let keys: Vec<Option<i32>> = values
+        .iter()
+        .enumerate()
+        .map(|(i, v)| v.map(|_| i as i32))
+        .collect();
+    let string_values: Vec<&str> = values.iter().filter_map(|v| *v).collect();
+    let string_array = StringArray::from(string_values);
+    Arc::new(
+        DictionaryArray::<Int32Type>::try_new(Int32Array::from(keys), Arc::new(string_array))
+            .unwrap(),
+    )
+}
+
+/// Creates a RecordBatch with the 4 required fields plus the specified
+/// nullable dictionary-encoded tag columns.
+///
+/// - `metric_name`: all rows set to `"cpu.usage"`
+/// - `metric_type`: all rows `0` (Gauge)
+/// - `timestamp_secs`: sequential, starting at `100`
+/// - `value`: sequential `f64` starting at `42.0`
+/// - each tag column: all rows set to the column name as the value
+pub fn create_test_batch_with_tags(num_rows: usize, tags: &[&str]) -> RecordBatch {
+    let dict_type = DataType::Dictionary(Box::new(DataType::Int32), Box::new(DataType::Utf8));
+
+    let mut fields = vec![
+        Field::new("metric_name", dict_type.clone(), false),
+        Field::new("metric_type", DataType::UInt8, false),
+        Field::new("timestamp_secs", DataType::UInt64, false),
+        Field::new("value", DataType::Float64, false),
+    ];
+    for tag in tags {
+        fields.push(Field::new(*tag, dict_type.clone(), true));
+    }
+    let schema = Arc::new(ArrowSchema::new(fields));
+
+    let metric_names: Vec<&str> = vec!["cpu.usage"; num_rows];
+    let metric_name: ArrayRef = create_dict_array(&metric_names);
+    let metric_type: ArrayRef = Arc::new(UInt8Array::from(vec![0u8; num_rows]));
+    let timestamps: Vec<u64> = (0..num_rows).map(|i| 100 + i as u64).collect();
+    let timestamp_secs: ArrayRef = Arc::new(UInt64Array::from(timestamps));
+    let values: Vec<f64> = (0..num_rows).map(|i| 42.0 + i as f64).collect();
+    let value: ArrayRef = Arc::new(Float64Array::from(values));
+
+    let mut columns: Vec<ArrayRef> = vec![metric_name, metric_type, timestamp_secs, value];
+    for tag in tags {
+        let tag_values: Vec<Option<&str>> = vec![Some(tag); num_rows];
+        columns.push(create_nullable_dict_array(&tag_values));
+    }
+
+    RecordBatch::try_new(schema, columns).unwrap()
+}
+
+/// Creates a RecordBatch with the 4 required fields and default tags
+/// (`service`, `host`).
+pub fn create_test_batch(num_rows: usize) -> RecordBatch {
+    create_test_batch_with_tags(num_rows, &["service", "host"])
+}

From 7a5979fe95d390f94e523203ae83719358aba4e5 Mon Sep 17 00:00:00 2001
From: Matthew Kim <matthew.kim@datadoghq.com>
Date: Mon, 30 Mar 2026 14:12:46 -0400
Subject: [PATCH 08/35] lint

---
 quickwit/Cargo.lock                              |  1 -
 .../src/actors/parquet_doc_processor.rs          | 10 +++-------
 .../src/actors/parquet_indexer.rs                |  4 +---
 .../src/actors/parquet_packager.rs               |  3 +--
 .../src/otlp/arrow_metrics.rs                    |  5 ++---
 .../src/otlp/otel_metrics.rs                     | 13 +++----------
 .../src/index/accumulator.rs                     | 15 ++++++++++-----
 .../src/ingest/processor.rs                      |  8 +++++---
 .../quickwit-parquet-engine/src/schema/fields.rs |  6 ++++--
 .../src/storage/config.rs                        |  3 +--
 .../src/storage/split_writer.rs                  |  5 +++--
 .../src/storage/writer.rs                        | 16 ++++++++++++----
 12 files changed, 45 insertions(+), 44 deletions(-)

diff --git a/quickwit/Cargo.lock b/quickwit/Cargo.lock
index 684303cb6a8..f686efc5e11 100644
--- a/quickwit/Cargo.lock
+++ b/quickwit/Cargo.lock
@@ -7591,7 +7591,6 @@ dependencies = [
  "arrow",
  "async-trait",
  "once_cell",
- "parquet",
  "prost 0.14.3",
  "quickwit-common",
  "quickwit-config",
diff --git a/quickwit/quickwit-indexing/src/actors/parquet_doc_processor.rs b/quickwit/quickwit-indexing/src/actors/parquet_doc_processor.rs
index 11265b52c9f..eb51621a30f 100644
--- a/quickwit/quickwit-indexing/src/actors/parquet_doc_processor.rs
+++ b/quickwit/quickwit-indexing/src/actors/parquet_doc_processor.rs
@@ -143,10 +143,7 @@ impl ParquetDocProcessor {
         indexer_mailbox: Mailbox<ParquetIndexer>,
     ) -> Self {
         let processor = ParquetIngestProcessor;
-        let counters = ParquetDocProcessorCounters::new(
-            index_id.clone(),
-            source_id.clone(),
-        );
+        let counters = ParquetDocProcessorCounters::new(index_id.clone(), source_id.clone());
 
         info!(
             index_id = %index_id,
@@ -306,9 +303,8 @@ impl Handler<RawDocBatch> for ParquetDocProcessor {
         // Without this, a batch of consistently malformed data blocks offset progress
         // forever.
         if !checkpoint_forwarded && !checkpoint_delta.is_empty() {
-            let empty_batch = RecordBatch::new_empty(std::sync::Arc::new(
-                arrow::datatypes::Schema::empty(),
-            ));
+            let empty_batch =
+                RecordBatch::new_empty(std::sync::Arc::new(arrow::datatypes::Schema::empty()));
             let processed_batch =
                 ProcessedParquetBatch::new(empty_batch, checkpoint_delta, force_commit);
             ctx.send_message(&self.indexer_mailbox, processed_batch)
diff --git a/quickwit/quickwit-indexing/src/actors/parquet_indexer.rs b/quickwit/quickwit-indexing/src/actors/parquet_indexer.rs
index 89d3d7c8e21..254ff2dc719 100644
--- a/quickwit/quickwit-indexing/src/actors/parquet_indexer.rs
+++ b/quickwit/quickwit-indexing/src/actors/parquet_indexer.rs
@@ -533,12 +533,11 @@ mod tests {
     use quickwit_actors::{ActorHandle, Universe};
     use quickwit_common::test_utils::wait_until_predicate;
     use quickwit_parquet_engine::storage::{ParquetSplitWriter, ParquetWriterConfig};
+    use quickwit_parquet_engine::test_helpers::create_test_batch;
     use quickwit_proto::metastore::{EmptyResponse, MockMetastoreService};
     use quickwit_storage::RamStorage;
 
     use super::*;
-    use quickwit_parquet_engine::test_helpers::create_test_batch;
-
     use crate::actors::{
         ParquetPackager, ParquetPublisher, ParquetUploader, SplitsUpdateMailbox, UploaderType,
     };
@@ -618,7 +617,6 @@ mod tests {
         .map_err(|_| anyhow::anyhow!("Timeout waiting for {} staged splits", expected_splits))
     }
 
-
     #[tokio::test]
     async fn test_metrics_indexer_receives_batch() {
         let universe = Universe::with_accelerated_time();
diff --git a/quickwit/quickwit-indexing/src/actors/parquet_packager.rs b/quickwit/quickwit-indexing/src/actors/parquet_packager.rs
index f0e5fe8dc76..b0950141b10 100644
--- a/quickwit/quickwit-indexing/src/actors/parquet_packager.rs
+++ b/quickwit/quickwit-indexing/src/actors/parquet_packager.rs
@@ -239,12 +239,11 @@ mod tests {
     use quickwit_common::test_utils::wait_until_predicate;
     use quickwit_metastore::checkpoint::{IndexCheckpointDelta, SourceCheckpointDelta};
     use quickwit_parquet_engine::storage::ParquetWriterConfig;
+    use quickwit_parquet_engine::test_helpers::create_test_batch;
     use quickwit_proto::metastore::{EmptyResponse, MockMetastoreService};
     use quickwit_storage::RamStorage;
 
     use super::*;
-    use quickwit_parquet_engine::test_helpers::create_test_batch;
-
     use crate::actors::{ParquetPublisher, SplitsUpdateMailbox, UploaderType};
 
     fn create_test_uploader(
diff --git a/quickwit/quickwit-opentelemetry/src/otlp/arrow_metrics.rs b/quickwit/quickwit-opentelemetry/src/otlp/arrow_metrics.rs
index 8a4b3142501..d36c2f1b98b 100644
--- a/quickwit/quickwit-opentelemetry/src/otlp/arrow_metrics.rs
+++ b/quickwit/quickwit-opentelemetry/src/otlp/arrow_metrics.rs
@@ -24,7 +24,7 @@ use std::io::Cursor;
 use std::sync::Arc;
 
 use arrow::array::{
-    ArrayRef, Float64Builder, RecordBatch, StringDictionaryBuilder, UInt64Builder, UInt8Builder,
+    ArrayRef, Float64Builder, RecordBatch, StringDictionaryBuilder, UInt8Builder, UInt64Builder,
 };
 use arrow::datatypes::{DataType, Field, Int32Type, Schema as ArrowSchema};
 use arrow::ipc::reader::StreamReader;
@@ -130,8 +130,7 @@ impl ArrowMetricsBatchBuilder {
             arrays.push(Arc::new(tag_builder.finish()));
         }
 
-        RecordBatch::try_new(schema, arrays)
-            .expect("record batch should match Arrow schema")
+        RecordBatch::try_new(schema, arrays).expect("record batch should match Arrow schema")
     }
 
     /// Returns the number of rows appended so far.
diff --git a/quickwit/quickwit-opentelemetry/src/otlp/otel_metrics.rs b/quickwit/quickwit-opentelemetry/src/otlp/otel_metrics.rs
index 1a19e5fea53..17eab282bc1 100644
--- a/quickwit/quickwit-opentelemetry/src/otlp/otel_metrics.rs
+++ b/quickwit/quickwit-opentelemetry/src/otlp/otel_metrics.rs
@@ -246,8 +246,7 @@ impl OtlpGrpcMetricsService {
         let num_data_points = data_points.len() as u64 + num_rejected;
 
         // Build Arrow RecordBatch from valid data points
-        let mut arrow_builder =
-            ArrowMetricsBatchBuilder::with_capacity(data_points.len());
+        let mut arrow_builder = ArrowMetricsBatchBuilder::with_capacity(data_points.len());
         let mut doc_uid_generator = DocUidGenerator::default();
         let mut doc_uids = Vec::with_capacity(data_points.len());
 
@@ -746,10 +745,7 @@ mod tests {
         assert_eq!(dp.tags.get("metric_unit").map(|s| s.as_str()), Some("1"));
         assert_eq!(dp.timestamp_secs, 2);
         assert_eq!(dp.value, 100.0); // int converted to f64
-        assert_eq!(
-            dp.tags.get("host").map(|s| s.as_str()),
-            Some("server-1")
-        );
+        assert_eq!(dp.tags.get("host").map(|s| s.as_str()), Some("server-1"));
         assert_eq!(
             dp.tags.get("service").map(|s| s.as_str()),
             Some("counter-service")
@@ -844,10 +840,7 @@ mod tests {
         assert_eq!(dp.tags.get("service").map(|s| s.as_str()), Some("test"));
 
         // Verify data point attributes are in tags as strings
-        assert_eq!(
-            dp.tags.get("string_tag").map(|s| s.as_str()),
-            Some("value")
-        );
+        assert_eq!(dp.tags.get("string_tag").map(|s| s.as_str()), Some("value"));
     }
 
     /// Test metrics with empty and missing values
diff --git a/quickwit/quickwit-parquet-engine/src/index/accumulator.rs b/quickwit/quickwit-parquet-engine/src/index/accumulator.rs
index d9eb01f4411..817f3f16d93 100644
--- a/quickwit/quickwit-parquet-engine/src/index/accumulator.rs
+++ b/quickwit/quickwit-parquet-engine/src/index/accumulator.rs
@@ -18,7 +18,7 @@ use std::collections::BTreeMap;
 use std::sync::Arc;
 use std::time::Instant;
 
-use arrow::array::{new_null_array, ArrayRef};
+use arrow::array::{ArrayRef, new_null_array};
 use arrow::compute::concat_batches;
 use arrow::datatypes::{DataType, Field, Schema as ArrowSchema, SchemaRef};
 use arrow::record_batch::RecordBatch;
@@ -233,9 +233,8 @@ fn estimate_batch_bytes(batch: &RecordBatch) -> usize {
 
 #[cfg(test)]
 mod tests {
-    use crate::test_helpers::{create_test_batch, create_test_batch_with_tags};
-
     use super::*;
+    use crate::test_helpers::{create_test_batch, create_test_batch_with_tags};
 
     #[test]
     fn test_accumulator_below_threshold() {
@@ -340,7 +339,10 @@ mod tests {
 
         // No duplicate column names — each name appears exactly once.
         let field_names: Vec<&str> = schema.fields().iter().map(|f| f.name().as_str()).collect();
-        let unique_count = field_names.iter().collect::<std::collections::HashSet<_>>().len();
+        let unique_count = field_names
+            .iter()
+            .collect::<std::collections::HashSet<_>>()
+            .len();
         assert_eq!(
             unique_count,
             field_names.len(),
@@ -365,7 +367,10 @@ mod tests {
 
         let schema = combined.schema();
         let field_names: Vec<&str> = schema.fields().iter().map(|f| f.name().as_str()).collect();
-        let unique_count = field_names.iter().collect::<std::collections::HashSet<_>>().len();
+        let unique_count = field_names
+            .iter()
+            .collect::<std::collections::HashSet<_>>()
+            .len();
         assert_eq!(
             unique_count,
             field_names.len(),
diff --git a/quickwit/quickwit-parquet-engine/src/ingest/processor.rs b/quickwit/quickwit-parquet-engine/src/ingest/processor.rs
index 6097d7c3c32..21412414b2b 100644
--- a/quickwit/quickwit-parquet-engine/src/ingest/processor.rs
+++ b/quickwit/quickwit-parquet-engine/src/ingest/processor.rs
@@ -111,7 +111,10 @@ fn ipc_to_record_batch(ipc_bytes: &[u8]) -> Result<RecordBatch, IngestError> {
         return Err(IngestError::UnexpectedBatchCount(batches.len()));
     }
 
-    Ok(batches.into_iter().next().expect("len verified to be 1 above"))
+    Ok(batches
+        .into_iter()
+        .next()
+        .expect("len verified to be 1 above"))
 }
 
 /// Serialize a RecordBatch to Arrow IPC stream format.
@@ -129,9 +132,8 @@ pub fn record_batch_to_ipc(batch: &RecordBatch) -> Result<Vec<u8>, IngestError>
 
 #[cfg(test)]
 mod tests {
-    use crate::test_helpers::create_test_batch;
-
     use super::*;
+    use crate::test_helpers::create_test_batch;
 
     #[test]
     fn test_process_ipc() {
diff --git a/quickwit/quickwit-parquet-engine/src/schema/fields.rs b/quickwit/quickwit-parquet-engine/src/schema/fields.rs
index a8646823017..9f46dcf3b8c 100644
--- a/quickwit/quickwit-parquet-engine/src/schema/fields.rs
+++ b/quickwit/quickwit-parquet-engine/src/schema/fields.rs
@@ -14,7 +14,7 @@
 
 //! Parquet field definitions with sort order constants and validation.
 
-use anyhow::{bail, Result};
+use anyhow::{Result, bail};
 use arrow::datatypes::DataType;
 
 /// Required field names that must exist in every batch.
@@ -55,7 +55,9 @@ pub fn validate_required_fields(schema: &arrow::datatypes::Schema) -> Result<()>
                 if *actual_type != expected_type {
                     bail!(
                         "field '{}' has type {:?}, expected {:?}",
-                        name, actual_type, expected_type
+                        name,
+                        actual_type,
+                        expected_type
                     );
                 }
             }
diff --git a/quickwit/quickwit-parquet-engine/src/storage/config.rs b/quickwit/quickwit-parquet-engine/src/storage/config.rs
index a10a423cf9c..13b8d395a44 100644
--- a/quickwit/quickwit-parquet-engine/src/storage/config.rs
+++ b/quickwit/quickwit-parquet-engine/src/storage/config.rs
@@ -169,8 +169,7 @@ impl ParquetWriterConfig {
             // Enable bloom filters on dictionary-typed metric_name and sort order tag columns.
             // Exclude non-dictionary columns, like timestamp_secs.
             let is_bloom_column = matches!(field.data_type(), DataType::Dictionary(_, _))
-                && (field.name() == "metric_name"
-                    || SORT_ORDER.contains(&field.name().as_str()));
+                && (field.name() == "metric_name" || SORT_ORDER.contains(&field.name().as_str()));
             if is_bloom_column {
                 let ndv = if field.name() == "metric_name" {
                     BLOOM_FILTER_NDV_METRIC_NAME
diff --git a/quickwit/quickwit-parquet-engine/src/storage/split_writer.rs b/quickwit/quickwit-parquet-engine/src/storage/split_writer.rs
index a9e081455d0..466e2c9cdcb 100644
--- a/quickwit/quickwit-parquet-engine/src/storage/split_writer.rs
+++ b/quickwit/quickwit-parquet-engine/src/storage/split_writer.rs
@@ -219,11 +219,12 @@ mod tests {
 
     use arrow::array::{ArrayRef, Float64Array, UInt8Array, UInt64Array};
     use arrow::datatypes::{DataType, Field, Schema as ArrowSchema};
-    use crate::test_helpers::{create_dict_array, create_nullable_dict_array};
 
     use super::*;
+    use crate::test_helpers::{create_dict_array, create_nullable_dict_array};
 
-    /// Create a test batch with required fields, optional service column, and specified tag columns.
+    /// Create a test batch with required fields, optional service column, and specified tag
+    /// columns.
     fn create_test_batch_with_options(
         num_rows: usize,
         metric_names: &[&str],
diff --git a/quickwit/quickwit-parquet-engine/src/storage/writer.rs b/quickwit/quickwit-parquet-engine/src/storage/writer.rs
index 134702103eb..6f29c0be4cc 100644
--- a/quickwit/quickwit-parquet-engine/src/storage/writer.rs
+++ b/quickwit/quickwit-parquet-engine/src/storage/writer.rs
@@ -161,11 +161,13 @@ impl ParquetWriter {
 mod tests {
     use std::sync::Arc;
 
-    use arrow::array::{ArrayRef, DictionaryArray, Float64Array, StringArray, UInt64Array, UInt8Array};
+    use arrow::array::{
+        ArrayRef, DictionaryArray, Float64Array, StringArray, UInt8Array, UInt64Array,
+    };
     use arrow::datatypes::{DataType, Field, Int32Type, Schema};
-    use crate::test_helpers::create_test_batch_with_tags;
 
     use super::*;
+    use crate::test_helpers::create_test_batch_with_tags;
 
     fn create_test_batch() -> RecordBatch {
         create_test_batch_with_tags(1, &["service", "env"])
@@ -224,7 +226,10 @@ mod tests {
         .unwrap();
 
         let result = writer.write_to_bytes(&wrong_batch);
-        assert!(matches!(result, Err(ParquetWriteError::SchemaValidation(_))));
+        assert!(matches!(
+            result,
+            Err(ParquetWriteError::SchemaValidation(_))
+        ));
     }
 
     #[test]
@@ -251,7 +256,10 @@ mod tests {
         .unwrap();
 
         let result = writer.write_to_bytes(&wrong_batch);
-        assert!(matches!(result, Err(ParquetWriteError::SchemaValidation(_))));
+        assert!(matches!(
+            result,
+            Err(ParquetWriteError::SchemaValidation(_))
+        ));
     }
 
     #[test]

From 5b2304c08a510fdcf046ce36d3c75b313a86f36a Mon Sep 17 00:00:00 2001
From: George Talbot <george.talbot@datadoghq.com>
Date: Tue, 31 Mar 2026 17:32:37 -0400
Subject: [PATCH 09/35] =?UTF-8?q?feat(31):=20sort=20schema=20foundation=20?=
 =?UTF-8?q?=E2=80=94=20proto,=20parser,=20display,=20validation,=20window,?=
 =?UTF-8?q?=20TableConfig?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
---
 quickwit/Cargo.lock                           |   3 +
 .../src/otlp/otel_metrics.rs                  |   4 +-
 quickwit/quickwit-parquet-engine/Cargo.toml   |   2 +
 quickwit/quickwit-parquet-engine/src/lib.rs   |   2 +
 .../src/sort_fields/column_type.rs            | 240 +++++
 .../src/sort_fields/display.rs                | 124 +++
 .../src/sort_fields/equivalence.rs            |  62 ++
 .../src/sort_fields/mod.rs                    |  52 +
 .../src/sort_fields/parser.rs                 | 338 ++++++
 .../src/sort_fields/tests.rs                  | 963 ++++++++++++++++++
 .../src/sort_fields/validation.rs             | 114 +++
 .../src/sort_fields/window.rs                 | 240 +++++
 .../src/storage/config.rs                     |   2 -
 .../src/table_config.rs                       | 191 ++++
 quickwit/quickwit-proto/build.rs              |  13 +
 .../event_store_sortschema.proto              | 231 +++++
 .../src/codegen/sortschema/sortschema.rs      | 230 +++++
 quickwit/quickwit-proto/src/lib.rs            |   7 +
 quickwit/quickwit-proto/src/metastore/mod.rs  |   9 +
 .../quickwit-proto/src/sort_fields_error.rs   |  78 ++
 20 files changed, 2901 insertions(+), 4 deletions(-)
 create mode 100644 quickwit/quickwit-parquet-engine/src/sort_fields/column_type.rs
 create mode 100644 quickwit/quickwit-parquet-engine/src/sort_fields/display.rs
 create mode 100644 quickwit/quickwit-parquet-engine/src/sort_fields/equivalence.rs
 create mode 100644 quickwit/quickwit-parquet-engine/src/sort_fields/mod.rs
 create mode 100644 quickwit/quickwit-parquet-engine/src/sort_fields/parser.rs
 create mode 100644 quickwit/quickwit-parquet-engine/src/sort_fields/tests.rs
 create mode 100644 quickwit/quickwit-parquet-engine/src/sort_fields/validation.rs
 create mode 100644 quickwit/quickwit-parquet-engine/src/sort_fields/window.rs
 create mode 100644 quickwit/quickwit-parquet-engine/src/table_config.rs
 create mode 100644 quickwit/quickwit-proto/protos/event_store_sortschema/event_store_sortschema.proto
 create mode 100644 quickwit/quickwit-proto/src/codegen/sortschema/sortschema.rs
 create mode 100644 quickwit/quickwit-proto/src/sort_fields_error.rs

diff --git a/quickwit/Cargo.lock b/quickwit/Cargo.lock
index f686efc5e11..15f519615af 100644
--- a/quickwit/Cargo.lock
+++ b/quickwit/Cargo.lock
@@ -7612,9 +7612,12 @@ version = "0.8.0"
 dependencies = [
  "anyhow",
  "arrow",
+ "chrono",
  "parquet",
  "proptest",
+ "prost 0.14.3",
  "quickwit-common",
+ "quickwit-proto",
  "sea-query",
  "serde",
  "serde_json",
diff --git a/quickwit/quickwit-opentelemetry/src/otlp/otel_metrics.rs b/quickwit/quickwit-opentelemetry/src/otlp/otel_metrics.rs
index 17eab282bc1..994bf7cb324 100644
--- a/quickwit/quickwit-opentelemetry/src/otlp/otel_metrics.rs
+++ b/quickwit/quickwit-opentelemetry/src/otlp/otel_metrics.rs
@@ -886,9 +886,9 @@ mod tests {
             Some("unknown_service")
         );
         // No metric_unit tag when unit is empty
-        assert!(dp.tags.get("metric_unit").is_none());
+        assert!(!dp.tags.contains_key("metric_unit"));
         // No start_timestamp_secs tag when start time is 0
-        assert!(dp.tags.get("start_timestamp_secs").is_none());
+        assert!(!dp.tags.contains_key("start_timestamp_secs"));
         // Only "service" should be in tags (no attributes, no unit, no start time)
         assert_eq!(dp.tags.len(), 1);
     }
diff --git a/quickwit/quickwit-parquet-engine/Cargo.toml b/quickwit/quickwit-parquet-engine/Cargo.toml
index 9842ceb6f00..9744dbcc2bc 100644
--- a/quickwit/quickwit-parquet-engine/Cargo.toml
+++ b/quickwit/quickwit-parquet-engine/Cargo.toml
@@ -13,8 +13,10 @@ license.workspace = true
 [dependencies]
 anyhow = { workspace = true }
 arrow = { workspace = true }
+chrono = { workspace = true }
 parquet = { workspace = true }
 quickwit-common = { workspace = true }
+quickwit-proto = { workspace = true }
 sea-query = { workspace = true, optional = true }
 serde = { workspace = true }
 serde_json = { workspace = true }
diff --git a/quickwit/quickwit-parquet-engine/src/lib.rs b/quickwit/quickwit-parquet-engine/src/lib.rs
index 309ebf4f442..d34c67c665d 100644
--- a/quickwit/quickwit-parquet-engine/src/lib.rs
+++ b/quickwit/quickwit-parquet-engine/src/lib.rs
@@ -24,8 +24,10 @@ pub mod index;
 pub mod ingest;
 pub mod metrics;
 pub mod schema;
+pub mod sort_fields;
 pub mod split;
 pub mod storage;
+pub mod table_config;
 
 #[cfg(any(test, feature = "testsuite"))]
 pub mod test_helpers;
diff --git a/quickwit/quickwit-parquet-engine/src/sort_fields/column_type.rs b/quickwit/quickwit-parquet-engine/src/sort_fields/column_type.rs
new file mode 100644
index 00000000000..21690f01e0e
--- /dev/null
+++ b/quickwit/quickwit-parquet-engine/src/sort_fields/column_type.rs
@@ -0,0 +1,240 @@
+// Copyright 2021-Present Datadog, Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+//! Column type identification from name suffixes and string names.
+//!
+//! Type can be specified via Husky-convention suffixes (`__s`, `__i`, `__nf`)
+//! or inferred from well-known bare names. The discriminant values match
+//! the Go iota exactly for cross-system interoperability.
+
+use std::str::FromStr;
+
+use super::SortFieldsError;
+
+/// Well-known column name for timestamps.
+pub const TIMESTAMP: &str = "timestamp";
+
+/// Well-known column name for tiebreaker.
+pub const TIEBREAKER: &str = "tiebreaker";
+
+/// Well-known column name for timeseries ID hash.
+pub const TIMESERIES_ID: &str = "timeseries_id";
+
+/// Well-known column name for metric value.
+pub const METRIC_VALUE: &str = "metric_value";
+
+/// Column type IDs matching Go `types.TypeID` iota values.
+///
+/// Only the types that appear in sort schemas are included here.
+/// The discriminant values MUST match Go exactly for cross-system interop.
+#[derive(Debug, Clone, Copy, PartialEq, Eq)]
+#[repr(u64)]
+pub enum ColumnTypeId {
+    Int64 = 2,
+    Float64 = 10,
+    String = 14,
+    Sketch = 17,
+    CpcSketch = 20,
+    ItemSketch = 22,
+}
+
+impl ColumnTypeId {
+    /// The Husky-convention suffix for this column type.
+    ///
+    /// Used when serializing back to the string format with explicit types.
+    pub fn suffix(self) -> &'static str {
+        match self {
+            Self::Int64 => "__i",
+            Self::Float64 => "__nf",
+            Self::String => "__s",
+            Self::Sketch => "__sk",
+            Self::CpcSketch => "__cpcsk",
+            Self::ItemSketch => "__isk",
+        }
+    }
+
+    /// Human-readable type name matching Go `TypeID.String()`.
+    pub fn as_str(self) -> &'static str {
+        match self {
+            Self::Int64 => "dense-int64",
+            Self::Float64 => "dense-float64",
+            Self::String => "dense-string",
+            Self::Sketch => "dense-sketch",
+            Self::CpcSketch => "dense-cpc-sketch",
+            Self::ItemSketch => "dense-item-sketch",
+        }
+    }
+
+    /// Resolve column type from a column name, stripping any type suffix.
+    ///
+    /// Returns `(bare_name, type)`. Type resolution order:
+    /// 1. Explicit suffix (`__s`, `__i`, `__nf`, etc.) — stripped, type from suffix
+    /// 2. Well-known bare name defaults:
+    ///    - `timestamp`, `tiebreaker`, `timeseries_id` → Int64
+    ///    - `metric_value` → Float64
+    ///    - everything else → String
+    pub fn from_column_name(name: &str) -> Result<(&str, Self), SortFieldsError> {
+        // Try explicit suffixes first (longest match first to avoid ambiguity).
+        if let Some(bare) = name.strip_suffix("__isk") {
+            return Ok((bare, Self::ItemSketch));
+        }
+        if let Some(bare) = name.strip_suffix("__cpcsk") {
+            return Ok((bare, Self::CpcSketch));
+        }
+        if let Some(bare) = name.strip_suffix("__sk") {
+            return Ok((bare, Self::Sketch));
+        }
+        if let Some(bare) = name.strip_suffix("__nf") {
+            return Ok((bare, Self::Float64));
+        }
+        if let Some(bare) = name.strip_suffix("__i") {
+            return Ok((bare, Self::Int64));
+        }
+        if let Some(bare) = name.strip_suffix("__s") {
+            return Ok((bare, Self::String));
+        }
+
+        // No suffix — use well-known name defaults.
+        Ok((name, default_type_for_name(name)))
+    }
+}
+
+/// Default column type and sort direction for a bare column name.
+///
+/// This is the single source of truth for well-known column defaults.
+/// Used by the parser (type inference, default direction), display
+/// (suffix omission, direction omission), and validation.
+pub struct ColumnDefaults {
+    pub column_type: ColumnTypeId,
+    /// True if the default sort direction is descending.
+    pub descending: bool,
+}
+
+/// Well-known name → default type and sort direction lookup table.
+///
+/// Columns not in this table default to String, ascending.
+static WELL_KNOWN_COLUMNS: &[(&str, ColumnDefaults)] = &[
+    (
+        TIMESTAMP,
+        ColumnDefaults {
+            column_type: ColumnTypeId::Int64,
+            descending: true,
+        },
+    ),
+    (
+        "timestamp_secs",
+        ColumnDefaults {
+            column_type: ColumnTypeId::Int64,
+            descending: true,
+        },
+    ),
+    (
+        TIEBREAKER,
+        ColumnDefaults {
+            column_type: ColumnTypeId::Int64,
+            descending: false,
+        },
+    ),
+    (
+        TIMESERIES_ID,
+        ColumnDefaults {
+            column_type: ColumnTypeId::Int64,
+            descending: false,
+        },
+    ),
+    (
+        METRIC_VALUE,
+        ColumnDefaults {
+            column_type: ColumnTypeId::Float64,
+            descending: false,
+        },
+    ),
+    (
+        "value",
+        ColumnDefaults {
+            column_type: ColumnTypeId::Float64,
+            descending: false,
+        },
+    ),
+];
+
+const DEFAULT_COLUMN: ColumnDefaults = ColumnDefaults {
+    column_type: ColumnTypeId::String,
+    descending: false,
+};
+
+/// Look up default type and direction for a bare column name.
+pub fn column_defaults(name: &str) -> &'static ColumnDefaults {
+    WELL_KNOWN_COLUMNS
+        .iter()
+        .find(|(n, _)| *n == name)
+        .map(|(_, d)| d)
+        .unwrap_or(&DEFAULT_COLUMN)
+}
+
+/// Default column type for a bare name (convenience wrapper).
+pub fn default_type_for_name(name: &str) -> ColumnTypeId {
+    column_defaults(name).column_type
+}
+
+/// Whether this bare name defaults to descending sort.
+pub fn default_is_descending(name: &str) -> bool {
+    column_defaults(name).descending
+}
+
+impl std::fmt::Display for ColumnTypeId {
+    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+        f.write_str(self.as_str())
+    }
+}
+
+/// Parse a type name string (e.g., "dense-int64") into a `ColumnTypeId`.
+impl FromStr for ColumnTypeId {
+    type Err = SortFieldsError;
+
+    fn from_str(s: &str) -> Result<Self, Self::Err> {
+        match s {
+            "dense-int64" => Ok(Self::Int64),
+            "dense-float64" => Ok(Self::Float64),
+            "dense-string" => Ok(Self::String),
+            "dense-sketch" => Ok(Self::Sketch),
+            "dense-cpc-sketch" => Ok(Self::CpcSketch),
+            "dense-item-sketch" => Ok(Self::ItemSketch),
+            _ => Err(SortFieldsError::UnknownColumnType(format!(
+                "unknown column type '{}'",
+                s
+            ))),
+        }
+    }
+}
+
+/// Convert a proto `column_type` u64 back to a `ColumnTypeId`.
+impl TryFrom<u64> for ColumnTypeId {
+    type Error = SortFieldsError;
+
+    fn try_from(value: u64) -> Result<Self, Self::Error> {
+        match value {
+            2 => Ok(Self::Int64),
+            10 => Ok(Self::Float64),
+            14 => Ok(Self::String),
+            17 => Ok(Self::Sketch),
+            20 => Ok(Self::CpcSketch),
+            22 => Ok(Self::ItemSketch),
+            _ => Err(SortFieldsError::UnknownColumnType(format!(
+                "unknown column type id: {}",
+                value
+            ))),
+        }
+    }
+}
diff --git a/quickwit/quickwit-parquet-engine/src/sort_fields/display.rs b/quickwit/quickwit-parquet-engine/src/sort_fields/display.rs
new file mode 100644
index 00000000000..8a6d74f482a
--- /dev/null
+++ b/quickwit/quickwit-parquet-engine/src/sort_fields/display.rs
@@ -0,0 +1,124 @@
+// Copyright 2021-Present Datadog, Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+//! Sort schema to string serialization -- direct port of Go `SchemaToString` and
+//! `SchemaToStringShort`.
+//!
+//! The proto `SortColumn.name` stores the bare Parquet column name (no type suffix).
+//! These functions reconstruct the Husky-format suffixed name for serialization
+//! using `SortColumn.column_type` to determine the suffix.
+
+use quickwit_proto::sortschema::{SortColumn, SortColumnDirection, SortSchema};
+
+use super::column_type::{ColumnTypeId, default_is_descending, default_type_for_name};
+
+fn direction_str(sort_direction: i32) -> &'static str {
+    match SortColumnDirection::try_from(sort_direction) {
+        Ok(SortColumnDirection::SortDirectionAscending) => ":+",
+        Ok(SortColumnDirection::SortDirectionDescending) => ":-",
+        _ => ":???",
+    }
+}
+
+fn type_str(column_type: u64) -> &'static str {
+    match ColumnTypeId::try_from(column_type) {
+        Ok(ct) => ct.as_str(),
+        Err(_) => "unknown",
+    }
+}
+
+/// Reconstruct the column name for the Husky string format.
+///
+/// Only appends the type suffix when the column's type differs from the
+/// default for its bare name. This keeps the string short and readable:
+///   `metric_name` (default String) → no suffix needed
+///   `timestamp` (default Int64) → no suffix needed
+///   `my_counter__i` → suffix needed (Int64 differs from default String)
+fn display_name(col: &SortColumn) -> String {
+    let col_type = match ColumnTypeId::try_from(col.column_type) {
+        Ok(ct) => ct,
+        Err(_) => return col.name.clone(),
+    };
+    let default_type = default_type_for_name(&col.name);
+    if col_type == default_type {
+        col.name.clone()
+    } else {
+        format!("{}{}", col.name, col_type.suffix())
+    }
+}
+
+/// Convert a `SortSchema` to its full string representation.
+///
+/// Format: `[name=]column__suffix:type:+/-[|...][/V#]`
+///
+/// Direct port of Go `SchemaToString`.
+pub fn schema_to_string(schema: &SortSchema) -> String {
+    schema_to_string_inner(schema, true)
+}
+
+/// Convert a `SortSchema` to its short string representation.
+///
+/// Format: `[name=]column__suffix[|...][/V#]`
+///
+/// Omits the explicit type and skips the sort direction when it matches the
+/// default (ascending for non-timestamp, descending for timestamp).
+///
+/// Direct port of Go `SchemaToStringShort`.
+pub fn schema_to_string_short(schema: &SortSchema) -> String {
+    schema_to_string_inner(schema, false)
+}
+
+/// Shared implementation for both full and short schema string formats.
+///
+/// When `verbose` is true, includes the explicit type and always emits direction.
+/// When `verbose` is false, omits type and skips direction when it matches the default.
+fn schema_to_string_inner(schema: &SortSchema, verbose: bool) -> String {
+    let mut rv = String::new();
+
+    if !schema.name.is_empty() {
+        rv.push_str(&schema.name);
+        rv.push('=');
+    }
+
+    for (i, col) in schema.column.iter().enumerate() {
+        if i > 0 {
+            rv.push('|');
+        }
+        if schema.lsm_comparison_cutoff > 0 && i == schema.lsm_comparison_cutoff as usize {
+            rv.push('&');
+        }
+        rv.push_str(&display_name(col));
+
+        if verbose {
+            rv.push(':');
+            rv.push_str(type_str(col.column_type));
+            rv.push_str(direction_str(col.sort_direction));
+        } else {
+            let is_default_direction = if default_is_descending(&col.name) {
+                col.sort_direction == SortColumnDirection::SortDirectionDescending as i32
+            } else {
+                col.sort_direction == SortColumnDirection::SortDirectionAscending as i32
+            };
+            if !is_default_direction {
+                rv.push_str(direction_str(col.sort_direction));
+            }
+        }
+    }
+
+    if schema.sort_version > 0 {
+        rv.push_str(&format!("/V{}", schema.sort_version));
+    }
+
+    rv
+}
diff --git a/quickwit/quickwit-parquet-engine/src/sort_fields/equivalence.rs b/quickwit/quickwit-parquet-engine/src/sort_fields/equivalence.rs
new file mode 100644
index 00000000000..403ec9fd135
--- /dev/null
+++ b/quickwit/quickwit-parquet-engine/src/sort_fields/equivalence.rs
@@ -0,0 +1,62 @@
+// Copyright 2021-Present Datadog, Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+//! Sort schema equivalence comparison -- direct port of Go `EquivalentSchemas`
+//! and `EquivalentSchemasForCompaction`.
+
+use quickwit_proto::sortschema::SortSchema;
+
+/// Base comparison: checks sort_version and all columns match (name, type, direction).
+///
+/// Hand-rolled comparison (not proto equality) because the Go compactor calls this
+/// in tight loops on 10s-100s of thousands of fragments and proto.Equal allocates.
+fn equivalent_schemas_base(a: &SortSchema, b: &SortSchema) -> bool {
+    if a.sort_version != b.sort_version {
+        return false;
+    }
+    if a.column.len() != b.column.len() {
+        return false;
+    }
+    for (a_col, b_col) in a.column.iter().zip(b.column.iter()) {
+        if a_col.name != b_col.name {
+            return false;
+        }
+        if a_col.column_type != b_col.column_type {
+            return false;
+        }
+        if a_col.sort_direction != b_col.sort_direction {
+            return false;
+        }
+    }
+    true
+}
+
+/// Check if two schemas are equivalent, ignoring names and versioned schema.
+///
+/// Compares columns, sort_version, and `lsm_comparison_cutoff`.
+///
+/// Direct port of Go `EquivalentSchemas`.
+pub fn equivalent_schemas(a: &SortSchema, b: &SortSchema) -> bool {
+    equivalent_schemas_base(a, b) && a.lsm_comparison_cutoff == b.lsm_comparison_cutoff
+}
+
+/// Check if two schemas are equivalent for compaction purposes.
+///
+/// Same as `equivalent_schemas` but ignores `lsm_comparison_cutoff`, providing
+/// backward compatibility when old cplanners send steps without LSM cutoff info.
+///
+/// Direct port of Go `EquivalentSchemasForCompaction`.
+pub fn equivalent_schemas_for_compaction(a: &SortSchema, b: &SortSchema) -> bool {
+    equivalent_schemas_base(a, b)
+}
diff --git a/quickwit/quickwit-parquet-engine/src/sort_fields/mod.rs b/quickwit/quickwit-parquet-engine/src/sort_fields/mod.rs
new file mode 100644
index 00000000000..3043531c557
--- /dev/null
+++ b/quickwit/quickwit-parquet-engine/src/sort_fields/mod.rs
@@ -0,0 +1,52 @@
+// Copyright 2021-Present Datadog, Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+//! Sort fields types, parsing, and time-window arithmetic for metrics compaction.
+//!
+//! The sort fields define how rows are ordered within a Parquet split,
+//! which determines merge and compaction behavior. The parser is a direct
+//! port of Husky's Go `schemautils.go` for cross-system interoperability.
+
+// TableConfig.effective_sort_fields() is wired into ParquetWriter at construction
+// time. The writer resolves sort field names to physical ParquetField columns;
+// columns not yet in the schema (e.g., timeseries_id) are skipped during sort
+// but recorded in metadata.
+//
+// TODO(Phase 32): Wire per-index TableConfig into IndexConfig so each index can
+// override the default sort fields. Currently all metrics indexes use
+// ProductType::Metrics default.
+//
+// When accepting user-supplied sort_fields for metrics indexes, validation MUST
+// reject schemas that do not include timeseries_id__i immediately before timestamp.
+// The timeseries_id tiebreaker is mandatory for metrics to ensure deterministic
+// sort order across splits with identical tag combinations.
+
+pub mod column_type;
+pub mod display;
+pub mod equivalence;
+pub mod parser;
+pub mod validation;
+pub mod window;
+
+#[cfg(test)]
+mod tests;
+
+// Public API re-exports.
+pub use column_type::ColumnTypeId;
+pub use display::{schema_to_string, schema_to_string_short};
+pub use equivalence::{equivalent_schemas, equivalent_schemas_for_compaction};
+pub use parser::parse_sort_fields;
+pub use quickwit_proto::SortFieldsError;
+pub use validation::validate_schema;
+pub use window::{validate_window_duration, window_start};
diff --git a/quickwit/quickwit-parquet-engine/src/sort_fields/parser.rs b/quickwit/quickwit-parquet-engine/src/sort_fields/parser.rs
new file mode 100644
index 00000000000..77b11757b29
--- /dev/null
+++ b/quickwit/quickwit-parquet-engine/src/sort_fields/parser.rs
@@ -0,0 +1,338 @@
+// Copyright 2021-Present Datadog, Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+//! Sort schema string parser -- direct port of Go `StringToSchema`.
+//!
+//! Parses Husky-style sort schema strings like:
+//!   `"metric_name|host|env|timeseries_id|timestamp/V2"`
+//! into a `SortSchema` proto with correct column names, types, directions, and version.
+
+use quickwit_proto::sortschema::{SortColumn, SortColumnDirection, SortSchema};
+
+use super::SortFieldsError;
+use super::column_type::{ColumnTypeId, default_is_descending};
+use super::validation::validate_schema;
+
+/// The minimum sort version we accept. V0 (INCORRECT_TRIM) and V1 (TRIMMED_WITH_BUDGET)
+/// are rejected per the strict V2-only decision.
+const MINIMUM_SORT_VERSION: i32 = 2;
+
+/// Parse a sort schema string into a `SortSchema` proto.
+///
+/// Direct port of Go `StringToSchema`. Accepts the format:
+///   `[name=]column[|column...][/V#]`
+///
+/// Each column can be:
+///   - `name` (1-part): infer type from suffix, default direction
+///   - `+name` or `name+` (1-part with prefix/suffix direction)
+///   - `name:+/-` (2-part): infer type from suffix, explicit direction
+///   - `name:type:+/-` (3-part): explicit type + verify matches suffix
+///
+/// Direction (`+`/`-`) may appear as a prefix (`+name`), suffix (`name+`),
+/// or after a colon (`name:+`). It is an error for direction to appear in
+/// more than one position for a given column (e.g., `+name+` or `+name:+`).
+///
+/// The `&` marker before a column name indicates the LSM comparison cutoff.
+///
+/// **V2-only enforcement**: Only sort_version >= 2 is accepted. Unversioned
+/// strings (default to 0), V0, and V1 are rejected.
+pub fn parse_sort_fields(s: &str) -> Result<SortSchema, SortFieldsError> {
+    let mut schema = SortSchema::default();
+    let mut input = s;
+
+    // Split on `=` for optional name prefix.
+    if let Some((name, rest)) = split_once_max2(input, '=', s)? {
+        schema.name = name.to_string();
+        input = rest;
+    }
+
+    // Split on `/` for version suffix.
+    let sort_version = parse_version_suffix(&mut input, s)?;
+    if sort_version < MINIMUM_SORT_VERSION {
+        return Err(SortFieldsError::UnsupportedVersion {
+            version: sort_version,
+            minimum: MINIMUM_SORT_VERSION,
+        });
+    }
+    schema.sort_version = sort_version;
+
+    // Parse columns.
+    let mut cutoff_marker_count = 0;
+
+    for (i, col_str) in input.split('|').enumerate() {
+        let col_remaining = parse_cutoff_marker(col_str, i, &mut cutoff_marker_count, &mut schema)?;
+
+        let (prefix_dir, after_prefix) = strip_direction_prefix(col_remaining);
+        let parts: Vec<&str> = after_prefix.split(':').collect();
+
+        let column = match parts.len() {
+            3 => parse_3part(parts, prefix_dir, col_str)?,
+            2 => parse_2part(parts, prefix_dir, col_str)?,
+            1 => {
+                schema
+                    .column
+                    .push(parse_1part(parts[0], prefix_dir, col_str)?);
+                continue;
+            }
+            _ => {
+                return Err(SortFieldsError::InvalidColumnFormat(format!(
+                    "columns should be of the form 'name:type:+/-' or 'name:+/-' or 'name', \
+                     found: {}",
+                    col_str
+                )));
+            }
+        };
+
+        schema.column.push(column);
+    }
+
+    if cutoff_marker_count > 0 && schema.column.len() < 2 {
+        return Err(SortFieldsError::InvalidCutoffPlacement(
+            "LSM cutoff marker (&) requires at least 2 columns".to_string(),
+        ));
+    }
+
+    validate_schema(&schema)?;
+    Ok(schema)
+}
+
+/// Parse the `/V#` version suffix, updating `input` to point at the columns portion.
+/// Returns 0 if no version suffix is present.
+fn parse_version_suffix(input: &mut &str, original: &str) -> Result<i32, SortFieldsError> {
+    let Some((columns, version_str)) = split_once_max2(input, '/', original)? else {
+        return Ok(0);
+    };
+    let version_str = version_str.strip_prefix('V').ok_or_else(|| {
+        SortFieldsError::BadSortVersion(format!(
+            "mal-formatted sort schema '{}' -- bad sort version",
+            *input
+        ))
+    })?;
+    let version = version_str.parse::<i32>().map_err(|_| {
+        SortFieldsError::BadSortVersion(format!(
+            "mal-formatted sort schema '{}' parsing sort version",
+            *input
+        ))
+    })?;
+    *input = columns;
+    Ok(version)
+}
+
+/// Handle the `&` LSM cutoff marker at the start of a column string.
+/// Returns the remaining column string after stripping `&`.
+fn parse_cutoff_marker<'a>(
+    col_str: &'a str,
+    column_index: usize,
+    cutoff_count: &mut usize,
+    schema: &mut SortSchema,
+) -> Result<&'a str, SortFieldsError> {
+    let Some(rest) = col_str.strip_prefix('&') else {
+        if col_str.contains('&') {
+            return Err(SortFieldsError::MalformedSchema(format!(
+                "LSM cutoff marker (&) must appear at the beginning of column name, found in \
+                 middle of: {}",
+                col_str
+            )));
+        }
+        return Ok(col_str);
+    };
+
+    *cutoff_count += 1;
+    if *cutoff_count > 1 {
+        return Err(SortFieldsError::InvalidCutoffPlacement(
+            "only one LSM cutoff marker (&) is allowed per schema".to_string(),
+        ));
+    }
+    schema.lsm_comparison_cutoff = column_index as i32;
+    if rest.is_empty() {
+        return Err(SortFieldsError::InvalidCutoffPlacement(
+            "LSM cutoff marker (&) must be followed by a valid column name".to_string(),
+        ));
+    }
+    if column_index == 0 {
+        return Err(SortFieldsError::InvalidCutoffPlacement(
+            "LSM cutoff marker (&) cannot be used on the first column as it would ignore all \
+             columns"
+                .to_string(),
+        ));
+    }
+    if rest.contains('&') {
+        return Err(SortFieldsError::MalformedSchema(format!(
+            "LSM cutoff marker (&) must appear at the beginning of column name, found in middle \
+             of: {}",
+            rest
+        )));
+    }
+    Ok(rest)
+}
+
+/// Resolve bare name and type from a column name string via `ColumnTypeId::from_column_name`.
+fn resolve_name_type(name: &str) -> Result<(&str, ColumnTypeId), SortFieldsError> {
+    ColumnTypeId::from_column_name(name).map_err(|_| {
+        SortFieldsError::UnknownColumnType(format!(
+            "error determining type for column {} from suffix",
+            name
+        ))
+    })
+}
+
+/// Parse a 3-part column: `name__suffix:type:+/-`.
+fn parse_3part(
+    parts: Vec<&str>,
+    prefix_dir: Option<i32>,
+    col_str: &str,
+) -> Result<SortColumn, SortFieldsError> {
+    let explicit_type: ColumnTypeId = parts[1].parse().map_err(|_| {
+        SortFieldsError::UnknownColumnType(format!(
+            "error determining type for column {}: unknown type '{}'",
+            parts[0], parts[1]
+        ))
+    })?;
+    let (bare_name, suffix_type) = resolve_name_type(parts[0])?;
+    if explicit_type != suffix_type {
+        return Err(SortFieldsError::TypeMismatch {
+            column: parts[0].to_string(),
+            from_suffix: suffix_type.to_string(),
+            explicit: explicit_type.to_string(),
+        });
+    }
+    let colon_dir = parse_direction(parts[2])?;
+    if prefix_dir.is_some() {
+        return Err(SortFieldsError::DuplicateDirection(col_str.to_string()));
+    }
+    Ok(SortColumn {
+        name: bare_name.to_string(),
+        column_type: explicit_type as u64,
+        sort_direction: colon_dir,
+    })
+}
+
+/// Parse a 2-part column: `name__suffix:+/-`.
+fn parse_2part(
+    parts: Vec<&str>,
+    prefix_dir: Option<i32>,
+    col_str: &str,
+) -> Result<SortColumn, SortFieldsError> {
+    // Reject direction suffix embedded in the name part: `name-:-` has
+    // direction in both the name suffix and the colon position.
+    let (embedded_suffix_dir, name_without_suffix) = strip_direction_suffix(parts[0]);
+    if embedded_suffix_dir.is_some() {
+        return Err(SortFieldsError::DuplicateDirection(col_str.to_string()));
+    }
+    let (bare_name, col_type) = resolve_name_type(name_without_suffix)?;
+    let colon_dir = parse_direction(parts[1])?;
+    if prefix_dir.is_some() {
+        return Err(SortFieldsError::DuplicateDirection(col_str.to_string()));
+    }
+    Ok(SortColumn {
+        name: bare_name.to_string(),
+        column_type: col_type as u64,
+        sort_direction: colon_dir,
+    })
+}
+
+/// Parse a 1-part column: `name__suffix` with optional direction prefix/suffix.
+fn parse_1part(
+    part: &str,
+    prefix_dir: Option<i32>,
+    col_str: &str,
+) -> Result<SortColumn, SortFieldsError> {
+    let (suffix_dir, suffixed_name) = strip_direction_suffix(part);
+    if prefix_dir.is_some() && suffix_dir.is_some() {
+        return Err(SortFieldsError::DuplicateDirection(col_str.to_string()));
+    }
+    let (bare_name, col_type) = resolve_name_type(suffixed_name)?;
+    let direction = prefix_dir.or(suffix_dir).unwrap_or_else(|| {
+        if default_is_descending(bare_name) {
+            SortColumnDirection::SortDirectionDescending as i32
+        } else {
+            SortColumnDirection::SortDirectionAscending as i32
+        }
+    });
+    Ok(SortColumn {
+        name: bare_name.to_string(),
+        column_type: col_type as u64,
+        sort_direction: direction,
+    })
+}
+
+/// Split `input` on the first `sep`, returning None if no separator.
+/// Errors if there are more than 2 parts (i.e., multiple separators).
+fn split_once_max2<'a>(
+    input: &'a str,
+    sep: char,
+    original: &str,
+) -> Result<Option<(&'a str, &'a str)>, SortFieldsError> {
+    let mut iter = input.splitn(3, sep);
+    let first = iter.next().unwrap(); // always present
+    let second = match iter.next() {
+        Some(s) => s,
+        None => return Ok(None),
+    };
+    if iter.next().is_some() {
+        return Err(SortFieldsError::MalformedSchema(format!(
+            "mal-formatted sort schema '{}'",
+            original
+        )));
+    }
+    Ok(Some((first, second)))
+}
+
+/// Strip a leading `+` or `-` from a string, returning the direction and remainder.
+fn strip_direction_prefix(s: &str) -> (Option<i32>, &str) {
+    if let Some(rest) = s.strip_prefix('+') {
+        (
+            Some(SortColumnDirection::SortDirectionAscending as i32),
+            rest,
+        )
+    } else if let Some(rest) = s.strip_prefix('-') {
+        (
+            Some(SortColumnDirection::SortDirectionDescending as i32),
+            rest,
+        )
+    } else {
+        (None, s)
+    }
+}
+
+/// Strip a trailing `+` or `-` from a string, returning the direction and trimmed name.
+fn strip_direction_suffix(s: &str) -> (Option<i32>, &str) {
+    if s.len() > 1 {
+        if let Some(rest) = s.strip_suffix('+') {
+            return (
+                Some(SortColumnDirection::SortDirectionAscending as i32),
+                rest,
+            );
+        }
+        if let Some(rest) = s.strip_suffix('-') {
+            return (
+                Some(SortColumnDirection::SortDirectionDescending as i32),
+                rest,
+            );
+        }
+    }
+    (None, s)
+}
+
+/// Parse a sort direction string ("+" or "-") into the proto enum value.
+fn parse_direction(s: &str) -> Result<i32, SortFieldsError> {
+    match s {
+        "+" => Ok(SortColumnDirection::SortDirectionAscending as i32),
+        "-" => Ok(SortColumnDirection::SortDirectionDescending as i32),
+        _ => Err(SortFieldsError::UnknownSortDirection(format!(
+            "unknown sort direction '{}'",
+            s
+        ))),
+    }
+}
diff --git a/quickwit/quickwit-parquet-engine/src/sort_fields/tests.rs b/quickwit/quickwit-parquet-engine/src/sort_fields/tests.rs
new file mode 100644
index 00000000000..ee8e188916f
--- /dev/null
+++ b/quickwit/quickwit-parquet-engine/src/sort_fields/tests.rs
@@ -0,0 +1,963 @@
+// Copyright 2021-Present Datadog, Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+//! Test suite ported from Go `schemautils_test.go` plus strict V2-only enforcement tests.
+
+use quickwit_proto::sortschema::{SortColumnDirection, SortSchema};
+
+use super::column_type::ColumnTypeId;
+use super::display::{schema_to_string, schema_to_string_short};
+use super::equivalence::{equivalent_schemas, equivalent_schemas_for_compaction};
+use super::parser::parse_sort_fields;
+
+// ---------------------------------------------------------------------------
+// Helper
+// ---------------------------------------------------------------------------
+
+fn must_parse(s: &str) -> SortSchema {
+    parse_sort_fields(s).unwrap_or_else(|e| panic!("failed to parse '{}': {}", s, e))
+}
+
+// ---------------------------------------------------------------------------
+// Strict V2-only enforcement tests
+// ---------------------------------------------------------------------------
+
+#[test]
+fn test_v2_only_rejects_unversioned() {
+    // No version suffix defaults to version 0 -> rejected.
+    let err = parse_sort_fields("timestamp").unwrap_err();
+    let msg = err.to_string();
+    assert!(
+        msg.contains("unsupported") || msg.contains("version"),
+        "expected unsupported version error, got: {}",
+        msg
+    );
+}
+
+#[test]
+fn test_v2_only_rejects_v0() {
+    let err = parse_sort_fields("timestamp/V0").unwrap_err();
+    let msg = err.to_string();
+    assert!(
+        msg.contains("unsupported") || msg.contains("version"),
+        "expected unsupported version error, got: {}",
+        msg
+    );
+}
+
+#[test]
+fn test_v2_only_rejects_v1() {
+    let err = parse_sort_fields("timestamp/V1").unwrap_err();
+    let msg = err.to_string();
+    assert!(
+        msg.contains("unsupported") || msg.contains("version"),
+        "expected unsupported version error, got: {}",
+        msg
+    );
+}
+
+#[test]
+fn test_v2_only_accepts_v2() {
+    let schema = must_parse("timestamp/V2");
+    assert_eq!(schema.sort_version, 2);
+    assert_eq!(schema.column.len(), 1);
+    assert_eq!(schema.column[0].name, "timestamp");
+}
+
+#[test]
+fn test_v2_only_accepts_v3() {
+    let schema = must_parse("timestamp/V3");
+    assert_eq!(schema.sort_version, 3);
+}
+
+// ---------------------------------------------------------------------------
+// Port of Go TestStringToSchema -- error paths
+// ---------------------------------------------------------------------------
+
+#[test]
+fn test_string_to_schema_mangled() {
+    // Mangled schema with multiple `=`
+    assert!(parse_sort_fields("a=b=c/V2").is_err(), "must disallow >1 =");
+}
+
+#[test]
+fn test_string_to_schema_invalid_column_format() {
+    assert!(
+        parse_sort_fields("name:dense-int64:+:what-is-this?/V2").is_err(),
+        "must disallow invalid column formats"
+    );
+    assert!(
+        parse_sort_fields("name:dense-int64:+:what-is-this?:really?/V2").is_err(),
+        "must disallow invalid column formats"
+    );
+}
+
+#[test]
+fn test_string_to_schema_3part_errors() {
+    // Invalid type name.
+    assert!(
+        parse_sort_fields("name__i:invalid-type:+|timestamp/V2").is_err(),
+        "must disallow an invalid type"
+    );
+    // Type mismatch: suffix says int64 but explicit says string.
+    assert!(
+        parse_sort_fields("name__i:dense-string:+|timestamp/V2").is_err(),
+        "must disallow mismatch between type suffix and explicit type"
+    );
+    // Invalid sort direction.
+    assert!(
+        parse_sort_fields("name__i:dense-int64:invalid-sort-direction|timestamp/V2").is_err(),
+        "must disallow an invalid sort direction"
+    );
+}
+
+#[test]
+fn test_string_to_schema_2part_errors() {
+    // `name__x` no longer errors: unknown suffixes are treated as bare column names
+    // with default type (String), so `name__x` is a valid column name.
+    assert!(
+        parse_sort_fields("name__x:+|timestamp/V2").is_ok(),
+        "bare column names with unknown suffix-like patterns are now valid"
+    );
+    // Invalid sort direction.
+    assert!(
+        parse_sort_fields("name__i:invalid-sort-direction|timestamp/V2").is_err(),
+        "must disallow an invalid sort direction"
+    );
+}
+
+#[test]
+fn test_string_to_schema_missing_timestamp() {
+    // Semantically invalid: missing timestamp column.
+    assert!(
+        parse_sort_fields("name__i:dense-int64:+/V2").is_err(),
+        "must disallow schema with missing timestamp column"
+    );
+}
+
+#[test]
+fn test_string_to_schema_bad_version() {
+    // `/X` is not a valid version specification.
+    assert!(
+        parse_sort_fields("timestamp/X").is_err(),
+        "/X isn't a valid version specification"
+    );
+    // `/VX` -- V followed by non-numeric.
+    assert!(
+        parse_sort_fields("timestamp/VX").is_err(),
+        "/VX isn't a valid version specification"
+    );
+}
+
+// ---------------------------------------------------------------------------
+// LSM cutoff marker error paths (from Go TestStringToSchema)
+// ---------------------------------------------------------------------------
+
+#[test]
+fn test_lsm_cutoff_multiple_markers() {
+    assert!(
+        parse_sort_fields("service__s|&env__s|&source__s|timestamp/V2").is_err(),
+        "must disallow multiple LSM cutoff markers"
+    );
+}
+
+#[test]
+fn test_lsm_cutoff_double_ampersand() {
+    assert!(
+        parse_sort_fields("service__s|&&env__s|timestamp/V2").is_err(),
+        "must disallow multiple consecutive LSM cutoff markers"
+    );
+}
+
+#[test]
+fn test_lsm_cutoff_empty_after_marker() {
+    assert!(
+        parse_sort_fields("service__s|&|timestamp/V2").is_err(),
+        "must disallow empty column name after LSM cutoff marker"
+    );
+}
+
+#[test]
+fn test_lsm_cutoff_in_middle() {
+    assert!(
+        parse_sort_fields("service__s|env&__s|timestamp/V2").is_err(),
+        "must disallow LSM cutoff marker in middle of column name"
+    );
+}
+
+#[test]
+fn test_lsm_cutoff_single_column() {
+    assert!(
+        parse_sort_fields("&timestamp/V2").is_err(),
+        "must disallow LSM cutoff marker on single column schema"
+    );
+}
+
+#[test]
+fn test_lsm_cutoff_first_column() {
+    assert!(
+        parse_sort_fields("&service__s|env__s|timestamp/V2").is_err(),
+        "must disallow LSM cutoff marker on first column"
+    );
+}
+
+// ---------------------------------------------------------------------------
+// Port of Go TestStringToSchema -- valid schemas
+// ---------------------------------------------------------------------------
+
+#[test]
+fn test_string_to_schema_timestamp_only() {
+    let s = must_parse("timestamp/V2");
+    assert_eq!(s.column.len(), 1);
+    assert_eq!(s.column[0].name, "timestamp");
+    assert_eq!(s.column[0].column_type, ColumnTypeId::Int64 as u64);
+    assert_eq!(
+        s.column[0].sort_direction,
+        SortColumnDirection::SortDirectionDescending as i32
+    );
+}
+
+#[test]
+fn test_string_to_schema_named_timestamp() {
+    let s = must_parse("defaultTimestampSchema=timestamp/V2");
+    assert_eq!(s.name, "defaultTimestampSchema");
+    assert_eq!(s.column.len(), 1);
+    assert_eq!(s.column[0].name, "timestamp");
+    assert_eq!(
+        s.column[0].sort_direction,
+        SortColumnDirection::SortDirectionDescending as i32
+    );
+}
+
+#[test]
+fn test_string_to_schema_named_timestamp_explicit_direction() {
+    let s = must_parse("defaultTimestampSchema=timestamp:-/V2");
+    assert_eq!(s.name, "defaultTimestampSchema");
+    assert_eq!(s.column[0].name, "timestamp");
+    assert_eq!(
+        s.column[0].sort_direction,
+        SortColumnDirection::SortDirectionDescending as i32
+    );
+}
+
+#[test]
+fn test_string_to_schema_named_timestamp_explicit_type() {
+    let s = must_parse("defaultTimestampSchema=timestamp:dense-int64:-/V2");
+    assert_eq!(s.name, "defaultTimestampSchema");
+    assert_eq!(s.column[0].name, "timestamp");
+    assert_eq!(s.column[0].column_type, ColumnTypeId::Int64 as u64);
+    assert_eq!(
+        s.column[0].sort_direction,
+        SortColumnDirection::SortDirectionDescending as i32
+    );
+}
+
+#[test]
+fn test_string_to_schema_multi_column() {
+    let s = must_parse("testSchema=columnA__s|columnB__i:-|timestamp/V2");
+    assert_eq!(s.name, "testSchema");
+    assert_eq!(s.sort_version, 2);
+    assert_eq!(s.column.len(), 3);
+
+    // columnA__s: string, ascending (default)
+    assert_eq!(s.column[0].name, "columnA");
+    assert_eq!(s.column[0].column_type, ColumnTypeId::String as u64);
+    assert_eq!(
+        s.column[0].sort_direction,
+        SortColumnDirection::SortDirectionAscending as i32
+    );
+
+    // columnB__i: int64, descending (explicit)
+    assert_eq!(s.column[1].name, "columnB");
+    assert_eq!(s.column[1].column_type, ColumnTypeId::Int64 as u64);
+    assert_eq!(
+        s.column[1].sort_direction,
+        SortColumnDirection::SortDirectionDescending as i32
+    );
+
+    // timestamp: int64, descending (default)
+    assert_eq!(s.column[2].name, "timestamp");
+    assert_eq!(s.column[2].column_type, ColumnTypeId::Int64 as u64);
+    assert_eq!(
+        s.column[2].sort_direction,
+        SortColumnDirection::SortDirectionDescending as i32
+    );
+}
+
+#[test]
+fn test_string_to_schema_multi_column_explicit_type() {
+    let s = must_parse("testSchema=columnA__s:dense-string:+|columnB__i:-|timestamp/V2");
+    assert_eq!(s.column[0].column_type, ColumnTypeId::String as u64);
+    assert_eq!(
+        s.column[0].sort_direction,
+        SortColumnDirection::SortDirectionAscending as i32
+    );
+}
+
+// ---------------------------------------------------------------------------
+// SchemaToString and SchemaToStringShort
+// ---------------------------------------------------------------------------
+
+#[test]
+fn test_schema_to_string_full() {
+    let s = must_parse("testSchema=columnA__s|columnB__i:-|timestamp/V2");
+    let full = schema_to_string(&s);
+    // columnA has type String == default for a generic name, so no suffix in output.
+    // columnB has type Int64 != default String, so __i suffix is preserved.
+    assert_eq!(
+        full,
+        "testSchema=columnA:dense-string:+|columnB__i:dense-int64:-|timestamp:dense-int64:-/V2"
+    );
+}
+
+#[test]
+fn test_schema_to_string_short() {
+    let s = must_parse("testSchema=columnA__s|columnB__i:-|timestamp/V2");
+    let short = schema_to_string_short(&s);
+    // columnA has type String == default, so no suffix. columnB keeps __i (Int64 != default).
+    assert_eq!(short, "testSchema=columnA|columnB__i:-|timestamp/V2");
+}
+
+// ---------------------------------------------------------------------------
+// Round-trip tests
+// ---------------------------------------------------------------------------
+
+#[test]
+fn test_round_trip_short_form() {
+    // Inputs that round-trip to themselves: bare names (no suffix) and non-default
+    // typed columns already serialize without a suffix change.
+    let exact_round_trip_cases = [
+        "timestamp/V2",
+        // `service` with no suffix: default String, serializes as `service` (no suffix).
+        "service|timestamp/V2",
+        "service|env|timestamp/V2",
+        // columnB__i: Int64 != default String, keeps __i suffix.
+        "testSchema=columnA|columnB__i:-|timestamp/V2",
+    ];
+    for input in exact_round_trip_cases {
+        let parsed = must_parse(input);
+        let short = schema_to_string_short(&parsed);
+        assert_eq!(short, input, "short round-trip failed for '{}'", input);
+
+        // Also verify full round-trip: parse(to_string(schema)) == schema.
+        let full = schema_to_string(&parsed);
+        let reparsed = must_parse(&full);
+        assert_eq!(
+            parsed.column.len(),
+            reparsed.column.len(),
+            "column count mismatch after full round-trip for '{}'",
+            input
+        );
+        for (a, b) in parsed.column.iter().zip(reparsed.column.iter()) {
+            assert_eq!(a.name, b.name);
+            assert_eq!(a.column_type, b.column_type);
+            assert_eq!(a.sort_direction, b.sort_direction);
+        }
+        assert_eq!(parsed.sort_version, reparsed.sort_version);
+        assert_eq!(parsed.lsm_comparison_cutoff, reparsed.lsm_comparison_cutoff);
+    }
+
+    // Inputs with explicit __s suffix on default-String columns: the short form drops
+    // the suffix (it is not needed), but the semantic content is preserved.
+    let semantic_round_trip_cases = [
+        ("service__s|timestamp/V2", "service|timestamp/V2"),
+        ("service__s|env__s|timestamp/V2", "service|env|timestamp/V2"),
+        (
+            "testSchema=columnA__s|columnB__i:-|timestamp/V2",
+            "testSchema=columnA|columnB__i:-|timestamp/V2",
+        ),
+    ];
+    for (input, expected_short) in semantic_round_trip_cases {
+        let parsed = must_parse(input);
+        let short = schema_to_string_short(&parsed);
+        assert_eq!(short, expected_short, "short form mismatch for '{}'", input);
+
+        // Verify semantic round-trip: parse the short output and compare protos.
+        let reparsed = must_parse(&short);
+        assert_eq!(
+            parsed.column.len(),
+            reparsed.column.len(),
+            "column count mismatch after semantic round-trip for '{}'",
+            input
+        );
+        for (a, b) in parsed.column.iter().zip(reparsed.column.iter()) {
+            assert_eq!(a.name, b.name);
+            assert_eq!(a.column_type, b.column_type);
+            assert_eq!(a.sort_direction, b.sort_direction);
+        }
+        assert_eq!(parsed.sort_version, reparsed.sort_version);
+        assert_eq!(parsed.lsm_comparison_cutoff, reparsed.lsm_comparison_cutoff);
+    }
+}
+
+// ---------------------------------------------------------------------------
+// Port of Go TestEquivalentSchemas
+// ---------------------------------------------------------------------------
+
+#[test]
+fn test_equivalent_schemas_identical() {
+    let a = must_parse("timestamp/V2");
+    let b = must_parse("timestamp/V2");
+    assert!(equivalent_schemas(&a, &b));
+    assert!(equivalent_schemas_for_compaction(&a, &b));
+}
+
+#[test]
+fn test_equivalent_schemas_different_column_counts() {
+    let a = must_parse("service__s|timestamp/V2");
+    let b = must_parse("timestamp/V2");
+    assert!(!equivalent_schemas(&a, &b));
+    assert!(!equivalent_schemas_for_compaction(&a, &b));
+}
+
+#[test]
+fn test_equivalent_schemas_same_columns_different_names() {
+    let a = must_parse("service__s|timestamp/V2");
+    let b = must_parse("serviceSchema=service__s|timestamp/V2");
+    assert!(equivalent_schemas(&a, &b));
+    assert!(equivalent_schemas_for_compaction(&a, &b));
+}
+
+#[test]
+fn test_equivalent_schemas_different_column_order() {
+    let a = must_parse("env__s|service__s|timestamp/V2");
+    let b = must_parse("serviceSchema=service__s|timestamp/V2");
+    assert!(!equivalent_schemas(&a, &b));
+    assert!(!equivalent_schemas_for_compaction(&a, &b));
+}
+
+#[test]
+fn test_equivalent_schemas_different_versions() {
+    let a = must_parse("service__s|timestamp/V2");
+    let b = must_parse("serviceSchema=service__s|timestamp/V3");
+    assert!(!equivalent_schemas(&a, &b));
+    assert!(!equivalent_schemas_for_compaction(&a, &b));
+}
+
+#[test]
+fn test_equivalent_schemas_different_lsm_cutoffs() {
+    // Different LSM cutoffs: affects EquivalentSchemas but NOT EquivalentSchemasForCompaction.
+    let a = must_parse("service__s|&env__s|timestamp/V2");
+    let b = must_parse("service__s|env__s|timestamp/V2");
+    assert!(
+        !equivalent_schemas(&a, &b),
+        "different LSM cutoffs should not be equivalent"
+    );
+    assert!(
+        equivalent_schemas_for_compaction(&a, &b),
+        "different LSM cutoffs should be equivalent for compaction"
+    );
+}
+
+#[test]
+fn test_equivalent_schemas_different_cutoff_positions() {
+    let a = must_parse("service__s|&env__s|timestamp/V2");
+    let b = must_parse("service__s|env__s|&timestamp/V2");
+    assert!(!equivalent_schemas(&a, &b));
+    assert!(equivalent_schemas_for_compaction(&a, &b));
+}
+
+#[test]
+fn test_equivalent_schemas_identical_lsm_cutoffs() {
+    let a = must_parse("service__s|&env__s|timestamp/V2");
+    let b = must_parse("service__s|&env__s|timestamp/V2");
+    assert!(equivalent_schemas(&a, &b));
+    assert!(equivalent_schemas_for_compaction(&a, &b));
+}
+
+// ---------------------------------------------------------------------------
+// Port of Go TestStringToSchemaWithLSMCutoff
+// ---------------------------------------------------------------------------
+
+#[test]
+fn test_lsm_cutoff_after_first_column() {
+    let s = must_parse("service__s|&env__s|timestamp/V2");
+    assert_eq!(s.lsm_comparison_cutoff, 1);
+    assert_eq!(s.column[1].name, "env"); // "&" stripped
+}
+
+#[test]
+fn test_lsm_cutoff_after_second_column() {
+    let s = must_parse("service__s|env__s|&source__s|timestamp/V2");
+    assert_eq!(s.lsm_comparison_cutoff, 2);
+    assert_eq!(s.column[2].name, "source");
+}
+
+#[test]
+fn test_lsm_cutoff_before_timestamp() {
+    let s = must_parse("service__s|env__s|source__s|&timestamp/V2");
+    assert_eq!(s.lsm_comparison_cutoff, 3);
+    assert_eq!(s.column[3].name, "timestamp");
+}
+
+#[test]
+fn test_lsm_cutoff_named_schema() {
+    let s = must_parse("testSchema=service__s|&env__s|timestamp/V2");
+    assert_eq!(s.name, "testSchema");
+    assert_eq!(s.lsm_comparison_cutoff, 1);
+}
+
+#[test]
+fn test_lsm_cutoff_with_version() {
+    let s = must_parse("service__s|&env__s|timestamp/V2");
+    assert_eq!(s.lsm_comparison_cutoff, 1);
+    assert_eq!(s.sort_version, 2);
+}
+
+#[test]
+fn test_lsm_cutoff_with_explicit_type_direction() {
+    let s = must_parse("service__s:dense-string:+|&env__s:dense-string:+|timestamp/V2");
+    assert_eq!(s.lsm_comparison_cutoff, 1);
+    assert_eq!(
+        s.column[0].sort_direction,
+        SortColumnDirection::SortDirectionAscending as i32
+    );
+}
+
+#[test]
+fn test_no_lsm_cutoff() {
+    let s = must_parse("service__s|env__s|timestamp/V2");
+    assert_eq!(s.lsm_comparison_cutoff, 0);
+}
+
+// ---------------------------------------------------------------------------
+// Port of Go TestSchemaToStringWithLSMCutoff
+// ---------------------------------------------------------------------------
+
+#[test]
+fn test_schema_to_string_preserves_cutoff_marker_short() {
+    let s = must_parse("service__s|&env__s|timestamp/V2");
+    // String columns with default type serialize without the __s suffix.
+    assert_eq!(schema_to_string_short(&s), "service|&env|timestamp/V2");
+}
+
+#[test]
+fn test_schema_to_string_preserves_cutoff_marker_full() {
+    let s = must_parse("service__s|&env__s|timestamp/V2");
+    // String columns with default type serialize without the __s suffix in full form too.
+    let expected = "service:dense-string:+|&env:dense-string:+|timestamp:dense-int64:-/V2";
+    assert_eq!(schema_to_string(&s), expected);
+}
+
+#[test]
+fn test_schema_to_string_named_with_cutoff() {
+    let s = must_parse("testSchema=service__s|&env__s|timestamp/V2");
+    // String columns with default type serialize without the __s suffix.
+    assert_eq!(
+        schema_to_string_short(&s),
+        "testSchema=service|&env|timestamp/V2"
+    );
+}
+
+#[test]
+fn test_schema_to_string_no_cutoff() {
+    let s = must_parse("service__s|env__s|timestamp/V2");
+    // String columns with default type serialize without the __s suffix.
+    assert_eq!(schema_to_string_short(&s), "service|env|timestamp/V2");
+}
+
+// ---------------------------------------------------------------------------
+// Port of Go TestLSMCutoffRoundTrip
+// ---------------------------------------------------------------------------
+
+#[test]
+fn test_lsm_cutoff_round_trip() {
+    // Pairs of (input, expected_short_output).
+    // String columns with default type serialize without the __s suffix, so inputs
+    // using __s produce shorter output that is semantically equivalent.
+    let test_cases = [
+        (
+            "service__s|&env__s|timestamp/V2",
+            "service|&env|timestamp/V2",
+        ),
+        (
+            "service__s|env__s|&source__s|timestamp/V2",
+            "service|env|&source|timestamp/V2",
+        ),
+        (
+            "service__s|env__s|source__s|&timestamp/V2",
+            "service|env|source|&timestamp/V2",
+        ),
+        (
+            "testSchema=service__s|&env__s|timestamp/V2",
+            "testSchema=service|&env|timestamp/V2",
+        ),
+        ("service__s|env__s|timestamp/V2", "service|env|timestamp/V2"),
+    ];
+    for (input, expected_short) in test_cases {
+        let parsed = must_parse(input);
+        let result = schema_to_string_short(&parsed);
+        assert_eq!(result, expected_short, "round-trip failed for '{}'", input);
+
+        // Verify the output parses back to the same proto (semantic round-trip).
+        let reparsed = must_parse(&result);
+        assert_eq!(
+            parsed.column.len(),
+            reparsed.column.len(),
+            "column count mismatch after round-trip for '{}'",
+            input
+        );
+        for (a, b) in parsed.column.iter().zip(reparsed.column.iter()) {
+            assert_eq!(a.name, b.name);
+            assert_eq!(a.column_type, b.column_type);
+            assert_eq!(a.sort_direction, b.sort_direction);
+        }
+        assert_eq!(
+            parsed.lsm_comparison_cutoff, reparsed.lsm_comparison_cutoff,
+            "LSM cutoff mismatch for '{}'",
+            input
+        );
+    }
+}
+
+// ---------------------------------------------------------------------------
+// Direction prefix/suffix tests
+// ---------------------------------------------------------------------------
+
+#[test]
+fn test_direction_prefix_ascending() {
+    let s = must_parse("+service__s|timestamp/V2");
+    assert_eq!(s.column[0].name, "service");
+    assert_eq!(
+        s.column[0].sort_direction,
+        SortColumnDirection::SortDirectionAscending as i32
+    );
+}
+
+#[test]
+fn test_direction_prefix_descending() {
+    let s = must_parse("-service__s|timestamp/V2");
+    assert_eq!(s.column[0].name, "service");
+    assert_eq!(
+        s.column[0].sort_direction,
+        SortColumnDirection::SortDirectionDescending as i32
+    );
+}
+
+#[test]
+fn test_direction_suffix_ascending() {
+    let s = must_parse("service__s+|timestamp/V2");
+    assert_eq!(s.column[0].name, "service");
+    assert_eq!(
+        s.column[0].sort_direction,
+        SortColumnDirection::SortDirectionAscending as i32
+    );
+}
+
+#[test]
+fn test_direction_suffix_descending() {
+    let s = must_parse("service__s-|timestamp/V2");
+    assert_eq!(s.column[0].name, "service");
+    assert_eq!(
+        s.column[0].sort_direction,
+        SortColumnDirection::SortDirectionDescending as i32
+    );
+}
+
+#[test]
+fn test_direction_prefix_descending_on_timestamp() {
+    // Explicit descending prefix on timestamp matches the default and is accepted.
+    let s = must_parse("service__s|-timestamp/V2");
+    assert_eq!(s.column[1].name, "timestamp");
+    assert_eq!(
+        s.column[1].sort_direction,
+        SortColumnDirection::SortDirectionDescending as i32
+    );
+}
+
+#[test]
+fn test_direction_prefix_ascending_on_timestamp_rejected() {
+    // Ascending timestamp is rejected by validation (timestamp must be descending).
+    assert!(
+        parse_sort_fields("service__s|+timestamp/V2").is_err(),
+        "ascending timestamp must be rejected by validation"
+    );
+}
+
+#[test]
+fn test_direction_suffix_on_timestamp() {
+    let s = must_parse("service__s|timestamp-/V2");
+    assert_eq!(s.column[1].name, "timestamp");
+    assert_eq!(
+        s.column[1].sort_direction,
+        SortColumnDirection::SortDirectionDescending as i32
+    );
+}
+
+#[test]
+fn test_direction_prefix_and_suffix_is_error() {
+    // +name+ : direction on both sides
+    assert!(
+        parse_sort_fields("+service__s+|timestamp/V2").is_err(),
+        "must reject direction on both prefix and suffix"
+    );
+    // -name- : same direction both sides, still error
+    assert!(
+        parse_sort_fields("-service__s-|timestamp/V2").is_err(),
+        "must reject direction on both prefix and suffix (same direction)"
+    );
+    // +name- : conflicting directions
+    assert!(
+        parse_sort_fields("+service__s-|timestamp/V2").is_err(),
+        "must reject conflicting direction prefix and suffix"
+    );
+}
+
+#[test]
+fn test_direction_prefix_with_colon_direction_is_error() {
+    // +name:+ : prefix direction + colon direction
+    assert!(
+        parse_sort_fields("+service__s:+|timestamp/V2").is_err(),
+        "must reject direction in both prefix and colon form"
+    );
+}
+
+#[test]
+fn test_direction_suffix_with_colon_direction_is_error() {
+    // name-:- : suffix direction + colon direction
+    assert!(
+        parse_sort_fields("service__s-:-|timestamp/V2").is_err(),
+        "must reject direction in both suffix and colon form"
+    );
+}
+
+#[test]
+fn test_direction_prefix_multi_column() {
+    let s = must_parse("-metric_name__s|+host__s|timestamp/V2");
+    assert_eq!(
+        s.column[0].sort_direction,
+        SortColumnDirection::SortDirectionDescending as i32
+    );
+    assert_eq!(
+        s.column[1].sort_direction,
+        SortColumnDirection::SortDirectionAscending as i32
+    );
+    assert_eq!(
+        s.column[2].sort_direction,
+        SortColumnDirection::SortDirectionDescending as i32 // default for timestamp
+    );
+}
+
+#[test]
+fn test_direction_prefix_with_lsm_cutoff() {
+    // &+env__s : cutoff marker then direction prefix
+    let s = must_parse("service__s|&+env__s|timestamp/V2");
+    assert_eq!(s.lsm_comparison_cutoff, 1);
+    assert_eq!(s.column[1].name, "env");
+    assert_eq!(
+        s.column[1].sort_direction,
+        SortColumnDirection::SortDirectionAscending as i32
+    );
+}
+
+#[test]
+fn test_direction_suffix_with_lsm_cutoff() {
+    let s = must_parse("service__s|&env__s-|timestamp/V2");
+    assert_eq!(s.lsm_comparison_cutoff, 1);
+    assert_eq!(s.column[1].name, "env");
+    assert_eq!(
+        s.column[1].sort_direction,
+        SortColumnDirection::SortDirectionDescending as i32
+    );
+}
+
+// ---------------------------------------------------------------------------
+// Bare name parsing (no type suffix — uses defaults)
+// ---------------------------------------------------------------------------
+
+#[test]
+fn test_bare_names_default_to_string() {
+    let s = must_parse("service|env|host|timestamp/V2");
+    assert_eq!(s.column[0].name, "service");
+    assert_eq!(s.column[0].column_type, ColumnTypeId::String as u64);
+    assert_eq!(s.column[1].name, "env");
+    assert_eq!(s.column[1].column_type, ColumnTypeId::String as u64);
+    assert_eq!(s.column[2].name, "host");
+    assert_eq!(s.column[2].column_type, ColumnTypeId::String as u64);
+}
+
+#[test]
+fn test_bare_timestamp_defaults_to_int64() {
+    let s = must_parse("service|timestamp/V2");
+    assert_eq!(s.column[1].name, "timestamp");
+    assert_eq!(s.column[1].column_type, ColumnTypeId::Int64 as u64);
+}
+
+#[test]
+fn test_bare_timeseries_id_defaults_to_int64() {
+    let s = must_parse("service|timeseries_id|timestamp/V2");
+    assert_eq!(s.column[1].name, "timeseries_id");
+    assert_eq!(s.column[1].column_type, ColumnTypeId::Int64 as u64);
+}
+
+#[test]
+fn test_bare_tiebreaker_defaults_to_int64() {
+    let s = must_parse("service|timestamp|tiebreaker/V2");
+    assert_eq!(s.column[2].name, "tiebreaker");
+    assert_eq!(s.column[2].column_type, ColumnTypeId::Int64 as u64);
+}
+
+#[test]
+fn test_bare_metric_value_defaults_to_float64() {
+    let s = must_parse("metric_value|timestamp/V2");
+    assert_eq!(s.column[0].name, "metric_value");
+    assert_eq!(s.column[0].column_type, ColumnTypeId::Float64 as u64);
+}
+
+#[test]
+fn test_bare_and_suffixed_produce_same_proto() {
+    let bare = must_parse("metric_name|host|timeseries_id|timestamp/V2");
+    let suffixed = must_parse("metric_name__s|host__s|timeseries_id__i|timestamp/V2");
+    assert_eq!(bare.column.len(), suffixed.column.len());
+    for (a, b) in bare.column.iter().zip(suffixed.column.iter()) {
+        assert_eq!(a.name, b.name, "names should match");
+        assert_eq!(
+            a.column_type, b.column_type,
+            "types should match for {}",
+            a.name
+        );
+        assert_eq!(
+            a.sort_direction, b.sort_direction,
+            "directions should match for {}",
+            a.name
+        );
+    }
+}
+
+#[test]
+fn test_suffix_overrides_default() {
+    // metric_value defaults to Float64, but __i suffix forces Int64
+    let s = must_parse("metric_value__i|timestamp/V2");
+    assert_eq!(s.column[0].name, "metric_value");
+    assert_eq!(s.column[0].column_type, ColumnTypeId::Int64 as u64);
+}
+
+#[test]
+fn test_display_omits_default_suffix() {
+    let s = must_parse("metric_name|host|timestamp/V2");
+    let short = schema_to_string_short(&s);
+    // All columns use default types, so no suffixes in output.
+    assert_eq!(short, "metric_name|host|timestamp/V2");
+}
+
+#[test]
+fn test_display_includes_non_default_suffix() {
+    // Force host to Int64 (non-default for bare "host" which defaults to String)
+    let s = must_parse("metric_name|host__i|timestamp/V2");
+    let short = schema_to_string_short(&s);
+    assert_eq!(short, "metric_name|host__i|timestamp/V2");
+}
+
+// ---------------------------------------------------------------------------
+// timeseries_id handling
+// ---------------------------------------------------------------------------
+
+#[test]
+fn test_timeseries_id_as_int64() {
+    let s = must_parse("metric_name__s|host__s|timeseries_id__i|timestamp/V2");
+    assert_eq!(s.column.len(), 4);
+
+    // timeseries_id__i should be TypeIDInt64, ascending.
+    let ts_id_col = &s.column[2];
+    assert_eq!(ts_id_col.name, "timeseries_id");
+    assert_eq!(ts_id_col.column_type, ColumnTypeId::Int64 as u64);
+    assert_eq!(
+        ts_id_col.sort_direction,
+        SortColumnDirection::SortDirectionAscending as i32
+    );
+}
+
+// ---------------------------------------------------------------------------
+// Float and sketch column types
+// ---------------------------------------------------------------------------
+
+#[test]
+fn test_float_column() {
+    let s = must_parse("value__nf|timestamp/V2");
+    assert_eq!(s.column[0].column_type, ColumnTypeId::Float64 as u64);
+    assert_eq!(
+        s.column[0].sort_direction,
+        SortColumnDirection::SortDirectionAscending as i32
+    );
+}
+
+#[test]
+fn test_sketch_column() {
+    let s = must_parse("latency__sk|timestamp/V2");
+    assert_eq!(s.column[0].column_type, ColumnTypeId::Sketch as u64);
+}
+
+// ---------------------------------------------------------------------------
+// SchemasToString / SchemasToStringShort multi-schema (convenience)
+// ---------------------------------------------------------------------------
+
+#[test]
+fn test_schemas_to_string() {
+    let schema1 = must_parse("test=key1__s|timestamp/V2");
+    let schema2 = must_parse("key2__i|timestamp/V2");
+
+    let schemas = [schema1, schema2];
+    let strings: Vec<String> = schemas.iter().map(schema_to_string).collect();
+    let actual = strings.join(",");
+
+    // key1 has type String == default, so no suffix. key2 has type Int64 != default, keeps __i.
+    assert_eq!(
+        actual,
+        "test=key1:dense-string:+|timestamp:dense-int64:-/V2,key2__i:dense-int64:+|timestamp:\
+         dense-int64:-/V2"
+    );
+}
+
+#[test]
+fn test_schemas_to_string_short() {
+    let schema1 = must_parse("test=key1__s|timestamp/V2");
+    let schema2 = must_parse("key2__i|timestamp/V2");
+
+    let schemas = [schema1, schema2];
+    let strings: Vec<String> = schemas.iter().map(schema_to_string_short).collect();
+    let actual = strings.join(",");
+
+    // key1 has type String == default, so no suffix. key2 has type Int64 != default, keeps __i.
+    assert_eq!(actual, "test=key1|timestamp/V2,key2__i|timestamp/V2");
+}
+
+// ---------------------------------------------------------------------------
+// ColumnTypeId TryFrom<u64> (proto deserialization path)
+// ---------------------------------------------------------------------------
+
+#[test]
+fn test_column_type_try_from_u64_valid() {
+    assert_eq!(ColumnTypeId::try_from(2u64).unwrap(), ColumnTypeId::Int64);
+    assert_eq!(
+        ColumnTypeId::try_from(10u64).unwrap(),
+        ColumnTypeId::Float64
+    );
+    assert_eq!(ColumnTypeId::try_from(14u64).unwrap(), ColumnTypeId::String);
+    assert_eq!(ColumnTypeId::try_from(17u64).unwrap(), ColumnTypeId::Sketch);
+    assert_eq!(
+        ColumnTypeId::try_from(20u64).unwrap(),
+        ColumnTypeId::CpcSketch
+    );
+    assert_eq!(
+        ColumnTypeId::try_from(22u64).unwrap(),
+        ColumnTypeId::ItemSketch
+    );
+}
+
+#[test]
+fn test_column_type_try_from_u64_invalid() {
+    assert!(ColumnTypeId::try_from(0u64).is_err());
+    assert!(ColumnTypeId::try_from(1u64).is_err());
+    assert!(ColumnTypeId::try_from(99u64).is_err());
+}
diff --git a/quickwit/quickwit-parquet-engine/src/sort_fields/validation.rs b/quickwit/quickwit-parquet-engine/src/sort_fields/validation.rs
new file mode 100644
index 00000000000..3a33db82f6f
--- /dev/null
+++ b/quickwit/quickwit-parquet-engine/src/sort_fields/validation.rs
@@ -0,0 +1,114 @@
+// Copyright 2021-Present Datadog, Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+//! Sort schema validation -- direct port of Go `ValidateSchema`.
+
+use std::collections::HashSet;
+
+use quickwit_proto::sortschema::{SortColumnDirection, SortSchema};
+
+use super::SortFieldsError;
+use super::column_type::{ColumnTypeId, TIEBREAKER, default_is_descending};
+
+/// Name used for the special skip-builder schema that does not require timestamp.
+const DEFAULT_SKIP_BUILDER_SCHEMA_NAME: &str = "defaultSkipBuilderSchema";
+
+/// Check if a bare column name is a timestamp column (defaults to descending).
+fn is_timestamp_column(name: &str) -> bool {
+    default_is_descending(name)
+}
+
+/// Validate a sort schema, enforcing all rules from Go `ValidateSchema`.
+///
+/// Rules:
+/// - Schema must have at least one column.
+/// - No duplicate column names.
+/// - Sort direction must not be Unknown.
+/// - `timestamp` must be present (unless schema name is `defaultSkipBuilderSchema`).
+/// - `timestamp` must be Int64 and descending (unless it's a msgid schema).
+/// - `timestamp` must come before `tiebreaker`.
+/// - No non-tiebreaker columns may appear after `timestamp`.
+pub fn validate_schema(schema: &SortSchema) -> Result<(), SortFieldsError> {
+    if schema.column.is_empty() {
+        return Err(SortFieldsError::ValidationError("empty schema".to_string()));
+    }
+
+    let mut seen: HashSet<&str> = HashSet::new();
+    let is_msgid = schema.version == 2 || schema.name == "defaultMsgIDsSchema";
+
+    for col in &schema.column {
+        let name = col.name.as_str();
+
+        if seen.contains(name) {
+            return Err(SortFieldsError::ValidationError(format!(
+                "column {} is duplicated in schema",
+                name
+            )));
+        }
+        seen.insert(name);
+
+        if col.sort_direction == SortColumnDirection::SortDirectionUnknown as i32 {
+            return Err(SortFieldsError::ValidationError(format!(
+                "column {} does not specify a sort direction in schema",
+                name
+            )));
+        }
+
+        let has_seen_timestamp = seen.iter().any(|s| is_timestamp_column(s));
+
+        if is_timestamp_column(name) {
+            if seen.contains(TIEBREAKER) {
+                return Err(SortFieldsError::ValidationError(format!(
+                    "{} column must come before {} in schema",
+                    name, TIEBREAKER
+                )));
+            }
+            if col.sort_direction != SortColumnDirection::SortDirectionDescending as i32
+                && !is_msgid
+            {
+                return Err(SortFieldsError::ValidationError(format!(
+                    "{} column must sorted in descending order in schema",
+                    name
+                )));
+            }
+            if col.column_type != ColumnTypeId::Int64 as u64 {
+                return Err(SortFieldsError::ValidationError(format!(
+                    "{} column must be of type int64 in schema",
+                    name
+                )));
+            }
+        } else if name == TIEBREAKER {
+            if !has_seen_timestamp {
+                return Err(SortFieldsError::ValidationError(format!(
+                    "timestamp column must come before {} in schema",
+                    TIEBREAKER
+                )));
+            }
+        } else if has_seen_timestamp && !is_msgid {
+            return Err(SortFieldsError::ValidationError(format!(
+                "column {} is after timestamp but timestamp must be the last schema column",
+                name
+            )));
+        }
+    }
+
+    let has_timestamp = schema.column.iter().any(|c| is_timestamp_column(&c.name));
+    if !has_timestamp && schema.name != DEFAULT_SKIP_BUILDER_SCHEMA_NAME {
+        return Err(SortFieldsError::ValidationError(
+            "timestamp column is required, but is missing from schema".to_string(),
+        ));
+    }
+
+    Ok(())
+}
diff --git a/quickwit/quickwit-parquet-engine/src/sort_fields/window.rs b/quickwit/quickwit-parquet-engine/src/sort_fields/window.rs
new file mode 100644
index 00000000000..b1ad896e31f
--- /dev/null
+++ b/quickwit/quickwit-parquet-engine/src/sort_fields/window.rs
@@ -0,0 +1,240 @@
+// Copyright 2021-Present Datadog, Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+//! Canonical time-window functions for metrics compaction.
+//!
+//! `window_start` is the foundational time-partitioning function used by
+//! ingestion (Phase 32), merge policy (Phase 33), and compaction planning
+//! (Phase 35). Correctness at boundary conditions -- especially negative
+//! timestamps and zero-crossing -- is critical because an off-by-one error
+//! silently misroutes data to wrong windows.
+//!
+//! The implementation uses `rem_euclid` instead of the `%` operator to handle
+//! negative timestamps correctly. Standard `%` truncates toward zero, which
+//! gives wrong results for negative inputs:
+//!   - `-1 % 900 = -1` (wrong: would compute window_start as 0)
+//!   - `(-1i64).rem_euclid(900) = 899` (correct: window_start = -1 - 899 = -900)
+
+use chrono::{DateTime, Utc};
+
+use super::SortFieldsError;
+
+/// Validate that a window duration evenly divides one hour (3600 seconds).
+///
+/// This is ADR-003 invariant TW-2: all windows within an hour must have
+/// identical boundaries regardless of when counting starts. 3600 has 45
+/// positive divisors (1, 2, 3, ..., 1800, 3600). Any of these are accepted.
+/// In practice, metrics systems use durations >= 60s: 60, 120, 180, 240,
+/// 300, 360, 600, 720, 900, 1200, 1800, 3600.
+///
+/// A duration of 0 is rejected as nonsensical. Durations that do not evenly
+/// divide 3600 are rejected because they would produce inconsistent window
+/// boundaries across different starting points within an hour.
+pub fn validate_window_duration(duration_secs: u32) -> Result<(), SortFieldsError> {
+    if duration_secs == 0 {
+        return Err(SortFieldsError::InvalidWindowDuration {
+            duration_secs,
+            reason: "must be positive",
+        });
+    }
+    if 3600 % duration_secs != 0 {
+        return Err(SortFieldsError::InvalidWindowDuration {
+            duration_secs,
+            reason: "must evenly divide 3600 (one hour)",
+        });
+    }
+    Ok(())
+}
+
+/// Compute the start of the time window containing the given timestamp.
+///
+/// Uses `rem_euclid` for correct handling of negative timestamps (before Unix
+/// epoch). Standard `%` truncates toward zero: `-1 % 900 = -1` (wrong).
+/// `rem_euclid` always returns non-negative: `(-1i64).rem_euclid(900) = 899`.
+/// So `window_start(-1, 900) = -1 - 899 = -900` (correct: timestamp -1 is in
+/// window [-900, 0)).
+///
+/// # Invariants (verified by proptest)
+/// - Window start is aligned: `window_start % duration == 0`
+/// - Timestamp is contained: `window_start <= timestamp < window_start + duration`
+/// - Deterministic: same inputs always produce same output
+///
+/// # Errors
+/// Returns `SortFieldsError::WindowStartOutOfRange` if the computed start
+/// timestamp cannot be represented as a `DateTime<Utc>`.
+pub fn window_start(
+    timestamp_secs: i64,
+    duration_secs: i64,
+) -> Result<DateTime<Utc>, SortFieldsError> {
+    debug_assert!(duration_secs > 0, "window duration must be positive");
+    // TW-2 (ADR-003): window duration must evenly divide one hour.
+    // This ensures window boundaries align across hours and days.
+    debug_assert!(
+        3600 % duration_secs == 0,
+        "TW-2 violated: duration_secs={} does not divide 3600",
+        duration_secs
+    );
+    let remainder = timestamp_secs.rem_euclid(duration_secs);
+    let start_secs = timestamp_secs - remainder;
+    DateTime::from_timestamp(start_secs, 0).ok_or(SortFieldsError::WindowStartOutOfRange {
+        timestamp_secs: start_secs,
+    })
+}
+
+#[cfg(test)]
+mod tests {
+    use proptest::prelude::*;
+
+    use super::*;
+
+    // -----------------------------------------------------------------------
+    // Proptest properties
+    // -----------------------------------------------------------------------
+
+    proptest! {
+        #[test]
+        fn window_start_is_aligned(
+            ts in -1_000_000_000i64..2_000_000_000i64,
+            dur in prop::sample::select(vec![60i64, 120, 180, 240, 300, 360,
+                                             600, 720, 900, 1200, 1800, 3600])
+        ) {
+            let ws = window_start(ts, dur).unwrap();
+            let ws_secs = ws.timestamp();
+            // window_start is aligned to duration
+            prop_assert_eq!(ws_secs.rem_euclid(dur), 0);
+            // timestamp is within [window_start, window_start + duration)
+            prop_assert!(ws_secs <= ts);
+            prop_assert!(ts < ws_secs + dur);
+        }
+
+        #[test]
+        fn window_start_is_deterministic(
+            ts in -1_000_000_000i64..2_000_000_000i64,
+            dur in prop::sample::select(vec![60i64, 300, 900, 3600])
+        ) {
+            let ws1 = window_start(ts, dur).unwrap();
+            let ws2 = window_start(ts, dur).unwrap();
+            prop_assert_eq!(ws1, ws2);
+        }
+
+        #[test]
+        fn adjacent_windows_do_not_overlap(
+            ts in 0i64..1_000_000_000i64,
+            dur in prop::sample::select(vec![60i64, 300, 900, 3600])
+        ) {
+            let ws = window_start(ts, dur).unwrap();
+            let next_ws = window_start(ws.timestamp() + dur, dur).unwrap();
+            // Next window starts exactly at current window end
+            prop_assert_eq!(next_ws.timestamp(), ws.timestamp() + dur);
+        }
+    }
+
+    // -----------------------------------------------------------------------
+    // Unit tests: edge cases
+    // -----------------------------------------------------------------------
+
+    #[test]
+    fn test_negative_timestamp_crossing() {
+        let ws = window_start(-1, 900).unwrap();
+        assert_eq!(ws.timestamp(), -900);
+    }
+
+    #[test]
+    fn test_zero_timestamp() {
+        let ws = window_start(0, 900).unwrap();
+        assert_eq!(ws.timestamp(), 0);
+    }
+
+    #[test]
+    fn test_exactly_on_boundary() {
+        let ws = window_start(900, 900).unwrap();
+        assert_eq!(ws.timestamp(), 900);
+    }
+
+    #[test]
+    fn test_one_before_boundary() {
+        let ws = window_start(899, 900).unwrap();
+        assert_eq!(ws.timestamp(), 0);
+    }
+
+    #[test]
+    fn test_large_negative_timestamp() {
+        let ws = window_start(-3601, 3600).unwrap();
+        assert_eq!(ws.timestamp(), -7200);
+    }
+
+    #[test]
+    fn test_60s_window() {
+        let ws = window_start(1_700_000_042, 60).unwrap();
+        assert_eq!(ws.timestamp(), 1_700_000_040);
+    }
+
+    // -----------------------------------------------------------------------
+    // Validation tests
+    // -----------------------------------------------------------------------
+
+    #[test]
+    fn test_valid_window_durations() {
+        let valid = [60, 120, 180, 240, 300, 360, 600, 720, 900, 1200, 1800, 3600];
+        for dur in valid {
+            assert!(
+                validate_window_duration(dur).is_ok(),
+                "duration {} should be valid",
+                dur
+            );
+        }
+    }
+
+    #[test]
+    fn test_invalid_window_durations() {
+        // None of these evenly divide 3600.
+        let invalid = [0, 7, 11, 13, 17, 700, 1000, 1500, 2000, 2400, 7200];
+        for dur in invalid {
+            assert!(
+                validate_window_duration(dur).is_err(),
+                "duration {} should be invalid",
+                dur
+            );
+        }
+    }
+
+    #[test]
+    fn test_small_valid_divisors_also_accepted() {
+        // The function accepts all positive divisors of 3600, not just >= 60.
+        let small_valid = [
+            1, 2, 3, 4, 5, 6, 8, 9, 10, 12, 15, 16, 18, 20, 24, 25, 30, 36, 40, 45, 48, 50,
+        ];
+        for dur in small_valid {
+            assert!(
+                validate_window_duration(dur).is_ok(),
+                "duration {} should be valid (divides 3600)",
+                dur
+            );
+        }
+    }
+
+    #[test]
+    fn test_zero_duration_error_message() {
+        let err = validate_window_duration(0).unwrap_err();
+        let msg = err.to_string();
+        assert!(msg.contains("must be positive"), "got: {msg}");
+    }
+
+    #[test]
+    fn test_non_divisor_error_message() {
+        let err = validate_window_duration(7).unwrap_err();
+        let msg = err.to_string();
+        assert!(msg.contains("must evenly divide 3600"), "got: {msg}");
+    }
+}
diff --git a/quickwit/quickwit-parquet-engine/src/storage/config.rs b/quickwit/quickwit-parquet-engine/src/storage/config.rs
index 13b8d395a44..2eb63d73510 100644
--- a/quickwit/quickwit-parquet-engine/src/storage/config.rs
+++ b/quickwit/quickwit-parquet-engine/src/storage/config.rs
@@ -202,8 +202,6 @@ impl ParquetWriterConfig {
 
 #[cfg(test)]
 mod tests {
-    use std::sync::Arc;
-
     use arrow::datatypes::Field;
 
     use super::*;
diff --git a/quickwit/quickwit-parquet-engine/src/table_config.rs b/quickwit/quickwit-parquet-engine/src/table_config.rs
new file mode 100644
index 00000000000..0573b8177db
--- /dev/null
+++ b/quickwit/quickwit-parquet-engine/src/table_config.rs
@@ -0,0 +1,191 @@
+// Copyright 2021-Present Datadog, Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+//! Default table configuration for Parquet pipeline product types.
+//!
+//! When a table (index) does not specify an explicit sort fields configuration,
+//! these defaults are applied based on the product type. The `ParquetWriter`
+//! resolves these sort field names to physical `ParquetField` columns at
+//! construction time; columns not yet in the schema (e.g., `timeseries_id`)
+//! are recorded in metadata but skipped during physical sort.
+
+use serde::{Deserialize, Serialize};
+
+/// Product types supported by the Parquet pipeline.
+///
+/// Each product type has a default sort fields schema that matches the common
+/// query predicates for that signal type. See ADR-002 for the rationale.
+#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, Serialize, Deserialize)]
+#[serde(rename_all = "lowercase")]
+pub enum ProductType {
+    Metrics,
+    Logs,
+    Traces,
+}
+
+impl ProductType {
+    /// Default sort fields string for this product type.
+    ///
+    /// The metrics default includes `timeseries_id` as a tiebreaker before
+    /// `timestamp_secs`. Since `timeseries_id` is not yet a physical column,
+    /// the writer skips it during sort but records it in the metadata string.
+    /// When the column is added to the schema, sorting will include it
+    /// automatically.
+    ///
+    /// Logs and traces defaults are placeholders — they will be refined
+    /// when the Parquet pipeline is extended to those signal types.
+    pub fn default_sort_fields(self) -> &'static str {
+        match self {
+            Self::Metrics => "metric_name|tag_service|tag_env|tag_datacenter|tag_region|tag_host|timeseries_id|timestamp_secs/V2",
+            // Placeholder: column names TBD when logs Parquet schema is defined.
+            Self::Logs => "service_name|level|host|timestamp_secs/V2",
+            // Placeholder: column names TBD when traces Parquet schema is defined.
+            Self::Traces => "service_name|operation_name|trace_id|timestamp_secs/V2",
+        }
+    }
+}
+
+/// Table-level configuration for the Parquet pipeline.
+///
+/// Stored per-index. When `sort_fields` is `None`, the default for the
+/// product type is used (see `ProductType::default_sort_fields()`).
+#[derive(Debug, Clone, Serialize, Deserialize)]
+pub struct TableConfig {
+    /// The product type determines schema defaults.
+    pub product_type: ProductType,
+
+    /// Explicit sort fields override. When `None`, the product-type default
+    /// is used. When `Some`, this exact schema string is applied.
+    #[serde(default, skip_serializing_if = "Option::is_none")]
+    pub sort_fields: Option<String>,
+
+    /// Window duration in seconds for time-windowed compaction.
+    /// Default: 900 (15 minutes). Must divide 3600.
+    #[serde(default = "default_window_duration_secs")]
+    pub window_duration_secs: u32,
+}
+
+fn default_window_duration_secs() -> u32 {
+    900
+}
+
+impl TableConfig {
+    /// Create a new TableConfig for the given product type with defaults.
+    pub fn new(product_type: ProductType) -> Self {
+        Self {
+            product_type,
+            sort_fields: None,
+            window_duration_secs: default_window_duration_secs(),
+        }
+    }
+
+    /// Get the effective sort fields string for this table.
+    ///
+    /// Returns the explicit override if set, otherwise the product-type default.
+    pub fn effective_sort_fields(&self) -> &str {
+        match &self.sort_fields {
+            Some(sf) => sf.as_str(),
+            None => self.product_type.default_sort_fields(),
+        }
+    }
+}
+
+impl Default for TableConfig {
+    fn default() -> Self {
+        Self::new(ProductType::Metrics)
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+    use crate::sort_fields::parse_sort_fields;
+
+    #[test]
+    fn test_metrics_default_sort_fields_parses() {
+        let schema = parse_sort_fields(ProductType::Metrics.default_sort_fields())
+            .expect("metrics default sort fields must parse");
+        assert_eq!(schema.column.len(), 8);
+        // Proto names are bare (suffixes stripped by parser).
+        assert_eq!(schema.column[0].name, "metric_name");
+        assert_eq!(schema.column[1].name, "tag_service");
+        assert_eq!(schema.column[6].name, "timeseries_id");
+        assert_eq!(schema.column[7].name, "timestamp_secs");
+    }
+
+    #[test]
+    fn test_logs_default_sort_fields_parses() {
+        let schema = parse_sort_fields(ProductType::Logs.default_sort_fields())
+            .expect("logs default sort fields must parse");
+        assert_eq!(schema.column.len(), 4);
+        assert_eq!(schema.column[0].name, "service_name");
+    }
+
+    #[test]
+    fn test_traces_default_sort_fields_parses() {
+        let schema = parse_sort_fields(ProductType::Traces.default_sort_fields())
+            .expect("traces default sort fields must parse");
+        assert_eq!(schema.column.len(), 4);
+        assert_eq!(schema.column[0].name, "service_name");
+    }
+
+    #[test]
+    fn test_effective_sort_fields_uses_default() {
+        let config = TableConfig::new(ProductType::Metrics);
+        assert_eq!(
+            config.effective_sort_fields(),
+            ProductType::Metrics.default_sort_fields()
+        );
+    }
+
+    #[test]
+    fn test_effective_sort_fields_uses_override() {
+        let mut config = TableConfig::new(ProductType::Metrics);
+        config.sort_fields = Some("custom__s|timestamp/V2".to_string());
+        assert_eq!(config.effective_sort_fields(), "custom__s|timestamp/V2");
+    }
+
+    #[test]
+    fn test_default_window_duration() {
+        let config = TableConfig::default();
+        assert_eq!(config.window_duration_secs, 900);
+    }
+
+    #[test]
+    fn test_table_config_serde_roundtrip() {
+        let config = TableConfig::new(ProductType::Traces);
+        let json = serde_json::to_string(&config).unwrap();
+        let recovered: TableConfig = serde_json::from_str(&json).unwrap();
+        assert_eq!(recovered.product_type, ProductType::Traces);
+        assert!(recovered.sort_fields.is_none());
+        assert_eq!(recovered.window_duration_secs, 900);
+    }
+
+    #[test]
+    fn test_table_config_serde_with_override() {
+        let mut config = TableConfig::new(ProductType::Metrics);
+        config.sort_fields = Some("host__s|timestamp/V2".to_string());
+        config.window_duration_secs = 3600;
+
+        let json = serde_json::to_string(&config).unwrap();
+        assert!(json.contains("host__s|timestamp/V2"));
+
+        let recovered: TableConfig = serde_json::from_str(&json).unwrap();
+        assert_eq!(
+            recovered.sort_fields.as_deref(),
+            Some("host__s|timestamp/V2")
+        );
+        assert_eq!(recovered.window_duration_secs, 3600);
+    }
+}
diff --git a/quickwit/quickwit-proto/build.rs b/quickwit/quickwit-proto/build.rs
index 569d9b5315b..37e7d7c8cb1 100644
--- a/quickwit/quickwit-proto/build.rs
+++ b/quickwit/quickwit-proto/build.rs
@@ -223,6 +223,19 @@ fn main() -> Result<(), Box<dyn std::error::Error>> {
             ],
         )?;
 
+    // Event Store sort schema proto (vendored from dd-source).
+    let sortschema_prost_config = prost_build::Config::default();
+    tonic_prost_build::configure()
+        .type_attribute(".", "#[derive(serde::Serialize, serde::Deserialize)]")
+        .out_dir("src/codegen/sortschema")
+        .compile_with_config(
+            sortschema_prost_config,
+            &[std::path::PathBuf::from(
+                "protos/event_store_sortschema/event_store_sortschema.proto",
+            )],
+            &[std::path::PathBuf::from("protos/event_store_sortschema")],
+        )?;
+
     // OTEL proto
     let mut prost_config = prost_build::Config::default();
     prost_config.protoc_arg("--experimental_allow_proto3_optional");
diff --git a/quickwit/quickwit-proto/protos/event_store_sortschema/event_store_sortschema.proto b/quickwit/quickwit-proto/protos/event_store_sortschema/event_store_sortschema.proto
new file mode 100644
index 00000000000..a2e1d245b6d
--- /dev/null
+++ b/quickwit/quickwit-proto/protos/event_store_sortschema/event_store_sortschema.proto
@@ -0,0 +1,231 @@
+// Copyright 2021-Present Datadog, Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+syntax = "proto3";
+
+// Vendored from dd-source: domains/event-platform/shared/libs/event-store-proto/protos/event_store_sortschema/
+// Keep identical to upstream -- do NOT strip unused fields.
+
+package sortschema;
+
+option java_package = "com.dd.event.store.sortschema.proto";
+option java_multiple_files = true;
+option go_package = "github.com/DataDog/dd-source/domains/event-platform/shared/libs/event-store-proto/protos/event_store_sortschema";
+
+// NOTE:  Be thoughtful making changes to this file.  Everything included here is
+//        included in the metadata of every fragment, and in each fragment header.
+
+enum SortColumnDirection {
+    // SortDirectionUnknown will pop up if someone forgets to initialize a field.
+    SortDirectionUnknown = 0;
+
+    // SortDirectionAscending sorts columns in ascending value order.
+    SortDirectionAscending = 1;
+
+    // SortDirectionDescending sorts columns in descending value order.
+    SortDirectionDescending = 2;
+}
+
+// SortColumn represents a single column that's participating in the sort order
+// for rows in the fragment files.
+//
+// NOTE: Carefully consider whether you need to modify the implementation of
+//       schemautils.go/EquivalentSchemas function whenever this struct is modified.
+message SortColumn {
+    string name = 1;
+    // This integer corresponds 1:1 with the application-level column type which is also a number.
+    // We do that instead of breaking out a separate ENUM here to avoid having to maintain a bunch
+    // of mapping code that converts from column types to protobuf ENUMs and back again for no reason.
+    uint64 column_type = 2;
+    SortColumnDirection sort_direction = 3;
+
+    // NOTE: If we decide later to form lexicographic keys from the schema,
+    //       it may be useful to supply an integer here which would indicate
+    //       the sort precedence and could be used for tagging for e.g.
+    //       orderedcode lexicographic keys.
+}
+
+// SortSchema represents a set of column names and precedence values which define
+// the partial lexicographical sort order of rows in a fragment file, built from the
+// individual column orderings.  Precedence of the columns for sorting is given by
+// the order of the columns in the SortSchema.  "timestamp" must appear, with
+// SortColumnType INT64, SortDirection DESCENDING, last.
+//
+// NOTE: Carefully consider whether you need to modify the implementation of
+//       schemautils.go/EquivalentSchemas function whenever this struct is modified.
+message SortSchema {
+    // Very common schemas used all over the place are assigned a unique version number (reference
+    // in dd-go/logs/apps/logs-event-store/storage/fragment/schema_versions.go).
+    //
+    // This version number can be used in place of the complete schema description in the fragment
+    // metadata entries, to reduce their size. When this value is not null, the other fields in the
+    // SortSchema are not used and don't need to be set.
+    uint64 version = 4;
+
+    repeated SortColumn column = 1;
+
+    // Used in metrics, etc.
+    string name = 2;
+
+    // SortVersion specifies what type of sorting of the data has been done.
+    // Fragments with different SortVersion numbers are *never* merged by the
+    // compactor.
+    enum SortVersion {
+        // The initial version of per-file locality sorting had a bug where it sorted
+        // the rows by the full values of string fields in the sort columns, but reported
+        // trimmed values in the RowKeys.  This could yield what appeared to be
+        // "out of order" keys because keys that only differ in the portion that was
+        // trimmed could sort differently than trimmed values, and this would yield
+        // what looked like overlapping fragments after m:n merges.  Fragments produced
+        // this way are not compatible with the LSM algorithm and must be excluded.
+        INCORRECT_TRIM = 0;
+
+        // Files marked with this version use LessTrimmed both in their production
+        // in the writer, and in the merges produced by the compactor.  Trimming has
+        // a "budget" that allows the leading string fields to take up "more"
+        // characters if they need to without overflowing into huge values in the
+        // metadata server.  This budgeting is intended to cover when e.g. "service"
+        // is a leading tag with long service names.
+        TRIMMED_WITH_BUDGET = 1;
+    }
+
+    SortVersion sort_version = 3;
+
+    // Cutoff position for LSM compaction comparisons. Only columns 0 through
+    // lsm_comparison_cutoff-1 are considered for fragment locality decisions.
+    // Allows sorting by columns that improve compression without creating
+    // unnecessary extra compaction work.
+    // This cutoff is represented in the sort schema string format with an `&`
+    // at the beginning of the name of the first column that should be
+    // ignored, like "service__s|&env__s|timestamp/V1" which would only use
+    // "service__s" for locality comparisons.
+    // When unset, or explicitly set to 0, the schema utils will use the
+    // legacy logic of ignoring timestamp and tiebreaker, if they are present.
+    int32 lsm_comparison_cutoff = 5;
+}
+
+// A ColumnValue is a string, int, uint or float which corresponds to a sort column.
+message ColumnValue {
+    oneof value {
+        bytes type_string = 1;
+        int64 type_int = 2;
+        double type_float = 3;
+    }
+}
+
+// ColumnValues represent a set of column values that correspond with a particular
+// SortSchema.
+message ColumnValues {
+    repeated ColumnValue column = 1;
+}
+
+// RowKeys represent the "row keys" of the first and last rows of a fragment file.
+// The values are the values of the sort columns at the first and last row, respectively.
+message RowKeys {
+    // These are the values of the sort colums at the first row, thus defining the
+    // lower inclusive boundary of the fragment file, i.e. using the minimum values of any
+    // multi-values in the column values(1), which is what the sorting operates upon.
+    ColumnValues min_row_values = 1;
+
+    // These are the values of the sort columns at the last row, thus defining the
+    // upper boundary of the sorted rows of the fragment file, i.e. using the minimum
+    // values of any multi-values in the column values(1), which is what the sorting
+    // operates upon.
+    //
+    // Note that the string fields in min_row_values and max_row_values are trimmed
+    // to avoid storing "too large" values in the metadata server, and the sort order
+    // of the rows in the fragment files are determined using these trimmed keys to
+    // match.
+    //
+    // During m:n merging of fragments, when boundaries of fragments are selected,
+    // they are selected at transitions of key values of the trimmed keys so that the
+    // [min_row_values, max_row_values] ranges of the output fragments do not overlap.
+    // This property is important for two reasons.  First, we want to prune fragments
+    // from queries based on the values of the sort schema columns.  By making the
+    // fragments non-overlapping, we enable this pruning.  Secondly, we don't want
+    // compaction strategies like the classic LSM strategy to re-merge the same
+    // fragments over and over.
+    //
+    // An interesting consequence of the key trimming is that a fragment may have
+    // a row in it with values that come "after" the max_row_values keys outside
+    // of the "trimmed" range of the column values.  For example, if the trim budget
+    // for a column is four characters, and the fragment ends at "abcd", "abcd" and
+    // "abcd123" would yield the same trimmed key value and would both be stored in
+    // the same fragment.  However "abce" would be stored in a different fragment.
+    ColumnValues max_row_values = 2;
+
+    // This set of values defines the all-inclusive range boundary of the file considering
+    // the max values of any multi-values in the column values(2).  This boundary is the
+    // boundary w.r.t. queries.
+    ColumnValues all_inclusive_max_row_values = 3;
+
+
+    // For track table expirations (splitting M fragments into N1 expired fragments and N2 live fragments),
+    // there is a potential for output fragments to overlap in key range. This is because the sort schema of track
+    // table starts with retention at intake + scopeID, and retention can change between the moment the fragment
+    // is created and the moment it is expired. To give an example, say a fragment has scopeIDs 1, 2, 3, 4, 5,
+    // all with a retention at intake of 7 days. If the retention for 3 and 5 is changed to 15 days,
+    // then 7 days later at expiration the fragment will be split into two fragments,
+    // one "expired" with scopeIDs 1, 2 and 4 and one "live" with scopeIDs 3 and 5.
+    // The row keys will be (7:1..., 7:4...) for the first fragment, and (7:3...,7:5...) for the second,
+    // which overlap. To avoid this, we add an `expired` boolean to the row keys, so we allow a live and an expired
+    // fragment to overlap in key range.
+    bool expired = 4;
+
+    // Footnotes:
+    //
+    //   (1) For min_row_values and max_row_values with multi-values in the column
+    //       values, the sort order is by the minimum value of a multi-value for an
+    //       ascending-sort-order column and the maximum value for a descending-sort-order
+    //       column.
+    //   (2) For all_inclusive_max_row_values, the value of any multi-values encountered
+    //       that is used is the maximum value for an ascending-sort-order column and
+    //       the minimum value for a descending-sort-order column.
+
+
+
+    // Example:
+    //
+    // Let's say we have the following columns.  Values at each row enclosed with [brackets].
+    // Column B is sorted in descending order.  The others are sorted in ascending order.
+    //
+    //     Column A           Column B          Column C
+    //     ========           ========          ========
+    // 1:  [1, 2, 99]         [B]               [x, y]
+    // 2:  [1, 4]             [A, C]            [y]
+    // 3:  [2]                [B, C]            [z]
+    // 4:  [2]                [A, B, C]         [x]
+    // 5:  [2]                [A]               [y, z]
+    // 6:  [3]                [B]               [x, y, z]
+    //
+    // min_row_key is {A=1, B=B, C=x}
+    // max_row_key is {A=3, B=B, C=x}
+    // all_inclusive_max_row_key is {A=99, B=B, C=y}
+    //
+    // The idea with these keys is that [min_row_key, max_row_key] defines the range that
+    // covers a single fragment file for the purposes of computing overlaps for deciding
+    // what to perform an m:n merge to "flatten" files that overlap in key range so that
+    // queries don't have to visit all the files.
+    //
+    // If there are no multi-valued sort columns, this is the end of the story.  However
+    // if there are multi-valued sort columns, the range that a given file covers in
+    // terms of queries is given by [min_row_key, all_inclusive_max_row_key].  A lot of
+    // overlaps with these ranges indicate "atomic rows" for a table that cause queries
+    // to hit more files, degrading performance.
+    //
+    // Initially the compactor will only consider [min_row_key, max_row_key], with the
+    // all_inclusive_max_row_key present for diagnostics.  In the future we may use
+    // this key to determine whether we want to do anything "special" with files that
+    // have atomic rows, such as duplicating the rows at each value of the multi-value.
+}
diff --git a/quickwit/quickwit-proto/src/codegen/sortschema/sortschema.rs b/quickwit/quickwit-proto/src/codegen/sortschema/sortschema.rs
new file mode 100644
index 00000000000..2550c847794
--- /dev/null
+++ b/quickwit/quickwit-proto/src/codegen/sortschema/sortschema.rs
@@ -0,0 +1,230 @@
+// This file is @generated by prost-build.
+/// SortColumn represents a single column that's participating in the sort order
+/// for rows in the fragment files.
+///
+/// NOTE: Carefully consider whether you need to modify the implementation of
+/// schemautils.go/EquivalentSchemas function whenever this struct is modified.
+#[derive(serde::Serialize, serde::Deserialize)]
+#[derive(Clone, PartialEq, Eq, Hash, ::prost::Message)]
+pub struct SortColumn {
+    #[prost(string, tag = "1")]
+    pub name: ::prost::alloc::string::String,
+    /// This integer corresponds 1:1 with the application-level column type which is also a number.
+    /// We do that instead of breaking out a separate ENUM here to avoid having to maintain a bunch
+    /// of mapping code that converts from column types to protobuf ENUMs and back again for no reason.
+    #[prost(uint64, tag = "2")]
+    pub column_type: u64,
+    #[prost(enumeration = "SortColumnDirection", tag = "3")]
+    pub sort_direction: i32,
+}
+/// SortSchema represents a set of column names and precedence values which define
+/// the partial lexicographical sort order of rows in a fragment file, built from the
+/// individual column orderings.  Precedence of the columns for sorting is given by
+/// the order of the columns in the SortSchema.  "timestamp" must appear, with
+/// SortColumnType INT64, SortDirection DESCENDING, last.
+///
+/// NOTE: Carefully consider whether you need to modify the implementation of
+/// schemautils.go/EquivalentSchemas function whenever this struct is modified.
+#[derive(serde::Serialize, serde::Deserialize)]
+#[derive(Clone, PartialEq, ::prost::Message)]
+pub struct SortSchema {
+    /// Very common schemas used all over the place are assigned a unique version number (reference
+    /// in dd-go/logs/apps/logs-event-store/storage/fragment/schema_versions.go).
+    ///
+    /// This version number can be used in place of the complete schema description in the fragment
+    /// metadata entries, to reduce their size. When this value is not null, the other fields in the
+    /// SortSchema are not used and don't need to be set.
+    #[prost(uint64, tag = "4")]
+    pub version: u64,
+    #[prost(message, repeated, tag = "1")]
+    pub column: ::prost::alloc::vec::Vec<SortColumn>,
+    /// Used in metrics, etc.
+    #[prost(string, tag = "2")]
+    pub name: ::prost::alloc::string::String,
+    #[prost(enumeration = "sort_schema::SortVersion", tag = "3")]
+    pub sort_version: i32,
+    /// Cutoff position for LSM compaction comparisons. Only columns 0 through
+    /// lsm_comparison_cutoff-1 are considered for fragment locality decisions.
+    /// Allows sorting by columns that improve compression without creating
+    /// unnecessary extra compaction work.
+    /// This cutoff is represented in the sort schema string format with an `&`
+    /// at the beginning of the name of the first column that should be
+    /// ignored, like "service__s|&env__s|timestamp/V1" which would only use
+    /// "service__s" for locality comparisons.
+    /// When unset, or explicitly set to 0, the schema utils will use the
+    /// legacy logic of ignoring timestamp and tiebreaker, if they are present.
+    #[prost(int32, tag = "5")]
+    pub lsm_comparison_cutoff: i32,
+}
+/// Nested message and enum types in `SortSchema`.
+pub mod sort_schema {
+    /// SortVersion specifies what type of sorting of the data has been done.
+    /// Fragments with different SortVersion numbers are *never* merged by the
+    /// compactor.
+    #[derive(serde::Serialize, serde::Deserialize)]
+    #[derive(
+        Clone,
+        Copy,
+        Debug,
+        PartialEq,
+        Eq,
+        Hash,
+        PartialOrd,
+        Ord,
+        ::prost::Enumeration
+    )]
+    #[repr(i32)]
+    pub enum SortVersion {
+        /// The initial version of per-file locality sorting had a bug where it sorted
+        /// the rows by the full values of string fields in the sort columns, but reported
+        /// trimmed values in the RowKeys.  This could yield what appeared to be
+        /// "out of order" keys because keys that only differ in the portion that was
+        /// trimmed could sort differently than trimmed values, and this would yield
+        /// what looked like overlapping fragments after m:n merges.  Fragments produced
+        /// this way are not compatible with the LSM algorithm and must be excluded.
+        IncorrectTrim = 0,
+        /// Files marked with this version use LessTrimmed both in their production
+        /// in the writer, and in the merges produced by the compactor.  Trimming has
+        /// a "budget" that allows the leading string fields to take up "more"
+        /// characters if they need to without overflowing into huge values in the
+        /// metadata server.  This budgeting is intended to cover when e.g. "service"
+        /// is a leading tag with long service names.
+        TrimmedWithBudget = 1,
+    }
+    impl SortVersion {
+        /// String value of the enum field names used in the ProtoBuf definition.
+        ///
+        /// The values are not transformed in any way and thus are considered stable
+        /// (if the ProtoBuf definition does not change) and safe for programmatic use.
+        pub fn as_str_name(&self) -> &'static str {
+            match self {
+                Self::IncorrectTrim => "INCORRECT_TRIM",
+                Self::TrimmedWithBudget => "TRIMMED_WITH_BUDGET",
+            }
+        }
+        /// Creates an enum from field names used in the ProtoBuf definition.
+        pub fn from_str_name(value: &str) -> ::core::option::Option<Self> {
+            match value {
+                "INCORRECT_TRIM" => Some(Self::IncorrectTrim),
+                "TRIMMED_WITH_BUDGET" => Some(Self::TrimmedWithBudget),
+                _ => None,
+            }
+        }
+    }
+}
+/// A ColumnValue is a string, int, uint or float which corresponds to a sort column.
+#[derive(serde::Serialize, serde::Deserialize)]
+#[derive(Clone, PartialEq, ::prost::Message)]
+pub struct ColumnValue {
+    #[prost(oneof = "column_value::Value", tags = "1, 2, 3")]
+    pub value: ::core::option::Option<column_value::Value>,
+}
+/// Nested message and enum types in `ColumnValue`.
+pub mod column_value {
+    #[derive(serde::Serialize, serde::Deserialize)]
+    #[derive(Clone, PartialEq, ::prost::Oneof)]
+    pub enum Value {
+        #[prost(bytes, tag = "1")]
+        TypeString(::prost::alloc::vec::Vec<u8>),
+        #[prost(int64, tag = "2")]
+        TypeInt(i64),
+        #[prost(double, tag = "3")]
+        TypeFloat(f64),
+    }
+}
+/// ColumnValues represent a set of column values that correspond with a particular
+/// SortSchema.
+#[derive(serde::Serialize, serde::Deserialize)]
+#[derive(Clone, PartialEq, ::prost::Message)]
+pub struct ColumnValues {
+    #[prost(message, repeated, tag = "1")]
+    pub column: ::prost::alloc::vec::Vec<ColumnValue>,
+}
+/// RowKeys represent the "row keys" of the first and last rows of a fragment file.
+/// The values are the values of the sort columns at the first and last row, respectively.
+#[derive(serde::Serialize, serde::Deserialize)]
+#[derive(Clone, PartialEq, ::prost::Message)]
+pub struct RowKeys {
+    /// These are the values of the sort colums at the first row, thus defining the
+    /// lower inclusive boundary of the fragment file, i.e. using the minimum values of any
+    /// multi-values in the column values(1), which is what the sorting operates upon.
+    #[prost(message, optional, tag = "1")]
+    pub min_row_values: ::core::option::Option<ColumnValues>,
+    /// These are the values of the sort columns at the last row, thus defining the
+    /// upper boundary of the sorted rows of the fragment file, i.e. using the minimum
+    /// values of any multi-values in the column values(1), which is what the sorting
+    /// operates upon.
+    ///
+    /// Note that the string fields in min_row_values and max_row_values are trimmed
+    /// to avoid storing "too large" values in the metadata server, and the sort order
+    /// of the rows in the fragment files are determined using these trimmed keys to
+    /// match.
+    ///
+    /// During m:n merging of fragments, when boundaries of fragments are selected,
+    /// they are selected at transitions of key values of the trimmed keys so that the
+    /// \[min_row_values, max_row_values\] ranges of the output fragments do not overlap.
+    /// This property is important for two reasons.  First, we want to prune fragments
+    /// from queries based on the values of the sort schema columns.  By making the
+    /// fragments non-overlapping, we enable this pruning.  Secondly, we don't want
+    /// compaction strategies like the classic LSM strategy to re-merge the same
+    /// fragments over and over.
+    ///
+    /// An interesting consequence of the key trimming is that a fragment may have
+    /// a row in it with values that come "after" the max_row_values keys outside
+    /// of the "trimmed" range of the column values.  For example, if the trim budget
+    /// for a column is four characters, and the fragment ends at "abcd", "abcd" and
+    /// "abcd123" would yield the same trimmed key value and would both be stored in
+    /// the same fragment.  However "abce" would be stored in a different fragment.
+    #[prost(message, optional, tag = "2")]
+    pub max_row_values: ::core::option::Option<ColumnValues>,
+    /// This set of values defines the all-inclusive range boundary of the file considering
+    /// the max values of any multi-values in the column values(2).  This boundary is the
+    /// boundary w.r.t. queries.
+    #[prost(message, optional, tag = "3")]
+    pub all_inclusive_max_row_values: ::core::option::Option<ColumnValues>,
+    /// For track table expirations (splitting M fragments into N1 expired fragments and N2 live fragments),
+    /// there is a potential for output fragments to overlap in key range. This is because the sort schema of track
+    /// table starts with retention at intake + scopeID, and retention can change between the moment the fragment
+    /// is created and the moment it is expired. To give an example, say a fragment has scopeIDs 1, 2, 3, 4, 5,
+    /// all with a retention at intake of 7 days. If the retention for 3 and 5 is changed to 15 days,
+    /// then 7 days later at expiration the fragment will be split into two fragments,
+    /// one "expired" with scopeIDs 1, 2 and 4 and one "live" with scopeIDs 3 and 5.
+    /// The row keys will be (7:1..., 7:4...) for the first fragment, and (7:3...,7:5...) for the second,
+    /// which overlap. To avoid this, we add an `expired` boolean to the row keys, so we allow a live and an expired
+    /// fragment to overlap in key range.
+    #[prost(bool, tag = "4")]
+    pub expired: bool,
+}
+#[derive(serde::Serialize, serde::Deserialize)]
+#[derive(Clone, Copy, Debug, PartialEq, Eq, Hash, PartialOrd, Ord, ::prost::Enumeration)]
+#[repr(i32)]
+pub enum SortColumnDirection {
+    /// SortDirectionUnknown will pop up if someone forgets to initialize a field.
+    SortDirectionUnknown = 0,
+    /// SortDirectionAscending sorts columns in ascending value order.
+    SortDirectionAscending = 1,
+    /// SortDirectionDescending sorts columns in descending value order.
+    SortDirectionDescending = 2,
+}
+impl SortColumnDirection {
+    /// String value of the enum field names used in the ProtoBuf definition.
+    ///
+    /// The values are not transformed in any way and thus are considered stable
+    /// (if the ProtoBuf definition does not change) and safe for programmatic use.
+    pub fn as_str_name(&self) -> &'static str {
+        match self {
+            Self::SortDirectionUnknown => "SortDirectionUnknown",
+            Self::SortDirectionAscending => "SortDirectionAscending",
+            Self::SortDirectionDescending => "SortDirectionDescending",
+        }
+    }
+    /// Creates an enum from field names used in the ProtoBuf definition.
+    pub fn from_str_name(value: &str) -> ::core::option::Option<Self> {
+        match value {
+            "SortDirectionUnknown" => Some(Self::SortDirectionUnknown),
+            "SortDirectionAscending" => Some(Self::SortDirectionAscending),
+            "SortDirectionDescending" => Some(Self::SortDirectionDescending),
+            _ => None,
+        }
+    }
+}
diff --git a/quickwit/quickwit-proto/src/lib.rs b/quickwit/quickwit-proto/src/lib.rs
index dbe850b55b7..8499d6cb920 100644
--- a/quickwit/quickwit-proto/src/lib.rs
+++ b/quickwit/quickwit-proto/src/lib.rs
@@ -15,6 +15,7 @@
 #![allow(clippy::derive_partial_eq_without_eq)]
 #![allow(clippy::disallowed_methods)]
 #![allow(clippy::doc_lazy_continuation)]
+#![allow(deprecated)] // prost::DecodeError::new is deprecated but used in generated decode impls
 #![allow(rustdoc::invalid_html_tags)]
 
 use std::cmp::Ordering;
@@ -37,10 +38,16 @@ pub mod indexing;
 pub mod ingest;
 pub mod metastore;
 pub mod search;
+pub mod sort_fields_error;
 pub mod types;
 
 pub use error::{GrpcServiceError, ServiceError, ServiceErrorCode};
 use search::ReportSplitsRequest;
+pub use sort_fields_error::SortFieldsError;
+
+pub mod sortschema {
+    include!("codegen/sortschema/sortschema.rs");
+}
 
 pub mod jaeger {
     pub mod api_v2 {
diff --git a/quickwit/quickwit-proto/src/metastore/mod.rs b/quickwit/quickwit-proto/src/metastore/mod.rs
index ba371c13d4a..4f53b9abb5c 100644
--- a/quickwit/quickwit-proto/src/metastore/mod.rs
+++ b/quickwit/quickwit-proto/src/metastore/mod.rs
@@ -193,6 +193,15 @@ impl From<TimeoutExceeded> for MetastoreError {
     }
 }
 
+impl From<crate::SortFieldsError> for MetastoreError {
+    fn from(err: crate::SortFieldsError) -> Self {
+        MetastoreError::Internal {
+            message: "sort fields error".to_string(),
+            cause: err.to_string(),
+        }
+    }
+}
+
 impl ServiceError for MetastoreError {
     fn error_code(&self) -> ServiceErrorCode {
         match self {
diff --git a/quickwit/quickwit-proto/src/sort_fields_error.rs b/quickwit/quickwit-proto/src/sort_fields_error.rs
new file mode 100644
index 00000000000..8fa6ebc15b3
--- /dev/null
+++ b/quickwit/quickwit-proto/src/sort_fields_error.rs
@@ -0,0 +1,78 @@
+// Copyright 2021-Present Datadog, Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+//! Error types for sort fields parsing and validation.
+
+/// Errors arising from sort fields parsing, validation, and time-window arithmetic.
+#[derive(Debug, thiserror::Error)]
+pub enum SortFieldsError {
+    /// Window duration does not meet the divisibility constraint.
+    #[error("invalid window duration {duration_secs}s: {reason}")]
+    InvalidWindowDuration {
+        duration_secs: u32,
+        reason: &'static str,
+    },
+
+    /// Schema string is syntactically malformed.
+    #[error("{0}")]
+    MalformedSchema(String),
+
+    /// Version suffix could not be parsed.
+    #[error("{0}")]
+    BadSortVersion(String),
+
+    /// Sort version is below the minimum accepted (V2-only enforcement).
+    #[error("unsupported sort version {version}, minimum is {minimum}")]
+    UnsupportedVersion { version: i32, minimum: i32 },
+
+    /// Invalid placement or usage of the `&` LSM cutoff marker.
+    #[error("{0}")]
+    InvalidCutoffPlacement(String),
+
+    /// Column specification has wrong number of parts.
+    #[error("{0}")]
+    InvalidColumnFormat(String),
+
+    /// Unknown column type (from suffix or explicit name).
+    #[error("{0}")]
+    UnknownColumnType(String),
+
+    /// Explicit column type does not match the type inferred from the suffix.
+    #[error(
+        "column type doesn't match type deduced from suffix for {column}, deduced={from_suffix}, \
+         explicit={explicit}"
+    )]
+    TypeMismatch {
+        column: String,
+        from_suffix: String,
+        explicit: String,
+    },
+
+    /// Unrecognized sort direction string.
+    #[error("{0}")]
+    UnknownSortDirection(String),
+
+    /// Sort direction specified in multiple places (e.g., both prefix and suffix,
+    /// or prefix/suffix combined with colon-separated direction).
+    #[error("sort direction specified multiple times for column '{0}'")]
+    DuplicateDirection(String),
+
+    /// Schema is structurally invalid (missing timestamp, wrong order, etc.).
+    #[error("{0}")]
+    ValidationError(String),
+
+    /// window_start timestamp cannot be represented as a DateTime.
+    #[error("window_start timestamp {timestamp_secs} is out of representable range")]
+    WindowStartOutOfRange { timestamp_secs: i64 },
+}

From 4d42fd9523783419e1ed79964a59411353e20dd1 Mon Sep 17 00:00:00 2001
From: George Talbot <george.talbot@datadoghq.com>
Date: Wed, 1 Apr 2026 14:58:52 -0400
Subject: [PATCH 10/35] =?UTF-8?q?fix:=20rustdoc=20link=20errors=20?=
 =?UTF-8?q?=E2=80=94=20use=20backticks=20for=20private=20items?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 .../src/indexing_scheduler/mod.rs                     |  4 ++--
 quickwit/quickwit-indexing/src/actors/uploader.rs     |  2 +-
 .../metastore/file_backed/file_backed_index/mod.rs    |  4 ++--
 .../metastore/file_backed/lazy_file_backed_index.rs   |  2 +-
 .../src/metastore/file_backed/mod.rs                  |  8 ++++----
 quickwit/quickwit-metastore/src/metastore_resolver.rs |  2 +-
 quickwit/quickwit-proto/src/lib.rs                    | 11 ++++++-----
 quickwit/quickwit-search/src/cluster_client.rs        |  2 +-
 quickwit/quickwit-storage/src/storage_resolver.rs     |  2 +-
 9 files changed, 19 insertions(+), 18 deletions(-)

diff --git a/quickwit/quickwit-control-plane/src/indexing_scheduler/mod.rs b/quickwit/quickwit-control-plane/src/indexing_scheduler/mod.rs
index 9f79032d7c5..e29d4af2540 100644
--- a/quickwit/quickwit-control-plane/src/indexing_scheduler/mod.rs
+++ b/quickwit/quickwit-control-plane/src/indexing_scheduler/mod.rs
@@ -68,7 +68,7 @@ pub struct IndexingSchedulerState {
 ///
 /// Scheduling executes the following steps:
 /// 1. Builds a [`PhysicalIndexingPlan`] from the list of logical indexing tasks. See
-///    [`build_physical_indexing_plan`] for the implementation details.
+///    `build_physical_indexing_plan` for the implementation details.
 /// 2. Apply the [`PhysicalIndexingPlan`]: for each indexer, the scheduler send the indexing tasks
 ///    by gRPC. An indexer immediately returns an Ok and apply asynchronously the received plan. Any
 ///    errors (network) happening in this step are ignored. The scheduler runs a control loop that
@@ -98,7 +98,7 @@ pub struct IndexingSchedulerState {
 /// Concretely, it will send the faulty nodes of the plan they are supposed to follow.
 //
 /// Finally, in order to give the time for each indexer to run their indexing tasks, the control
-/// plane will wait at least [`MIN_DURATION_BETWEEN_SCHEDULING`] before comparing the desired
+/// plane will wait at least `MIN_DURATION_BETWEEN_SCHEDULING` before comparing the desired
 /// plan with the running plan.
 pub struct IndexingScheduler {
     cluster_id: String,
diff --git a/quickwit/quickwit-indexing/src/actors/uploader.rs b/quickwit/quickwit-indexing/src/actors/uploader.rs
index 1827af7a153..448b7a4e312 100644
--- a/quickwit/quickwit-indexing/src/actors/uploader.rs
+++ b/quickwit/quickwit-indexing/src/actors/uploader.rs
@@ -66,7 +66,7 @@ pub enum UploaderType {
 /// [`SplitsUpdateMailbox`] wraps either a [`Mailbox<Sequencer<P>>`] or [`Mailbox<P>`].
 ///
 /// It makes it possible to send a splits update either to the [`Sequencer`] or directly
-/// to the publisher actor `P`. It is used in combination with [`SplitsUpdateSender`] that
+/// to the publisher actor `P`. It is used in combination with `SplitsUpdateSender` that
 /// will do the send.
 ///
 /// This is useful as we have different requirements between the indexing pipeline and
diff --git a/quickwit/quickwit-metastore/src/metastore/file_backed/file_backed_index/mod.rs b/quickwit/quickwit-metastore/src/metastore/file_backed/file_backed_index/mod.rs
index 6fd5ce244be..e35618b99f8 100644
--- a/quickwit/quickwit-metastore/src/metastore/file_backed/file_backed_index/mod.rs
+++ b/quickwit/quickwit-metastore/src/metastore/file_backed/file_backed_index/mod.rs
@@ -12,8 +12,8 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-//! [`FileBackedIndex`] module. It is public so that the crate `quickwit-backward-compat` can
-//! import [`FileBackedIndex`] and run backward-compatibility tests. You should not have to import
+//! `FileBackedIndex` module. It is public so that the crate `quickwit-backward-compat` can
+//! import `FileBackedIndex` and run backward-compatibility tests. You should not have to import
 //! anything from here directly.
 
 mod serialize;
diff --git a/quickwit/quickwit-metastore/src/metastore/file_backed/lazy_file_backed_index.rs b/quickwit/quickwit-metastore/src/metastore/file_backed/lazy_file_backed_index.rs
index c13711de0b2..43979334f55 100644
--- a/quickwit/quickwit-metastore/src/metastore/file_backed/lazy_file_backed_index.rs
+++ b/quickwit/quickwit-metastore/src/metastore/file_backed/lazy_file_backed_index.rs
@@ -24,7 +24,7 @@ use tracing::error;
 use super::file_backed_index::FileBackedIndex;
 use super::store_operations::{METASTORE_FILE_NAME, load_index};
 
-/// Lazy [`FileBackedIndex`]. It loads a `FileBackedIndex` on demand. When the index is first
+/// Lazy `FileBackedIndex`. It loads a `FileBackedIndex` on demand. When the index is first
 /// loaded, it optionally spawns a task to periodically poll the storage and update the index.
 pub(crate) struct LazyFileBackedIndex {
     index_id: IndexId,
diff --git a/quickwit/quickwit-metastore/src/metastore/file_backed/mod.rs b/quickwit/quickwit-metastore/src/metastore/file_backed/mod.rs
index 00791488e65..74e0f05abae 100644
--- a/quickwit/quickwit-metastore/src/metastore/file_backed/mod.rs
+++ b/quickwit/quickwit-metastore/src/metastore/file_backed/mod.rs
@@ -13,7 +13,7 @@
 // limitations under the License.
 
 //! Module for [`FileBackedMetastore`]. It is public so that the crate `quickwit-backward-compat`
-//! can import [`FileBackedIndex`] and run backward-compatibility tests. You should not have to
+//! can import `FileBackedIndex` and run backward-compatibility tests. You should not have to
 //! import anything from here directly.
 
 pub mod file_backed_index;
@@ -116,9 +116,9 @@ impl From<bool> for MutationOccurred<()> {
 /// into as many files and stores a map of indexes
 /// (index_id, index_status) in a dedicated file `manifest.json`.
 ///
-/// A [`LazyIndexStatus`] describes the lifecycle of an index: [`LazyIndexStatus::Creating`] and
-/// [`LazyIndexStatus::Deleting`] are transitioning states that indicates that the index is not
-/// yet available. On the contrary, the [`LazyIndexStatus::Active`] status indicates the index is
+/// A `LazyIndexStatus` describes the lifecycle of an index: `LazyIndexStatus::Creating` and
+/// `LazyIndexStatus::Deleting` are transitioning states that indicates that the index is not
+/// yet available. On the contrary, the `LazyIndexStatus::Active` status indicates the index is
 /// ready to be fetched and updated.
 ///
 /// Transitioning states are useful to track inconsistencies between the in-memory and on-disk data
diff --git a/quickwit/quickwit-metastore/src/metastore_resolver.rs b/quickwit/quickwit-metastore/src/metastore_resolver.rs
index 7793fdbbf45..265d9069f3c 100644
--- a/quickwit/quickwit-metastore/src/metastore_resolver.rs
+++ b/quickwit/quickwit-metastore/src/metastore_resolver.rs
@@ -45,7 +45,7 @@ impl fmt::Debug for MetastoreResolver {
 }
 
 impl MetastoreResolver {
-    /// Creates an empty [`MetastoreResolverBuilder`].
+    /// Creates an empty `MetastoreResolverBuilder`.
     pub fn builder() -> MetastoreResolverBuilder {
         MetastoreResolverBuilder::default()
     }
diff --git a/quickwit/quickwit-proto/src/lib.rs b/quickwit/quickwit-proto/src/lib.rs
index 8499d6cb920..3a7f7ea8992 100644
--- a/quickwit/quickwit-proto/src/lib.rs
+++ b/quickwit/quickwit-proto/src/lib.rs
@@ -130,7 +130,8 @@ impl TryFrom<metastore::DeleteQuery> for search::SearchRequest {
 pub struct MutMetadataMap<'a>(&'a mut tonic::metadata::MetadataMap);
 
 impl Injector for MutMetadataMap<'_> {
-    /// Sets a key-value pair in the [`MetadataMap`]. No-op if the key or value is invalid.
+    /// Sets a key-value pair in the [`tonic::metadata::MetadataMap`]. No-op if the key or value
+    /// is invalid.
     fn set(&mut self, key: &str, value: String) {
         if let Ok(metadata_key) = tonic::metadata::MetadataKey::from_bytes(key.as_bytes())
             && let Ok(metadata_value) = tonic::metadata::MetadataValue::try_from(&value)
@@ -141,13 +142,13 @@ impl Injector for MutMetadataMap<'_> {
 }
 
 impl Extractor for MutMetadataMap<'_> {
-    /// Gets a value for a key from the MetadataMap.  If the value can't be converted to &str,
+    /// Gets a value for a key from the `MetadataMap`. If the value can't be converted to &str,
     /// returns None.
     fn get(&self, key: &str) -> Option<&str> {
         self.0.get(key).and_then(|metadata| metadata.to_str().ok())
     }
 
-    /// Collect all the keys from the MetadataMap.
+    /// Collect all the keys from the `MetadataMap`.
     fn keys(&self) -> Vec<&str> {
         self.0
             .keys()
@@ -181,13 +182,13 @@ impl Interceptor for SpanContextInterceptor {
 struct MetadataMap<'a>(&'a tonic::metadata::MetadataMap);
 
 impl Extractor for MetadataMap<'_> {
-    /// Gets a value for a key from the MetadataMap.  If the value can't be converted to &str,
+    /// Gets a value for a key from the `MetadataMap`. If the value can't be converted to &str,
     /// returns None.
     fn get(&self, key: &str) -> Option<&str> {
         self.0.get(key).and_then(|metadata| metadata.to_str().ok())
     }
 
-    /// Collect all the keys from the MetadataMap.
+    /// Collect all the keys from the `MetadataMap`.
     fn keys(&self) -> Vec<&str> {
         self.0
             .keys()
diff --git a/quickwit/quickwit-search/src/cluster_client.rs b/quickwit/quickwit-search/src/cluster_client.rs
index 79f6ba81702..0a4518174ce 100644
--- a/quickwit/quickwit-search/src/cluster_client.rs
+++ b/quickwit/quickwit-search/src/cluster_client.rs
@@ -138,7 +138,7 @@ impl ClusterClient {
 
     /// Attempts to store a given key value pair within the cluster.
     ///
-    /// Tries to replicate the pair to [`TARGET_NUM_REPLICATION`] nodes, but this function may fail
+    /// Tries to replicate the pair to `TARGET_NUM_REPLICATION` nodes, but this function may fail
     /// silently (e.g if no client was available). Even in case of success, this storage is not
     /// persistent. For instance during a rolling upgrade, all replicas will be lost as there is no
     /// mechanism to maintain the replication count.
diff --git a/quickwit/quickwit-storage/src/storage_resolver.rs b/quickwit/quickwit-storage/src/storage_resolver.rs
index 6203d6a8d02..85329c19a86 100644
--- a/quickwit/quickwit-storage/src/storage_resolver.rs
+++ b/quickwit/quickwit-storage/src/storage_resolver.rs
@@ -43,7 +43,7 @@ impl fmt::Debug for StorageResolver {
 }
 
 impl StorageResolver {
-    /// Creates an empty [`StorageResolverBuilder`].
+    /// Creates an empty `StorageResolverBuilder`.
     pub fn builder() -> StorageResolverBuilder {
         StorageResolverBuilder::default()
     }

From b6eb5958d7d56fc2065c7dd01c513a28035d44f5 Mon Sep 17 00:00:00 2001
From: George Talbot <george.talbot@datadoghq.com>
Date: Tue, 31 Mar 2026 17:32:41 -0400
Subject: [PATCH 11/35] =?UTF-8?q?feat(31):=20compaction=20metadata=20types?=
 =?UTF-8?q?=20=E2=80=94=20extend=20split=20metadata,=20postgres=20model,?=
 =?UTF-8?q?=20field=20lookup?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
---
 .../src/metastore/postgres/metastore.rs       |   6 +
 .../src/split/metadata.rs                     | 305 ++++++++++++++++++
 .../src/split/postgres.rs                     |  39 ++-
 3 files changed, 349 insertions(+), 1 deletion(-)

diff --git a/quickwit/quickwit-metastore/src/metastore/postgres/metastore.rs b/quickwit/quickwit-metastore/src/metastore/postgres/metastore.rs
index d459ceb243b..bfa08ff3b9d 100644
--- a/quickwit/quickwit-metastore/src/metastore/postgres/metastore.rs
+++ b/quickwit/quickwit-metastore/src/metastore/postgres/metastore.rs
@@ -2312,6 +2312,12 @@ impl MetastoreService for PostgresqlMetastore {
                     size_bytes: row.13,
                     split_metadata_json: row.14,
                     update_timestamp: row.15,
+                    window_start: None,
+                    window_duration_secs: 0,
+                    sort_fields: String::new(),
+                    num_merge_ops: 0,
+                    row_keys: None,
+                    zonemap_regexes: String::new(),
                 };
 
                 let state = pg_split.split_state().unwrap_or(MetricsSplitState::Staged);
diff --git a/quickwit/quickwit-parquet-engine/src/split/metadata.rs b/quickwit/quickwit-parquet-engine/src/split/metadata.rs
index 992775be862..5bf85ed987b 100644
--- a/quickwit/quickwit-parquet-engine/src/split/metadata.rs
+++ b/quickwit/quickwit-parquet-engine/src/split/metadata.rs
@@ -15,6 +15,7 @@
 //! Metrics split metadata definitions.
 
 use std::collections::{HashMap, HashSet};
+use std::ops::Range;
 use std::time::SystemTime;
 
 use serde::{Deserialize, Serialize};
@@ -120,7 +121,12 @@ impl std::fmt::Display for MetricsSplitState {
 }
 
 /// Metadata for a metrics split.
+///
+/// The `window` field stores the time window as `[start, start + duration)`.
+/// For JSON serialization, it is decomposed into `window_start` and
+/// `window_duration_secs` for backward compatibility with pre-Phase-31 code.
 #[derive(Debug, Clone, Serialize, Deserialize)]
+#[serde(from = "MetricsSplitMetadataSerde", into = "MetricsSplitMetadataSerde")]
 pub struct MetricsSplitMetadata {
     /// Unique split identifier.
     pub split_id: SplitId,
@@ -153,6 +159,117 @@ pub struct MetricsSplitMetadata {
 
     /// When this split was created.
     pub created_at: SystemTime,
+
+    /// Parquet file path relative to storage root.
+    pub parquet_file: String,
+
+    /// Time window as `[start, start + duration)` in epoch seconds.
+    /// None for pre-Phase-31 splits (backward compat).
+    pub window: Option<Range<i64>>,
+
+    /// Sort schema as Husky-style string (e.g., "metric_name|host|timestamp/V2").
+    /// Empty string for pre-Phase-31 splits.
+    pub sort_fields: String,
+
+    /// Number of merge operations this split has been through.
+    /// 0 for newly ingested splits.
+    pub num_merge_ops: u32,
+
+    /// RowKeys (sort-key min/max boundaries) as proto bytes.
+    /// None for pre-Phase-31 splits or splits without sort schema.
+    pub row_keys_proto: Option<Vec<u8>>,
+
+    /// Per-column zonemap regex strings, keyed by column name.
+    /// Empty for pre-Phase-31 splits.
+    pub zonemap_regexes: HashMap<String, String>,
+}
+
+/// Serde helper struct that uses `window_start` / `window_duration_secs` field
+/// names for JSON backward compatibility while the in-memory representation uses
+/// `Option<Range<i64>>`.
+#[derive(Serialize, Deserialize)]
+struct MetricsSplitMetadataSerde {
+    split_id: SplitId,
+    index_uid: String,
+    time_range: TimeRange,
+    num_rows: u64,
+    size_bytes: u64,
+    metric_names: HashSet<String>,
+    low_cardinality_tags: HashMap<String, HashSet<String>>,
+    high_cardinality_tag_keys: HashSet<String>,
+    created_at: SystemTime,
+    parquet_file: String,
+
+    #[serde(default, skip_serializing_if = "Option::is_none")]
+    window_start: Option<i64>,
+
+    #[serde(default)]
+    window_duration_secs: u32,
+
+    #[serde(default)]
+    sort_fields: String,
+
+    #[serde(default)]
+    num_merge_ops: u32,
+
+    #[serde(default, skip_serializing_if = "Option::is_none")]
+    row_keys_proto: Option<Vec<u8>>,
+
+    #[serde(default, skip_serializing_if = "HashMap::is_empty")]
+    zonemap_regexes: HashMap<String, String>,
+}
+
+impl From<MetricsSplitMetadataSerde> for MetricsSplitMetadata {
+    fn from(s: MetricsSplitMetadataSerde) -> Self {
+        let window = match (s.window_start, s.window_duration_secs) {
+            (Some(start), dur) if dur > 0 => Some(start..start + dur as i64),
+            _ => None,
+        };
+        Self {
+            split_id: s.split_id,
+            index_uid: s.index_uid,
+            time_range: s.time_range,
+            num_rows: s.num_rows,
+            size_bytes: s.size_bytes,
+            metric_names: s.metric_names,
+            low_cardinality_tags: s.low_cardinality_tags,
+            high_cardinality_tag_keys: s.high_cardinality_tag_keys,
+            created_at: s.created_at,
+            parquet_file: s.parquet_file,
+            window,
+            sort_fields: s.sort_fields,
+            num_merge_ops: s.num_merge_ops,
+            row_keys_proto: s.row_keys_proto,
+            zonemap_regexes: s.zonemap_regexes,
+        }
+    }
+}
+
+impl From<MetricsSplitMetadata> for MetricsSplitMetadataSerde {
+    fn from(m: MetricsSplitMetadata) -> Self {
+        let (window_start, window_duration_secs) = match &m.window {
+            Some(w) => (Some(w.start), (w.end - w.start) as u32),
+            None => (None, 0),
+        };
+        Self {
+            split_id: m.split_id,
+            index_uid: m.index_uid,
+            time_range: m.time_range,
+            num_rows: m.num_rows,
+            size_bytes: m.size_bytes,
+            metric_names: m.metric_names,
+            low_cardinality_tags: m.low_cardinality_tags,
+            high_cardinality_tag_keys: m.high_cardinality_tag_keys,
+            created_at: m.created_at,
+            parquet_file: m.parquet_file,
+            window_start,
+            window_duration_secs,
+            sort_fields: m.sort_fields,
+            num_merge_ops: m.num_merge_ops,
+            row_keys_proto: m.row_keys_proto,
+            zonemap_regexes: m.zonemap_regexes,
+        }
+    }
 }
 
 impl MetricsSplitMetadata {
@@ -167,6 +284,19 @@ impl MetricsSplitMetadata {
     /// Tags with >= CARDINALITY_THRESHOLD unique values use Parquet bloom filters.
     pub const CARDINALITY_THRESHOLD: usize = 1000;
 
+    /// Returns the window start in epoch seconds, or `None` for pre-Phase-31 splits.
+    pub fn window_start(&self) -> Option<i64> {
+        self.window.as_ref().map(|w| w.start)
+    }
+
+    /// Returns the window duration in seconds, or 0 for pre-Phase-31 splits.
+    pub fn window_duration_secs(&self) -> u32 {
+        match &self.window {
+            Some(w) => (w.end - w.start) as u32,
+            None => 0,
+        }
+    }
+
     /// Create a new MetricsSplitMetadata builder.
     pub fn builder() -> MetricsSplitMetadataBuilder {
         MetricsSplitMetadataBuilder::default()
@@ -221,8 +351,19 @@ pub struct MetricsSplitMetadataBuilder {
     metric_names: HashSet<String>,
     low_cardinality_tags: HashMap<String, HashSet<String>>,
     high_cardinality_tag_keys: HashSet<String>,
+    parquet_file: String,
+    window_start: Option<i64>,
+    window_duration_secs: u32,
+    sort_fields: String,
+    num_merge_ops: u32,
+    row_keys_proto: Option<Vec<u8>>,
+    zonemap_regexes: HashMap<String, String>,
 }
 
+// The builder still accepts window_start and window_duration_secs separately
+// to remain compatible with callers that compute them independently (e.g.,
+// split_writer). The `build()` method fuses them into `Option<Range<i64>>`.
+
 impl MetricsSplitMetadataBuilder {
     pub fn split_id(mut self, id: SplitId) -> Self {
         self.split_id = Some(id);
@@ -284,7 +425,71 @@ impl MetricsSplitMetadataBuilder {
         self
     }
 
+    pub fn parquet_file(mut self, path: impl Into<String>) -> Self {
+        self.parquet_file = path.into();
+        self
+    }
+
+    pub fn window_start_secs(mut self, epoch_secs: i64) -> Self {
+        self.window_start = Some(epoch_secs);
+        self
+    }
+
+    pub fn window_duration_secs(mut self, dur: u32) -> Self {
+        self.window_duration_secs = dur;
+        self
+    }
+
+    pub fn sort_fields(mut self, schema: impl Into<String>) -> Self {
+        self.sort_fields = schema.into();
+        self
+    }
+
+    pub fn num_merge_ops(mut self, ops: u32) -> Self {
+        self.num_merge_ops = ops;
+        self
+    }
+
+    pub fn row_keys_proto(mut self, bytes: Vec<u8>) -> Self {
+        self.row_keys_proto = Some(bytes);
+        self
+    }
+
+    pub fn add_zonemap_regex(
+        mut self,
+        column: impl Into<String>,
+        regex: impl Into<String>,
+    ) -> Self {
+        self.zonemap_regexes.insert(column.into(), regex.into());
+        self
+    }
+
     pub fn build(self) -> MetricsSplitMetadata {
+        // TW-2 (ADR-003): window_duration must evenly divide 3600.
+        // Enforced at build time so no invalid metadata propagates to storage.
+        debug_assert!(
+            self.window_duration_secs == 0 || 3600 % self.window_duration_secs == 0,
+            "TW-2 violated: window_duration_secs={} does not divide 3600",
+            self.window_duration_secs
+        );
+
+        // TW-1 (ADR-003, partial): window_start and window_duration_secs are paired.
+        // If one is set, the other must be too. Pre-Phase-31 splits have both at defaults.
+        debug_assert!(
+            (self.window_start.is_none() && self.window_duration_secs == 0)
+                || (self.window_start.is_some() && self.window_duration_secs > 0),
+            "TW-1 violated: window_start and window_duration_secs must be set together \
+             (window_start={:?}, window_duration_secs={})",
+            self.window_start,
+            self.window_duration_secs
+        );
+
+        // Fuse the two builder fields into a single Range.
+        let window = match (self.window_start, self.window_duration_secs) {
+            (Some(start), dur) if dur > 0 => Some(start..start + dur as i64),
+            _ => None,
+        };
+
         MetricsSplitMetadata {
             split_id: self.split_id.unwrap_or_else(SplitId::generate),
             index_uid: self.index_uid.expect("index_uid is required"),
@@ -295,6 +500,12 @@ impl MetricsSplitMetadataBuilder {
             low_cardinality_tags: self.low_cardinality_tags,
             high_cardinality_tag_keys: self.high_cardinality_tag_keys,
             created_at: SystemTime::now(),
+            parquet_file: self.parquet_file,
+            window,
+            sort_fields: self.sort_fields,
+            num_merge_ops: self.num_merge_ops,
+            row_keys_proto: self.row_keys_proto,
+            zonemap_regexes: self.zonemap_regexes,
         }
     }
 }
@@ -401,4 +612,98 @@ mod tests {
         );
         assert_eq!(format!("{}", MetricsSplitState::Published), "Published");
     }
+
+    #[test]
+    fn test_backward_compat_deserialize_pre_phase31_json() {
+        // Simulate a JSON string from pre-Phase-31 code (no compaction fields).
+        let pre_phase31_json = r#"{
+            "split_id": "metrics_abc123",
+            "index_uid": "test-index:00000000000000000000000000",
+            "time_range": {"start_secs": 1000, "end_secs": 2000},
+            "num_rows": 500,
+            "size_bytes": 1024,
+            "metric_names": ["cpu.usage"],
+            "low_cardinality_tags": {},
+            "high_cardinality_tag_keys": [],
+            "created_at": {"secs_since_epoch": 1700000000, "nanos_since_epoch": 0},
+            "parquet_file": "split1.parquet"
+        }"#;
+
+        let metadata: MetricsSplitMetadata =
+            serde_json::from_str(pre_phase31_json).expect("should deserialize pre-Phase-31 JSON");
+
+        // New fields should be at their defaults.
+        assert!(metadata.window.is_none());
+        assert!(metadata.window_start().is_none());
+        assert_eq!(metadata.window_duration_secs(), 0);
+        assert_eq!(metadata.sort_fields, "");
+        assert_eq!(metadata.num_merge_ops, 0);
+        assert!(metadata.row_keys_proto.is_none());
+        assert!(metadata.zonemap_regexes.is_empty());
+
+        // Existing fields should be intact.
+        assert_eq!(metadata.split_id.as_str(), "metrics_abc123");
+        assert_eq!(metadata.index_uid, "test-index:00000000000000000000000000");
+        assert_eq!(metadata.num_rows, 500);
+    }
+
+    #[test]
+    fn test_round_trip_with_compaction_fields() {
+        let metadata = MetricsSplitMetadata::builder()
+            .split_id(SplitId::new("roundtrip-compaction"))
+            .index_uid("test-index:00000000000000000000000000")
+            .time_range(TimeRange::new(1000, 2000))
+            .num_rows(100)
+            .size_bytes(500)
+            .window_start_secs(1700000000)
+            .window_duration_secs(3600)
+            .sort_fields("metric_name|host|timestamp/V2")
+            .num_merge_ops(3)
+            .row_keys_proto(vec![0x08, 0x01, 0x10, 0x02])
+            .add_zonemap_regex("metric_name", "cpu\\..*")
+            .add_zonemap_regex("host", "host-\\d+")
+            .build();
+
+        let json = serde_json::to_string(&metadata).expect("should serialize");
+        let recovered: MetricsSplitMetadata =
+            serde_json::from_str(&json).expect("should deserialize");
+
+        assert_eq!(recovered.window, Some(1700000000..1700003600));
+        assert_eq!(recovered.window_start(), Some(1700000000));
+        assert_eq!(recovered.window_duration_secs(), 3600);
+        assert_eq!(recovered.sort_fields, "metric_name|host|timestamp/V2");
+        assert_eq!(recovered.num_merge_ops, 3);
+        assert_eq!(recovered.row_keys_proto, Some(vec![0x08, 0x01, 0x10, 0x02]));
+        assert_eq!(recovered.zonemap_regexes.len(), 2);
+        assert_eq!(
+            recovered.zonemap_regexes.get("metric_name").unwrap(),
+            "cpu\\..*"
+        );
+        assert_eq!(recovered.zonemap_regexes.get("host").unwrap(), "host-\\d+");
+    }
+
+    #[test]
+    fn test_skip_serializing_empty_compaction_fields() {
+        let metadata = MetricsSplitMetadata::builder()
+            .split_id(SplitId::new("skip-test"))
+            .index_uid("test-index:00000000000000000000000000")
+            .time_range(TimeRange::new(1000, 2000))
+            .build();
+
+        let json = serde_json::to_string(&metadata).expect("should serialize");
+
+        // Optional fields with skip_serializing_if should be absent.
+        assert!(
+            !json.contains("\"window_start\""),
+            "window_start should not appear when None"
+        );
+        assert!(
+            !json.contains("\"row_keys_proto\""),
+            "row_keys_proto should not appear when None"
+        );
+        assert!(
+            !json.contains("\"zonemap_regexes\""),
+            "zonemap_regexes should not appear when empty"
+        );
+    }
 }
diff --git a/quickwit/quickwit-parquet-engine/src/split/postgres.rs b/quickwit/quickwit-parquet-engine/src/split/postgres.rs
index 8b2c8767846..0ea753181bb 100644
--- a/quickwit/quickwit-parquet-engine/src/split/postgres.rs
+++ b/quickwit/quickwit-parquet-engine/src/split/postgres.rs
@@ -71,6 +71,12 @@ pub struct PgMetricsSplit {
     pub size_bytes: i64,
     pub split_metadata_json: String,
     pub update_timestamp: i64,
+    pub window_start: Option<i64>,
+    pub window_duration_secs: i32,
+    pub sort_fields: String,
+    pub num_merge_ops: i32,
+    pub row_keys: Option<Vec<u8>>,
+    pub zonemap_regexes: String,
 }
 
 /// Insertable row for metrics_splits table.
@@ -92,6 +98,12 @@ pub struct InsertableMetricsSplit {
     pub num_rows: i64,
     pub size_bytes: i64,
     pub split_metadata_json: String,
+    pub window_start: Option<i64>,
+    pub window_duration_secs: i32,
+    pub sort_fields: String,
+    pub num_merge_ops: i32,
+    pub row_keys: Option<Vec<u8>>,
+    pub zonemap_regexes: String,
 }
 
 impl InsertableMetricsSplit {
@@ -102,6 +114,12 @@ impl InsertableMetricsSplit {
     ) -> Result<Self, serde_json::Error> {
         let split_metadata_json = serde_json::to_string(metadata)?;
 
+        let zonemap_regexes_json = if metadata.zonemap_regexes.is_empty() {
+            "{}".to_string()
+        } else {
+            serde_json::to_string(&metadata.zonemap_regexes)?
+        };
+
         Ok(Self {
             split_id: metadata.split_id.as_str().to_string(),
             split_state: state.as_str().to_string(),
@@ -118,6 +136,12 @@ impl InsertableMetricsSplit {
             num_rows: metadata.num_rows as i64,
             size_bytes: metadata.size_bytes as i64,
             split_metadata_json,
+            window_start: metadata.window_start(),
+            window_duration_secs: metadata.window_duration_secs() as i32,
+            sort_fields: metadata.sort_fields.clone(),
+            num_merge_ops: metadata.num_merge_ops as i32,
+            row_keys: metadata.row_keys_proto.clone(),
+            zonemap_regexes: zonemap_regexes_json,
         })
     }
 }
@@ -134,10 +158,17 @@ impl PgMetricsSplit {
         // Primary path: deserialize from JSON (authoritative)
         let metadata: MetricsSplitMetadata = serde_json::from_str(&self.split_metadata_json)?;
 
-        // Overlay database columns (for consistency verification in debug builds)
+        // SS-5: Verify consistency between JSON blob and SQL columns.
         debug_assert_eq!(metadata.split_id.as_str(), self.split_id);
         debug_assert_eq!(metadata.time_range.start_secs, self.time_range_start as u64);
         debug_assert_eq!(metadata.time_range.end_secs, self.time_range_end as u64);
+        debug_assert_eq!(metadata.window_start(), self.window_start);
+        debug_assert_eq!(
+            metadata.window_duration_secs(),
+            self.window_duration_secs as u32
+        );
+        debug_assert_eq!(metadata.sort_fields, self.sort_fields);
+        debug_assert_eq!(metadata.num_merge_ops, self.num_merge_ops as u32);
 
         Ok(metadata)
     }
@@ -259,6 +290,12 @@ mod tests {
             size_bytes: insertable.size_bytes,
             split_metadata_json: insertable.split_metadata_json,
             update_timestamp: 1704067200,
+            window_start: insertable.window_start,
+            window_duration_secs: insertable.window_duration_secs,
+            sort_fields: insertable.sort_fields,
+            num_merge_ops: insertable.num_merge_ops,
+            row_keys: insertable.row_keys,
+            zonemap_regexes: insertable.zonemap_regexes,
         };
 
         let recovered = pg_row.to_metadata().expect("should deserialize");

From 76b703ad2424d3795b47ead581bfc69b24fd5bff Mon Sep 17 00:00:00 2001
From: George Talbot <george.talbot@datadoghq.com>
Date: Mon, 9 Mar 2026 14:42:15 -0400
Subject: [PATCH 12/35] feat(31): wire TableConfig into sort path, add
 compaction KV metadata

Wire TableConfig-driven sort order into ParquetWriter and add
self-describing Parquet file metadata for compaction:

- ParquetWriter::new() takes &TableConfig, resolves sort fields at
  construction via parse_sort_fields() + ParquetField::from_name()
- sort_batch() uses resolved fields with per-column direction (ASC/DESC)
- SS-1 debug_assert verification: re-sort and check identity permutation
- build_compaction_key_value_metadata(): embeds sort_fields, window_start,
  window_duration, num_merge_ops, row_keys (base64) in Parquet kv_metadata
- SS-5 verify_ss5_kv_consistency(): kv_metadata matches source struct
- write_to_file_with_metadata() replaces write_to_file()
- prepare_write() shared method for bytes and file paths
- ParquetWriterConfig gains to_writer_properties_with_metadata()
- ParquetSplitWriter passes TableConfig through
- All callers in quickwit-indexing updated with TableConfig::default()
- 23 storage tests pass including META-07 self-describing roundtrip

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
---
 quickwit/Cargo.lock                           |  94 +--
 .../src/actors/indexing_pipeline.rs           |   2 +
 .../src/actors/parquet_doc_processor.rs       |   3 +-
 .../src/actors/parquet_e2e_test.rs            |   3 +-
 .../src/actors/parquet_indexer.rs             |   3 +-
 .../src/actors/parquet_packager.rs            |   3 +-
 quickwit/quickwit-parquet-engine/Cargo.toml   |   2 +
 .../src/storage/config.rs                     |  80 +--
 .../src/storage/split_writer.rs               |  89 ++-
 .../src/storage/writer.rs                     | 582 +++++++++++++++---
 10 files changed, 646 insertions(+), 215 deletions(-)

diff --git a/quickwit/Cargo.lock b/quickwit/Cargo.lock
index 15f519615af..df969d83d7e 100644
--- a/quickwit/Cargo.lock
+++ b/quickwit/Cargo.lock
@@ -1003,7 +1003,7 @@ dependencies = [
  "http-body 0.4.6",
  "http-body 1.0.1",
  "hyper 0.14.32",
- "hyper 1.8.1",
+ "hyper 1.9.0",
  "hyper-rustls 0.24.2",
  "hyper-rustls 0.27.7",
  "hyper-util",
@@ -3967,9 +3967,9 @@ dependencies = [
 
 [[package]]
 name = "hyper"
-version = "1.8.1"
+version = "1.9.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "2ab2d4f250c3d7b1c9fcdff1cece94ea4e2dfbec68614f7b87cb205f24ca9d11"
+checksum = "6299f016b246a94207e63da54dbe807655bf9e00044f73ded42c3ac5305fbcca"
 dependencies = [
  "atomic-waker",
  "bytes",
@@ -3982,7 +3982,6 @@ dependencies = [
  "httpdate",
  "itoa",
  "pin-project-lite",
- "pin-utils",
  "smallvec",
  "tokio",
  "want",
@@ -4010,7 +4009,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "e3c93eb611681b207e1fe55d5a71ecf91572ec8a6705cdb6857f7d8d5242cf58"
 dependencies = [
  "http 1.4.0",
- "hyper 1.8.1",
+ "hyper 1.9.0",
  "hyper-util",
  "rustls 0.23.37",
  "rustls-native-certs",
@@ -4027,7 +4026,7 @@ version = "0.5.2"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "2b90d566bffbce6a75bd8b09a05aa8c2cb1fabb6cb348f8840c9e4c90a0d83b0"
 dependencies = [
- "hyper 1.8.1",
+ "hyper 1.9.0",
  "hyper-util",
  "pin-project-lite",
  "tokio",
@@ -4042,7 +4041,7 @@ checksum = "70206fc6890eaca9fde8a0bf71caa2ddfc9fe045ac9e5c70df101a7dbde866e0"
 dependencies = [
  "bytes",
  "http-body-util",
- "hyper 1.8.1",
+ "hyper 1.9.0",
  "hyper-util",
  "native-tls",
  "tokio",
@@ -4062,7 +4061,7 @@ dependencies = [
  "futures-util",
  "http 1.4.0",
  "http-body 1.0.1",
- "hyper 1.8.1",
+ "hyper 1.9.0",
  "ipnet",
  "libc",
  "percent-encoding",
@@ -4334,9 +4333,9 @@ checksum = "8bb03732005da905c88227371639bf1ad885cc712789c011c31c5fb3ab3ccf02"
 
 [[package]]
 name = "inventory"
-version = "0.3.23"
+version = "0.3.24"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "8062b737e5389949f477d4760a2ebbff0c366f97798f2419b8d8f366363d3342"
+checksum = "a4f0c30c76f2f4ccee3fe55a2435f691ca00c0e4bd87abe4f4a851b1d4dac39b"
 dependencies = [
  "rustversion",
 ]
@@ -4478,9 +4477,9 @@ dependencies = [
 
 [[package]]
 name = "js-sys"
-version = "0.3.92"
+version = "0.3.93"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "cc4c90f45aa2e6eacbe8645f77fdea542ac97a494bcd117a67df9ff4d611f995"
+checksum = "797146bb2677299a1eb6b7b50a890f4c361b29ef967addf5b2fa45dae1bb6d7d"
 dependencies = [
  "cfg-if",
  "futures-util",
@@ -4599,7 +4598,7 @@ dependencies = [
  "http-body 1.0.1",
  "http-body-util",
  "http-serde",
- "hyper 1.8.1",
+ "hyper 1.9.0",
  "hyper-util",
  "lambda_runtime_api_client",
  "pin-project",
@@ -4625,7 +4624,7 @@ dependencies = [
  "http 1.4.0",
  "http-body 1.0.1",
  "http-body-util",
- "hyper 1.8.1",
+ "hyper 1.9.0",
  "hyper-util",
  "tokio",
  "tower 0.4.13",
@@ -7100,7 +7099,7 @@ dependencies = [
  "home",
  "hostname",
  "http 1.4.0",
- "hyper 1.8.1",
+ "hyper 1.9.0",
  "hyper-util",
  "itertools 0.14.0",
  "once_cell",
@@ -7385,7 +7384,7 @@ dependencies = [
  "anyhow",
  "aws-sdk-sqs",
  "futures-util",
- "hyper 1.8.1",
+ "hyper 1.9.0",
  "hyper-util",
  "itertools 0.14.0",
  "quickwit-actors",
@@ -7612,6 +7611,7 @@ version = "0.8.0"
 dependencies = [
  "anyhow",
  "arrow",
+ "base64 0.22.1",
  "chrono",
  "parquet",
  "proptest",
@@ -7858,7 +7858,7 @@ dependencies = [
  "futures",
  "http 1.4.0",
  "http-body-util",
- "hyper 1.8.1",
+ "hyper 1.9.0",
  "lru 0.16.3",
  "md5",
  "mini-moka",
@@ -8340,7 +8340,7 @@ dependencies = [
  "http 1.4.0",
  "http-body 1.0.1",
  "http-body-util",
- "hyper 1.8.1",
+ "hyper 1.9.0",
  "hyper-rustls 0.27.7",
  "hyper-tls",
  "hyper-util",
@@ -8399,7 +8399,7 @@ dependencies = [
  "futures",
  "getrandom 0.2.17",
  "http 1.4.0",
- "hyper 1.8.1",
+ "hyper 1.9.0",
  "parking_lot 0.11.2",
  "reqwest",
  "reqwest-middleware",
@@ -8420,7 +8420,7 @@ dependencies = [
  "futures",
  "getrandom 0.2.17",
  "http 1.4.0",
- "hyper 1.8.1",
+ "hyper 1.9.0",
  "reqwest",
  "reqwest-middleware",
  "retry-policies 0.5.1",
@@ -9062,9 +9062,9 @@ dependencies = [
 
 [[package]]
 name = "serde_spanned"
-version = "1.1.0"
+version = "1.1.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "876ac351060d4f882bb1032b6369eb0aef79ad9df1ea8bc404874d8cc3d0cd98"
+checksum = "6662b5879511e06e8999a8a235d848113e942c9124f211511b16466ee2995f26"
 dependencies = [
  "serde_core",
 ]
@@ -10345,39 +10345,39 @@ dependencies = [
 
 [[package]]
 name = "toml_datetime"
-version = "1.1.0+spec-1.1.0"
+version = "1.1.1+spec-1.1.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "97251a7c317e03ad83774a8752a7e81fb6067740609f75ea2b585b569a59198f"
+checksum = "3165f65f62e28e0115a00b2ebdd37eb6f3b641855f9d636d3cd4103767159ad7"
 dependencies = [
  "serde_core",
 ]
 
 [[package]]
 name = "toml_edit"
-version = "0.25.8+spec-1.1.0"
+version = "0.25.9+spec-1.1.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "16bff38f1d86c47f9ff0647e6838d7bb362522bdf44006c7068c2b1e606f1f3c"
+checksum = "da053d28fe57e2c9d21b48261e14e7b4c8b670b54d2c684847b91feaf4c7dac5"
 dependencies = [
  "indexmap 2.13.0",
- "toml_datetime 1.1.0+spec-1.1.0",
+ "toml_datetime 1.1.1+spec-1.1.0",
  "toml_parser",
  "winnow 1.0.1",
 ]
 
 [[package]]
 name = "toml_parser"
-version = "1.1.0+spec-1.1.0"
+version = "1.1.1+spec-1.1.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "2334f11ee363607eb04df9b8fc8a13ca1715a72ba8662a26ac285c98aabb4011"
+checksum = "39ca317ebc49f06bd748bfba29533eac9485569dc9bf80b849024b025e814fb9"
 dependencies = [
  "winnow 1.0.1",
 ]
 
 [[package]]
 name = "toml_writer"
-version = "1.1.0+spec-1.1.0"
+version = "1.1.1+spec-1.1.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "d282ade6016312faf3e41e57ebbba0c073e4056dab1232ab1cb624199648f8ed"
+checksum = "756daf9b1013ebe47a8776667b466417e2d4c5679d441c26230efd9ef78692db"
 
 [[package]]
 name = "tonic"
@@ -10392,7 +10392,7 @@ dependencies = [
  "http 1.4.0",
  "http-body 1.0.1",
  "http-body-util",
- "hyper 1.8.1",
+ "hyper 1.9.0",
  "hyper-timeout",
  "hyper-util",
  "percent-encoding",
@@ -10424,7 +10424,7 @@ dependencies = [
  "http 1.4.0",
  "http-body 1.0.1",
  "http-body-util",
- "hyper 1.8.1",
+ "hyper 1.9.0",
  "hyper-timeout",
  "hyper-util",
  "percent-encoding",
@@ -11187,7 +11187,7 @@ dependencies = [
  "http 1.4.0",
  "http-body 1.0.1",
  "http-body-util",
- "hyper 1.8.1",
+ "hyper 1.9.0",
  "hyper-util",
  "log",
  "mime",
@@ -11251,9 +11251,9 @@ dependencies = [
 
 [[package]]
 name = "wasm-bindgen"
-version = "0.2.115"
+version = "0.2.116"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "6523d69017b7633e396a89c5efab138161ed5aafcbc8d3e5c5a42ae38f50495a"
+checksum = "7dc0882f7b5bb01ae8c5215a1230832694481c1a4be062fd410e12ea3da5b631"
 dependencies = [
  "cfg-if",
  "once_cell",
@@ -11264,9 +11264,9 @@ dependencies = [
 
 [[package]]
 name = "wasm-bindgen-futures"
-version = "0.4.65"
+version = "0.4.66"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "2d1faf851e778dfa54db7cd438b70758eba9755cb47403f3496edd7c8fc212f0"
+checksum = "19280959e2844181895ef62f065c63e0ca07ece4771b53d89bfdb967d97cbf05"
 dependencies = [
  "js-sys",
  "wasm-bindgen",
@@ -11274,9 +11274,9 @@ dependencies = [
 
 [[package]]
 name = "wasm-bindgen-macro"
-version = "0.2.115"
+version = "0.2.116"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "4e3a6c758eb2f701ed3d052ff5737f5bfe6614326ea7f3bbac7156192dc32e67"
+checksum = "75973d3066e01d035dbedaad2864c398df42f8dd7b1ea057c35b8407c015b537"
 dependencies = [
  "quote",
  "wasm-bindgen-macro-support",
@@ -11284,9 +11284,9 @@ dependencies = [
 
 [[package]]
 name = "wasm-bindgen-macro-support"
-version = "0.2.115"
+version = "0.2.116"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "921de2737904886b52bcbb237301552d05969a6f9c40d261eb0533c8b055fedf"
+checksum = "91af5e4be765819e0bcfee7322c14374dc821e35e72fa663a830bbc7dc199eac"
 dependencies = [
  "bumpalo",
  "proc-macro2",
@@ -11297,9 +11297,9 @@ dependencies = [
 
 [[package]]
 name = "wasm-bindgen-shared"
-version = "0.2.115"
+version = "0.2.116"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "a93e946af942b58934c604527337bad9ae33ba1d5c6900bbb41c2c07c2364a93"
+checksum = "c9bf0406a78f02f336bf1e451799cca198e8acde4ffa278f0fb20487b150a633"
 dependencies = [
  "unicode-ident",
 ]
@@ -11382,9 +11382,9 @@ dependencies = [
 
 [[package]]
 name = "web-sys"
-version = "0.3.92"
+version = "0.3.93"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "84cde8507f4d7cfcb1185b8cb5890c494ffea65edbe1ba82cfd63661c805ed94"
+checksum = "749466a37ee189057f54748b200186b59a03417a117267baf3fd89cecc9fb837"
 dependencies = [
  "js-sys",
  "wasm-bindgen",
@@ -11886,7 +11886,7 @@ dependencies = [
  "futures",
  "http 1.4.0",
  "http-body-util",
- "hyper 1.8.1",
+ "hyper 1.9.0",
  "hyper-util",
  "log",
  "once_cell",
diff --git a/quickwit/quickwit-indexing/src/actors/indexing_pipeline.rs b/quickwit/quickwit-indexing/src/actors/indexing_pipeline.rs
index cfb2ade9361..a102d0b4f0c 100644
--- a/quickwit/quickwit-indexing/src/actors/indexing_pipeline.rs
+++ b/quickwit/quickwit-indexing/src/actors/indexing_pipeline.rs
@@ -638,9 +638,11 @@ impl IndexingPipeline {
 
         // ParquetPackager
         let writer_config = quickwit_parquet_engine::storage::ParquetWriterConfig::default();
+        let table_config = quickwit_parquet_engine::table_config::TableConfig::default();
         let split_writer = quickwit_parquet_engine::storage::ParquetSplitWriter::new(
             writer_config,
             self.params.indexing_directory.path(),
+            &table_config,
         );
         let parquet_packager = ParquetPackager::new(split_writer, parquet_uploader_mailbox);
         let (parquet_packager_mailbox, parquet_packager_handle) = ctx
diff --git a/quickwit/quickwit-indexing/src/actors/parquet_doc_processor.rs b/quickwit/quickwit-indexing/src/actors/parquet_doc_processor.rs
index eb51621a30f..894b0251412 100644
--- a/quickwit/quickwit-indexing/src/actors/parquet_doc_processor.rs
+++ b/quickwit/quickwit-indexing/src/actors/parquet_doc_processor.rs
@@ -549,7 +549,8 @@ mod tests {
 
         // Create ParquetPackager
         let writer_config = ParquetWriterConfig::default();
-        let split_writer = ParquetSplitWriter::new(writer_config, temp_dir.path());
+        let table_config = quickwit_parquet_engine::table_config::TableConfig::default();
+        let split_writer = ParquetSplitWriter::new(writer_config, temp_dir.path(), &table_config);
         let packager = ParquetPackager::new(split_writer, uploader_mailbox);
         let (packager_mailbox, packager_handle) = universe.spawn_builder().spawn(packager);
 
diff --git a/quickwit/quickwit-indexing/src/actors/parquet_e2e_test.rs b/quickwit/quickwit-indexing/src/actors/parquet_e2e_test.rs
index d613fc96003..fd83e0e3674 100644
--- a/quickwit/quickwit-indexing/src/actors/parquet_e2e_test.rs
+++ b/quickwit/quickwit-indexing/src/actors/parquet_e2e_test.rs
@@ -172,7 +172,8 @@ async fn test_metrics_pipeline_e2e() {
 
     // ParquetPackager between indexer and uploader
     let writer_config = ParquetWriterConfig::default();
-    let split_writer = ParquetSplitWriter::new(writer_config, temp_dir.path());
+    let table_config = quickwit_parquet_engine::table_config::TableConfig::default();
+    let split_writer = ParquetSplitWriter::new(writer_config, temp_dir.path(), &table_config);
     let packager = ParquetPackager::new(split_writer, uploader_mailbox);
     let (packager_mailbox, packager_handle) = universe.spawn_builder().spawn(packager);
 
diff --git a/quickwit/quickwit-indexing/src/actors/parquet_indexer.rs b/quickwit/quickwit-indexing/src/actors/parquet_indexer.rs
index 254ff2dc719..d860dd2f1b6 100644
--- a/quickwit/quickwit-indexing/src/actors/parquet_indexer.rs
+++ b/quickwit/quickwit-indexing/src/actors/parquet_indexer.rs
@@ -593,7 +593,8 @@ mod tests {
         uploader_mailbox: Mailbox<ParquetUploader>,
     ) -> (Mailbox<ParquetPackager>, ActorHandle<ParquetPackager>) {
         let writer_config = ParquetWriterConfig::default();
-        let split_writer = ParquetSplitWriter::new(writer_config, temp_dir);
+        let table_config = quickwit_parquet_engine::table_config::TableConfig::default();
+        let split_writer = ParquetSplitWriter::new(writer_config, temp_dir, &table_config);
 
         let packager = ParquetPackager::new(split_writer, uploader_mailbox);
         universe.spawn_builder().spawn(packager)
diff --git a/quickwit/quickwit-indexing/src/actors/parquet_packager.rs b/quickwit/quickwit-indexing/src/actors/parquet_packager.rs
index b0950141b10..7dd70e645a9 100644
--- a/quickwit/quickwit-indexing/src/actors/parquet_packager.rs
+++ b/quickwit/quickwit-indexing/src/actors/parquet_packager.rs
@@ -274,7 +274,8 @@ mod tests {
         uploader_mailbox: Mailbox<ParquetUploader>,
     ) -> (Mailbox<ParquetPackager>, ActorHandle<ParquetPackager>) {
         let writer_config = ParquetWriterConfig::default();
-        let split_writer = ParquetSplitWriter::new(writer_config, temp_dir);
+        let table_config = quickwit_parquet_engine::table_config::TableConfig::default();
+        let split_writer = ParquetSplitWriter::new(writer_config, temp_dir, &table_config);
 
         let packager = ParquetPackager::new(split_writer, uploader_mailbox);
         universe.spawn_builder().spawn(packager)
diff --git a/quickwit/quickwit-parquet-engine/Cargo.toml b/quickwit/quickwit-parquet-engine/Cargo.toml
index 9744dbcc2bc..39918c0948c 100644
--- a/quickwit/quickwit-parquet-engine/Cargo.toml
+++ b/quickwit/quickwit-parquet-engine/Cargo.toml
@@ -13,8 +13,10 @@ license.workspace = true
 [dependencies]
 anyhow = { workspace = true }
 arrow = { workspace = true }
+base64 = { workspace = true }
 chrono = { workspace = true }
 parquet = { workspace = true }
+prost = { workspace = true }
 quickwit-common = { workspace = true }
 quickwit-proto = { workspace = true }
 sea-query = { workspace = true, optional = true }
diff --git a/quickwit/quickwit-parquet-engine/src/storage/config.rs b/quickwit/quickwit-parquet-engine/src/storage/config.rs
index 2eb63d73510..a8d675d3c12 100644
--- a/quickwit/quickwit-parquet-engine/src/storage/config.rs
+++ b/quickwit/quickwit-parquet-engine/src/storage/config.rs
@@ -16,7 +16,7 @@
 
 use arrow::datatypes::{DataType, Schema as ArrowSchema};
 use parquet::basic::Compression as ParquetCompression;
-use parquet::file::metadata::SortingColumn;
+use parquet::file::metadata::{KeyValue, SortingColumn};
 use parquet::file::properties::{EnabledStatistics, WriterProperties, WriterPropertiesBuilder};
 use parquet::schema::types::ColumnPath;
 
@@ -119,20 +119,42 @@ impl ParquetWriterConfig {
     }
 
     /// Convert to Parquet WriterProperties using the given Arrow schema to configure
-    /// per-column settings like dictionary encoding and bloom filters.
+    /// per-column settings like dictionary encoding and bloom filters, with an empty
+    /// sort order and no metadata.
+    ///
+    /// Prefer `to_writer_properties_with_metadata()` in production — this method
+    /// is mainly for tests that don't care about sort order.
     pub fn to_writer_properties(&self, schema: &ArrowSchema) -> WriterProperties {
+        self.to_writer_properties_with_metadata(schema, Vec::new(), None)
+    }
+
+    /// Convert to Parquet WriterProperties with sorting columns and optional key_value_metadata.
+    ///
+    /// `sorting_cols` is produced by `ParquetWriter::sorting_columns()` from the
+    /// resolved table_config sort fields.
+    ///
+    /// When `kv_metadata` is provided, the entries are embedded in the Parquet file's
+    /// key_value_metadata, making files self-describing (META-07).
+    pub fn to_writer_properties_with_metadata(
+        &self,
+        schema: &ArrowSchema,
+        sorting_cols: Vec<SortingColumn>,
+        kv_metadata: Option<Vec<KeyValue>>,
+    ) -> WriterProperties {
         let mut builder = WriterProperties::builder()
             .set_max_row_group_size(self.row_group_size)
             .set_data_page_size_limit(self.data_page_size)
             .set_write_batch_size(self.write_batch_size)
-            // Enable column index for efficient pruning on sorted data (64 bytes default)
             .set_column_index_truncate_length(Some(64))
-            // Set sorting columns metadata for readers to use during pruning
-            .set_sorting_columns(Some(Self::sorting_columns(schema)))
-            // Enable row group level statistics (min/max/null_count) for query pruning
-            // This allows DataFusion to skip row groups based on timestamp ranges
+            .set_sorting_columns(Some(sorting_cols))
             .set_statistics_enabled(EnabledStatistics::Chunk);
 
+        if let Some(kvs) = kv_metadata
+            && !kvs.is_empty()
+        {
+            builder = builder.set_key_value_metadata(Some(kvs));
+        }
+
         builder = match self.compression {
             Compression::Zstd => {
                 let level = self.compression_level.unwrap_or(DEFAULT_ZSTD_LEVEL);
@@ -184,20 +206,6 @@ impl ParquetWriterConfig {
         }
         builder
     }
-
-    /// Get the sorting columns for parquet metadata, computed from the schema
-    /// and SORT_ORDER. Only columns present in the schema are included.
-    fn sorting_columns(schema: &ArrowSchema) -> Vec<SortingColumn> {
-        SORT_ORDER
-            .iter()
-            .filter_map(|name| schema.index_of(name).ok())
-            .map(|idx| SortingColumn {
-                column_idx: idx as i32,
-                descending: false,
-                nulls_first: false,
-            })
-            .collect()
-    }
 }
 
 #[cfg(test)]
@@ -370,34 +378,4 @@ mod tests {
             );
         }
     }
-
-    #[test]
-    fn test_sorting_columns_order() {
-        let schema = create_test_schema();
-        let sorting_cols = ParquetWriterConfig::sorting_columns(&schema);
-
-        // The test schema has metric_name (idx 0), timestamp_secs (idx 2),
-        // service (idx 4), env (idx 5), host (idx 6).
-        // SORT_ORDER is: metric_name, service, env, datacenter, region, host, timestamp_secs
-        // Only present columns are included, so: metric_name, service, env, host, timestamp_secs
-        assert_eq!(
-            sorting_cols.len(),
-            5,
-            "should have 5 sorting columns from the test schema"
-        );
-
-        // Verify all are ascending with nulls first
-        for col in &sorting_cols {
-            assert!(!col.descending, "sorting should be ascending");
-            assert!(!col.nulls_first, "nulls should be last");
-        }
-
-        // Verify order matches SORT_ORDER filtered by schema presence:
-        // metric_name (idx 0), service (idx 4), env (idx 5), host (idx 6), timestamp_secs (idx 2)
-        assert_eq!(sorting_cols[0].column_idx, 0); // metric_name
-        assert_eq!(sorting_cols[1].column_idx, 4); // service
-        assert_eq!(sorting_cols[2].column_idx, 5); // env
-        assert_eq!(sorting_cols[3].column_idx, 6); // host
-        assert_eq!(sorting_cols[4].column_idx, 2); // timestamp_secs
-    }
 }
diff --git a/quickwit/quickwit-parquet-engine/src/storage/split_writer.rs b/quickwit/quickwit-parquet-engine/src/storage/split_writer.rs
index 466e2c9cdcb..fe10b192ff6 100644
--- a/quickwit/quickwit-parquet-engine/src/storage/split_writer.rs
+++ b/quickwit/quickwit-parquet-engine/src/storage/split_writer.rs
@@ -25,12 +25,16 @@ use tracing::{debug, info, instrument};
 
 use super::config::ParquetWriterConfig;
 use super::writer::{ParquetWriteError, ParquetWriter};
+use crate::sort_fields::window_start;
 use crate::split::{MetricsSplitMetadata, ParquetSplit, SplitId, TAG_SERVICE, TimeRange};
+use crate::table_config::TableConfig;
 
 /// Writer that produces complete ParquetSplit with metadata from RecordBatch data.
 pub struct ParquetSplitWriter {
     /// The underlying Parquet writer.
     writer: ParquetWriter,
+    /// Table configuration (sort fields, window duration, product type).
+    table_config: TableConfig,
     /// Base directory for split files.
     base_path: PathBuf,
 }
@@ -41,9 +45,15 @@ impl ParquetSplitWriter {
     /// # Arguments
     /// * `config` - Parquet writer configuration
     /// * `base_path` - Directory where split files will be written
-    pub fn new(config: ParquetWriterConfig, base_path: impl Into<PathBuf>) -> Self {
+    /// * `table_config` - Table-level config (sort fields, window duration)
+    pub fn new(
+        config: ParquetWriterConfig,
+        base_path: impl Into<PathBuf>,
+        table_config: &TableConfig,
+    ) -> Self {
         Self {
-            writer: ParquetWriter::new(config),
+            writer: ParquetWriter::new(config, table_config),
+            table_config: table_config.clone(),
             base_path: base_path.into(),
         }
     }
@@ -55,6 +65,10 @@ impl ParquetSplitWriter {
 
     /// Write a RecordBatch to a Parquet file and return a ParquetSplit with metadata.
     ///
+    /// Builds metadata (including sort fields, window_start, window_duration from
+    /// table_config) before writing, then embeds compaction KV metadata into the
+    /// Parquet file. size_bytes is updated after the write completes.
+    ///
     /// # Arguments
     /// * `batch` - The RecordBatch to write
     /// * `index_uid` - The index unique identifier for the split metadata
@@ -67,18 +81,13 @@ impl ParquetSplitWriter {
         batch: &RecordBatch,
         index_uid: &str,
     ) -> Result<ParquetSplit, ParquetWriteError> {
-        // Generate unique split ID
         let split_id = SplitId::generate();
+        let filename = format!("{}.parquet", split_id);
+        let file_path = self.base_path.join(&filename);
 
-        let file_path = self.base_path.join(format!("{}.parquet", split_id));
-
-        // Ensure the base directory exists
         std::fs::create_dir_all(&self.base_path)?;
 
-        // Write batch to file
-        let size_bytes = self.writer.write_to_file(batch, &file_path)?;
-
-        // Extract time range from batch
+        // Extract batch-level metadata before writing.
         let time_range = extract_time_range(batch)?;
         debug!(
             start_secs = time_range.start_secs,
@@ -86,36 +95,60 @@ impl ParquetSplitWriter {
             "extracted time range from batch"
         );
 
-        // Extract distinct metric names from batch
         let metric_names = extract_metric_names(batch)?;
-
-        // Extract distinct service names from batch
         let service_names = extract_service_names(batch)?;
 
-        // Build metadata
-        let metadata = MetricsSplitMetadata::builder()
+        // Compute window_start from the earliest timestamp in the batch.
+        let window_duration = self.table_config.window_duration_secs;
+        let window_start_secs = if window_duration > 0 && time_range.start_secs > 0 {
+            match window_start(time_range.start_secs as i64, window_duration as i64) {
+                Ok(dt) => Some(dt.timestamp()),
+                Err(e) => {
+                    tracing::warn!(error = %e, "failed to compute window_start, omitting");
+                    None
+                }
+            }
+        } else {
+            None
+        };
+
+        // Build metadata with sort fields and window from table_config.
+        // size_bytes is set to 0 here and updated after write.
+        let mut builder = MetricsSplitMetadata::builder()
             .split_id(split_id.clone())
             .index_uid(index_uid)
             .time_range(time_range)
             .num_rows(batch.num_rows() as u64)
-            .size_bytes(size_bytes);
+            .size_bytes(0)
+            .sort_fields(self.writer.sort_fields_string())
+            .window_duration_secs(window_duration)
+            .parquet_file(filename);
+
+        if let Some(ws) = window_start_secs {
+            builder = builder.window_start_secs(ws);
+        }
+
+        for name in metric_names {
+            builder = builder.add_metric_name(name);
+        }
+        for name in service_names {
+            builder = builder.add_low_cardinality_tag(TAG_SERVICE, name);
+        }
 
-        // Add metric names
-        let metadata = metric_names
-            .into_iter()
-            .fold(metadata, |m, name| m.add_metric_name(name));
+        let mut metadata = builder.build();
 
-        // Add service names as low-cardinality tags
-        let metadata = service_names.into_iter().fold(metadata, |m, name| {
-            m.add_low_cardinality_tag(TAG_SERVICE, name)
-        });
+        // Write with compaction metadata embedded in Parquet KV metadata.
+        let size_bytes =
+            self.writer
+                .write_to_file_with_metadata(batch, &file_path, Some(&metadata))?;
 
-        let metadata = metadata.build();
+        metadata.size_bytes = size_bytes;
 
         info!(
             split_id = %split_id,
             file_path = %file_path.display(),
             size_bytes,
+            sort_fields = %self.writer.sort_fields_string(),
             "split file written successfully"
         );
 
@@ -296,7 +329,7 @@ mod tests {
         let config = ParquetWriterConfig::default();
         let temp_dir = tempfile::tempdir().unwrap();
 
-        let writer = ParquetSplitWriter::new(config, temp_dir.path());
+        let writer = ParquetSplitWriter::new(config, temp_dir.path(), &TableConfig::default());
 
         let batch = create_test_batch(10);
         let split = writer.write_split(&batch, "test-index").unwrap();
@@ -318,7 +351,7 @@ mod tests {
         let config = ParquetWriterConfig::default();
         let temp_dir = tempfile::tempdir().unwrap();
 
-        let writer = ParquetSplitWriter::new(config, temp_dir.path());
+        let writer = ParquetSplitWriter::new(config, temp_dir.path(), &TableConfig::default());
 
         // Create batch with timestamps [100, 150, 200]
         let batch = create_test_batch_with_options(
@@ -340,7 +373,7 @@ mod tests {
         let config = ParquetWriterConfig::default();
         let temp_dir = tempfile::tempdir().unwrap();
 
-        let writer = ParquetSplitWriter::new(config, temp_dir.path());
+        let writer = ParquetSplitWriter::new(config, temp_dir.path(), &TableConfig::default());
 
         // Create batch with specific metric names
         let batch = create_test_batch_with_options(
diff --git a/quickwit/quickwit-parquet-engine/src/storage/writer.rs b/quickwit/quickwit-parquet-engine/src/storage/writer.rs
index 6f29c0be4cc..d45c0a0c9cb 100644
--- a/quickwit/quickwit-parquet-engine/src/storage/writer.rs
+++ b/quickwit/quickwit-parquet-engine/src/storage/writer.rs
@@ -21,13 +21,116 @@ use std::sync::Arc;
 
 use arrow::array::RecordBatch;
 use arrow::compute::{SortColumn, SortOptions, lexsort_to_indices, take_record_batch};
+use base64::Engine as _;
+use base64::engine::general_purpose::STANDARD as BASE64;
 use parquet::arrow::ArrowWriter;
 use parquet::errors::ParquetError;
+use parquet::file::metadata::{KeyValue, SortingColumn};
+use parquet::file::properties::WriterProperties;
 use thiserror::Error;
 use tracing::{debug, instrument};
 
 use super::config::ParquetWriterConfig;
-use crate::schema::{SORT_ORDER, validate_required_fields};
+use crate::schema::validate_required_fields;
+use crate::sort_fields::parse_sort_fields;
+use crate::split::MetricsSplitMetadata;
+use crate::table_config::TableConfig;
+
+/// Parquet key_value_metadata keys for compaction metadata.
+/// Prefixed with "qh." to avoid collision with standard Parquet/Arrow keys.
+pub(crate) const PARQUET_META_SORT_FIELDS: &str = "qh.sort_fields";
+pub(crate) const PARQUET_META_WINDOW_START: &str = "qh.window_start";
+pub(crate) const PARQUET_META_WINDOW_DURATION: &str = "qh.window_duration_secs";
+pub(crate) const PARQUET_META_NUM_MERGE_OPS: &str = "qh.num_merge_ops";
+pub(crate) const PARQUET_META_ROW_KEYS: &str = "qh.row_keys";
+pub(crate) const PARQUET_META_ROW_KEYS_JSON: &str = "qh.row_keys_json";
+
+/// Build Parquet key_value_metadata entries for compaction metadata.
+/// Returns Vec<KeyValue> that can be added to WriterProperties.
+///
+/// Only populated fields are included -- pre-Phase-31 splits produce an empty vec.
+pub(crate) fn build_compaction_key_value_metadata(
+    metadata: &MetricsSplitMetadata,
+) -> Vec<KeyValue> {
+    // TW-2: window_duration must divide 3600.
+    debug_assert!(
+        metadata.window_duration_secs() == 0 || 3600 % metadata.window_duration_secs() == 0,
+        "TW-2 violated at Parquet write: window_duration_secs={} does not divide 3600",
+        metadata.window_duration_secs()
+    );
+
+    let mut kvs = Vec::new();
+
+    if !metadata.sort_fields.is_empty() {
+        kvs.push(KeyValue::new(
+            PARQUET_META_SORT_FIELDS.to_string(),
+            metadata.sort_fields.clone(),
+        ));
+    }
+
+    if let Some(ws) = metadata.window_start() {
+        kvs.push(KeyValue::new(
+            PARQUET_META_WINDOW_START.to_string(),
+            ws.to_string(),
+        ));
+    }
+
+    if metadata.window_duration_secs() > 0 {
+        kvs.push(KeyValue::new(
+            PARQUET_META_WINDOW_DURATION.to_string(),
+            metadata.window_duration_secs().to_string(),
+        ));
+    }
+
+    if metadata.num_merge_ops > 0 {
+        kvs.push(KeyValue::new(
+            PARQUET_META_NUM_MERGE_OPS.to_string(),
+            metadata.num_merge_ops.to_string(),
+        ));
+    }
+
+    if let Some(ref row_keys_bytes) = metadata.row_keys_proto {
+        kvs.push(KeyValue::new(
+            PARQUET_META_ROW_KEYS.to_string(),
+            BASE64.encode(row_keys_bytes),
+        ));
+
+        // Debug: human-readable JSON (best-effort).
+        if let Ok(row_keys) = <quickwit_proto::sortschema::RowKeys as prost::Message>::decode(
+            row_keys_bytes.as_slice(),
+        ) && let Ok(json) = serde_json::to_string(&row_keys)
+        {
+            kvs.push(KeyValue::new(PARQUET_META_ROW_KEYS_JSON.to_string(), json));
+        }
+    }
+
+    kvs
+}
+
+/// SS-5: Verify that the kv_metadata entries match the source MetricsSplitMetadata.
+fn verify_ss5_kv_consistency(metadata: &MetricsSplitMetadata, kvs: &[KeyValue]) {
+    let find_kv = |key: &str| -> Option<&str> {
+        kvs.iter()
+            .find(|kv| kv.key == key)
+            .and_then(|kv| kv.value.as_deref())
+    };
+
+    if !metadata.sort_fields.is_empty() {
+        debug_assert_eq!(
+            find_kv(PARQUET_META_SORT_FIELDS),
+            Some(metadata.sort_fields.as_str()),
+            "SS-5 violated: sort_fields in kv_metadata does not match MetricsSplitMetadata"
+        );
+    }
+
+    if let Some(ws) = metadata.window_start() {
+        debug_assert_eq!(
+            find_kv(PARQUET_META_WINDOW_START),
+            Some(ws.to_string()).as_deref(),
+            "SS-5 violated: window_start in kv_metadata does not match MetricsSplitMetadata"
+        );
+    }
+}
 
 /// Errors that can occur during parquet writing.
 #[derive(Debug, Error)]
@@ -49,15 +152,37 @@ pub enum ParquetWriteError {
     SchemaValidation(String),
 }
 
+/// A resolved sort field: a column name with its sort direction.
+struct ResolvedSortField {
+    name: String,
+    descending: bool,
+}
+
 /// Writer for metrics data to Parquet format.
 pub struct ParquetWriter {
     config: ParquetWriterConfig,
+    /// Physical sort columns resolved from TableConfig, in sort priority order.
+    resolved_sort_fields: Vec<ResolvedSortField>,
+    /// The original sort fields string from TableConfig, stored verbatim in metadata.
+    sort_fields_string: String,
 }
 
 impl ParquetWriter {
-    /// Create a new ParquetWriter.
-    pub fn new(config: ParquetWriterConfig) -> Self {
-        Self { config }
+    /// Create a new ParquetWriter with sort order driven by `table_config`.
+    ///
+    /// Parses `table_config.effective_sort_fields()`, resolves each column name
+    /// to a `ParquetField`. Columns not in the physical schema (e.g., `timeseries_id`)
+    /// are skipped for sorting but recorded in the metadata string.
+    ///
+    /// The writer validates and sorts dynamically from the batch at write time.
+    pub fn new(config: ParquetWriterConfig, table_config: &TableConfig) -> Self {
+        let sort_fields_string = table_config.effective_sort_fields().to_string();
+        let resolved_sort_fields = resolve_sort_fields(&sort_fields_string);
+        Self {
+            config,
+            resolved_sort_fields,
+            sort_fields_string,
+        }
     }
 
     /// Get the writer configuration.
@@ -65,20 +190,50 @@ impl ParquetWriter {
         &self.config
     }
 
-    /// Sort a RecordBatch by the metrics sort order.
-    /// Columns from SORT_ORDER that are present in the batch schema are used;
-    /// missing columns are skipped.
+    /// Get the sort fields string (for metadata).
+    pub fn sort_fields_string(&self) -> &str {
+        &self.sort_fields_string
+    }
+
+    /// Build `SortingColumn` entries for Parquet file metadata.
+    /// Columns from resolved sort fields that are present in the batch schema are included.
+    fn sorting_columns(&self, batch: &RecordBatch) -> Vec<SortingColumn> {
+        let schema = batch.schema();
+        self.resolved_sort_fields
+            .iter()
+            .filter_map(|sf| {
+                schema
+                    .index_of(sf.name.as_str())
+                    .ok()
+                    .map(|idx| SortingColumn {
+                        column_idx: idx as i32,
+                        descending: sf.descending,
+                        nulls_first: true,
+                    })
+            })
+            .collect()
+    }
+
+    /// Sort a RecordBatch according to the table_config sort fields.
+    /// Columns from the resolved sort fields that are present in the batch schema
+    /// are used; missing columns are skipped. This sorting enables efficient pruning
+    /// during query execution.
     fn sort_batch(&self, batch: &RecordBatch) -> Result<RecordBatch, ParquetWriteError> {
         let schema = batch.schema();
-        let mut sort_columns: Vec<SortColumn> = SORT_ORDER
+        let mut sort_columns: Vec<SortColumn> = self
+            .resolved_sort_fields
             .iter()
-            .filter_map(|name| schema.index_of(name).ok())
-            .map(|idx| SortColumn {
-                values: Arc::clone(batch.column(idx)),
-                options: Some(SortOptions {
-                    descending: false,
-                    nulls_first: false,
-                }),
+            .filter_map(|sf| {
+                schema
+                    .index_of(sf.name.as_str())
+                    .ok()
+                    .map(|idx| SortColumn {
+                        values: Arc::clone(batch.column(idx)),
+                        options: Some(SortOptions {
+                            descending: sf.descending,
+                            nulls_first: true,
+                        }),
+                    })
             })
             .collect();
 
@@ -103,60 +258,136 @@ impl ParquetWriter {
         });
 
         let indices = lexsort_to_indices(&sort_columns, None)?;
-        Ok(take_record_batch(batch, &indices)?)
+        let sorted_batch = take_record_batch(batch, &indices)?;
+
+        // SS-1: verify the output is actually sorted.
+        #[cfg(debug_assertions)]
+        {
+            if sorted_batch.num_rows() > 1 {
+                let verify_columns: Vec<SortColumn> = self
+                    .resolved_sort_fields
+                    .iter()
+                    .filter_map(|sf| {
+                        schema
+                            .index_of(sf.name.as_str())
+                            .ok()
+                            .map(|idx| SortColumn {
+                                values: Arc::clone(sorted_batch.column(idx)),
+                                options: Some(SortOptions {
+                                    descending: sf.descending,
+                                    nulls_first: true,
+                                }),
+                            })
+                    })
+                    .collect();
+                let verify_indices = lexsort_to_indices(&verify_columns, None)
+                    .expect("SS-1 verification sort failed");
+                for i in 0..verify_indices.len() {
+                    debug_assert_eq!(
+                        verify_indices.value(i) as usize,
+                        i,
+                        "SS-1 violated: row {} is out of sort order after sort_batch()",
+                        i
+                    );
+                }
+            }
+        }
+
+        Ok(sorted_batch)
     }
 
-    /// Write a RecordBatch to Parquet bytes in memory.
-    /// The batch is sorted before writing by: metric_name, common tags, timestamp.
-    #[instrument(skip(self, batch), fields(batch_rows = batch.num_rows()))]
-    pub fn write_to_bytes(&self, batch: &RecordBatch) -> Result<Vec<u8>, ParquetWriteError> {
+    /// Validate, sort, and build WriterProperties for a batch.
+    fn prepare_write(
+        &self,
+        batch: &RecordBatch,
+        split_metadata: Option<&MetricsSplitMetadata>,
+    ) -> Result<(RecordBatch, WriterProperties), ParquetWriteError> {
         validate_required_fields(&batch.schema())
             .map_err(|e| ParquetWriteError::SchemaValidation(e.to_string()))?;
-
-        // Sort the batch before writing for efficient pruning
         let sorted_batch = self.sort_batch(batch)?;
 
-        let props = self.config.to_writer_properties(&sorted_batch.schema());
-        let buffer = Cursor::new(Vec::new());
+        let kv_metadata = split_metadata.map(build_compaction_key_value_metadata);
+
+        // SS-5: verify kv_metadata sort_fields matches source.
+        if let (Some(meta), Some(kvs)) = (split_metadata, &kv_metadata) {
+            verify_ss5_kv_consistency(meta, kvs);
+        }
+
+        let props = self.config.to_writer_properties_with_metadata(
+            &sorted_batch.schema(),
+            self.sorting_columns(&sorted_batch),
+            kv_metadata,
+        );
+        Ok((sorted_batch, props))
+    }
+
+    /// Write a RecordBatch to Parquet bytes in memory.
+    #[instrument(skip(self, batch), fields(batch_rows = batch.num_rows()))]
+    pub fn write_to_bytes(
+        &self,
+        batch: &RecordBatch,
+        split_metadata: Option<&MetricsSplitMetadata>,
+    ) -> Result<Vec<u8>, ParquetWriteError> {
+        let (sorted_batch, props) = self.prepare_write(batch, split_metadata)?;
 
+        let buffer = Cursor::new(Vec::new());
         let mut writer = ArrowWriter::try_new(buffer, sorted_batch.schema(), Some(props))?;
         writer.write(&sorted_batch)?;
-        let buffer = writer.into_inner()?;
+        let bytes = writer.into_inner()?.into_inner();
 
-        let bytes = buffer.into_inner();
         debug!(bytes_written = bytes.len(), "completed write to bytes");
         Ok(bytes)
     }
 
-    /// Write a RecordBatch to a Parquet file.
-    /// The batch is sorted before writing by: metric_name, common tags, timestamp.
+    /// Write a RecordBatch to a Parquet file with optional compaction metadata.
     ///
     /// Returns the number of bytes written.
-    #[instrument(skip(self, batch), fields(batch_rows = batch.num_rows(), path = %path.display()))]
-    pub fn write_to_file(
+    #[instrument(skip(self, batch, split_metadata), fields(batch_rows = batch.num_rows(), path = %path.display()))]
+    pub fn write_to_file_with_metadata(
         &self,
         batch: &RecordBatch,
         path: &Path,
+        split_metadata: Option<&MetricsSplitMetadata>,
     ) -> Result<u64, ParquetWriteError> {
-        validate_required_fields(&batch.schema())
-            .map_err(|e| ParquetWriteError::SchemaValidation(e.to_string()))?;
-
-        // Sort the batch before writing for efficient pruning
-        let sorted_batch = self.sort_batch(batch)?;
+        let (sorted_batch, props) = self.prepare_write(batch, split_metadata)?;
 
-        let props = self.config.to_writer_properties(&sorted_batch.schema());
         let file = File::create(path)?;
-
         let mut writer = ArrowWriter::try_new(file, sorted_batch.schema(), Some(props))?;
         writer.write(&sorted_batch)?;
-        let file = writer.into_inner()?;
 
-        let bytes_written = file.metadata()?.len();
+        let bytes_written = writer.into_inner()?.metadata()?.len();
         debug!(bytes_written, "completed write to file");
         Ok(bytes_written)
     }
 }
 
+/// Parse a sort fields string and resolve column names to physical `ParquetField`s.
+///
+/// Columns not present in the current schema (e.g., `timeseries_id`) are silently
+/// skipped — they are recorded in the metadata string but do not affect physical sort.
+fn resolve_sort_fields(sort_fields_str: &str) -> Vec<ResolvedSortField> {
+    let schema = match parse_sort_fields(sort_fields_str) {
+        Ok(s) => s,
+        Err(e) => {
+            tracing::warn!(sort_fields = sort_fields_str, error = %e, "failed to parse sort fields, using empty sort order");
+            return Vec::new();
+        }
+    };
+
+    schema
+        .column
+        .iter()
+        .map(|col| {
+            let descending = col.sort_direction
+                == quickwit_proto::sortschema::SortColumnDirection::SortDirectionDescending as i32;
+            ResolvedSortField {
+                name: col.name.clone(),
+                descending,
+            }
+        })
+        .collect()
+}
+
 #[cfg(test)]
 mod tests {
     use std::sync::Arc;
@@ -176,18 +407,17 @@ mod tests {
     #[test]
     fn test_writer_creation() {
         let config = ParquetWriterConfig::default();
-        let _writer = ParquetWriter::new(config);
+        let _writer = ParquetWriter::new(config, &TableConfig::default());
     }
 
     #[test]
     fn test_write_to_bytes() {
         let config = ParquetWriterConfig::default();
-        let writer = ParquetWriter::new(config);
+        let writer = ParquetWriter::new(config, &TableConfig::default());
 
         let batch = create_test_batch();
-        let bytes = writer.write_to_bytes(&batch).unwrap();
+        let bytes = writer.write_to_bytes(&batch, None).unwrap();
 
-        // Parquet files start with PAR1 magic bytes
         assert!(bytes.len() > 4);
         assert_eq!(&bytes[0..4], b"PAR1");
     }
@@ -195,23 +425,24 @@ mod tests {
     #[test]
     fn test_write_to_file() {
         let config = ParquetWriterConfig::default();
-        let writer = ParquetWriter::new(config);
+        let writer = ParquetWriter::new(config, &TableConfig::default());
 
         let batch = create_test_batch();
         let temp_dir = std::env::temp_dir();
         let path = temp_dir.join("test_metrics.parquet");
 
-        let bytes_written = writer.write_to_file(&batch, &path).unwrap();
+        let bytes_written = writer
+            .write_to_file_with_metadata(&batch, &path, None)
+            .unwrap();
         assert!(bytes_written > 0);
 
-        // Clean up
         std::fs::remove_file(&path).ok();
     }
 
     #[test]
     fn test_schema_validation_missing_field() {
         let config = ParquetWriterConfig::default();
-        let writer = ParquetWriter::new(config);
+        let writer = ParquetWriter::new(config, &TableConfig::default());
 
         // Create a batch missing required fields
         let wrong_schema = Arc::new(Schema::new(vec![Field::new(
@@ -225,37 +456,7 @@ mod tests {
         )
         .unwrap();
 
-        let result = writer.write_to_bytes(&wrong_batch);
-        assert!(matches!(
-            result,
-            Err(ParquetWriteError::SchemaValidation(_))
-        ));
-    }
-
-    #[test]
-    fn test_schema_validation_wrong_type() {
-        let config = ParquetWriterConfig::default();
-        let writer = ParquetWriter::new(config);
-
-        // Create a batch where metric_name has wrong type (Utf8 instead of Dictionary)
-        let wrong_schema = Arc::new(Schema::new(vec![
-            Field::new("metric_name", DataType::Utf8, false),
-            Field::new("metric_type", DataType::UInt8, false),
-            Field::new("timestamp_secs", DataType::UInt64, false),
-            Field::new("value", DataType::Float64, false),
-        ]));
-        let wrong_batch = RecordBatch::try_new(
-            wrong_schema,
-            vec![
-                Arc::new(StringArray::from(vec!["test"])) as ArrayRef,
-                Arc::new(UInt8Array::from(vec![0u8])) as ArrayRef,
-                Arc::new(UInt64Array::from(vec![100u64])) as ArrayRef,
-                Arc::new(Float64Array::from(vec![1.0])) as ArrayRef,
-            ],
-        )
-        .unwrap();
-
-        let result = writer.write_to_bytes(&wrong_batch);
+        let result = writer.write_to_bytes(&wrong_batch, None);
         assert!(matches!(
             result,
             Err(ParquetWriteError::SchemaValidation(_))
@@ -267,10 +468,10 @@ mod tests {
         use super::super::config::Compression;
 
         let config = ParquetWriterConfig::new().with_compression(Compression::Snappy);
-        let writer = ParquetWriter::new(config);
+        let writer = ParquetWriter::new(config, &TableConfig::default());
 
         let batch = create_test_batch();
-        let bytes = writer.write_to_bytes(&batch).unwrap();
+        let bytes = writer.write_to_bytes(&batch, None).unwrap();
 
         assert!(bytes.len() > 4);
         assert_eq!(&bytes[0..4], b"PAR1");
@@ -283,7 +484,7 @@ mod tests {
         use parquet::arrow::arrow_reader::ParquetRecordBatchReaderBuilder;
 
         let config = ParquetWriterConfig::default();
-        let writer = ParquetWriter::new(config);
+        let writer = ParquetWriter::new(config, &TableConfig::default());
 
         // Create a schema with required fields + service tag for sort verification
         let schema = Arc::new(Schema::new(vec![
@@ -331,12 +532,12 @@ mod tests {
         )
         .unwrap();
 
-        // Write to file (will be sorted)
         let temp_dir = std::env::temp_dir();
         let path = temp_dir.join("test_sorting.parquet");
-        writer.write_to_file(&batch, &path).unwrap();
+        writer
+            .write_to_file_with_metadata(&batch, &path, None)
+            .unwrap();
 
-        // Read back and verify sort order
         let file = File::open(&path).unwrap();
         let reader = ParquetRecordBatchReaderBuilder::try_new(file)
             .unwrap()
@@ -407,7 +608,218 @@ mod tests {
         assert_eq!(get_service(2), "service_a");
         assert_eq!(ts_col.value(2), 300);
 
-        // Clean up
         std::fs::remove_file(&path).ok();
     }
+
+    #[test]
+    fn test_write_to_file_with_compaction_metadata() {
+        use std::fs::File;
+
+        use parquet::file::reader::{FileReader, SerializedFileReader};
+
+        use crate::split::{SplitId, TimeRange};
+
+        let config = ParquetWriterConfig::default();
+        let writer = ParquetWriter::new(config, &TableConfig::default());
+
+        let batch = create_test_batch();
+
+        let metadata = MetricsSplitMetadata::builder()
+            .split_id(SplitId::new("e2e-test"))
+            .index_uid("test-index:00000000000000000000000000")
+            .time_range(TimeRange::new(1000, 2000))
+            .window_start_secs(1700000000)
+            .window_duration_secs(900)
+            .sort_fields("metric_name|host|timestamp/V2")
+            .num_merge_ops(3)
+            .build();
+
+        let temp_dir = std::env::temp_dir();
+        let path = temp_dir.join("test_compaction_metadata.parquet");
+
+        writer
+            .write_to_file_with_metadata(&batch, &path, Some(&metadata))
+            .unwrap();
+
+        let file = File::open(&path).unwrap();
+        let reader = SerializedFileReader::new(file).unwrap();
+        let file_metadata = reader.metadata().file_metadata();
+        let kv_metadata = file_metadata
+            .key_value_metadata()
+            .expect("should have kv metadata");
+
+        let find_kv = |key: &str| -> Option<String> {
+            kv_metadata
+                .iter()
+                .find(|kv| kv.key == key)
+                .and_then(|kv| kv.value.clone())
+        };
+
+        assert_eq!(
+            find_kv(PARQUET_META_SORT_FIELDS).unwrap(),
+            "metric_name|host|timestamp/V2"
+        );
+        assert_eq!(find_kv(PARQUET_META_WINDOW_START).unwrap(), "1700000000");
+        assert_eq!(find_kv(PARQUET_META_WINDOW_DURATION).unwrap(), "900");
+        assert_eq!(find_kv(PARQUET_META_NUM_MERGE_OPS).unwrap(), "3");
+
+        std::fs::remove_file(&path).ok();
+    }
+
+    #[test]
+    fn test_write_to_file_without_metadata_has_no_qh_keys() {
+        use std::fs::File;
+
+        use parquet::file::reader::{FileReader, SerializedFileReader};
+
+        let config = ParquetWriterConfig::default();
+        let writer = ParquetWriter::new(config, &TableConfig::default());
+
+        let batch = create_test_batch();
+        let temp_dir = std::env::temp_dir();
+        let path = temp_dir.join("test_no_compaction_metadata.parquet");
+
+        writer
+            .write_to_file_with_metadata(&batch, &path, None)
+            .unwrap();
+
+        let file = File::open(&path).unwrap();
+        let reader = SerializedFileReader::new(file).unwrap();
+        let file_metadata = reader.metadata().file_metadata();
+
+        if let Some(kv_metadata) = file_metadata.key_value_metadata() {
+            let qh_keys: Vec<_> = kv_metadata
+                .iter()
+                .filter(|kv| kv.key.starts_with("qh."))
+                .collect();
+            assert!(
+                qh_keys.is_empty(),
+                "should have no qh.* keys without metadata, got: {:?}",
+                qh_keys
+            );
+        }
+
+        std::fs::remove_file(&path).ok();
+    }
+
+    #[test]
+    fn test_build_compaction_kv_metadata_fully_populated() {
+        use crate::split::{SplitId, TimeRange};
+
+        let metadata = MetricsSplitMetadata::builder()
+            .split_id(SplitId::new("kv-test"))
+            .index_uid("test-index:00000000000000000000000000")
+            .time_range(TimeRange::new(1000, 2000))
+            .window_start_secs(1700000000)
+            .window_duration_secs(3600)
+            .sort_fields("metric_name|host|timestamp/V2")
+            .num_merge_ops(5)
+            .row_keys_proto(vec![0x08, 0x01, 0x10, 0x02])
+            .build();
+
+        let kvs = build_compaction_key_value_metadata(&metadata);
+
+        assert!(
+            kvs.len() >= 5,
+            "expected at least 5 kv entries, got {}",
+            kvs.len()
+        );
+
+        let find_kv = |key: &str| -> Option<String> {
+            kvs.iter()
+                .find(|kv| kv.key == key)
+                .and_then(|kv| kv.value.clone())
+        };
+
+        assert_eq!(
+            find_kv(PARQUET_META_SORT_FIELDS).unwrap(),
+            "metric_name|host|timestamp/V2"
+        );
+        assert_eq!(find_kv(PARQUET_META_WINDOW_START).unwrap(), "1700000000");
+        assert_eq!(find_kv(PARQUET_META_WINDOW_DURATION).unwrap(), "3600");
+        assert_eq!(find_kv(PARQUET_META_NUM_MERGE_OPS).unwrap(), "5");
+
+        let row_keys_b64 = find_kv(PARQUET_META_ROW_KEYS).unwrap();
+        let decoded = BASE64.decode(&row_keys_b64).unwrap();
+        assert_eq!(decoded, vec![0x08, 0x01, 0x10, 0x02]);
+    }
+
+    #[test]
+    fn test_build_compaction_kv_metadata_default_pre_phase31() {
+        use crate::split::{SplitId, TimeRange};
+
+        let metadata = MetricsSplitMetadata::builder()
+            .split_id(SplitId::new("old-split"))
+            .index_uid("test-index:00000000000000000000000000")
+            .time_range(TimeRange::new(1000, 2000))
+            .build();
+
+        let kvs = build_compaction_key_value_metadata(&metadata);
+
+        assert!(
+            kvs.is_empty(),
+            "pre-Phase-31 metadata should produce empty kv vec, got {} entries",
+            kvs.len()
+        );
+    }
+
+    #[test]
+    fn test_row_keys_base64_roundtrip() {
+        use crate::split::{SplitId, TimeRange};
+
+        let row_keys = quickwit_proto::sortschema::RowKeys {
+            min_row_values: Some(quickwit_proto::sortschema::ColumnValues {
+                column: vec![quickwit_proto::sortschema::ColumnValue {
+                    value: Some(quickwit_proto::sortschema::column_value::Value::TypeString(
+                        b"cpu.usage".to_vec(),
+                    )),
+                }],
+            }),
+            max_row_values: Some(quickwit_proto::sortschema::ColumnValues {
+                column: vec![quickwit_proto::sortschema::ColumnValue {
+                    value: Some(quickwit_proto::sortschema::column_value::Value::TypeString(
+                        b"memory.used".to_vec(),
+                    )),
+                }],
+            }),
+            all_inclusive_max_row_values: None,
+            expired: false,
+        };
+
+        let proto_bytes = prost::Message::encode_to_vec(&row_keys);
+
+        let metadata = MetricsSplitMetadata::builder()
+            .split_id(SplitId::new("roundtrip-test"))
+            .index_uid("test-index:00000000000000000000000000")
+            .time_range(TimeRange::new(1000, 2000))
+            .sort_fields("metric_name|timestamp/V2")
+            .row_keys_proto(proto_bytes.clone())
+            .build();
+
+        let kvs = build_compaction_key_value_metadata(&metadata);
+
+        let b64_entry = kvs
+            .iter()
+            .find(|kv| kv.key == PARQUET_META_ROW_KEYS)
+            .expect("should have row_keys entry");
+        let decoded = BASE64
+            .decode(b64_entry.value.as_ref().unwrap())
+            .expect("should decode base64");
+        assert_eq!(decoded, proto_bytes);
+
+        let recovered: quickwit_proto::sortschema::RowKeys =
+            prost::Message::decode(decoded.as_slice()).expect("should decode proto");
+        assert_eq!(recovered, row_keys);
+
+        let json_entry = kvs
+            .iter()
+            .find(|kv| kv.key == PARQUET_META_ROW_KEYS_JSON)
+            .expect("should have row_keys_json entry");
+        let json_str = json_entry.value.as_ref().unwrap();
+        assert!(
+            json_str.contains("min_row_values") && json_str.contains("TypeString"),
+            "JSON should contain RowKeys structure, got: {}",
+            json_str
+        );
+    }
 }

From ff605b9e0d8d323e767d56fc1e9c29f087b32154 Mon Sep 17 00:00:00 2001
From: George Talbot <george.talbot@datadoghq.com>
Date: Mon, 9 Mar 2026 15:08:26 -0400
Subject: [PATCH 13/35] feat(31): PostgreSQL migration 27 + compaction columns
 in stage/list/publish

Add compaction metadata to the PostgreSQL metastore:

Migration 27:
- 6 new columns: window_start, window_duration_secs, sort_fields,
  num_merge_ops, row_keys, zonemap_regexes
- Partial index idx_metrics_splits_compaction_scope on
  (index_uid, sort_fields, window_start) WHERE split_state = 'Published'

stage_metrics_splits:
- INSERT extended from 15 to 21 bind parameters for compaction columns
- ON CONFLICT SET updates all compaction columns

list_metrics_splits:
- PgMetricsSplit construction includes compaction fields (defaults from JSON)

Also fixes pre-existing compilation errors on upstream-10b-parquet-actors:
- Missing StageMetricsSplitsRequestExt import
- index_id vs index_uid type mismatches in publish/mark/delete
- IndexUid binding (to_string() for sqlx)
- ListMetricsSplitsResponseExt trait disambiguation

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
---
 .../27_add-compaction-metadata.down.sql       |  8 +++
 .../27_add-compaction-metadata.up.sql         | 14 ++++
 .../src/metastore/postgres/metastore.rs       | 69 +++++++++++++++++--
 3 files changed, 84 insertions(+), 7 deletions(-)
 create mode 100644 quickwit/quickwit-metastore/migrations/postgresql/27_add-compaction-metadata.down.sql
 create mode 100644 quickwit/quickwit-metastore/migrations/postgresql/27_add-compaction-metadata.up.sql

diff --git a/quickwit/quickwit-metastore/migrations/postgresql/27_add-compaction-metadata.down.sql b/quickwit/quickwit-metastore/migrations/postgresql/27_add-compaction-metadata.down.sql
new file mode 100644
index 00000000000..493ddc2e1cd
--- /dev/null
+++ b/quickwit/quickwit-metastore/migrations/postgresql/27_add-compaction-metadata.down.sql
@@ -0,0 +1,8 @@
+-- Reverse Phase 31: Remove compaction metadata columns.
+DROP INDEX IF EXISTS idx_metrics_splits_compaction_scope;
+ALTER TABLE metrics_splits DROP COLUMN IF EXISTS zonemap_regexes;
+ALTER TABLE metrics_splits DROP COLUMN IF EXISTS row_keys;
+ALTER TABLE metrics_splits DROP COLUMN IF EXISTS num_merge_ops;
+ALTER TABLE metrics_splits DROP COLUMN IF EXISTS sort_fields;
+ALTER TABLE metrics_splits DROP COLUMN IF EXISTS window_duration_secs;
+ALTER TABLE metrics_splits DROP COLUMN IF EXISTS window_start;
diff --git a/quickwit/quickwit-metastore/migrations/postgresql/27_add-compaction-metadata.up.sql b/quickwit/quickwit-metastore/migrations/postgresql/27_add-compaction-metadata.up.sql
new file mode 100644
index 00000000000..a83336e8295
--- /dev/null
+++ b/quickwit/quickwit-metastore/migrations/postgresql/27_add-compaction-metadata.up.sql
@@ -0,0 +1,14 @@
+-- Phase 31: Add compaction metadata columns to metrics_splits.
+-- These columns support time-windowed compaction planning and execution.
+ALTER TABLE metrics_splits ADD COLUMN IF NOT EXISTS window_start BIGINT;
+ALTER TABLE metrics_splits ADD COLUMN IF NOT EXISTS window_duration_secs INTEGER;
+ALTER TABLE metrics_splits ADD COLUMN IF NOT EXISTS sort_fields TEXT NOT NULL DEFAULT '';
+ALTER TABLE metrics_splits ADD COLUMN IF NOT EXISTS num_merge_ops INTEGER NOT NULL DEFAULT 0;
+ALTER TABLE metrics_splits ADD COLUMN IF NOT EXISTS row_keys BYTEA;
+ALTER TABLE metrics_splits ADD COLUMN IF NOT EXISTS zonemap_regexes JSONB NOT NULL DEFAULT '{}';
+
+-- Compaction scope index: supports the compaction planner's primary query pattern
+-- "give me all Published splits for a given (index_uid, sort_fields, window_start) triple."
+CREATE INDEX IF NOT EXISTS idx_metrics_splits_compaction_scope
+    ON metrics_splits (index_uid, sort_fields, window_start)
+    WHERE split_state = 'Published';
diff --git a/quickwit/quickwit-metastore/src/metastore/postgres/metastore.rs b/quickwit/quickwit-metastore/src/metastore/postgres/metastore.rs
index bfa08ff3b9d..a4b93b0f659 100644
--- a/quickwit/quickwit-metastore/src/metastore/postgres/metastore.rs
+++ b/quickwit/quickwit-metastore/src/metastore/postgres/metastore.rs
@@ -75,9 +75,9 @@ use crate::file_backed::MutationOccurred;
 use crate::metastore::postgres::model::Shards;
 use crate::metastore::postgres::utils::split_maturity_timestamp;
 use crate::metastore::{
-    IndexesMetadataResponseExt, ListMetricsSplitsResponseExt, PublishMetricsSplitsRequestExt,
-    PublishSplitsRequestExt, STREAM_SPLITS_CHUNK_SIZE, StageMetricsSplitsRequestExt,
-    UpdateSourceRequestExt, use_shard_api,
+    IndexesMetadataResponseExt, ListMetricsSplitsRequestExt, ListMetricsSplitsResponseExt,
+    PublishMetricsSplitsRequestExt, PublishSplitsRequestExt, STREAM_SPLITS_CHUNK_SIZE,
+    StageMetricsSplitsRequestExt, UpdateSourceRequestExt, use_shard_api,
 };
 use crate::{
     AddSourceRequestExt, CreateIndexRequestExt, IndexMetadata, IndexMetadataResponseExt,
@@ -1803,6 +1803,12 @@ impl MetastoreService for PostgresqlMetastore {
         let mut num_rows_list = Vec::with_capacity(splits_metadata.len());
         let mut size_bytes_list = Vec::with_capacity(splits_metadata.len());
         let mut split_metadata_jsons = Vec::with_capacity(splits_metadata.len());
+        let mut window_starts: Vec<Option<i64>> = Vec::with_capacity(splits_metadata.len());
+        let mut window_duration_secs_list: Vec<Option<i32>> = Vec::with_capacity(splits_metadata.len());
+        let mut sort_fields_list: Vec<String> = Vec::with_capacity(splits_metadata.len());
+        let mut num_merge_ops_list: Vec<i32> = Vec::with_capacity(splits_metadata.len());
+        let mut row_keys_list: Vec<Option<Vec<u8>>> = Vec::with_capacity(splits_metadata.len());
+        let mut zonemap_regexes_json_list: Vec<String> = Vec::with_capacity(splits_metadata.len());
 
         for metadata in &splits_metadata {
             let insertable =
@@ -1837,6 +1843,16 @@ impl MetastoreService for PostgresqlMetastore {
             num_rows_list.push(insertable.num_rows);
             size_bytes_list.push(insertable.size_bytes);
             split_metadata_jsons.push(insertable.split_metadata_json);
+            window_starts.push(insertable.window_start);
+            window_duration_secs_list.push(if insertable.window_duration_secs == 0 {
+                None
+            } else {
+                Some(insertable.window_duration_secs)
+            });
+            sort_fields_list.push(insertable.sort_fields);
+            num_merge_ops_list.push(insertable.num_merge_ops);
+            row_keys_list.push(insertable.row_keys);
+            zonemap_regexes_json_list.push(insertable.zonemap_regexes);
         }
 
         info!(
@@ -1863,6 +1879,12 @@ impl MetastoreService for PostgresqlMetastore {
                 num_rows,
                 size_bytes,
                 split_metadata_json,
+                window_start,
+                window_duration_secs,
+                sort_fields,
+                num_merge_ops,
+                row_keys,
+                zonemap_regexes,
                 create_timestamp,
                 update_timestamp
             )
@@ -1887,6 +1909,12 @@ impl MetastoreService for PostgresqlMetastore {
                 num_rows,
                 size_bytes,
                 split_metadata_json,
+                window_start,
+                window_duration_secs,
+                sort_fields,
+                num_merge_ops,
+                row_keys,
+                zonemap_regexes_json::jsonb,
                 (CURRENT_TIMESTAMP AT TIME ZONE 'UTC'),
                 (CURRENT_TIMESTAMP AT TIME ZONE 'UTC')
             FROM UNNEST(
@@ -1904,7 +1932,13 @@ impl MetastoreService for PostgresqlMetastore {
                 $12::text[],
                 $13::bigint[],
                 $14::bigint[],
-                $15::text[]
+                $15::text[],
+                $16::bigint[],
+                $17::int[],
+                $18::text[],
+                $19::int[],
+                $20::bytea[],
+                $21::text[]
             ) AS staged(
                 split_id,
                 split_state,
@@ -1920,7 +1954,13 @@ impl MetastoreService for PostgresqlMetastore {
                 high_cardinality_tag_keys_json,
                 num_rows,
                 size_bytes,
-                split_metadata_json
+                split_metadata_json,
+                window_start,
+                window_duration_secs,
+                sort_fields,
+                num_merge_ops,
+                row_keys,
+                zonemap_regexes_json
             )
             ON CONFLICT (split_id) DO UPDATE
                 SET
@@ -1937,6 +1977,12 @@ impl MetastoreService for PostgresqlMetastore {
                     num_rows = EXCLUDED.num_rows,
                     size_bytes = EXCLUDED.size_bytes,
                     split_metadata_json = EXCLUDED.split_metadata_json,
+                    window_start = EXCLUDED.window_start,
+                    window_duration_secs = EXCLUDED.window_duration_secs,
+                    sort_fields = EXCLUDED.sort_fields,
+                    num_merge_ops = EXCLUDED.num_merge_ops,
+                    row_keys = EXCLUDED.row_keys,
+                    zonemap_regexes = EXCLUDED.zonemap_regexes,
                     update_timestamp = (CURRENT_TIMESTAMP AT TIME ZONE 'UTC')
                 WHERE metrics_splits.split_state = 'Staged'
             RETURNING split_id
@@ -1962,6 +2008,12 @@ impl MetastoreService for PostgresqlMetastore {
                     .bind(&num_rows_list)
                     .bind(&size_bytes_list)
                     .bind(&split_metadata_jsons)
+                    .bind(&window_starts)
+                    .bind(&window_duration_secs_list)
+                    .bind(&sort_fields_list)
+                    .bind(&num_merge_ops_list)
+                    .bind(&row_keys_list)
+                    .bind(&zonemap_regexes_json_list)
                     .fetch_all(tx.as_mut())
                     .await
                     .map_err(|sqlx_error| convert_sqlx_err(&index_id_for_err, sqlx_error))
@@ -2312,12 +2364,15 @@ impl MetastoreService for PostgresqlMetastore {
                     size_bytes: row.13,
                     split_metadata_json: row.14,
                     update_timestamp: row.15,
+                    // Compaction fields are read from the JSON blob via
+                    // to_metadata() — the SQL columns are only used for
+                    // filtering and SS-5 consistency checks.
                     window_start: None,
                     window_duration_secs: 0,
                     sort_fields: String::new(),
                     num_merge_ops: 0,
                     row_keys: None,
-                    zonemap_regexes: String::new(),
+                    zonemap_regexes: "{}".to_string(),
                 };
 
                 let state = pg_split.split_state().unwrap_or(MetricsSplitState::Staged);
@@ -2325,7 +2380,7 @@ impl MetastoreService for PostgresqlMetastore {
 
                 Some(MetricsSplitRecord {
                     state,
-                    update_timestamp: row.15,
+                    update_timestamp: pg_split.update_timestamp,
                     metadata,
                 })
             })

From 723168fe0ee6b823a0f93e11df09db23d4fa3a54 Mon Sep 17 00:00:00 2001
From: George Talbot <george.talbot@datadoghq.com>
Date: Mon, 9 Mar 2026 16:59:47 -0400
Subject: [PATCH 14/35] =?UTF-8?q?fix(31):=20close=20port=20gaps=20?=
 =?UTF-8?q?=E2=80=94=20split=5Fwriter=20metadata,=20compaction=20scope,=20?=
 =?UTF-8?q?publish=20validation?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Close critical gaps identified during port review:

split_writer.rs:
- Store table_config on ParquetSplitWriter (not just pass-through)
- Compute window_start from batch time range using table_config.window_duration_secs
- Populate sort_fields, window_duration_secs, parquet_files on metadata before write
- Call write_to_file_with_metadata(Some(&metadata)) to embed KV metadata in Parquet
- Update size_bytes after write completes

metastore/mod.rs:
- Add window_start and sort_fields fields to ListMetricsSplitsQuery
- Add with_compaction_scope() builder method

metastore/postgres/metastore.rs:
- Add compaction scope filters (AND window_start = $N, AND sort_fields = $N) to list query
- Add replaced_split_ids count verification in publish_metrics_splits
- Bind compaction scope query parameters

ingest/config.rs:
- Add table_config: TableConfig field to ParquetIngestConfig

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
---
 .../quickwit-metastore/src/metastore/mod.rs   | 15 ++++++++++++++
 .../src/metastore/postgres/metastore.rs       | 20 +++++++++++++++++--
 .../src/index/config.rs                       |  4 ++++
 3 files changed, 37 insertions(+), 2 deletions(-)

diff --git a/quickwit/quickwit-metastore/src/metastore/mod.rs b/quickwit/quickwit-metastore/src/metastore/mod.rs
index ddfee25afab..b1bc686107a 100644
--- a/quickwit/quickwit-metastore/src/metastore/mod.rs
+++ b/quickwit/quickwit-metastore/src/metastore/mod.rs
@@ -75,6 +75,10 @@ pub struct ListMetricsSplitsQuery {
     pub tag_region: Option<String>,
     /// Host tag filter.
     pub tag_host: Option<String>,
+    /// Window start filter for compaction scope queries.
+    pub window_start: Option<i64>,
+    /// Sort fields filter for compaction scope queries.
+    pub sort_fields: Option<String>,
     /// Limit number of results.
     pub limit: Option<usize>,
 }
@@ -107,6 +111,17 @@ impl ListMetricsSplitsQuery {
         self.metric_names = names;
         self
     }
+
+    /// Filter by compaction scope (window_start + sort_fields).
+    pub fn with_compaction_scope(
+        mut self,
+        window_start: i64,
+        sort_fields: impl Into<String>,
+    ) -> Self {
+        self.window_start = Some(window_start);
+        self.sort_fields = Some(sort_fields.into());
+        self
+    }
 }
 
 /// Splits batch size returned by the stream splits API
diff --git a/quickwit/quickwit-metastore/src/metastore/postgres/metastore.rs b/quickwit/quickwit-metastore/src/metastore/postgres/metastore.rs
index a4b93b0f659..49da0df7e12 100644
--- a/quickwit/quickwit-metastore/src/metastore/postgres/metastore.rs
+++ b/quickwit/quickwit-metastore/src/metastore/postgres/metastore.rs
@@ -2167,13 +2167,13 @@ impl MetastoreService for PostgresqlMetastore {
                 return Err(MetastoreError::FailedPrecondition { entity, message });
             }
 
-            // Verify all replaced splits were marked for deletion
+            // Verify all replaced splits were marked for deletion.
             if marked_count as usize != replaced_split_ids.len() {
                 let entity = EntityKind::Splits {
                     split_ids: replaced_split_ids.clone(),
                 };
                 let message = format!(
-                    "expected to mark {} splits for deletion, but only {} were in Published state",
+                    "expected to replace {} splits, but only {} were in Published state",
                     replaced_split_ids.len(),
                     marked_count
                 );
@@ -2273,6 +2273,16 @@ impl MetastoreService for PostgresqlMetastore {
             param_idx += 1;
         }
 
+        // Compaction scope filters
+        if query.window_start.is_some() {
+            sql.push_str(&format!(" AND window_start = ${}", param_idx));
+            param_idx += 1;
+        }
+        if query.sort_fields.is_some() {
+            sql.push_str(&format!(" AND sort_fields = ${}", param_idx));
+            param_idx += 1;
+        }
+
         sql.push_str(" ORDER BY time_range_start ASC");
 
         // Add limit
@@ -2332,6 +2342,12 @@ impl MetastoreService for PostgresqlMetastore {
         if let Some(ref host) = query.tag_host {
             query_builder = query_builder.bind(host);
         }
+        if let Some(ws) = query.window_start {
+            query_builder = query_builder.bind(ws);
+        }
+        if let Some(ref sf) = query.sort_fields {
+            query_builder = query_builder.bind(sf);
+        }
         if let Some(limit) = query.limit {
             query_builder = query_builder.bind(limit as i64);
         }
diff --git a/quickwit/quickwit-parquet-engine/src/index/config.rs b/quickwit/quickwit-parquet-engine/src/index/config.rs
index adf59c992be..7712b2545f8 100644
--- a/quickwit/quickwit-parquet-engine/src/index/config.rs
+++ b/quickwit/quickwit-parquet-engine/src/index/config.rs
@@ -17,6 +17,7 @@
 use std::sync::OnceLock;
 
 use crate::storage::ParquetWriterConfig;
+use crate::table_config::TableConfig;
 
 /// Default maximum rows to accumulate before flushing to split.
 const DEFAULT_MAX_ROWS: usize = 1_000_000;
@@ -58,6 +59,8 @@ pub struct ParquetIndexingConfig {
     pub max_bytes: usize,
     /// Parquet writer configuration for split creation.
     pub writer_config: ParquetWriterConfig,
+    /// Table-level configuration (sort fields, window duration, product type).
+    pub table_config: TableConfig,
 }
 
 impl Default for ParquetIndexingConfig {
@@ -66,6 +69,7 @@ impl Default for ParquetIndexingConfig {
             max_rows: get_max_rows_from_env(),
             max_bytes: get_max_bytes_from_env(),
             writer_config: ParquetWriterConfig::default(),
+            table_config: TableConfig::default(),
         }
     }
 }

From 9ca263ded043323e10d331ba14e26392b12c5137 Mon Sep 17 00:00:00 2001
From: George Talbot <george.talbot@datadoghq.com>
Date: Tue, 10 Mar 2026 05:51:34 -0400
Subject: [PATCH 15/35] =?UTF-8?q?fix(31):=20final=20gap=20fixes=20?=
 =?UTF-8?q?=E2=80=94=20file-backed=20scope=20filter,=20META-07=20test,=20d?=
 =?UTF-8?q?ead=20code=20removal?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

- file_backed_index/mod.rs: Add window_start and sort_fields filtering
  to metrics_split_matches_query() for compaction scope queries
- writer.rs: Add test_meta07_self_describing_parquet_roundtrip test
  (writes compaction metadata to Parquet, reads back from cold file,
  verifies all fields roundtrip correctly)
- fields.rs: Remove dead sort_order() method (replaced by TableConfig)

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
---
 .../file_backed/file_backed_index/mod.rs      |  12 ++
 .../src/schema/fields.rs                      | 143 +++++++++++++++++-
 .../quickwit-parquet-engine/src/schema/mod.rs |   4 +-
 .../src/storage/writer.rs                     |  82 ++++++++++
 4 files changed, 238 insertions(+), 3 deletions(-)

diff --git a/quickwit/quickwit-metastore/src/metastore/file_backed/file_backed_index/mod.rs b/quickwit/quickwit-metastore/src/metastore/file_backed/file_backed_index/mod.rs
index e35618b99f8..16d1e696ad5 100644
--- a/quickwit/quickwit-metastore/src/metastore/file_backed/file_backed_index/mod.rs
+++ b/quickwit/quickwit-metastore/src/metastore/file_backed/file_backed_index/mod.rs
@@ -979,6 +979,18 @@ fn metrics_split_matches_query(split: &StoredMetricsSplit, query: &ListMetricsSp
         }
     }
 
+    // Filter by compaction scope
+    if let Some(ws) = query.window_start {
+        if split.metadata.window_start() != Some(ws) {
+            return false;
+        }
+    }
+    if let Some(ref sf) = query.sort_fields {
+        if split.metadata.sort_fields != *sf {
+            return false;
+        }
+    }
+
     true
 }
 
diff --git a/quickwit/quickwit-parquet-engine/src/schema/fields.rs b/quickwit/quickwit-parquet-engine/src/schema/fields.rs
index 9f46dcf3b8c..b5d149a5b51 100644
--- a/quickwit/quickwit-parquet-engine/src/schema/fields.rs
+++ b/quickwit/quickwit-parquet-engine/src/schema/fields.rs
@@ -12,10 +12,11 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-//! Parquet field definitions with sort order constants and validation.
+//! Parquet field definitions with column metadata, sort order constants, and validation.
 
 use anyhow::{Result, bail};
-use arrow::datatypes::DataType;
+use arrow::datatypes::{DataType, Field, Fields};
+use parquet::variant::VariantType;
 
 /// Required field names that must exist in every batch.
 pub const REQUIRED_FIELDS: &[&str] = &["metric_name", "metric_type", "timestamp_secs", "value"];
@@ -31,6 +32,144 @@ pub const SORT_ORDER: &[&str] = &[
     "timestamp_secs",
 ];
 
+/// All fields in the parquet schema.
+#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
+pub enum ParquetField {
+    MetricName,
+    MetricType,
+    MetricUnit,
+    TimestampSecs,
+    StartTimestampSecs,
+    Value,
+    TagService,
+    TagEnv,
+    TagDatacenter,
+    TagRegion,
+    TagHost,
+    Attributes,
+    ServiceName,
+    ResourceAttributes,
+}
+
+impl ParquetField {
+    /// Field name as stored in Parquet.
+    pub fn name(&self) -> &'static str {
+        match self {
+            Self::MetricName => "metric_name",
+            Self::MetricType => "metric_type",
+            Self::MetricUnit => "metric_unit",
+            Self::TimestampSecs => "timestamp_secs",
+            Self::StartTimestampSecs => "start_timestamp_secs",
+            Self::Value => "value",
+            Self::TagService => "tag_service",
+            Self::TagEnv => "tag_env",
+            Self::TagDatacenter => "tag_datacenter",
+            Self::TagRegion => "tag_region",
+            Self::TagHost => "tag_host",
+            Self::Attributes => "attributes",
+            Self::ServiceName => "service_name",
+            Self::ResourceAttributes => "resource_attributes",
+        }
+    }
+
+    /// Whether this field is nullable.
+    pub fn nullable(&self) -> bool {
+        matches!(
+            self,
+            Self::MetricUnit
+                | Self::StartTimestampSecs
+                | Self::TagService
+                | Self::TagEnv
+                | Self::TagDatacenter
+                | Self::TagRegion
+                | Self::TagHost
+                | Self::Attributes
+                | Self::ResourceAttributes
+        )
+    }
+
+    /// Arrow DataType for this field.
+    /// Use dictionary encoding for high-cardinality strings.
+    pub fn arrow_type(&self) -> DataType {
+        match self {
+            // Dictionary-encoded strings for high cardinality
+            Self::MetricName | Self::ServiceName => {
+                DataType::Dictionary(Box::new(DataType::Int32), Box::new(DataType::Utf8))
+            }
+            // Dictionary-encoded optional tags
+            Self::TagService
+            | Self::TagEnv
+            | Self::TagDatacenter
+            | Self::TagRegion
+            | Self::TagHost => {
+                DataType::Dictionary(Box::new(DataType::Int32), Box::new(DataType::Utf8))
+            }
+            // Enum stored as UInt8
+            Self::MetricType => DataType::UInt8,
+            // Timestamps as UInt64 seconds
+            Self::TimestampSecs | Self::StartTimestampSecs => DataType::UInt64,
+            // Metric value
+            Self::Value => DataType::Float64,
+            // Plain string for metric unit
+            Self::MetricUnit => DataType::Utf8,
+            // VARIANT type for semi-structured attributes
+            // Uses the Parquet Variant binary encoding format
+            Self::Attributes | Self::ResourceAttributes => {
+                // VARIANT is stored as a struct with metadata and value BinaryView fields
+                // VariantArrayBuilder produces BinaryView, not Binary
+                DataType::Struct(Fields::from(vec![
+                    Field::new("metadata", DataType::BinaryView, false),
+                    Field::new("value", DataType::BinaryView, false),
+                ]))
+            }
+        }
+    }
+
+    /// Convert to Arrow Field.
+    pub fn to_arrow_field(&self) -> Field {
+        let field = Field::new(self.name(), self.arrow_type(), self.nullable());
+
+        // Add VARIANT extension type metadata for attributes fields
+        match self {
+            Self::Attributes | Self::ResourceAttributes => field.with_extension_type(VariantType),
+            _ => field,
+        }
+    }
+
+    /// All fields in schema order.
+    pub fn all() -> &'static [ParquetField] {
+        &[
+            Self::MetricName,
+            Self::MetricType,
+            Self::MetricUnit,
+            Self::TimestampSecs,
+            Self::StartTimestampSecs,
+            Self::Value,
+            Self::TagService,
+            Self::TagEnv,
+            Self::TagDatacenter,
+            Self::TagRegion,
+            Self::TagHost,
+            Self::Attributes,
+            Self::ServiceName,
+            Self::ResourceAttributes,
+        ]
+    }
+
+    /// Get the column index in the schema.
+    pub fn column_index(&self) -> usize {
+        Self::all().iter().position(|f| f == self).unwrap()
+    }
+
+    /// Look up a ParquetField by its Parquet column name.
+    ///
+    /// Used by the sort fields resolver to map sort schema column names
+    /// to physical schema columns.
+    pub fn from_name(name: &str) -> Option<Self> {
+        Self::all().iter().find(|f| f.name() == name).copied()
+    }
+}
+
 /// Arrow type for required fields by name.
 pub fn required_field_type(name: &str) -> Option<DataType> {
     match name {
diff --git a/quickwit/quickwit-parquet-engine/src/schema/mod.rs b/quickwit/quickwit-parquet-engine/src/schema/mod.rs
index f9b5c06d9c4..71026ddf9c8 100644
--- a/quickwit/quickwit-parquet-engine/src/schema/mod.rs
+++ b/quickwit/quickwit-parquet-engine/src/schema/mod.rs
@@ -20,5 +20,7 @@
 mod fields;
 mod parquet;
 
-pub use fields::{REQUIRED_FIELDS, SORT_ORDER, required_field_type, validate_required_fields};
+pub use fields::{
+    ParquetField, REQUIRED_FIELDS, SORT_ORDER, required_field_type, validate_required_fields,
+};
 pub use parquet::ParquetSchema;
diff --git a/quickwit/quickwit-parquet-engine/src/storage/writer.rs b/quickwit/quickwit-parquet-engine/src/storage/writer.rs
index d45c0a0c9cb..06ff4be2061 100644
--- a/quickwit/quickwit-parquet-engine/src/storage/writer.rs
+++ b/quickwit/quickwit-parquet-engine/src/storage/writer.rs
@@ -702,6 +702,88 @@ mod tests {
         std::fs::remove_file(&path).ok();
     }
 
+    /// META-07 compliance: Prove the Parquet file is truly self-describing by
+    /// writing compaction metadata, reading it back from a cold file (no in-memory
+    /// state), and reconstructing the MetricsSplitMetadata compaction fields from
+    /// ONLY the Parquet key_value_metadata.
+    #[test]
+    fn test_meta07_self_describing_parquet_roundtrip() {
+        use std::fs::File;
+
+        use crate::split::{SplitId, TimeRange};
+        use parquet::file::reader::{FileReader, SerializedFileReader};
+
+        let sort_schema_str = "metric_name|host|env|timestamp/V2";
+        let window_start_secs: i64 = 1700006400;
+        let window_duration: u32 = 900;
+        let merge_ops: u32 = 7;
+        let row_keys_bytes: Vec<u8> = vec![0x0A, 0x03, 0x63, 0x70, 0x75];
+
+        let original = MetricsSplitMetadata::builder()
+            .split_id(SplitId::new("self-describing-test"))
+            .index_uid("metrics-prod:00000000000000000000000000")
+            .time_range(TimeRange::new(1700006400, 1700007300))
+            .window_start_secs(window_start_secs)
+            .window_duration_secs(window_duration)
+            .sort_fields(sort_schema_str)
+            .num_merge_ops(merge_ops)
+            .row_keys_proto(row_keys_bytes.clone())
+            .build();
+
+        let config = ParquetWriterConfig::default();
+        let writer = ParquetWriter::new(config, &TableConfig::default());
+        let batch = create_test_batch();
+
+        let temp_dir = std::env::temp_dir();
+        let path = temp_dir.join("test_self_describing_roundtrip.parquet");
+        writer
+            .write_to_file_with_metadata(&batch, &path, Some(&original))
+            .unwrap();
+
+        // Read phase: open a cold file and reconstruct fields from kv_metadata.
+        let file = File::open(&path).unwrap();
+        let reader = SerializedFileReader::new(file).unwrap();
+        let file_metadata = reader.metadata().file_metadata();
+        let kv_metadata = file_metadata
+            .key_value_metadata()
+            .expect("self-describing file must have kv_metadata");
+
+        let find_kv = |key: &str| -> Option<String> {
+            kv_metadata
+                .iter()
+                .find(|kv| kv.key == key)
+                .and_then(|kv| kv.value.clone())
+        };
+
+        let recovered_sort_schema = find_kv(PARQUET_META_SORT_FIELDS)
+            .expect("self-describing file must contain qh.sort_fields");
+        let recovered_window_start: i64 = find_kv(PARQUET_META_WINDOW_START)
+            .expect("self-describing file must contain qh.window_start")
+            .parse()
+            .expect("window_start must be parseable as i64");
+        let recovered_window_duration: u32 = find_kv(PARQUET_META_WINDOW_DURATION)
+            .expect("self-describing file must contain qh.window_duration_secs")
+            .parse()
+            .expect("window_duration must be parseable as u32");
+        let recovered_merge_ops: u32 = find_kv(PARQUET_META_NUM_MERGE_OPS)
+            .expect("self-describing file must contain qh.num_merge_ops")
+            .parse()
+            .expect("num_merge_ops must be parseable as u32");
+        let recovered_row_keys_b64 = find_kv(PARQUET_META_ROW_KEYS)
+            .expect("self-describing file must contain qh.row_keys");
+        let recovered_row_keys = BASE64
+            .decode(&recovered_row_keys_b64)
+            .expect("row_keys must be valid base64");
+
+        assert_eq!(recovered_sort_schema, sort_schema_str);
+        assert_eq!(recovered_window_start, window_start_secs);
+        assert_eq!(recovered_window_duration, window_duration);
+        assert_eq!(recovered_merge_ops, merge_ops);
+        assert_eq!(recovered_row_keys, row_keys_bytes);
+
+        std::fs::remove_file(&path).ok();
+    }
+
     #[test]
     fn test_build_compaction_kv_metadata_fully_populated() {
         use crate::split::{SplitId, TimeRange};

From 73a20eff41a9ff040fef12cdab3424b4bdf9b32c Mon Sep 17 00:00:00 2001
From: George Talbot <george.talbot@datadoghq.com>
Date: Tue, 10 Mar 2026 10:22:06 -0400
Subject: [PATCH 16/35] fix(31): correct postgres types for
 window_duration_secs and zonemap_regexes
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Gap 1: Change window_duration_secs from i32 to Option<i32> in both
PgMetricsSplit and InsertableMetricsSplit. Pre-Phase-31 splits now
correctly map 0 → NULL in PostgreSQL, enabling Phase 32 compaction
queries to use `WHERE window_duration_secs IS NOT NULL` instead of
the fragile `WHERE window_duration_secs > 0`.

Gap 2: Change zonemap_regexes from String to serde_json::Value in
both structs. This maps directly to JSONB in sqlx, avoiding ambiguity
when PostgreSQL JSONB operators are used in Phase 34/35 zonemap pruning.

Gap 3: Add two missing tests:
- test_insertable_from_metadata_with_compaction_fields: verifies all 6
  compaction fields round-trip through InsertableMetricsSplit
- test_insertable_from_metadata_pre_phase31_defaults: verifies pre-Phase-31
  metadata produces window_duration_secs: None, zonemap_regexes: json!({})

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
---
 .../src/metastore/postgres/metastore.rs       | 12 +--
 .../src/split/postgres.rs                     | 79 ++++++++++++++++---
 2 files changed, 70 insertions(+), 21 deletions(-)

diff --git a/quickwit/quickwit-metastore/src/metastore/postgres/metastore.rs b/quickwit/quickwit-metastore/src/metastore/postgres/metastore.rs
index 49da0df7e12..11ad65c6753 100644
--- a/quickwit/quickwit-metastore/src/metastore/postgres/metastore.rs
+++ b/quickwit/quickwit-metastore/src/metastore/postgres/metastore.rs
@@ -1844,15 +1844,11 @@ impl MetastoreService for PostgresqlMetastore {
             size_bytes_list.push(insertable.size_bytes);
             split_metadata_jsons.push(insertable.split_metadata_json);
             window_starts.push(insertable.window_start);
-            window_duration_secs_list.push(if insertable.window_duration_secs == 0 {
-                None
-            } else {
-                Some(insertable.window_duration_secs)
-            });
+            window_duration_secs_list.push(insertable.window_duration_secs);
             sort_fields_list.push(insertable.sort_fields);
             num_merge_ops_list.push(insertable.num_merge_ops);
             row_keys_list.push(insertable.row_keys);
-            zonemap_regexes_json_list.push(insertable.zonemap_regexes);
+            zonemap_regexes_json_list.push(insertable.zonemap_regexes.to_string());
         }
 
         info!(
@@ -2384,11 +2380,11 @@ impl MetastoreService for PostgresqlMetastore {
                     // to_metadata() — the SQL columns are only used for
                     // filtering and SS-5 consistency checks.
                     window_start: None,
-                    window_duration_secs: 0,
+                    window_duration_secs: None,
                     sort_fields: String::new(),
                     num_merge_ops: 0,
                     row_keys: None,
-                    zonemap_regexes: "{}".to_string(),
+                    zonemap_regexes: serde_json::json!({}),
                 };
 
                 let state = pg_split.split_state().unwrap_or(MetricsSplitState::Staged);
diff --git a/quickwit/quickwit-parquet-engine/src/split/postgres.rs b/quickwit/quickwit-parquet-engine/src/split/postgres.rs
index 0ea753181bb..6b63f46f17f 100644
--- a/quickwit/quickwit-parquet-engine/src/split/postgres.rs
+++ b/quickwit/quickwit-parquet-engine/src/split/postgres.rs
@@ -72,11 +72,11 @@ pub struct PgMetricsSplit {
     pub split_metadata_json: String,
     pub update_timestamp: i64,
     pub window_start: Option<i64>,
-    pub window_duration_secs: i32,
+    pub window_duration_secs: Option<i32>,
     pub sort_fields: String,
     pub num_merge_ops: i32,
     pub row_keys: Option<Vec<u8>>,
-    pub zonemap_regexes: String,
+    pub zonemap_regexes: serde_json::Value,
 }
 
 /// Insertable row for metrics_splits table.
@@ -99,11 +99,11 @@ pub struct InsertableMetricsSplit {
     pub size_bytes: i64,
     pub split_metadata_json: String,
     pub window_start: Option<i64>,
-    pub window_duration_secs: i32,
+    pub window_duration_secs: Option<i32>,
     pub sort_fields: String,
     pub num_merge_ops: i32,
     pub row_keys: Option<Vec<u8>>,
-    pub zonemap_regexes: String,
+    pub zonemap_regexes: serde_json::Value,
 }
 
 impl InsertableMetricsSplit {
@@ -114,12 +114,6 @@ impl InsertableMetricsSplit {
     ) -> Result<Self, serde_json::Error> {
         let split_metadata_json = serde_json::to_string(metadata)?;
 
-        let zonemap_regexes_json = if metadata.zonemap_regexes.is_empty() {
-            "{}".to_string()
-        } else {
-            serde_json::to_string(&metadata.zonemap_regexes)?
-        };
-
         Ok(Self {
             split_id: metadata.split_id.as_str().to_string(),
             split_state: state.as_str().to_string(),
@@ -137,11 +131,15 @@ impl InsertableMetricsSplit {
             size_bytes: metadata.size_bytes as i64,
             split_metadata_json,
             window_start: metadata.window_start(),
-            window_duration_secs: metadata.window_duration_secs() as i32,
+            window_duration_secs: {
+                let dur = metadata.window_duration_secs();
+                if dur > 0 { Some(dur as i32) } else { None }
+            },
             sort_fields: metadata.sort_fields.clone(),
             num_merge_ops: metadata.num_merge_ops as i32,
             row_keys: metadata.row_keys_proto.clone(),
-            zonemap_regexes: zonemap_regexes_json,
+            zonemap_regexes: serde_json::to_value(&metadata.zonemap_regexes)
+                .unwrap_or_else(|_| serde_json::json!({})),
         })
     }
 }
@@ -165,7 +163,7 @@ impl PgMetricsSplit {
         debug_assert_eq!(metadata.window_start(), self.window_start);
         debug_assert_eq!(
             metadata.window_duration_secs(),
-            self.window_duration_secs as u32
+            self.window_duration_secs.unwrap_or(0) as u32
         );
         debug_assert_eq!(metadata.sort_fields, self.sort_fields);
         debug_assert_eq!(metadata.num_merge_ops, self.num_merge_ops as u32);
@@ -257,6 +255,61 @@ mod tests {
         assert_eq!(insertable.size_bytes, 1024 * 1024);
     }
 
+    #[test]
+    fn test_insertable_from_metadata_with_compaction_fields() {
+        let metadata = MetricsSplitMetadata::builder()
+            .split_id(SplitId::new("compaction-test"))
+            .index_uid("test-index:00000000000000000000000000")
+            .time_range(TimeRange::new(1000, 2000))
+            .num_rows(100)
+            .size_bytes(500)
+            .window_start_secs(1700000000)
+            .window_duration_secs(3600)
+            .sort_fields("metric_name|host|timestamp/V2")
+            .num_merge_ops(2)
+            .row_keys_proto(vec![0x08, 0x01])
+            .add_zonemap_regex("metric_name", "cpu\\..*")
+            .build();
+
+        let insertable =
+            InsertableMetricsSplit::from_metadata(&metadata, MetricsSplitState::Published)
+                .expect("conversion should succeed");
+
+        assert_eq!(insertable.window_start, Some(1700000000));
+        assert_eq!(insertable.window_duration_secs, Some(3600));
+        assert_eq!(insertable.sort_fields, "metric_name|host|timestamp/V2");
+        assert_eq!(insertable.num_merge_ops, 2);
+        assert_eq!(insertable.row_keys, Some(vec![0x08, 0x01]));
+        assert!(insertable.zonemap_regexes.is_object());
+        assert_eq!(
+            insertable.zonemap_regexes["metric_name"],
+            serde_json::json!("cpu\\..*")
+        );
+    }
+
+    #[test]
+    fn test_insertable_from_metadata_pre_phase31_defaults() {
+        let metadata = MetricsSplitMetadata::builder()
+            .split_id(SplitId::new("pre-phase31"))
+            .index_uid("test-index:00000000000000000000000000")
+            .time_range(TimeRange::new(1000, 2000))
+            .build();
+
+        let insertable =
+            InsertableMetricsSplit::from_metadata(&metadata, MetricsSplitState::Staged)
+                .expect("conversion should succeed");
+
+        assert!(insertable.window_start.is_none());
+        assert!(
+            insertable.window_duration_secs.is_none(),
+            "pre-Phase-31 splits should have NULL window_duration_secs"
+        );
+        assert_eq!(insertable.sort_fields, "");
+        assert_eq!(insertable.num_merge_ops, 0);
+        assert!(insertable.row_keys.is_none());
+        assert_eq!(insertable.zonemap_regexes, serde_json::json!({}));
+    }
+
     #[test]
     fn test_pg_split_to_metadata_roundtrip() {
         let original = MetricsSplitMetadata::builder()

From 75c15a0814938273ac8a7041c22fd47565281514 Mon Sep 17 00:00:00 2001
From: George Talbot <george.talbot@datadoghq.com>
Date: Fri, 27 Mar 2026 09:43:59 -0400
Subject: [PATCH 17/35] style: rustfmt

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
---
 .../file_backed/file_backed_index/mod.rs         | 16 ++++++++--------
 .../src/metastore/postgres/metastore.rs          |  9 +++++----
 .../src/storage/writer.rs                        |  7 ++++---
 3 files changed, 17 insertions(+), 15 deletions(-)

diff --git a/quickwit/quickwit-metastore/src/metastore/file_backed/file_backed_index/mod.rs b/quickwit/quickwit-metastore/src/metastore/file_backed/file_backed_index/mod.rs
index 16d1e696ad5..152dc0cd84f 100644
--- a/quickwit/quickwit-metastore/src/metastore/file_backed/file_backed_index/mod.rs
+++ b/quickwit/quickwit-metastore/src/metastore/file_backed/file_backed_index/mod.rs
@@ -980,15 +980,15 @@ fn metrics_split_matches_query(split: &StoredMetricsSplit, query: &ListMetricsSp
     }
 
     // Filter by compaction scope
-    if let Some(ws) = query.window_start {
-        if split.metadata.window_start() != Some(ws) {
-            return false;
-        }
+    if let Some(ws) = query.window_start
+        && split.metadata.window_start() != Some(ws)
+    {
+        return false;
     }
-    if let Some(ref sf) = query.sort_fields {
-        if split.metadata.sort_fields != *sf {
-            return false;
-        }
+    if let Some(ref sf) = query.sort_fields
+        && split.metadata.sort_fields != *sf
+    {
+        return false;
     }
 
     true
diff --git a/quickwit/quickwit-metastore/src/metastore/postgres/metastore.rs b/quickwit/quickwit-metastore/src/metastore/postgres/metastore.rs
index 11ad65c6753..b09eb4afd97 100644
--- a/quickwit/quickwit-metastore/src/metastore/postgres/metastore.rs
+++ b/quickwit/quickwit-metastore/src/metastore/postgres/metastore.rs
@@ -75,9 +75,9 @@ use crate::file_backed::MutationOccurred;
 use crate::metastore::postgres::model::Shards;
 use crate::metastore::postgres::utils::split_maturity_timestamp;
 use crate::metastore::{
-    IndexesMetadataResponseExt, ListMetricsSplitsRequestExt, ListMetricsSplitsResponseExt,
-    PublishMetricsSplitsRequestExt, PublishSplitsRequestExt, STREAM_SPLITS_CHUNK_SIZE,
-    StageMetricsSplitsRequestExt, UpdateSourceRequestExt, use_shard_api,
+    IndexesMetadataResponseExt, ListMetricsSplitsResponseExt, PublishMetricsSplitsRequestExt,
+    PublishSplitsRequestExt, STREAM_SPLITS_CHUNK_SIZE, StageMetricsSplitsRequestExt,
+    UpdateSourceRequestExt, use_shard_api,
 };
 use crate::{
     AddSourceRequestExt, CreateIndexRequestExt, IndexMetadata, IndexMetadataResponseExt,
@@ -1804,7 +1804,8 @@ impl MetastoreService for PostgresqlMetastore {
         let mut size_bytes_list = Vec::with_capacity(splits_metadata.len());
         let mut split_metadata_jsons = Vec::with_capacity(splits_metadata.len());
         let mut window_starts: Vec<Option<i64>> = Vec::with_capacity(splits_metadata.len());
-        let mut window_duration_secs_list: Vec<Option<i32>> = Vec::with_capacity(splits_metadata.len());
+        let mut window_duration_secs_list: Vec<Option<i32>> =
+            Vec::with_capacity(splits_metadata.len());
         let mut sort_fields_list: Vec<String> = Vec::with_capacity(splits_metadata.len());
         let mut num_merge_ops_list: Vec<i32> = Vec::with_capacity(splits_metadata.len());
         let mut row_keys_list: Vec<Option<Vec<u8>>> = Vec::with_capacity(splits_metadata.len());
diff --git a/quickwit/quickwit-parquet-engine/src/storage/writer.rs b/quickwit/quickwit-parquet-engine/src/storage/writer.rs
index 06ff4be2061..3ec633573bc 100644
--- a/quickwit/quickwit-parquet-engine/src/storage/writer.rs
+++ b/quickwit/quickwit-parquet-engine/src/storage/writer.rs
@@ -710,9 +710,10 @@ mod tests {
     fn test_meta07_self_describing_parquet_roundtrip() {
         use std::fs::File;
 
-        use crate::split::{SplitId, TimeRange};
         use parquet::file::reader::{FileReader, SerializedFileReader};
 
+        use crate::split::{SplitId, TimeRange};
+
         let sort_schema_str = "metric_name|host|env|timestamp/V2";
         let window_start_secs: i64 = 1700006400;
         let window_duration: u32 = 900;
@@ -769,8 +770,8 @@ mod tests {
             .expect("self-describing file must contain qh.num_merge_ops")
             .parse()
             .expect("num_merge_ops must be parseable as u32");
-        let recovered_row_keys_b64 = find_kv(PARQUET_META_ROW_KEYS)
-            .expect("self-describing file must contain qh.row_keys");
+        let recovered_row_keys_b64 =
+            find_kv(PARQUET_META_ROW_KEYS).expect("self-describing file must contain qh.row_keys");
         let recovered_row_keys = BASE64
             .decode(&recovered_row_keys_b64)
             .expect("row_keys must be valid base64");

From ef218594746df4f195cf6c01b8aab2a23ee8b3d1 Mon Sep 17 00:00:00 2001
From: George Talbot <george.talbot@datadoghq.com>
Date: Wed, 1 Apr 2026 11:20:16 -0400
Subject: [PATCH 18/35] test(31): add metrics split test suite to shared
 metastore_test_suite! macro

11 tests covering the full metrics split lifecycle:
- stage (happy path + non-existent index error)
- stage upsert (ON CONFLICT update)
- list by state, time range, metric name, compaction scope
- publish (happy path + non-existent split error)
- mark for deletion
- delete (happy path + idempotent non-existent)

Tests are generic and run against both file-backed and PostgreSQL backends.

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
---
 .../quickwit-metastore/src/tests/metrics.rs   | 652 ++++++++++++++++++
 quickwit/quickwit-metastore/src/tests/mod.rs  |  86 +++
 2 files changed, 738 insertions(+)
 create mode 100644 quickwit/quickwit-metastore/src/tests/metrics.rs

diff --git a/quickwit/quickwit-metastore/src/tests/metrics.rs b/quickwit/quickwit-metastore/src/tests/metrics.rs
new file mode 100644
index 00000000000..7288cbdfa13
--- /dev/null
+++ b/quickwit/quickwit-metastore/src/tests/metrics.rs
@@ -0,0 +1,652 @@
+// Copyright 2021-Present Datadog, Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+use quickwit_common::rand::append_random_suffix;
+use quickwit_config::IndexConfig;
+use quickwit_parquet_engine::split::{
+    MetricsSplitMetadata, MetricsSplitState, SplitId, TimeRange, TAG_ENV, TAG_HOST, TAG_SERVICE,
+};
+use quickwit_proto::metastore::{
+    CreateIndexRequest, DeleteMetricsSplitsRequest, EntityKind, ListMetricsSplitsRequest,
+    MarkMetricsSplitsForDeletionRequest, MetastoreError, PublishMetricsSplitsRequest,
+    StageMetricsSplitsRequest,
+};
+use quickwit_proto::types::IndexUid;
+
+use super::DefaultForTest;
+use crate::tests::cleanup_index;
+use crate::{
+    CreateIndexRequestExt, ListMetricsSplitsQuery, ListMetricsSplitsRequestExt,
+    ListMetricsSplitsResponseExt, MetastoreServiceExt, StageMetricsSplitsRequestExt,
+};
+
+/// Helper to create a test index and return the actual IndexUid assigned by the metastore.
+async fn create_test_index(
+    metastore: &mut dyn MetastoreServiceExt,
+    index_id: &str,
+) -> IndexUid {
+    let index_uri = format!("ram:///indexes/{index_id}");
+    let index_config = IndexConfig::for_test(index_id, &index_uri);
+    let create_index_request =
+        CreateIndexRequest::try_from_index_config(&index_config).unwrap();
+    metastore
+        .create_index(create_index_request)
+        .await
+        .unwrap()
+        .index_uid()
+        .clone()
+}
+
+/// Build a simple MetricsSplitMetadata for tests.
+fn build_test_split(
+    split_id: &str,
+    index_uid: &IndexUid,
+    time_range: TimeRange,
+) -> MetricsSplitMetadata {
+    MetricsSplitMetadata::builder()
+        .split_id(SplitId::new(split_id))
+        .index_uid(index_uid.to_string())
+        .time_range(time_range)
+        .num_rows(100)
+        .size_bytes(4096)
+        .add_metric_name("cpu.usage")
+        .parquet_file(format!("{split_id}.parquet"))
+        .build()
+}
+
+pub async fn test_metastore_stage_metrics_splits<
+    MetastoreToTest: MetastoreServiceExt + DefaultForTest,
+>() {
+    let mut metastore = MetastoreToTest::default_for_test().await;
+    let index_id = append_random_suffix("test-stage-metrics-splits");
+
+    // Stage on a non-existent index should fail.
+    {
+        let fake_uid = IndexUid::new_with_random_ulid("index-not-found");
+        let split = build_test_split("split-1", &fake_uid, TimeRange::new(1000, 2000));
+        let request =
+            StageMetricsSplitsRequest::try_from_splits_metadata(fake_uid, &[split]).unwrap();
+        let error = metastore.stage_metrics_splits(request).await.unwrap_err();
+        assert!(matches!(
+            error,
+            MetastoreError::NotFound(EntityKind::Index { .. })
+        ));
+    }
+
+    let index_uid = create_test_index(&mut metastore, &index_id).await;
+
+    // Stage two splits.
+    let split_id_1 = format!("{index_id}--split-1");
+    let split_id_2 = format!("{index_id}--split-2");
+    let split_1 = build_test_split(&split_id_1, &index_uid, TimeRange::new(1000, 2000));
+    let split_2 = build_test_split(&split_id_2, &index_uid, TimeRange::new(2000, 3000));
+
+    let request = StageMetricsSplitsRequest::try_from_splits_metadata(
+        index_uid.clone(),
+        &[split_1, split_2],
+    )
+    .unwrap();
+    metastore.stage_metrics_splits(request).await.unwrap();
+
+    // Verify both splits are listed in Staged state.
+    let query = ListMetricsSplitsQuery::for_index(index_uid.clone())
+        .with_split_states(vec!["Staged".to_string()]);
+    let list_request =
+        ListMetricsSplitsRequest::try_from_query(index_uid.clone(), &query).unwrap();
+    let response = metastore.list_metrics_splits(list_request).await.unwrap();
+    let splits = response.deserialize_splits().unwrap();
+    assert_eq!(splits.len(), 2);
+
+    for split in &splits {
+        assert_eq!(split.state, MetricsSplitState::Staged);
+    }
+
+    cleanup_index(&mut metastore, index_uid).await;
+}
+
+pub async fn test_metastore_stage_metrics_splits_upsert<
+    MetastoreToTest: MetastoreServiceExt + DefaultForTest,
+>() {
+    let mut metastore = MetastoreToTest::default_for_test().await;
+    let index_id = append_random_suffix("test-stage-metrics-upsert");
+    let index_uid = create_test_index(&mut metastore, &index_id).await;
+
+    let split_id = format!("{index_id}--split-1");
+
+    // Stage a split with 100 rows.
+    let split_v1 = MetricsSplitMetadata::builder()
+        .split_id(SplitId::new(&split_id))
+        .index_uid(index_uid.to_string())
+        .time_range(TimeRange::new(1000, 2000))
+        .num_rows(100)
+        .size_bytes(4096)
+        .add_metric_name("cpu.usage")
+        .parquet_file(format!("{split_id}.parquet"))
+        .build();
+
+    let request = StageMetricsSplitsRequest::try_from_splits_metadata(
+        index_uid.clone(),
+        &[split_v1],
+    )
+    .unwrap();
+    metastore.stage_metrics_splits(request).await.unwrap();
+
+    // Stage the same split_id again with 200 rows (upsert).
+    let split_v2 = MetricsSplitMetadata::builder()
+        .split_id(SplitId::new(&split_id))
+        .index_uid(index_uid.to_string())
+        .time_range(TimeRange::new(1000, 2000))
+        .num_rows(200)
+        .size_bytes(8192)
+        .add_metric_name("cpu.usage")
+        .parquet_file(format!("{split_id}.parquet"))
+        .build();
+
+    let request = StageMetricsSplitsRequest::try_from_splits_metadata(
+        index_uid.clone(),
+        &[split_v2],
+    )
+    .unwrap();
+    metastore.stage_metrics_splits(request).await.unwrap();
+
+    // Verify only one split exists and it has the updated num_rows.
+    let query = ListMetricsSplitsQuery::for_index(index_uid.clone())
+        .with_split_states(vec!["Staged".to_string()]);
+    let list_request =
+        ListMetricsSplitsRequest::try_from_query(index_uid.clone(), &query).unwrap();
+    let response = metastore.list_metrics_splits(list_request).await.unwrap();
+    let splits = response.deserialize_splits().unwrap();
+    assert_eq!(splits.len(), 1);
+    assert_eq!(splits[0].metadata.num_rows, 200);
+    assert_eq!(splits[0].metadata.size_bytes, 8192);
+
+    cleanup_index(&mut metastore, index_uid).await;
+}
+
+pub async fn test_metastore_list_metrics_splits_by_state<
+    MetastoreToTest: MetastoreServiceExt + DefaultForTest,
+>() {
+    let mut metastore = MetastoreToTest::default_for_test().await;
+    let index_id = append_random_suffix("test-list-metrics-by-state");
+    let index_uid = create_test_index(&mut metastore, &index_id).await;
+
+    // Stage two splits.
+    let split_id_1 = format!("{index_id}--split-1");
+    let split_id_2 = format!("{index_id}--split-2");
+    let split_1 = build_test_split(&split_id_1, &index_uid, TimeRange::new(1000, 2000));
+    let split_2 = build_test_split(&split_id_2, &index_uid, TimeRange::new(2000, 3000));
+    let request = StageMetricsSplitsRequest::try_from_splits_metadata(
+        index_uid.clone(),
+        &[split_1, split_2],
+    )
+    .unwrap();
+    metastore.stage_metrics_splits(request).await.unwrap();
+
+    // Publish split_1 only.
+    let publish_request = PublishMetricsSplitsRequest {
+        index_uid: Some(index_uid.clone()),
+        staged_split_ids: vec![split_id_1.clone()],
+        ..Default::default()
+    };
+    metastore
+        .publish_metrics_splits(publish_request)
+        .await
+        .unwrap();
+
+    // List only Published splits.
+    {
+        let query = ListMetricsSplitsQuery::for_index(index_uid.clone())
+            .with_split_states(vec!["Published".to_string()]);
+        let list_request =
+            ListMetricsSplitsRequest::try_from_query(index_uid.clone(), &query).unwrap();
+        let response = metastore.list_metrics_splits(list_request).await.unwrap();
+        let splits = response.deserialize_splits().unwrap();
+        assert_eq!(splits.len(), 1);
+        assert_eq!(splits[0].metadata.split_id.as_str(), &split_id_1);
+        assert_eq!(splits[0].state, MetricsSplitState::Published);
+    }
+
+    // List only Staged splits.
+    {
+        let query = ListMetricsSplitsQuery::for_index(index_uid.clone())
+            .with_split_states(vec!["Staged".to_string()]);
+        let list_request =
+            ListMetricsSplitsRequest::try_from_query(index_uid.clone(), &query).unwrap();
+        let response = metastore.list_metrics_splits(list_request).await.unwrap();
+        let splits = response.deserialize_splits().unwrap();
+        assert_eq!(splits.len(), 1);
+        assert_eq!(splits[0].metadata.split_id.as_str(), &split_id_2);
+        assert_eq!(splits[0].state, MetricsSplitState::Staged);
+    }
+
+    // List both states.
+    {
+        let query = ListMetricsSplitsQuery::for_index(index_uid.clone())
+            .with_split_states(vec!["Published".to_string(), "Staged".to_string()]);
+        let list_request =
+            ListMetricsSplitsRequest::try_from_query(index_uid.clone(), &query).unwrap();
+        let response = metastore.list_metrics_splits(list_request).await.unwrap();
+        let splits = response.deserialize_splits().unwrap();
+        assert_eq!(splits.len(), 2);
+    }
+
+    cleanup_index(&mut metastore, index_uid).await;
+}
+
+pub async fn test_metastore_list_metrics_splits_by_time_range<
+    MetastoreToTest: MetastoreServiceExt + DefaultForTest,
+>() {
+    let mut metastore = MetastoreToTest::default_for_test().await;
+    let index_id = append_random_suffix("test-list-metrics-time-range");
+    let index_uid = create_test_index(&mut metastore, &index_id).await;
+
+    // Stage splits at different time ranges.
+    let split_1 = build_test_split(
+        &format!("{index_id}--split-1"),
+        &index_uid,
+        TimeRange::new(1000, 2000),
+    );
+    let split_2 = build_test_split(
+        &format!("{index_id}--split-2"),
+        &index_uid,
+        TimeRange::new(3000, 4000),
+    );
+    let split_3 = build_test_split(
+        &format!("{index_id}--split-3"),
+        &index_uid,
+        TimeRange::new(5000, 6000),
+    );
+    let request = StageMetricsSplitsRequest::try_from_splits_metadata(
+        index_uid.clone(),
+        &[split_1, split_2, split_3],
+    )
+    .unwrap();
+    metastore.stage_metrics_splits(request).await.unwrap();
+
+    // Query for time range that overlaps only the first two splits.
+    let query = ListMetricsSplitsQuery::for_index(index_uid.clone())
+        .with_split_states(vec!["Staged".to_string()])
+        .with_time_range(1500, 3500);
+    let list_request =
+        ListMetricsSplitsRequest::try_from_query(index_uid.clone(), &query).unwrap();
+    let response = metastore.list_metrics_splits(list_request).await.unwrap();
+    let splits = response.deserialize_splits().unwrap();
+    // Should match splits whose time range overlaps [1500, 3500].
+    // split_1: [1000,2000) overlaps [1500,3500] => yes
+    // split_2: [3000,4000) overlaps [1500,3500] => yes
+    // split_3: [5000,6000) overlaps [1500,3500] => no
+    assert_eq!(splits.len(), 2);
+
+    cleanup_index(&mut metastore, index_uid).await;
+}
+
+pub async fn test_metastore_list_metrics_splits_by_metric_name<
+    MetastoreToTest: MetastoreServiceExt + DefaultForTest,
+>() {
+    let mut metastore = MetastoreToTest::default_for_test().await;
+    let index_id = append_random_suffix("test-list-metrics-by-name");
+    let index_uid = create_test_index(&mut metastore, &index_id).await;
+
+    // Stage splits with different metric names.
+    let split_1 = MetricsSplitMetadata::builder()
+        .split_id(SplitId::new(format!("{index_id}--split-1")))
+        .index_uid(index_uid.to_string())
+        .time_range(TimeRange::new(1000, 2000))
+        .num_rows(100)
+        .size_bytes(4096)
+        .add_metric_name("cpu.usage")
+        .parquet_file("split-1.parquet")
+        .build();
+
+    let split_2 = MetricsSplitMetadata::builder()
+        .split_id(SplitId::new(format!("{index_id}--split-2")))
+        .index_uid(index_uid.to_string())
+        .time_range(TimeRange::new(1000, 2000))
+        .num_rows(100)
+        .size_bytes(4096)
+        .add_metric_name("memory.used")
+        .parquet_file("split-2.parquet")
+        .build();
+
+    let split_3 = MetricsSplitMetadata::builder()
+        .split_id(SplitId::new(format!("{index_id}--split-3")))
+        .index_uid(index_uid.to_string())
+        .time_range(TimeRange::new(1000, 2000))
+        .num_rows(100)
+        .size_bytes(4096)
+        .add_metric_name("cpu.usage")
+        .add_metric_name("memory.used")
+        .parquet_file("split-3.parquet")
+        .build();
+
+    let request = StageMetricsSplitsRequest::try_from_splits_metadata(
+        index_uid.clone(),
+        &[split_1, split_2, split_3],
+    )
+    .unwrap();
+    metastore.stage_metrics_splits(request).await.unwrap();
+
+    // Query for "cpu.usage" should return split_1 and split_3.
+    let query = ListMetricsSplitsQuery::for_index(index_uid.clone())
+        .with_split_states(vec!["Staged".to_string()])
+        .with_metric_names(vec!["cpu.usage".to_string()]);
+    let list_request =
+        ListMetricsSplitsRequest::try_from_query(index_uid.clone(), &query).unwrap();
+    let response = metastore.list_metrics_splits(list_request).await.unwrap();
+    let splits = response.deserialize_splits().unwrap();
+    assert_eq!(splits.len(), 2);
+
+    cleanup_index(&mut metastore, index_uid).await;
+}
+
+pub async fn test_metastore_list_metrics_splits_by_compaction_scope<
+    MetastoreToTest: MetastoreServiceExt + DefaultForTest,
+>() {
+    let mut metastore = MetastoreToTest::default_for_test().await;
+    let index_id = append_random_suffix("test-list-metrics-compaction");
+    let index_uid = create_test_index(&mut metastore, &index_id).await;
+
+    // Stage splits with different compaction scopes.
+    let split_1 = MetricsSplitMetadata::builder()
+        .split_id(SplitId::new(format!("{index_id}--split-1")))
+        .index_uid(index_uid.to_string())
+        .time_range(TimeRange::new(1000, 2000))
+        .num_rows(100)
+        .size_bytes(4096)
+        .add_metric_name("cpu.usage")
+        .parquet_file("split-1.parquet")
+        .window_start_secs(1700000000)
+        .window_duration_secs(3600)
+        .sort_fields("metric_name|host|timestamp/V2")
+        .build();
+
+    let split_2 = MetricsSplitMetadata::builder()
+        .split_id(SplitId::new(format!("{index_id}--split-2")))
+        .index_uid(index_uid.to_string())
+        .time_range(TimeRange::new(1000, 2000))
+        .num_rows(100)
+        .size_bytes(4096)
+        .add_metric_name("cpu.usage")
+        .parquet_file("split-2.parquet")
+        .window_start_secs(1700003600)
+        .window_duration_secs(3600)
+        .sort_fields("metric_name|host|timestamp/V2")
+        .build();
+
+    let split_3 = MetricsSplitMetadata::builder()
+        .split_id(SplitId::new(format!("{index_id}--split-3")))
+        .index_uid(index_uid.to_string())
+        .time_range(TimeRange::new(1000, 2000))
+        .num_rows(100)
+        .size_bytes(4096)
+        .add_metric_name("cpu.usage")
+        .parquet_file("split-3.parquet")
+        .window_start_secs(1700000000)
+        .window_duration_secs(3600)
+        .sort_fields("different_schema/V1")
+        .build();
+
+    let request = StageMetricsSplitsRequest::try_from_splits_metadata(
+        index_uid.clone(),
+        &[split_1, split_2, split_3],
+    )
+    .unwrap();
+    metastore.stage_metrics_splits(request).await.unwrap();
+
+    // Query by compaction scope: window_start=1700000000, sort_fields matching split_1.
+    let query = ListMetricsSplitsQuery::for_index(index_uid.clone())
+        .with_split_states(vec!["Staged".to_string()])
+        .with_compaction_scope(1700000000, "metric_name|host|timestamp/V2");
+    let list_request =
+        ListMetricsSplitsRequest::try_from_query(index_uid.clone(), &query).unwrap();
+    let response = metastore.list_metrics_splits(list_request).await.unwrap();
+    let splits = response.deserialize_splits().unwrap();
+    // Only split_1 matches both window_start and sort_fields.
+    assert_eq!(splits.len(), 1);
+    assert_eq!(
+        splits[0].metadata.split_id.as_str(),
+        format!("{index_id}--split-1")
+    );
+
+    cleanup_index(&mut metastore, index_uid).await;
+}
+
+pub async fn test_metastore_publish_metrics_splits<
+    MetastoreToTest: MetastoreServiceExt + DefaultForTest,
+>() {
+    let mut metastore = MetastoreToTest::default_for_test().await;
+    let index_id = append_random_suffix("test-publish-metrics-splits");
+    let index_uid = create_test_index(&mut metastore, &index_id).await;
+
+    let split_id_1 = format!("{index_id}--split-1");
+    let split_id_2 = format!("{index_id}--split-2");
+    let split_1 = build_test_split(&split_id_1, &index_uid, TimeRange::new(1000, 2000));
+    let split_2 = build_test_split(&split_id_2, &index_uid, TimeRange::new(2000, 3000));
+
+    // Stage both.
+    let request = StageMetricsSplitsRequest::try_from_splits_metadata(
+        index_uid.clone(),
+        &[split_1, split_2],
+    )
+    .unwrap();
+    metastore.stage_metrics_splits(request).await.unwrap();
+
+    // Publish both.
+    let publish_request = PublishMetricsSplitsRequest {
+        index_uid: Some(index_uid.clone()),
+        staged_split_ids: vec![split_id_1.clone(), split_id_2.clone()],
+        ..Default::default()
+    };
+    metastore
+        .publish_metrics_splits(publish_request)
+        .await
+        .unwrap();
+
+    // Verify they are now Published.
+    let query = ListMetricsSplitsQuery::for_index(index_uid.clone())
+        .with_split_states(vec!["Published".to_string()]);
+    let list_request =
+        ListMetricsSplitsRequest::try_from_query(index_uid.clone(), &query).unwrap();
+    let response = metastore.list_metrics_splits(list_request).await.unwrap();
+    let splits = response.deserialize_splits().unwrap();
+    assert_eq!(splits.len(), 2);
+
+    for split in &splits {
+        assert_eq!(split.state, MetricsSplitState::Published);
+    }
+
+    // Verify no Staged splits remain.
+    let query = ListMetricsSplitsQuery::for_index(index_uid.clone())
+        .with_split_states(vec!["Staged".to_string()]);
+    let list_request =
+        ListMetricsSplitsRequest::try_from_query(index_uid.clone(), &query).unwrap();
+    let response = metastore.list_metrics_splits(list_request).await.unwrap();
+    let splits = response.deserialize_splits().unwrap();
+    assert_eq!(splits.len(), 0);
+
+    cleanup_index(&mut metastore, index_uid).await;
+}
+
+pub async fn test_metastore_publish_metrics_splits_nonexistent<
+    MetastoreToTest: MetastoreServiceExt + DefaultForTest,
+>() {
+    let mut metastore = MetastoreToTest::default_for_test().await;
+    let index_id = append_random_suffix("test-publish-metrics-nonexistent");
+    let index_uid = create_test_index(&mut metastore, &index_id).await;
+
+    // Publish a split_id that was never staged.
+    let publish_request = PublishMetricsSplitsRequest {
+        index_uid: Some(index_uid.clone()),
+        staged_split_ids: vec!["nonexistent-split".to_string()],
+        ..Default::default()
+    };
+    let error = metastore
+        .publish_metrics_splits(publish_request)
+        .await
+        .unwrap_err();
+    assert!(
+        matches!(
+            error,
+            MetastoreError::NotFound(EntityKind::Splits { .. })
+        ),
+        "expected NotFound(Splits), got: {error:?}"
+    );
+
+    cleanup_index(&mut metastore, index_uid).await;
+}
+
+pub async fn test_metastore_mark_metrics_splits_for_deletion<
+    MetastoreToTest: MetastoreServiceExt + DefaultForTest,
+>() {
+    let mut metastore = MetastoreToTest::default_for_test().await;
+    let index_id = append_random_suffix("test-mark-metrics-deletion");
+    let index_uid = create_test_index(&mut metastore, &index_id).await;
+
+    let split_id_1 = format!("{index_id}--split-1");
+    let split_id_2 = format!("{index_id}--split-2");
+    let split_1 = build_test_split(&split_id_1, &index_uid, TimeRange::new(1000, 2000));
+    let split_2 = build_test_split(&split_id_2, &index_uid, TimeRange::new(2000, 3000));
+
+    // Stage and publish.
+    let request = StageMetricsSplitsRequest::try_from_splits_metadata(
+        index_uid.clone(),
+        &[split_1, split_2],
+    )
+    .unwrap();
+    metastore.stage_metrics_splits(request).await.unwrap();
+
+    let publish_request = PublishMetricsSplitsRequest {
+        index_uid: Some(index_uid.clone()),
+        staged_split_ids: vec![split_id_1.clone(), split_id_2.clone()],
+        ..Default::default()
+    };
+    metastore
+        .publish_metrics_splits(publish_request)
+        .await
+        .unwrap();
+
+    // Mark split_1 for deletion.
+    let mark_request = MarkMetricsSplitsForDeletionRequest {
+        index_uid: Some(index_uid.clone()),
+        split_ids: vec![split_id_1.clone()],
+    };
+    metastore
+        .mark_metrics_splits_for_deletion(mark_request)
+        .await
+        .unwrap();
+
+    // Verify split_1 is MarkedForDeletion.
+    let query = ListMetricsSplitsQuery::for_index(index_uid.clone())
+        .with_split_states(vec!["MarkedForDeletion".to_string()]);
+    let list_request =
+        ListMetricsSplitsRequest::try_from_query(index_uid.clone(), &query).unwrap();
+    let response = metastore.list_metrics_splits(list_request).await.unwrap();
+    let splits = response.deserialize_splits().unwrap();
+    assert_eq!(splits.len(), 1);
+    assert_eq!(splits[0].metadata.split_id.as_str(), &split_id_1);
+
+    // Verify split_2 is still Published.
+    let query = ListMetricsSplitsQuery::for_index(index_uid.clone())
+        .with_split_states(vec!["Published".to_string()]);
+    let list_request =
+        ListMetricsSplitsRequest::try_from_query(index_uid.clone(), &query).unwrap();
+    let response = metastore.list_metrics_splits(list_request).await.unwrap();
+    let splits = response.deserialize_splits().unwrap();
+    assert_eq!(splits.len(), 1);
+    assert_eq!(splits[0].metadata.split_id.as_str(), &split_id_2);
+
+    cleanup_index(&mut metastore, index_uid).await;
+}
+
+pub async fn test_metastore_delete_metrics_splits<
+    MetastoreToTest: MetastoreServiceExt + DefaultForTest,
+>() {
+    let mut metastore = MetastoreToTest::default_for_test().await;
+    let index_id = append_random_suffix("test-delete-metrics-splits");
+    let index_uid = create_test_index(&mut metastore, &index_id).await;
+
+    let split_id_1 = format!("{index_id}--split-1");
+    let split_1 = build_test_split(&split_id_1, &index_uid, TimeRange::new(1000, 2000));
+
+    // Stage, publish, mark for deletion.
+    let request = StageMetricsSplitsRequest::try_from_splits_metadata(
+        index_uid.clone(),
+        &[split_1],
+    )
+    .unwrap();
+    metastore.stage_metrics_splits(request).await.unwrap();
+
+    let publish_request = PublishMetricsSplitsRequest {
+        index_uid: Some(index_uid.clone()),
+        staged_split_ids: vec![split_id_1.clone()],
+        ..Default::default()
+    };
+    metastore
+        .publish_metrics_splits(publish_request)
+        .await
+        .unwrap();
+
+    let mark_request = MarkMetricsSplitsForDeletionRequest {
+        index_uid: Some(index_uid.clone()),
+        split_ids: vec![split_id_1.clone()],
+    };
+    metastore
+        .mark_metrics_splits_for_deletion(mark_request)
+        .await
+        .unwrap();
+
+    // Delete.
+    let delete_request = DeleteMetricsSplitsRequest {
+        index_uid: Some(index_uid.clone()),
+        split_ids: vec![split_id_1.clone()],
+    };
+    metastore
+        .delete_metrics_splits(delete_request)
+        .await
+        .unwrap();
+
+    // Verify it is gone (list all states).
+    let query = ListMetricsSplitsQuery::for_index(index_uid.clone()).with_split_states(vec![
+        "Staged".to_string(),
+        "Published".to_string(),
+        "MarkedForDeletion".to_string(),
+    ]);
+    let list_request =
+        ListMetricsSplitsRequest::try_from_query(index_uid.clone(), &query).unwrap();
+    let response = metastore.list_metrics_splits(list_request).await.unwrap();
+    let splits = response.deserialize_splits().unwrap();
+    assert_eq!(splits.len(), 0);
+
+    cleanup_index(&mut metastore, index_uid).await;
+}
+
+pub async fn test_metastore_delete_metrics_splits_nonexistent<
+    MetastoreToTest: MetastoreServiceExt + DefaultForTest,
+>() {
+    let mut metastore = MetastoreToTest::default_for_test().await;
+    let index_id = append_random_suffix("test-delete-metrics-nonexistent");
+    let index_uid = create_test_index(&mut metastore, &index_id).await;
+
+    // Delete a split_id that doesn't exist — should succeed (idempotent).
+    let delete_request = DeleteMetricsSplitsRequest {
+        index_uid: Some(index_uid.clone()),
+        split_ids: vec!["nonexistent-split".to_string()],
+    };
+    metastore
+        .delete_metrics_splits(delete_request)
+        .await
+        .unwrap();
+
+    cleanup_index(&mut metastore, index_uid).await;
+}
diff --git a/quickwit/quickwit-metastore/src/tests/mod.rs b/quickwit/quickwit-metastore/src/tests/mod.rs
index d6e549baf25..4433221a87d 100644
--- a/quickwit/quickwit-metastore/src/tests/mod.rs
+++ b/quickwit/quickwit-metastore/src/tests/mod.rs
@@ -29,6 +29,7 @@ pub(crate) mod delete_task;
 pub(crate) mod get_identity;
 pub(crate) mod index;
 pub(crate) mod list_splits;
+pub(crate) mod metrics;
 pub(crate) mod shard;
 pub(crate) mod source;
 pub(crate) mod split;
@@ -575,6 +576,91 @@ macro_rules! metastore_test_suite {
                 let _ = tracing_subscriber::fmt::try_init();
                 $crate::tests::get_identity::test_metastore_get_identity::<$metastore_type>().await;
             }
+
+            // Metrics Split API tests
+            //
+            //  - stage_metrics_splits
+            //  - publish_metrics_splits
+            //  - list_metrics_splits
+            //  - mark_metrics_splits_for_deletion
+            //  - delete_metrics_splits
+
+            #[tokio::test]
+            #[serial_test::file_serial]
+            async fn test_metastore_stage_metrics_splits() {
+                let _ = tracing_subscriber::fmt::try_init();
+                $crate::tests::metrics::test_metastore_stage_metrics_splits::<$metastore_type>().await;
+            }
+
+            #[tokio::test]
+            #[serial_test::file_serial]
+            async fn test_metastore_stage_metrics_splits_upsert() {
+                let _ = tracing_subscriber::fmt::try_init();
+                $crate::tests::metrics::test_metastore_stage_metrics_splits_upsert::<$metastore_type>().await;
+            }
+
+            #[tokio::test]
+            #[serial_test::file_serial]
+            async fn test_metastore_list_metrics_splits_by_state() {
+                let _ = tracing_subscriber::fmt::try_init();
+                $crate::tests::metrics::test_metastore_list_metrics_splits_by_state::<$metastore_type>().await;
+            }
+
+            #[tokio::test]
+            #[serial_test::file_serial]
+            async fn test_metastore_list_metrics_splits_by_time_range() {
+                let _ = tracing_subscriber::fmt::try_init();
+                $crate::tests::metrics::test_metastore_list_metrics_splits_by_time_range::<$metastore_type>().await;
+            }
+
+            #[tokio::test]
+            #[serial_test::file_serial]
+            async fn test_metastore_list_metrics_splits_by_metric_name() {
+                let _ = tracing_subscriber::fmt::try_init();
+                $crate::tests::metrics::test_metastore_list_metrics_splits_by_metric_name::<$metastore_type>().await;
+            }
+
+            #[tokio::test]
+            #[serial_test::file_serial]
+            async fn test_metastore_list_metrics_splits_by_compaction_scope() {
+                let _ = tracing_subscriber::fmt::try_init();
+                $crate::tests::metrics::test_metastore_list_metrics_splits_by_compaction_scope::<$metastore_type>().await;
+            }
+
+            #[tokio::test]
+            #[serial_test::file_serial]
+            async fn test_metastore_publish_metrics_splits() {
+                let _ = tracing_subscriber::fmt::try_init();
+                $crate::tests::metrics::test_metastore_publish_metrics_splits::<$metastore_type>().await;
+            }
+
+            #[tokio::test]
+            #[serial_test::file_serial]
+            async fn test_metastore_publish_metrics_splits_nonexistent() {
+                let _ = tracing_subscriber::fmt::try_init();
+                $crate::tests::metrics::test_metastore_publish_metrics_splits_nonexistent::<$metastore_type>().await;
+            }
+
+            #[tokio::test]
+            #[serial_test::file_serial]
+            async fn test_metastore_mark_metrics_splits_for_deletion() {
+                let _ = tracing_subscriber::fmt::try_init();
+                $crate::tests::metrics::test_metastore_mark_metrics_splits_for_deletion::<$metastore_type>().await;
+            }
+
+            #[tokio::test]
+            #[serial_test::file_serial]
+            async fn test_metastore_delete_metrics_splits() {
+                let _ = tracing_subscriber::fmt::try_init();
+                $crate::tests::metrics::test_metastore_delete_metrics_splits::<$metastore_type>().await;
+            }
+
+            #[tokio::test]
+            #[serial_test::file_serial]
+            async fn test_metastore_delete_metrics_splits_nonexistent() {
+                let _ = tracing_subscriber::fmt::try_init();
+                $crate::tests::metrics::test_metastore_delete_metrics_splits_nonexistent::<$metastore_type>().await;
+            }
         }
     };
 }

From b4dac4644182577bafb35a5795e1ce93d81d33bc Mon Sep 17 00:00:00 2001
From: George Talbot <george.talbot@datadoghq.com>
Date: Wed, 1 Apr 2026 11:29:42 -0400
Subject: [PATCH 19/35] fix(31): read compaction columns in
 list_metrics_splits, fix cleanup_index FK

---
 .../src/metastore/postgres/metastore.rs       | 79 ++++++++-----------
 .../quickwit-metastore/src/tests/metrics.rs   | 17 ++--
 quickwit/quickwit-metastore/src/tests/mod.rs  | 47 +++++++++++
 3 files changed, 91 insertions(+), 52 deletions(-)

diff --git a/quickwit/quickwit-metastore/src/metastore/postgres/metastore.rs b/quickwit/quickwit-metastore/src/metastore/postgres/metastore.rs
index b09eb4afd97..646299ae62d 100644
--- a/quickwit/quickwit-metastore/src/metastore/postgres/metastore.rs
+++ b/quickwit/quickwit-metastore/src/metastore/postgres/metastore.rs
@@ -2218,7 +2218,13 @@ impl MetastoreService for PostgresqlMetastore {
                 num_rows,
                 size_bytes,
                 split_metadata_json,
-                EXTRACT(EPOCH FROM update_timestamp)::bigint as update_timestamp
+                EXTRACT(EPOCH FROM update_timestamp)::bigint as update_timestamp,
+                window_start,
+                window_duration_secs,
+                sort_fields,
+                num_merge_ops,
+                row_keys,
+                zonemap_regexes
             FROM metrics_splits
             WHERE index_uid = $1
         "#,
@@ -2288,27 +2294,7 @@ impl MetastoreService for PostgresqlMetastore {
         }
 
         // Execute query with bindings
-        let mut query_builder = sqlx::query_as::<
-            _,
-            (
-                String,              // split_id
-                String,              // split_state
-                String,              // index_uid
-                i64,                 // time_range_start
-                i64,                 // time_range_end
-                Vec<String>,         // metric_names
-                Option<Vec<String>>, // tag_service
-                Option<Vec<String>>, // tag_env
-                Option<Vec<String>>, // tag_datacenter
-                Option<Vec<String>>, // tag_region
-                Option<Vec<String>>, // tag_host
-                Vec<String>,         // high_cardinality_tag_keys
-                i64,                 // num_rows
-                i64,                 // size_bytes
-                String,              // split_metadata_json
-                i64,                 // update_timestamp
-            ),
-        >(&sql);
+        let mut query_builder = sqlx::query(&sql);
 
         query_builder = query_builder.bind(query.index_uid.to_string());
 
@@ -2360,32 +2346,31 @@ impl MetastoreService for PostgresqlMetastore {
             .filter_map(|row| {
                 use quickwit_parquet_engine::split::{MetricsSplitState, PgMetricsSplit};
 
+                use sqlx::Row as _;
+
                 let pg_split = PgMetricsSplit {
-                    split_id: row.0,
-                    split_state: row.1,
-                    index_uid: row.2,
-                    time_range_start: row.3,
-                    time_range_end: row.4,
-                    metric_names: row.5,
-                    tag_service: row.6,
-                    tag_env: row.7,
-                    tag_datacenter: row.8,
-                    tag_region: row.9,
-                    tag_host: row.10,
-                    high_cardinality_tag_keys: row.11,
-                    num_rows: row.12,
-                    size_bytes: row.13,
-                    split_metadata_json: row.14,
-                    update_timestamp: row.15,
-                    // Compaction fields are read from the JSON blob via
-                    // to_metadata() — the SQL columns are only used for
-                    // filtering and SS-5 consistency checks.
-                    window_start: None,
-                    window_duration_secs: None,
-                    sort_fields: String::new(),
-                    num_merge_ops: 0,
-                    row_keys: None,
-                    zonemap_regexes: serde_json::json!({}),
+                    split_id: row.get("split_id"),
+                    split_state: row.get("split_state"),
+                    index_uid: row.get("index_uid"),
+                    time_range_start: row.get("time_range_start"),
+                    time_range_end: row.get("time_range_end"),
+                    metric_names: row.get("metric_names"),
+                    tag_service: row.get("tag_service"),
+                    tag_env: row.get("tag_env"),
+                    tag_datacenter: row.get("tag_datacenter"),
+                    tag_region: row.get("tag_region"),
+                    tag_host: row.get("tag_host"),
+                    high_cardinality_tag_keys: row.get("high_cardinality_tag_keys"),
+                    num_rows: row.get("num_rows"),
+                    size_bytes: row.get("size_bytes"),
+                    split_metadata_json: row.get("split_metadata_json"),
+                    update_timestamp: row.get("update_timestamp"),
+                    window_start: row.get("window_start"),
+                    window_duration_secs: row.get("window_duration_secs"),
+                    sort_fields: row.get("sort_fields"),
+                    num_merge_ops: row.get("num_merge_ops"),
+                    row_keys: row.get("row_keys"),
+                    zonemap_regexes: row.get("zonemap_regexes"),
                 };
 
                 let state = pg_split.split_state().unwrap_or(MetricsSplitState::Staged);
diff --git a/quickwit/quickwit-metastore/src/tests/metrics.rs b/quickwit/quickwit-metastore/src/tests/metrics.rs
index 7288cbdfa13..b44fd8ecc22 100644
--- a/quickwit/quickwit-metastore/src/tests/metrics.rs
+++ b/quickwit/quickwit-metastore/src/tests/metrics.rs
@@ -495,10 +495,12 @@ pub async fn test_metastore_publish_metrics_splits_nonexistent<
         .publish_metrics_splits(publish_request)
         .await
         .unwrap_err();
+    // File-backed: NotFound. Postgres: FailedPrecondition (count mismatch).
     assert!(
         matches!(
             error,
             MetastoreError::NotFound(EntityKind::Splits { .. })
+                | MetastoreError::FailedPrecondition { .. }
         ),
         "expected NotFound(Splits), got: {error:?}"
     );
@@ -638,15 +640,20 @@ pub async fn test_metastore_delete_metrics_splits_nonexistent<
     let index_id = append_random_suffix("test-delete-metrics-nonexistent");
     let index_uid = create_test_index(&mut metastore, &index_id).await;
 
-    // Delete a split_id that doesn't exist — should succeed (idempotent).
+    // Delete a split_id that doesn't exist.
+    // File-backed: succeeds silently (idempotent).
+    // Postgres: may return FailedPrecondition ("not marked for deletion").
+    // Both behaviors are acceptable.
     let delete_request = DeleteMetricsSplitsRequest {
         index_uid: Some(index_uid.clone()),
         split_ids: vec!["nonexistent-split".to_string()],
     };
-    metastore
-        .delete_metrics_splits(delete_request)
-        .await
-        .unwrap();
+    let result = metastore.delete_metrics_splits(delete_request).await;
+    match &result {
+        Ok(_) => {} // file-backed: idempotent success
+        Err(MetastoreError::FailedPrecondition { .. }) => {} // postgres: not marked
+        Err(other) => panic!("unexpected error: {other:?}"),
+    }
 
     cleanup_index(&mut metastore, index_uid).await;
 }
diff --git a/quickwit/quickwit-metastore/src/tests/mod.rs b/quickwit/quickwit-metastore/src/tests/mod.rs
index 4433221a87d..fd384670438 100644
--- a/quickwit/quickwit-metastore/src/tests/mod.rs
+++ b/quickwit/quickwit-metastore/src/tests/mod.rs
@@ -22,6 +22,8 @@ use quickwit_proto::metastore::{
     DeleteIndexRequest, DeleteSplitsRequest, ListSplitsRequest, MarkSplitsForDeletionRequest,
     MetastoreServiceClient, MetastoreServiceGrpcClientAdapter,
 };
+
+use crate::metastore::{ListMetricsSplitsRequestExt, ListMetricsSplitsResponseExt};
 use quickwit_proto::tonic::transport::Channel;
 use quickwit_proto::types::IndexUid;
 
@@ -154,6 +156,51 @@ async fn cleanup_index(metastore: &mut dyn MetastoreServiceExt, index_uid: Index
             .await
             .unwrap();
     }
+    // Also clean up any metrics splits (they have a separate FK constraint).
+    let metrics_query =
+        crate::metastore::ListMetricsSplitsQuery::for_index(index_uid.clone()).with_split_states(
+            vec![
+                "Staged".to_string(),
+                "Published".to_string(),
+                "MarkedForDeletion".to_string(),
+            ],
+        );
+    if let Ok(list_request) = quickwit_proto::metastore::ListMetricsSplitsRequest::try_from_query(
+        index_uid.clone(),
+        &metrics_query,
+    ) {
+        if let Ok(response) = metastore.list_metrics_splits(list_request).await {
+            if let Ok(splits) = response.deserialize_splits() {
+                if !splits.is_empty() {
+                    let split_ids: Vec<String> = splits
+                        .iter()
+                        .map(|s| s.metadata.split_id.as_str().to_string())
+                        .collect();
+
+                    // Mark for deletion first.
+                    let _ = metastore
+                        .mark_metrics_splits_for_deletion(
+                            quickwit_proto::metastore::MarkMetricsSplitsForDeletionRequest {
+                                index_uid: Some(index_uid.clone()),
+                                split_ids: split_ids.clone(),
+                            },
+                        )
+                        .await;
+
+                    // Delete.
+                    let _ = metastore
+                        .delete_metrics_splits(
+                            quickwit_proto::metastore::DeleteMetricsSplitsRequest {
+                                index_uid: Some(index_uid.clone()),
+                                split_ids,
+                            },
+                        )
+                        .await;
+                }
+            }
+        }
+    }
+
     // Delete index.
     metastore
         .delete_index(DeleteIndexRequest {

From db51a96e063b019222dfb5d450ab01049cbfae58 Mon Sep 17 00:00:00 2001
From: George Talbot <george.talbot@datadoghq.com>
Date: Wed, 1 Apr 2026 11:34:09 -0400
Subject: [PATCH 20/35] fix(31): correct error types for non-existent metrics
 splits

- publish_metrics_splits: return NotFound (not FailedPrecondition) when
  staged splits don't exist
- delete_metrics_splits: succeed silently (idempotent) for non-existent
  splits instead of returning FailedPrecondition
- Tests now assert the correct error types on both backends

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
---
 .../src/metastore/postgres/metastore.rs       | 25 +++++++------------
 .../quickwit-metastore/src/tests/metrics.rs   | 22 +++++-----------
 2 files changed, 15 insertions(+), 32 deletions(-)

diff --git a/quickwit/quickwit-metastore/src/metastore/postgres/metastore.rs b/quickwit/quickwit-metastore/src/metastore/postgres/metastore.rs
index 646299ae62d..bd19d8e5f14 100644
--- a/quickwit/quickwit-metastore/src/metastore/postgres/metastore.rs
+++ b/quickwit/quickwit-metastore/src/metastore/postgres/metastore.rs
@@ -2151,17 +2151,13 @@ impl MetastoreService for PostgresqlMetastore {
                     .await
                     .map_err(|sqlx_error| convert_sqlx_err(&index_uid.index_id, sqlx_error))?;
 
-            // Verify all staged splits were published
+            // Verify all staged splits were published.
+            // If some splits are missing, it means they don't exist (NotFound),
+            // not that they're in the wrong state (FailedPrecondition).
             if published_count as usize != staged_split_ids.len() {
-                let entity = EntityKind::Splits {
+                return Err(MetastoreError::NotFound(EntityKind::Splits {
                     split_ids: staged_split_ids.clone(),
-                };
-                let message = format!(
-                    "expected to publish {} splits, but only {} were in Staged state",
-                    staged_split_ids.len(),
-                    published_count
-                );
-                return Err(MetastoreError::FailedPrecondition { entity, message });
+                }));
             }
 
             // Verify all replaced splits were marked for deletion.
@@ -2461,7 +2457,9 @@ impl MetastoreService for PostgresqlMetastore {
             .await
             .map_err(|sqlx_error| convert_sqlx_err(&request.index_uid().index_id, sqlx_error))?;
 
-        // Check if any splits could not be deleted
+        // Log if some splits were not deleted (either non-existent or not
+        // in MarkedForDeletion state). Delete is idempotent — we don't error
+        // for missing splits.
         if deleted_split_ids.len() != request.split_ids.len() {
             let not_deleted: Vec<String> = request
                 .split_ids
@@ -2474,13 +2472,8 @@ impl MetastoreService for PostgresqlMetastore {
                 warn!(
                     index_uid = %request.index_uid(),
                     not_deleted = ?not_deleted,
-                    "some metrics splits were not in MarkedForDeletion state"
+                    "some metrics splits were not deleted (non-existent or not marked for deletion)"
                 );
-                let entity = EntityKind::Splits {
-                    split_ids: not_deleted,
-                };
-                let message = "splits are not marked for deletion".to_string();
-                return Err(MetastoreError::FailedPrecondition { entity, message });
             }
         }
 
diff --git a/quickwit/quickwit-metastore/src/tests/metrics.rs b/quickwit/quickwit-metastore/src/tests/metrics.rs
index b44fd8ecc22..f478b386e8a 100644
--- a/quickwit/quickwit-metastore/src/tests/metrics.rs
+++ b/quickwit/quickwit-metastore/src/tests/metrics.rs
@@ -495,13 +495,8 @@ pub async fn test_metastore_publish_metrics_splits_nonexistent<
         .publish_metrics_splits(publish_request)
         .await
         .unwrap_err();
-    // File-backed: NotFound. Postgres: FailedPrecondition (count mismatch).
     assert!(
-        matches!(
-            error,
-            MetastoreError::NotFound(EntityKind::Splits { .. })
-                | MetastoreError::FailedPrecondition { .. }
-        ),
+        matches!(error, MetastoreError::NotFound(EntityKind::Splits { .. })),
         "expected NotFound(Splits), got: {error:?}"
     );
 
@@ -640,20 +635,15 @@ pub async fn test_metastore_delete_metrics_splits_nonexistent<
     let index_id = append_random_suffix("test-delete-metrics-nonexistent");
     let index_uid = create_test_index(&mut metastore, &index_id).await;
 
-    // Delete a split_id that doesn't exist.
-    // File-backed: succeeds silently (idempotent).
-    // Postgres: may return FailedPrecondition ("not marked for deletion").
-    // Both behaviors are acceptable.
+    // Delete a split_id that doesn't exist — should succeed (idempotent).
     let delete_request = DeleteMetricsSplitsRequest {
         index_uid: Some(index_uid.clone()),
         split_ids: vec!["nonexistent-split".to_string()],
     };
-    let result = metastore.delete_metrics_splits(delete_request).await;
-    match &result {
-        Ok(_) => {} // file-backed: idempotent success
-        Err(MetastoreError::FailedPrecondition { .. }) => {} // postgres: not marked
-        Err(other) => panic!("unexpected error: {other:?}"),
-    }
+    metastore
+        .delete_metrics_splits(delete_request)
+        .await
+        .unwrap();
 
     cleanup_index(&mut metastore, index_uid).await;
 }

From 605708eade513a158aced5566e3be4e8a09a92ed Mon Sep 17 00:00:00 2001
From: George Talbot <george.talbot@datadoghq.com>
Date: Wed, 1 Apr 2026 14:17:22 -0400
Subject: [PATCH 21/35] style: rustfmt metastore tests and postgres

---
 .../src/metastore/postgres/metastore.rs       |  1 -
 .../quickwit-metastore/src/tests/metrics.rs   | 97 +++++++------------
 quickwit/quickwit-metastore/src/tests/mod.rs  | 73 +++++++-------
 3 files changed, 66 insertions(+), 105 deletions(-)

diff --git a/quickwit/quickwit-metastore/src/metastore/postgres/metastore.rs b/quickwit/quickwit-metastore/src/metastore/postgres/metastore.rs
index bd19d8e5f14..3bbbe3c2f27 100644
--- a/quickwit/quickwit-metastore/src/metastore/postgres/metastore.rs
+++ b/quickwit/quickwit-metastore/src/metastore/postgres/metastore.rs
@@ -2341,7 +2341,6 @@ impl MetastoreService for PostgresqlMetastore {
             .into_iter()
             .filter_map(|row| {
                 use quickwit_parquet_engine::split::{MetricsSplitState, PgMetricsSplit};
-
                 use sqlx::Row as _;
 
                 let pg_split = PgMetricsSplit {
diff --git a/quickwit/quickwit-metastore/src/tests/metrics.rs b/quickwit/quickwit-metastore/src/tests/metrics.rs
index f478b386e8a..60d3246a2a4 100644
--- a/quickwit/quickwit-metastore/src/tests/metrics.rs
+++ b/quickwit/quickwit-metastore/src/tests/metrics.rs
@@ -14,9 +14,7 @@
 
 use quickwit_common::rand::append_random_suffix;
 use quickwit_config::IndexConfig;
-use quickwit_parquet_engine::split::{
-    MetricsSplitMetadata, MetricsSplitState, SplitId, TimeRange, TAG_ENV, TAG_HOST, TAG_SERVICE,
-};
+use quickwit_parquet_engine::split::{MetricsSplitMetadata, MetricsSplitState, SplitId, TimeRange};
 use quickwit_proto::metastore::{
     CreateIndexRequest, DeleteMetricsSplitsRequest, EntityKind, ListMetricsSplitsRequest,
     MarkMetricsSplitsForDeletionRequest, MetastoreError, PublishMetricsSplitsRequest,
@@ -32,14 +30,10 @@ use crate::{
 };
 
 /// Helper to create a test index and return the actual IndexUid assigned by the metastore.
-async fn create_test_index(
-    metastore: &mut dyn MetastoreServiceExt,
-    index_id: &str,
-) -> IndexUid {
+async fn create_test_index(metastore: &mut dyn MetastoreServiceExt, index_id: &str) -> IndexUid {
     let index_uri = format!("ram:///indexes/{index_id}");
     let index_config = IndexConfig::for_test(index_id, &index_uri);
-    let create_index_request =
-        CreateIndexRequest::try_from_index_config(&index_config).unwrap();
+    let create_index_request = CreateIndexRequest::try_from_index_config(&index_config).unwrap();
     metastore
         .create_index(create_index_request)
         .await
@@ -92,18 +86,15 @@ pub async fn test_metastore_stage_metrics_splits<
     let split_1 = build_test_split(&split_id_1, &index_uid, TimeRange::new(1000, 2000));
     let split_2 = build_test_split(&split_id_2, &index_uid, TimeRange::new(2000, 3000));
 
-    let request = StageMetricsSplitsRequest::try_from_splits_metadata(
-        index_uid.clone(),
-        &[split_1, split_2],
-    )
-    .unwrap();
+    let request =
+        StageMetricsSplitsRequest::try_from_splits_metadata(index_uid.clone(), &[split_1, split_2])
+            .unwrap();
     metastore.stage_metrics_splits(request).await.unwrap();
 
     // Verify both splits are listed in Staged state.
     let query = ListMetricsSplitsQuery::for_index(index_uid.clone())
         .with_split_states(vec!["Staged".to_string()]);
-    let list_request =
-        ListMetricsSplitsRequest::try_from_query(index_uid.clone(), &query).unwrap();
+    let list_request = ListMetricsSplitsRequest::try_from_query(index_uid.clone(), &query).unwrap();
     let response = metastore.list_metrics_splits(list_request).await.unwrap();
     let splits = response.deserialize_splits().unwrap();
     assert_eq!(splits.len(), 2);
@@ -135,11 +126,9 @@ pub async fn test_metastore_stage_metrics_splits_upsert<
         .parquet_file(format!("{split_id}.parquet"))
         .build();
 
-    let request = StageMetricsSplitsRequest::try_from_splits_metadata(
-        index_uid.clone(),
-        &[split_v1],
-    )
-    .unwrap();
+    let request =
+        StageMetricsSplitsRequest::try_from_splits_metadata(index_uid.clone(), &[split_v1])
+            .unwrap();
     metastore.stage_metrics_splits(request).await.unwrap();
 
     // Stage the same split_id again with 200 rows (upsert).
@@ -153,18 +142,15 @@ pub async fn test_metastore_stage_metrics_splits_upsert<
         .parquet_file(format!("{split_id}.parquet"))
         .build();
 
-    let request = StageMetricsSplitsRequest::try_from_splits_metadata(
-        index_uid.clone(),
-        &[split_v2],
-    )
-    .unwrap();
+    let request =
+        StageMetricsSplitsRequest::try_from_splits_metadata(index_uid.clone(), &[split_v2])
+            .unwrap();
     metastore.stage_metrics_splits(request).await.unwrap();
 
     // Verify only one split exists and it has the updated num_rows.
     let query = ListMetricsSplitsQuery::for_index(index_uid.clone())
         .with_split_states(vec!["Staged".to_string()]);
-    let list_request =
-        ListMetricsSplitsRequest::try_from_query(index_uid.clone(), &query).unwrap();
+    let list_request = ListMetricsSplitsRequest::try_from_query(index_uid.clone(), &query).unwrap();
     let response = metastore.list_metrics_splits(list_request).await.unwrap();
     let splits = response.deserialize_splits().unwrap();
     assert_eq!(splits.len(), 1);
@@ -186,11 +172,9 @@ pub async fn test_metastore_list_metrics_splits_by_state<
     let split_id_2 = format!("{index_id}--split-2");
     let split_1 = build_test_split(&split_id_1, &index_uid, TimeRange::new(1000, 2000));
     let split_2 = build_test_split(&split_id_2, &index_uid, TimeRange::new(2000, 3000));
-    let request = StageMetricsSplitsRequest::try_from_splits_metadata(
-        index_uid.clone(),
-        &[split_1, split_2],
-    )
-    .unwrap();
+    let request =
+        StageMetricsSplitsRequest::try_from_splits_metadata(index_uid.clone(), &[split_1, split_2])
+            .unwrap();
     metastore.stage_metrics_splits(request).await.unwrap();
 
     // Publish split_1 only.
@@ -278,8 +262,7 @@ pub async fn test_metastore_list_metrics_splits_by_time_range<
     let query = ListMetricsSplitsQuery::for_index(index_uid.clone())
         .with_split_states(vec!["Staged".to_string()])
         .with_time_range(1500, 3500);
-    let list_request =
-        ListMetricsSplitsRequest::try_from_query(index_uid.clone(), &query).unwrap();
+    let list_request = ListMetricsSplitsRequest::try_from_query(index_uid.clone(), &query).unwrap();
     let response = metastore.list_metrics_splits(list_request).await.unwrap();
     let splits = response.deserialize_splits().unwrap();
     // Should match splits whose time range overlaps [1500, 3500].
@@ -341,8 +324,7 @@ pub async fn test_metastore_list_metrics_splits_by_metric_name<
     let query = ListMetricsSplitsQuery::for_index(index_uid.clone())
         .with_split_states(vec!["Staged".to_string()])
         .with_metric_names(vec!["cpu.usage".to_string()]);
-    let list_request =
-        ListMetricsSplitsRequest::try_from_query(index_uid.clone(), &query).unwrap();
+    let list_request = ListMetricsSplitsRequest::try_from_query(index_uid.clone(), &query).unwrap();
     let response = metastore.list_metrics_splits(list_request).await.unwrap();
     let splits = response.deserialize_splits().unwrap();
     assert_eq!(splits.len(), 2);
@@ -408,8 +390,7 @@ pub async fn test_metastore_list_metrics_splits_by_compaction_scope<
     let query = ListMetricsSplitsQuery::for_index(index_uid.clone())
         .with_split_states(vec!["Staged".to_string()])
         .with_compaction_scope(1700000000, "metric_name|host|timestamp/V2");
-    let list_request =
-        ListMetricsSplitsRequest::try_from_query(index_uid.clone(), &query).unwrap();
+    let list_request = ListMetricsSplitsRequest::try_from_query(index_uid.clone(), &query).unwrap();
     let response = metastore.list_metrics_splits(list_request).await.unwrap();
     let splits = response.deserialize_splits().unwrap();
     // Only split_1 matches both window_start and sort_fields.
@@ -435,11 +416,9 @@ pub async fn test_metastore_publish_metrics_splits<
     let split_2 = build_test_split(&split_id_2, &index_uid, TimeRange::new(2000, 3000));
 
     // Stage both.
-    let request = StageMetricsSplitsRequest::try_from_splits_metadata(
-        index_uid.clone(),
-        &[split_1, split_2],
-    )
-    .unwrap();
+    let request =
+        StageMetricsSplitsRequest::try_from_splits_metadata(index_uid.clone(), &[split_1, split_2])
+            .unwrap();
     metastore.stage_metrics_splits(request).await.unwrap();
 
     // Publish both.
@@ -456,8 +435,7 @@ pub async fn test_metastore_publish_metrics_splits<
     // Verify they are now Published.
     let query = ListMetricsSplitsQuery::for_index(index_uid.clone())
         .with_split_states(vec!["Published".to_string()]);
-    let list_request =
-        ListMetricsSplitsRequest::try_from_query(index_uid.clone(), &query).unwrap();
+    let list_request = ListMetricsSplitsRequest::try_from_query(index_uid.clone(), &query).unwrap();
     let response = metastore.list_metrics_splits(list_request).await.unwrap();
     let splits = response.deserialize_splits().unwrap();
     assert_eq!(splits.len(), 2);
@@ -469,8 +447,7 @@ pub async fn test_metastore_publish_metrics_splits<
     // Verify no Staged splits remain.
     let query = ListMetricsSplitsQuery::for_index(index_uid.clone())
         .with_split_states(vec!["Staged".to_string()]);
-    let list_request =
-        ListMetricsSplitsRequest::try_from_query(index_uid.clone(), &query).unwrap();
+    let list_request = ListMetricsSplitsRequest::try_from_query(index_uid.clone(), &query).unwrap();
     let response = metastore.list_metrics_splits(list_request).await.unwrap();
     let splits = response.deserialize_splits().unwrap();
     assert_eq!(splits.len(), 0);
@@ -516,11 +493,9 @@ pub async fn test_metastore_mark_metrics_splits_for_deletion<
     let split_2 = build_test_split(&split_id_2, &index_uid, TimeRange::new(2000, 3000));
 
     // Stage and publish.
-    let request = StageMetricsSplitsRequest::try_from_splits_metadata(
-        index_uid.clone(),
-        &[split_1, split_2],
-    )
-    .unwrap();
+    let request =
+        StageMetricsSplitsRequest::try_from_splits_metadata(index_uid.clone(), &[split_1, split_2])
+            .unwrap();
     metastore.stage_metrics_splits(request).await.unwrap();
 
     let publish_request = PublishMetricsSplitsRequest {
@@ -546,8 +521,7 @@ pub async fn test_metastore_mark_metrics_splits_for_deletion<
     // Verify split_1 is MarkedForDeletion.
     let query = ListMetricsSplitsQuery::for_index(index_uid.clone())
         .with_split_states(vec!["MarkedForDeletion".to_string()]);
-    let list_request =
-        ListMetricsSplitsRequest::try_from_query(index_uid.clone(), &query).unwrap();
+    let list_request = ListMetricsSplitsRequest::try_from_query(index_uid.clone(), &query).unwrap();
     let response = metastore.list_metrics_splits(list_request).await.unwrap();
     let splits = response.deserialize_splits().unwrap();
     assert_eq!(splits.len(), 1);
@@ -556,8 +530,7 @@ pub async fn test_metastore_mark_metrics_splits_for_deletion<
     // Verify split_2 is still Published.
     let query = ListMetricsSplitsQuery::for_index(index_uid.clone())
         .with_split_states(vec!["Published".to_string()]);
-    let list_request =
-        ListMetricsSplitsRequest::try_from_query(index_uid.clone(), &query).unwrap();
+    let list_request = ListMetricsSplitsRequest::try_from_query(index_uid.clone(), &query).unwrap();
     let response = metastore.list_metrics_splits(list_request).await.unwrap();
     let splits = response.deserialize_splits().unwrap();
     assert_eq!(splits.len(), 1);
@@ -577,11 +550,8 @@ pub async fn test_metastore_delete_metrics_splits<
     let split_1 = build_test_split(&split_id_1, &index_uid, TimeRange::new(1000, 2000));
 
     // Stage, publish, mark for deletion.
-    let request = StageMetricsSplitsRequest::try_from_splits_metadata(
-        index_uid.clone(),
-        &[split_1],
-    )
-    .unwrap();
+    let request =
+        StageMetricsSplitsRequest::try_from_splits_metadata(index_uid.clone(), &[split_1]).unwrap();
     metastore.stage_metrics_splits(request).await.unwrap();
 
     let publish_request = PublishMetricsSplitsRequest {
@@ -619,8 +589,7 @@ pub async fn test_metastore_delete_metrics_splits<
         "Published".to_string(),
         "MarkedForDeletion".to_string(),
     ]);
-    let list_request =
-        ListMetricsSplitsRequest::try_from_query(index_uid.clone(), &query).unwrap();
+    let list_request = ListMetricsSplitsRequest::try_from_query(index_uid.clone(), &query).unwrap();
     let response = metastore.list_metrics_splits(list_request).await.unwrap();
     let splits = response.deserialize_splits().unwrap();
     assert_eq!(splits.len(), 0);
diff --git a/quickwit/quickwit-metastore/src/tests/mod.rs b/quickwit/quickwit-metastore/src/tests/mod.rs
index fd384670438..7c068d57113 100644
--- a/quickwit/quickwit-metastore/src/tests/mod.rs
+++ b/quickwit/quickwit-metastore/src/tests/mod.rs
@@ -22,11 +22,11 @@ use quickwit_proto::metastore::{
     DeleteIndexRequest, DeleteSplitsRequest, ListSplitsRequest, MarkSplitsForDeletionRequest,
     MetastoreServiceClient, MetastoreServiceGrpcClientAdapter,
 };
-
-use crate::metastore::{ListMetricsSplitsRequestExt, ListMetricsSplitsResponseExt};
 use quickwit_proto::tonic::transport::Channel;
 use quickwit_proto::types::IndexUid;
 
+use crate::metastore::{ListMetricsSplitsRequestExt, ListMetricsSplitsResponseExt};
+
 pub(crate) mod delete_task;
 pub(crate) mod get_identity;
 pub(crate) mod index;
@@ -157,48 +157,41 @@ async fn cleanup_index(metastore: &mut dyn MetastoreServiceExt, index_uid: Index
             .unwrap();
     }
     // Also clean up any metrics splits (they have a separate FK constraint).
-    let metrics_query =
-        crate::metastore::ListMetricsSplitsQuery::for_index(index_uid.clone()).with_split_states(
-            vec![
-                "Staged".to_string(),
-                "Published".to_string(),
-                "MarkedForDeletion".to_string(),
-            ],
-        );
+    let metrics_query = crate::metastore::ListMetricsSplitsQuery::for_index(index_uid.clone())
+        .with_split_states(vec![
+            "Staged".to_string(),
+            "Published".to_string(),
+            "MarkedForDeletion".to_string(),
+        ]);
     if let Ok(list_request) = quickwit_proto::metastore::ListMetricsSplitsRequest::try_from_query(
         index_uid.clone(),
         &metrics_query,
-    ) {
-        if let Ok(response) = metastore.list_metrics_splits(list_request).await {
-            if let Ok(splits) = response.deserialize_splits() {
-                if !splits.is_empty() {
-                    let split_ids: Vec<String> = splits
-                        .iter()
-                        .map(|s| s.metadata.split_id.as_str().to_string())
-                        .collect();
-
-                    // Mark for deletion first.
-                    let _ = metastore
-                        .mark_metrics_splits_for_deletion(
-                            quickwit_proto::metastore::MarkMetricsSplitsForDeletionRequest {
-                                index_uid: Some(index_uid.clone()),
-                                split_ids: split_ids.clone(),
-                            },
-                        )
-                        .await;
+    ) && let Ok(response) = metastore.list_metrics_splits(list_request).await
+        && let Ok(splits) = response.deserialize_splits()
+        && !splits.is_empty()
+    {
+        let split_ids: Vec<String> = splits
+            .iter()
+            .map(|s| s.metadata.split_id.as_str().to_string())
+            .collect();
 
-                    // Delete.
-                    let _ = metastore
-                        .delete_metrics_splits(
-                            quickwit_proto::metastore::DeleteMetricsSplitsRequest {
-                                index_uid: Some(index_uid.clone()),
-                                split_ids,
-                            },
-                        )
-                        .await;
-                }
-            }
-        }
+        // Mark for deletion first.
+        let _ = metastore
+            .mark_metrics_splits_for_deletion(
+                quickwit_proto::metastore::MarkMetricsSplitsForDeletionRequest {
+                    index_uid: Some(index_uid.clone()),
+                    split_ids: split_ids.clone(),
+                },
+            )
+            .await;
+
+        // Delete.
+        let _ = metastore
+            .delete_metrics_splits(quickwit_proto::metastore::DeleteMetricsSplitsRequest {
+                index_uid: Some(index_uid.clone()),
+                split_ids,
+            })
+            .await;
     }
 
     // Delete index.

From f21fff57a7074308dd45fd0e1c61a7d01c5d3c93 Mon Sep 17 00:00:00 2001
From: George Talbot <george.talbot@datadoghq.com>
Date: Mon, 6 Apr 2026 09:48:07 -0400
Subject: [PATCH 22/35] =?UTF-8?q?fix(31):=20address=20PR=20review=20?=
 =?UTF-8?q?=E2=80=94=20align=20metrics=5Fsplits=20with=20splits=20table?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

- Migration 27: add maturity_timestamp, delete_opstamp, node_id columns
  and publish_timestamp trigger to match the splits table (Paul's review)
- ListMetricsSplitsQuery: adopt FilterRange<i64> for time_range (matching
  log-side pattern), single time_range field for both read and compaction
  paths, add node_id/delete_opstamp/update_timestamp/create_timestamp/
  mature filters to close gaps with ListSplitsQuery
- Use SplitState enum instead of stringly-typed Vec<String> for split_states
- StoredMetricsSplit: add create_timestamp, node_id, delete_opstamp,
  maturity_timestamp so file-backed metastore can filter on them locally
- File-backed filter: use FilterRange::overlaps_with() for time range and
  window intersection, apply all new filters matching log-side predicate
- Postgres: intersection semantics for window queries, FilterRange-based
  SQL generation for all range filters
- Fix InsertableMetricsSplit.window_duration_secs from Option<i32> to i32
- Rename two-letter variables (ws, sf, dt) throughout

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
---
 .../src/actors/parquet_e2e_test.rs            |  19 +-
 .../27_add-compaction-metadata.down.sql       |   7 +-
 .../27_add-compaction-metadata.up.sql         |  26 ++
 .../file_backed/file_backed_index/mod.rs      |  91 +++++--
 .../quickwit-metastore/src/metastore/mod.rs   |  72 ++++--
 .../src/metastore/postgres/metastore.rs       | 239 ++++++++++++++++--
 .../quickwit-metastore/src/tests/metrics.rs   |  39 +--
 quickwit/quickwit-metastore/src/tests/mod.rs  |  10 +-
 .../src/split/postgres.rs                     |  17 +-
 9 files changed, 422 insertions(+), 98 deletions(-)

diff --git a/quickwit/quickwit-indexing/src/actors/parquet_e2e_test.rs b/quickwit/quickwit-indexing/src/actors/parquet_e2e_test.rs
index fd83e0e3674..c24ea6ee920 100644
--- a/quickwit/quickwit-indexing/src/actors/parquet_e2e_test.rs
+++ b/quickwit/quickwit-indexing/src/actors/parquet_e2e_test.rs
@@ -261,7 +261,8 @@ async fn test_file_backed_metastore_metrics_operations() {
     use quickwit_config::IndexConfig;
     use quickwit_metastore::{
         CreateIndexRequestExt, FileBackedMetastore, ListMetricsSplitsQuery,
-        ListMetricsSplitsRequestExt, ListMetricsSplitsResponseExt, StageMetricsSplitsRequestExt,
+        ListMetricsSplitsRequestExt, ListMetricsSplitsResponseExt, SplitState,
+        StageMetricsSplitsRequestExt,
     };
     use quickwit_parquet_engine::split::{MetricsSplitMetadata, MetricsSplitRecord, TimeRange};
     use quickwit_proto::metastore::{
@@ -306,7 +307,7 @@ async fn test_file_backed_metastore_metrics_operations() {
 
     // Verify staged
     let query = ListMetricsSplitsQuery::for_index(index_uid.clone())
-        .with_split_states(vec!["Staged".to_string()]);
+        .with_split_states([SplitState::Staged]);
     let list_request = ListMetricsSplitsRequest::try_from_query(index_uid.clone(), &query).unwrap();
     let list_response = metastore.list_metrics_splits(list_request).await.unwrap();
     let staged: Vec<MetricsSplitRecord> = list_response.deserialize_splits().unwrap();
@@ -327,7 +328,7 @@ async fn test_file_backed_metastore_metrics_operations() {
 
     // Verify published
     let query = ListMetricsSplitsQuery::for_index(index_uid.clone())
-        .with_split_states(vec!["Published".to_string()]);
+        .with_split_states([SplitState::Published]);
     let list_request = ListMetricsSplitsRequest::try_from_query(index_uid.clone(), &query).unwrap();
     let list_response = metastore.list_metrics_splits(list_request).await.unwrap();
     let published: Vec<MetricsSplitRecord> = list_response.deserialize_splits().unwrap();
@@ -336,16 +337,18 @@ async fn test_file_backed_metastore_metrics_operations() {
 
     // Time range filtering
     let query = ListMetricsSplitsQuery::for_index(index_uid.clone())
-        .with_split_states(vec!["Published".to_string()])
-        .with_time_range(1000, 1100);
+        .with_split_states([SplitState::Published])
+        .with_time_range_start_gte(1000)
+        .with_time_range_end_lte(1100);
     let list_request = ListMetricsSplitsRequest::try_from_query(index_uid.clone(), &query).unwrap();
     let list_response = metastore.list_metrics_splits(list_request).await.unwrap();
     let in_range: Vec<MetricsSplitRecord> = list_response.deserialize_splits().unwrap();
     assert_eq!(in_range.len(), 1);
 
     let query = ListMetricsSplitsQuery::for_index(index_uid.clone())
-        .with_split_states(vec!["Published".to_string()])
-        .with_time_range(5000, 5100);
+        .with_split_states([SplitState::Published])
+        .with_time_range_start_gte(5000)
+        .with_time_range_end_lte(5100);
     let list_request = ListMetricsSplitsRequest::try_from_query(index_uid.clone(), &query).unwrap();
     let list_response = metastore.list_metrics_splits(list_request).await.unwrap();
     let out_of_range: Vec<MetricsSplitRecord> = list_response.deserialize_splits().unwrap();
@@ -353,7 +356,7 @@ async fn test_file_backed_metastore_metrics_operations() {
 
     // Metric name filtering
     let query = ListMetricsSplitsQuery::for_index(index_uid.clone())
-        .with_split_states(vec!["Published".to_string()])
+        .with_split_states([SplitState::Published])
         .with_metric_names(vec!["cpu.usage".to_string()]);
     let list_request = ListMetricsSplitsRequest::try_from_query(index_uid.clone(), &query).unwrap();
     let list_response = metastore.list_metrics_splits(list_request).await.unwrap();
diff --git a/quickwit/quickwit-metastore/migrations/postgresql/27_add-compaction-metadata.down.sql b/quickwit/quickwit-metastore/migrations/postgresql/27_add-compaction-metadata.down.sql
index 493ddc2e1cd..3a1ee3cfc36 100644
--- a/quickwit/quickwit-metastore/migrations/postgresql/27_add-compaction-metadata.down.sql
+++ b/quickwit/quickwit-metastore/migrations/postgresql/27_add-compaction-metadata.down.sql
@@ -1,5 +1,10 @@
--- Reverse Phase 31: Remove compaction metadata columns.
+-- Reverse Phase 31: Remove compaction metadata columns and triggers.
+DROP TRIGGER IF EXISTS set_publish_timestamp_on_metrics_split_publish ON metrics_splits CASCADE;
+DROP FUNCTION IF EXISTS set_publish_timestamp_for_metrics_split();
 DROP INDEX IF EXISTS idx_metrics_splits_compaction_scope;
+ALTER TABLE metrics_splits DROP COLUMN IF EXISTS node_id;
+ALTER TABLE metrics_splits DROP COLUMN IF EXISTS delete_opstamp;
+ALTER TABLE metrics_splits DROP COLUMN IF EXISTS maturity_timestamp;
 ALTER TABLE metrics_splits DROP COLUMN IF EXISTS zonemap_regexes;
 ALTER TABLE metrics_splits DROP COLUMN IF EXISTS row_keys;
 ALTER TABLE metrics_splits DROP COLUMN IF EXISTS num_merge_ops;
diff --git a/quickwit/quickwit-metastore/migrations/postgresql/27_add-compaction-metadata.up.sql b/quickwit/quickwit-metastore/migrations/postgresql/27_add-compaction-metadata.up.sql
index a83336e8295..8d7386fa445 100644
--- a/quickwit/quickwit-metastore/migrations/postgresql/27_add-compaction-metadata.up.sql
+++ b/quickwit/quickwit-metastore/migrations/postgresql/27_add-compaction-metadata.up.sql
@@ -7,6 +7,32 @@ ALTER TABLE metrics_splits ADD COLUMN IF NOT EXISTS num_merge_ops INTEGER NOT NU
 ALTER TABLE metrics_splits ADD COLUMN IF NOT EXISTS row_keys BYTEA;
 ALTER TABLE metrics_splits ADD COLUMN IF NOT EXISTS zonemap_regexes JSONB NOT NULL DEFAULT '{}';
 
+-- Columns present on the `splits` table that were missing from `metrics_splits`.
+-- maturity_timestamp: compaction planner needs this to restrict candidates to
+-- Published-and-immature splits, matching the logic the log-side merge planner uses.
+ALTER TABLE metrics_splits ADD COLUMN IF NOT EXISTS maturity_timestamp TIMESTAMP DEFAULT TO_TIMESTAMP(0);
+-- delete_opstamp: tracks which delete tasks have been applied to a split.
+ALTER TABLE metrics_splits ADD COLUMN IF NOT EXISTS delete_opstamp BIGINT CHECK (delete_opstamp >= 0) DEFAULT 0;
+-- node_id: identifies which node produced the split.
+ALTER TABLE metrics_splits ADD COLUMN IF NOT EXISTS node_id VARCHAR(253);
+
+-- Auto-set publish_timestamp when a split transitions Staged → Published,
+-- matching the trigger on the `splits` table (migration 3).
+CREATE OR REPLACE FUNCTION set_publish_timestamp_for_metrics_split() RETURNS trigger AS $$
+BEGIN
+    IF (TG_OP = 'UPDATE') AND (NEW.split_state = 'Published') AND (OLD.split_state = 'Staged') THEN
+        NEW.publish_timestamp := (CURRENT_TIMESTAMP AT TIME ZONE 'UTC');
+    END IF;
+    RETURN NEW;
+END;
+$$ LANGUAGE plpgsql;
+
+DROP TRIGGER IF EXISTS set_publish_timestamp_on_metrics_split_publish ON metrics_splits CASCADE;
+CREATE TRIGGER set_publish_timestamp_on_metrics_split_publish
+    BEFORE UPDATE ON metrics_splits
+    FOR EACH ROW
+    EXECUTE PROCEDURE set_publish_timestamp_for_metrics_split();
+
 -- Compaction scope index: supports the compaction planner's primary query pattern
 -- "give me all Published splits for a given (index_uid, sort_fields, window_start) triple."
 CREATE INDEX IF NOT EXISTS idx_metrics_splits_compaction_scope
diff --git a/quickwit/quickwit-metastore/src/metastore/file_backed/file_backed_index/mod.rs b/quickwit/quickwit-metastore/src/metastore/file_backed/file_backed_index/mod.rs
index 152dc0cd84f..96556232dbb 100644
--- a/quickwit/quickwit-metastore/src/metastore/file_backed/file_backed_index/mod.rs
+++ b/quickwit/quickwit-metastore/src/metastore/file_backed/file_backed_index/mod.rs
@@ -56,6 +56,19 @@ pub(crate) struct StoredMetricsSplit {
     pub state: MetricsSplitState,
     /// Update timestamp (Unix epoch seconds).
     pub update_timestamp: i64,
+    /// Create timestamp (Unix epoch seconds).
+    pub create_timestamp: i64,
+    /// Node that produced this split.
+    #[serde(default)]
+    pub node_id: String,
+    /// Delete opstamp.
+    #[serde(default)]
+    pub delete_opstamp: u64,
+    /// Maturity timestamp (Unix epoch seconds). Splits with
+    /// maturity_timestamp <= now are considered mature.
+    /// Defaults to 0 (epoch), meaning mature immediately.
+    #[serde(default)]
+    pub maturity_timestamp: i64,
 }
 
 /// A `FileBackedIndex` object carries an index metadata and its split metadata.
@@ -759,6 +772,10 @@ impl FileBackedIndex {
                 metadata,
                 state: MetricsSplitState::Staged,
                 update_timestamp: now,
+                create_timestamp: now,
+                node_id: String::new(),
+                delete_opstamp: 0,
+                maturity_timestamp: 0,
             };
             self.metrics_splits.insert(split_id, stored);
         }
@@ -907,21 +924,37 @@ fn metrics_split_matches_query(split: &StoredMetricsSplit, query: &ListMetricsSp
     // Filter by state
     if !query.split_states.is_empty() {
         let state_str = split.state.as_str();
-        if !query.split_states.iter().any(|s| s == state_str) {
+        if !query.split_states.iter().any(|s| s.as_str() == state_str) {
             return false;
         }
     }
 
-    // Filter by time range
-    if let Some(start) = query.time_range_start
-        && (split.metadata.time_range.end_secs as i64) < start
-    {
-        return false;
-    }
-    if let Some(end) = query.time_range_end
-        && (split.metadata.time_range.start_secs as i64) > end
-    {
-        return false;
+    // Filter by time range.
+    // When sort_fields is set this is a compaction query and time_range
+    // refers to the compaction window; otherwise it refers to the data
+    // time range. Both use intersection semantics via FilterRange.
+    if !query.time_range.is_unbounded() {
+        if query.sort_fields.is_some() {
+            // Compaction path: intersect against the split's window.
+            let split_start = split.metadata.window_start();
+            let split_duration = split.metadata.window_duration_secs() as i64;
+            match split_start {
+                Some(split_start) if split_duration > 0 => {
+                    let split_end = split_start + split_duration - 1;
+                    if !query.time_range.overlaps_with(split_start..=split_end) {
+                        return false;
+                    }
+                }
+                _ => return false,
+            }
+        } else {
+            // Read path: intersect against the split's data time range.
+            let data_range = split.metadata.time_range.start_secs as i64
+                ..=split.metadata.time_range.end_secs as i64;
+            if !query.time_range.overlaps_with(data_range) {
+                return false;
+            }
+        }
     }
 
     // Filter by metric names
@@ -979,18 +1012,44 @@ fn metrics_split_matches_query(split: &StoredMetricsSplit, query: &ListMetricsSp
         }
     }
 
-    // Filter by compaction scope
-    if let Some(ws) = query.window_start
-        && split.metadata.window_start() != Some(ws)
+    if let Some(ref sort_fields) = query.sort_fields
+        && split.metadata.sort_fields != *sort_fields
     {
         return false;
     }
-    if let Some(ref sf) = query.sort_fields
-        && split.metadata.sort_fields != *sf
+
+    if let Some(node_id) = &query.node_id
+        && split.node_id != *node_id
     {
         return false;
     }
 
+    if !query.delete_opstamp.contains(&split.delete_opstamp) {
+        return false;
+    }
+
+    if !query.update_timestamp.contains(&split.update_timestamp) {
+        return false;
+    }
+
+    if !query.create_timestamp.contains(&split.create_timestamp) {
+        return false;
+    }
+
+    match &query.mature {
+        Bound::Included(evaluation_datetime) => {
+            if split.maturity_timestamp > evaluation_datetime.unix_timestamp() {
+                return false;
+            }
+        }
+        Bound::Excluded(evaluation_datetime) => {
+            if split.maturity_timestamp <= evaluation_datetime.unix_timestamp() {
+                return false;
+            }
+        }
+        Bound::Unbounded => {}
+    }
+
     true
 }
 
diff --git a/quickwit/quickwit-metastore/src/metastore/mod.rs b/quickwit/quickwit-metastore/src/metastore/mod.rs
index b1bc686107a..85a537b83ee 100644
--- a/quickwit/quickwit-metastore/src/metastore/mod.rs
+++ b/quickwit/quickwit-metastore/src/metastore/mod.rs
@@ -51,17 +51,15 @@ use crate::checkpoint::IndexCheckpointDelta;
 use crate::{Split, SplitMetadata, SplitState};
 
 /// Query parameters for listing metrics splits.
-#[derive(Debug, Clone, Default, Serialize, Deserialize)]
+#[derive(Debug, Clone, Serialize, Deserialize)]
 pub struct ListMetricsSplitsQuery {
     /// Index UID to filter by (required).
     pub index_uid: IndexUid,
     /// Split states to include.
     #[serde(default)]
-    pub split_states: Vec<String>,
-    /// Time range start (inclusive).
-    pub time_range_start: Option<i64>,
-    /// Time range end (inclusive).
-    pub time_range_end: Option<i64>,
+    pub split_states: Vec<SplitState>,
+    /// The time range to filter by.
+    pub time_range: FilterRange<i64>,
     /// Metric names to filter by (any match).
     #[serde(default)]
     pub metric_names: Vec<String>,
@@ -75,34 +73,70 @@ pub struct ListMetricsSplitsQuery {
     pub tag_region: Option<String>,
     /// Host tag filter.
     pub tag_host: Option<String>,
-    /// Window start filter for compaction scope queries.
-    pub window_start: Option<i64>,
     /// Sort fields filter for compaction scope queries.
     pub sort_fields: Option<String>,
+    /// A specific node ID to filter by.
+    pub node_id: Option<NodeId>,
+    /// The delete opstamp range to filter by.
+    pub delete_opstamp: FilterRange<u64>,
+    /// The update timestamp range to filter by.
+    pub update_timestamp: FilterRange<i64>,
+    /// The create timestamp range to filter by.
+    pub create_timestamp: FilterRange<i64>,
+    /// The datetime at which you include or exclude mature splits.
+    pub mature: Bound<OffsetDateTime>,
     /// Limit number of results.
     pub limit: Option<usize>,
 }
 
+impl Default for ListMetricsSplitsQuery {
+    fn default() -> Self {
+        Self {
+            index_uid: IndexUid::default(),
+            split_states: Vec::new(),
+            time_range: Default::default(),
+            metric_names: Vec::new(),
+            tag_service: None,
+            tag_env: None,
+            tag_datacenter: None,
+            tag_region: None,
+            tag_host: None,
+            sort_fields: None,
+            node_id: None,
+            delete_opstamp: Default::default(),
+            update_timestamp: Default::default(),
+            create_timestamp: Default::default(),
+            mature: Bound::Unbounded,
+            limit: None,
+        }
+    }
+}
+
 impl ListMetricsSplitsQuery {
     /// Creates a query for all splits in an index.
     pub fn for_index(index_uid: impl Into<IndexUid>) -> Self {
         Self {
             index_uid: index_uid.into(),
-            split_states: vec!["Published".to_string()],
+            split_states: vec![SplitState::Published],
             ..Default::default()
         }
     }
 
     /// Filter by split states.
-    pub fn with_split_states(mut self, states: Vec<String>) -> Self {
-        self.split_states = states;
+    pub fn with_split_states(mut self, states: impl AsRef<[SplitState]>) -> Self {
+        self.split_states = states.as_ref().to_vec();
+        self
+    }
+
+    /// Filter by time range (inclusive on both ends).
+    pub fn with_time_range_start_gte(mut self, v: i64) -> Self {
+        self.time_range.start = Bound::Included(v);
         self
     }
 
-    /// Filter by time range.
-    pub fn with_time_range(mut self, start: i64, end: i64) -> Self {
-        self.time_range_start = Some(start);
-        self.time_range_end = Some(end);
+    /// Filter by time range (inclusive on both ends).
+    pub fn with_time_range_end_lte(mut self, v: i64) -> Self {
+        self.time_range.end = Bound::Included(v);
         self
     }
 
@@ -112,13 +146,17 @@ impl ListMetricsSplitsQuery {
         self
     }
 
-    /// Filter by compaction scope (window_start + sort_fields).
+    /// Filter by compaction scope: splits whose window intersects
+    /// `[window_start, window_start + window_duration_secs)` and whose
+    /// sort_fields match exactly.
     pub fn with_compaction_scope(
         mut self,
         window_start: i64,
+        window_duration_secs: u32,
         sort_fields: impl Into<String>,
     ) -> Self {
-        self.window_start = Some(window_start);
+        self.time_range.start = Bound::Included(window_start);
+        self.time_range.end = Bound::Excluded(window_start + window_duration_secs as i64);
         self.sort_fields = Some(sort_fields.into());
         self
     }
diff --git a/quickwit/quickwit-metastore/src/metastore/postgres/metastore.rs b/quickwit/quickwit-metastore/src/metastore/postgres/metastore.rs
index 3bbbe3c2f27..4296f7f51a5 100644
--- a/quickwit/quickwit-metastore/src/metastore/postgres/metastore.rs
+++ b/quickwit/quickwit-metastore/src/metastore/postgres/metastore.rs
@@ -14,6 +14,7 @@
 
 use std::collections::HashMap;
 use std::fmt::{self, Write};
+use std::ops::Bound;
 use std::str::FromStr;
 use std::time::Duration;
 
@@ -1804,7 +1805,7 @@ impl MetastoreService for PostgresqlMetastore {
         let mut size_bytes_list = Vec::with_capacity(splits_metadata.len());
         let mut split_metadata_jsons = Vec::with_capacity(splits_metadata.len());
         let mut window_starts: Vec<Option<i64>> = Vec::with_capacity(splits_metadata.len());
-        let mut window_duration_secs_list: Vec<Option<i32>> =
+        let mut window_duration_secs_list: Vec<i32> =
             Vec::with_capacity(splits_metadata.len());
         let mut sort_fields_list: Vec<String> = Vec::with_capacity(splits_metadata.len());
         let mut num_merge_ops_list: Vec<i32> = Vec::with_capacity(splits_metadata.len());
@@ -2234,14 +2235,39 @@ impl MetastoreService for PostgresqlMetastore {
             param_idx += 1;
         }
 
-        // Add time range filter
-        if query.time_range_start.is_some() {
-            sql.push_str(&format!(" AND time_range_end >= ${}", param_idx));
-            param_idx += 1;
+        // Time range filter.
+        // When sort_fields is set this is a compaction query and time_range
+        // refers to the compaction window columns; otherwise it refers to the
+        // data time_range_start/time_range_end columns.
+        let is_compaction_query = query.sort_fields.is_some();
+        let (range_col_start, range_col_end) = if is_compaction_query {
+            ("window_start", "window_start + window_duration_secs")
+        } else {
+            ("time_range_start", "time_range_end")
+        };
+        // overlap: split_end > query_start AND split_start < query_end
+        // (using >= / <= for Included bounds, > / < for Excluded)
+        match &query.time_range.start {
+            Bound::Included(_) => {
+                sql.push_str(&format!(" AND {} >= ${}", range_col_end, param_idx));
+                param_idx += 1;
+            }
+            Bound::Excluded(_) => {
+                sql.push_str(&format!(" AND {} > ${}", range_col_end, param_idx));
+                param_idx += 1;
+            }
+            Bound::Unbounded => {}
         }
-        if query.time_range_end.is_some() {
-            sql.push_str(&format!(" AND time_range_start <= ${}", param_idx));
-            param_idx += 1;
+        match &query.time_range.end {
+            Bound::Included(_) => {
+                sql.push_str(&format!(" AND {} <= ${}", range_col_start, param_idx));
+                param_idx += 1;
+            }
+            Bound::Excluded(_) => {
+                sql.push_str(&format!(" AND {} < ${}", range_col_start, param_idx));
+                param_idx += 1;
+            }
+            Bound::Unbounded => {}
         }
 
         // Add metric names filter (ANY overlap)
@@ -2272,16 +2298,131 @@ impl MetastoreService for PostgresqlMetastore {
             param_idx += 1;
         }
 
-        // Compaction scope filters
-        if query.window_start.is_some() {
-            sql.push_str(&format!(" AND window_start = ${}", param_idx));
-            param_idx += 1;
-        }
         if query.sort_fields.is_some() {
             sql.push_str(&format!(" AND sort_fields = ${}", param_idx));
             param_idx += 1;
         }
 
+        if query.node_id.is_some() {
+            sql.push_str(&format!(" AND node_id = ${}", param_idx));
+            param_idx += 1;
+        }
+
+        // delete_opstamp range filter
+        match &query.delete_opstamp.start {
+            Bound::Included(_) => {
+                sql.push_str(&format!(" AND delete_opstamp >= ${}", param_idx));
+                param_idx += 1;
+            }
+            Bound::Excluded(_) => {
+                sql.push_str(&format!(" AND delete_opstamp > ${}", param_idx));
+                param_idx += 1;
+            }
+            Bound::Unbounded => {}
+        }
+        match &query.delete_opstamp.end {
+            Bound::Included(_) => {
+                sql.push_str(&format!(" AND delete_opstamp <= ${}", param_idx));
+                param_idx += 1;
+            }
+            Bound::Excluded(_) => {
+                sql.push_str(&format!(" AND delete_opstamp < ${}", param_idx));
+                param_idx += 1;
+            }
+            Bound::Unbounded => {}
+        }
+
+        // update_timestamp range filter
+        match &query.update_timestamp.start {
+            Bound::Included(_) => {
+                sql.push_str(&format!(
+                    " AND EXTRACT(EPOCH FROM update_timestamp)::bigint >= ${}",
+                    param_idx
+                ));
+                param_idx += 1;
+            }
+            Bound::Excluded(_) => {
+                sql.push_str(&format!(
+                    " AND EXTRACT(EPOCH FROM update_timestamp)::bigint > ${}",
+                    param_idx
+                ));
+                param_idx += 1;
+            }
+            Bound::Unbounded => {}
+        }
+        match &query.update_timestamp.end {
+            Bound::Included(_) => {
+                sql.push_str(&format!(
+                    " AND EXTRACT(EPOCH FROM update_timestamp)::bigint <= ${}",
+                    param_idx
+                ));
+                param_idx += 1;
+            }
+            Bound::Excluded(_) => {
+                sql.push_str(&format!(
+                    " AND EXTRACT(EPOCH FROM update_timestamp)::bigint < ${}",
+                    param_idx
+                ));
+                param_idx += 1;
+            }
+            Bound::Unbounded => {}
+        }
+
+        // create_timestamp range filter
+        match &query.create_timestamp.start {
+            Bound::Included(_) => {
+                sql.push_str(&format!(
+                    " AND EXTRACT(EPOCH FROM create_timestamp)::bigint >= ${}",
+                    param_idx
+                ));
+                param_idx += 1;
+            }
+            Bound::Excluded(_) => {
+                sql.push_str(&format!(
+                    " AND EXTRACT(EPOCH FROM create_timestamp)::bigint > ${}",
+                    param_idx
+                ));
+                param_idx += 1;
+            }
+            Bound::Unbounded => {}
+        }
+        match &query.create_timestamp.end {
+            Bound::Included(_) => {
+                sql.push_str(&format!(
+                    " AND EXTRACT(EPOCH FROM create_timestamp)::bigint <= ${}",
+                    param_idx
+                ));
+                param_idx += 1;
+            }
+            Bound::Excluded(_) => {
+                sql.push_str(&format!(
+                    " AND EXTRACT(EPOCH FROM create_timestamp)::bigint < ${}",
+                    param_idx
+                ));
+                param_idx += 1;
+            }
+            Bound::Unbounded => {}
+        }
+
+        // maturity filter
+        match &query.mature {
+            Bound::Included(_) => {
+                sql.push_str(&format!(
+                    " AND maturity_timestamp <= TO_TIMESTAMP(${})",
+                    param_idx
+                ));
+                param_idx += 1;
+            }
+            Bound::Excluded(_) => {
+                sql.push_str(&format!(
+                    " AND maturity_timestamp > TO_TIMESTAMP(${})",
+                    param_idx
+                ));
+                param_idx += 1;
+            }
+            Bound::Unbounded => {}
+        }
+
         sql.push_str(" ORDER BY time_range_start ASC");
 
         // Add limit
@@ -2295,13 +2436,24 @@ impl MetastoreService for PostgresqlMetastore {
         query_builder = query_builder.bind(query.index_uid.to_string());
 
         if !query.split_states.is_empty() {
-            query_builder = query_builder.bind(&query.split_states);
+            let state_strings: Vec<String> = query
+                .split_states
+                .iter()
+                .map(|s| s.as_str().to_string())
+                .collect();
+            query_builder = query_builder.bind(state_strings);
         }
-        if let Some(start) = query.time_range_start {
-            query_builder = query_builder.bind(start);
+        match &query.time_range.start {
+            Bound::Included(v) | Bound::Excluded(v) => {
+                query_builder = query_builder.bind(*v);
+            }
+            Bound::Unbounded => {}
         }
-        if let Some(end) = query.time_range_end {
-            query_builder = query_builder.bind(end);
+        match &query.time_range.end {
+            Bound::Included(v) | Bound::Excluded(v) => {
+                query_builder = query_builder.bind(*v);
+            }
+            Bound::Unbounded => {}
         }
         if !query.metric_names.is_empty() {
             query_builder = query_builder.bind(&query.metric_names);
@@ -2321,11 +2473,54 @@ impl MetastoreService for PostgresqlMetastore {
         if let Some(ref host) = query.tag_host {
             query_builder = query_builder.bind(host);
         }
-        if let Some(ws) = query.window_start {
-            query_builder = query_builder.bind(ws);
+        if let Some(ref sort_fields) = query.sort_fields {
+            query_builder = query_builder.bind(sort_fields);
+        }
+        if let Some(ref node_id) = query.node_id {
+            query_builder = query_builder.bind(node_id.as_str());
         }
-        if let Some(ref sf) = query.sort_fields {
-            query_builder = query_builder.bind(sf);
+        match &query.delete_opstamp.start {
+            Bound::Included(v) | Bound::Excluded(v) => {
+                query_builder = query_builder.bind(*v as i64);
+            }
+            Bound::Unbounded => {}
+        }
+        match &query.delete_opstamp.end {
+            Bound::Included(v) | Bound::Excluded(v) => {
+                query_builder = query_builder.bind(*v as i64);
+            }
+            Bound::Unbounded => {}
+        }
+        match &query.update_timestamp.start {
+            Bound::Included(v) | Bound::Excluded(v) => {
+                query_builder = query_builder.bind(*v);
+            }
+            Bound::Unbounded => {}
+        }
+        match &query.update_timestamp.end {
+            Bound::Included(v) | Bound::Excluded(v) => {
+                query_builder = query_builder.bind(*v);
+            }
+            Bound::Unbounded => {}
+        }
+        match &query.create_timestamp.start {
+            Bound::Included(v) | Bound::Excluded(v) => {
+                query_builder = query_builder.bind(*v);
+            }
+            Bound::Unbounded => {}
+        }
+        match &query.create_timestamp.end {
+            Bound::Included(v) | Bound::Excluded(v) => {
+                query_builder = query_builder.bind(*v);
+            }
+            Bound::Unbounded => {}
+        }
+        match &query.mature {
+            Bound::Included(evaluation_datetime) | Bound::Excluded(evaluation_datetime) => {
+                query_builder =
+                    query_builder.bind(evaluation_datetime.unix_timestamp() as f64);
+            }
+            Bound::Unbounded => {}
         }
         if let Some(limit) = query.limit {
             query_builder = query_builder.bind(limit as i64);
diff --git a/quickwit/quickwit-metastore/src/tests/metrics.rs b/quickwit/quickwit-metastore/src/tests/metrics.rs
index 60d3246a2a4..f85ffe9588e 100644
--- a/quickwit/quickwit-metastore/src/tests/metrics.rs
+++ b/quickwit/quickwit-metastore/src/tests/metrics.rs
@@ -26,7 +26,7 @@ use super::DefaultForTest;
 use crate::tests::cleanup_index;
 use crate::{
     CreateIndexRequestExt, ListMetricsSplitsQuery, ListMetricsSplitsRequestExt,
-    ListMetricsSplitsResponseExt, MetastoreServiceExt, StageMetricsSplitsRequestExt,
+    ListMetricsSplitsResponseExt, MetastoreServiceExt, SplitState, StageMetricsSplitsRequestExt,
 };
 
 /// Helper to create a test index and return the actual IndexUid assigned by the metastore.
@@ -93,7 +93,7 @@ pub async fn test_metastore_stage_metrics_splits<
 
     // Verify both splits are listed in Staged state.
     let query = ListMetricsSplitsQuery::for_index(index_uid.clone())
-        .with_split_states(vec!["Staged".to_string()]);
+        .with_split_states([SplitState::Staged]);
     let list_request = ListMetricsSplitsRequest::try_from_query(index_uid.clone(), &query).unwrap();
     let response = metastore.list_metrics_splits(list_request).await.unwrap();
     let splits = response.deserialize_splits().unwrap();
@@ -149,7 +149,7 @@ pub async fn test_metastore_stage_metrics_splits_upsert<
 
     // Verify only one split exists and it has the updated num_rows.
     let query = ListMetricsSplitsQuery::for_index(index_uid.clone())
-        .with_split_states(vec!["Staged".to_string()]);
+        .with_split_states([SplitState::Staged]);
     let list_request = ListMetricsSplitsRequest::try_from_query(index_uid.clone(), &query).unwrap();
     let response = metastore.list_metrics_splits(list_request).await.unwrap();
     let splits = response.deserialize_splits().unwrap();
@@ -191,7 +191,7 @@ pub async fn test_metastore_list_metrics_splits_by_state<
     // List only Published splits.
     {
         let query = ListMetricsSplitsQuery::for_index(index_uid.clone())
-            .with_split_states(vec!["Published".to_string()]);
+            .with_split_states([SplitState::Published]);
         let list_request =
             ListMetricsSplitsRequest::try_from_query(index_uid.clone(), &query).unwrap();
         let response = metastore.list_metrics_splits(list_request).await.unwrap();
@@ -204,7 +204,7 @@ pub async fn test_metastore_list_metrics_splits_by_state<
     // List only Staged splits.
     {
         let query = ListMetricsSplitsQuery::for_index(index_uid.clone())
-            .with_split_states(vec!["Staged".to_string()]);
+            .with_split_states([SplitState::Staged]);
         let list_request =
             ListMetricsSplitsRequest::try_from_query(index_uid.clone(), &query).unwrap();
         let response = metastore.list_metrics_splits(list_request).await.unwrap();
@@ -217,7 +217,7 @@ pub async fn test_metastore_list_metrics_splits_by_state<
     // List both states.
     {
         let query = ListMetricsSplitsQuery::for_index(index_uid.clone())
-            .with_split_states(vec!["Published".to_string(), "Staged".to_string()]);
+            .with_split_states([SplitState::Published, SplitState::Staged]);
         let list_request =
             ListMetricsSplitsRequest::try_from_query(index_uid.clone(), &query).unwrap();
         let response = metastore.list_metrics_splits(list_request).await.unwrap();
@@ -260,8 +260,9 @@ pub async fn test_metastore_list_metrics_splits_by_time_range<
 
     // Query for time range that overlaps only the first two splits.
     let query = ListMetricsSplitsQuery::for_index(index_uid.clone())
-        .with_split_states(vec!["Staged".to_string()])
-        .with_time_range(1500, 3500);
+        .with_split_states([SplitState::Staged])
+        .with_time_range_start_gte(1500)
+        .with_time_range_end_lte(3500);
     let list_request = ListMetricsSplitsRequest::try_from_query(index_uid.clone(), &query).unwrap();
     let response = metastore.list_metrics_splits(list_request).await.unwrap();
     let splits = response.deserialize_splits().unwrap();
@@ -322,7 +323,7 @@ pub async fn test_metastore_list_metrics_splits_by_metric_name<
 
     // Query for "cpu.usage" should return split_1 and split_3.
     let query = ListMetricsSplitsQuery::for_index(index_uid.clone())
-        .with_split_states(vec!["Staged".to_string()])
+        .with_split_states([SplitState::Staged])
         .with_metric_names(vec!["cpu.usage".to_string()]);
     let list_request = ListMetricsSplitsRequest::try_from_query(index_uid.clone(), &query).unwrap();
     let response = metastore.list_metrics_splits(list_request).await.unwrap();
@@ -388,8 +389,8 @@ pub async fn test_metastore_list_metrics_splits_by_compaction_scope<
 
     // Query by compaction scope: window_start=1700000000, sort_fields matching split_1.
     let query = ListMetricsSplitsQuery::for_index(index_uid.clone())
-        .with_split_states(vec!["Staged".to_string()])
-        .with_compaction_scope(1700000000, "metric_name|host|timestamp/V2");
+        .with_split_states([SplitState::Staged])
+        .with_compaction_scope(1700000000, 3600, "metric_name|host|timestamp/V2");
     let list_request = ListMetricsSplitsRequest::try_from_query(index_uid.clone(), &query).unwrap();
     let response = metastore.list_metrics_splits(list_request).await.unwrap();
     let splits = response.deserialize_splits().unwrap();
@@ -434,7 +435,7 @@ pub async fn test_metastore_publish_metrics_splits<
 
     // Verify they are now Published.
     let query = ListMetricsSplitsQuery::for_index(index_uid.clone())
-        .with_split_states(vec!["Published".to_string()]);
+        .with_split_states([SplitState::Published]);
     let list_request = ListMetricsSplitsRequest::try_from_query(index_uid.clone(), &query).unwrap();
     let response = metastore.list_metrics_splits(list_request).await.unwrap();
     let splits = response.deserialize_splits().unwrap();
@@ -446,7 +447,7 @@ pub async fn test_metastore_publish_metrics_splits<
 
     // Verify no Staged splits remain.
     let query = ListMetricsSplitsQuery::for_index(index_uid.clone())
-        .with_split_states(vec!["Staged".to_string()]);
+        .with_split_states([SplitState::Staged]);
     let list_request = ListMetricsSplitsRequest::try_from_query(index_uid.clone(), &query).unwrap();
     let response = metastore.list_metrics_splits(list_request).await.unwrap();
     let splits = response.deserialize_splits().unwrap();
@@ -520,7 +521,7 @@ pub async fn test_metastore_mark_metrics_splits_for_deletion<
 
     // Verify split_1 is MarkedForDeletion.
     let query = ListMetricsSplitsQuery::for_index(index_uid.clone())
-        .with_split_states(vec!["MarkedForDeletion".to_string()]);
+        .with_split_states([SplitState::MarkedForDeletion]);
     let list_request = ListMetricsSplitsRequest::try_from_query(index_uid.clone(), &query).unwrap();
     let response = metastore.list_metrics_splits(list_request).await.unwrap();
     let splits = response.deserialize_splits().unwrap();
@@ -529,7 +530,7 @@ pub async fn test_metastore_mark_metrics_splits_for_deletion<
 
     // Verify split_2 is still Published.
     let query = ListMetricsSplitsQuery::for_index(index_uid.clone())
-        .with_split_states(vec!["Published".to_string()]);
+        .with_split_states([SplitState::Published]);
     let list_request = ListMetricsSplitsRequest::try_from_query(index_uid.clone(), &query).unwrap();
     let response = metastore.list_metrics_splits(list_request).await.unwrap();
     let splits = response.deserialize_splits().unwrap();
@@ -584,10 +585,10 @@ pub async fn test_metastore_delete_metrics_splits<
         .unwrap();
 
     // Verify it is gone (list all states).
-    let query = ListMetricsSplitsQuery::for_index(index_uid.clone()).with_split_states(vec![
-        "Staged".to_string(),
-        "Published".to_string(),
-        "MarkedForDeletion".to_string(),
+    let query = ListMetricsSplitsQuery::for_index(index_uid.clone()).with_split_states([
+        SplitState::Staged,
+        SplitState::Published,
+        SplitState::MarkedForDeletion,
     ]);
     let list_request = ListMetricsSplitsRequest::try_from_query(index_uid.clone(), &query).unwrap();
     let response = metastore.list_metrics_splits(list_request).await.unwrap();
diff --git a/quickwit/quickwit-metastore/src/tests/mod.rs b/quickwit/quickwit-metastore/src/tests/mod.rs
index 7c068d57113..fd2194df598 100644
--- a/quickwit/quickwit-metastore/src/tests/mod.rs
+++ b/quickwit/quickwit-metastore/src/tests/mod.rs
@@ -38,7 +38,7 @@ pub(crate) mod split;
 pub(crate) mod template;
 
 use crate::metastore::MetastoreServiceStreamSplitsExt;
-use crate::{ListSplitsRequestExt, MetastoreServiceExt, Split};
+use crate::{ListSplitsRequestExt, MetastoreServiceExt, Split, SplitState};
 
 const MAX_GRPC_MESSAGE_SIZE: ByteSize = ByteSize::mib(1);
 
@@ -158,10 +158,10 @@ async fn cleanup_index(metastore: &mut dyn MetastoreServiceExt, index_uid: Index
     }
     // Also clean up any metrics splits (they have a separate FK constraint).
     let metrics_query = crate::metastore::ListMetricsSplitsQuery::for_index(index_uid.clone())
-        .with_split_states(vec![
-            "Staged".to_string(),
-            "Published".to_string(),
-            "MarkedForDeletion".to_string(),
+        .with_split_states([
+            SplitState::Staged,
+            SplitState::Published,
+            SplitState::MarkedForDeletion,
         ]);
     if let Ok(list_request) = quickwit_proto::metastore::ListMetricsSplitsRequest::try_from_query(
         index_uid.clone(),
diff --git a/quickwit/quickwit-parquet-engine/src/split/postgres.rs b/quickwit/quickwit-parquet-engine/src/split/postgres.rs
index 6b63f46f17f..0d1bbff184a 100644
--- a/quickwit/quickwit-parquet-engine/src/split/postgres.rs
+++ b/quickwit/quickwit-parquet-engine/src/split/postgres.rs
@@ -99,7 +99,7 @@ pub struct InsertableMetricsSplit {
     pub size_bytes: i64,
     pub split_metadata_json: String,
     pub window_start: Option<i64>,
-    pub window_duration_secs: Option<i32>,
+    pub window_duration_secs: i32,
     pub sort_fields: String,
     pub num_merge_ops: i32,
     pub row_keys: Option<Vec<u8>>,
@@ -131,10 +131,7 @@ impl InsertableMetricsSplit {
             size_bytes: metadata.size_bytes as i64,
             split_metadata_json,
             window_start: metadata.window_start(),
-            window_duration_secs: {
-                let dur = metadata.window_duration_secs();
-                if dur > 0 { Some(dur as i32) } else { None }
-            },
+            window_duration_secs: metadata.window_duration_secs() as i32,
             sort_fields: metadata.sort_fields.clone(),
             num_merge_ops: metadata.num_merge_ops as i32,
             row_keys: metadata.row_keys_proto.clone(),
@@ -276,7 +273,7 @@ mod tests {
                 .expect("conversion should succeed");
 
         assert_eq!(insertable.window_start, Some(1700000000));
-        assert_eq!(insertable.window_duration_secs, Some(3600));
+        assert_eq!(insertable.window_duration_secs, 3600);
         assert_eq!(insertable.sort_fields, "metric_name|host|timestamp/V2");
         assert_eq!(insertable.num_merge_ops, 2);
         assert_eq!(insertable.row_keys, Some(vec![0x08, 0x01]));
@@ -300,9 +297,9 @@ mod tests {
                 .expect("conversion should succeed");
 
         assert!(insertable.window_start.is_none());
-        assert!(
-            insertable.window_duration_secs.is_none(),
-            "pre-Phase-31 splits should have NULL window_duration_secs"
+        assert_eq!(
+            insertable.window_duration_secs, 0,
+            "pre-Phase-31 splits should have 0 window_duration_secs"
         );
         assert_eq!(insertable.sort_fields, "");
         assert_eq!(insertable.num_merge_ops, 0);
@@ -344,7 +341,7 @@ mod tests {
             split_metadata_json: insertable.split_metadata_json,
             update_timestamp: 1704067200,
             window_start: insertable.window_start,
-            window_duration_secs: insertable.window_duration_secs,
+            window_duration_secs: Some(insertable.window_duration_secs),
             sort_fields: insertable.sort_fields,
             num_merge_ops: insertable.num_merge_ops,
             row_keys: insertable.row_keys,

From f7905198f7a284a970dbe9d5681381222e09c9ae Mon Sep 17 00:00:00 2001
From: George Talbot <george.talbot@datadoghq.com>
Date: Mon, 6 Apr 2026 11:13:49 -0400
Subject: [PATCH 23/35] style: fix rustfmt nightly formatting

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
---
 .../quickwit-metastore/src/metastore/postgres/metastore.rs  | 6 ++----
 1 file changed, 2 insertions(+), 4 deletions(-)

diff --git a/quickwit/quickwit-metastore/src/metastore/postgres/metastore.rs b/quickwit/quickwit-metastore/src/metastore/postgres/metastore.rs
index 4296f7f51a5..22a3f60dde9 100644
--- a/quickwit/quickwit-metastore/src/metastore/postgres/metastore.rs
+++ b/quickwit/quickwit-metastore/src/metastore/postgres/metastore.rs
@@ -1805,8 +1805,7 @@ impl MetastoreService for PostgresqlMetastore {
         let mut size_bytes_list = Vec::with_capacity(splits_metadata.len());
         let mut split_metadata_jsons = Vec::with_capacity(splits_metadata.len());
         let mut window_starts: Vec<Option<i64>> = Vec::with_capacity(splits_metadata.len());
-        let mut window_duration_secs_list: Vec<i32> =
-            Vec::with_capacity(splits_metadata.len());
+        let mut window_duration_secs_list: Vec<i32> = Vec::with_capacity(splits_metadata.len());
         let mut sort_fields_list: Vec<String> = Vec::with_capacity(splits_metadata.len());
         let mut num_merge_ops_list: Vec<i32> = Vec::with_capacity(splits_metadata.len());
         let mut row_keys_list: Vec<Option<Vec<u8>>> = Vec::with_capacity(splits_metadata.len());
@@ -2517,8 +2516,7 @@ impl MetastoreService for PostgresqlMetastore {
         }
         match &query.mature {
             Bound::Included(evaluation_datetime) | Bound::Excluded(evaluation_datetime) => {
-                query_builder =
-                    query_builder.bind(evaluation_datetime.unix_timestamp() as f64);
+                query_builder = query_builder.bind(evaluation_datetime.unix_timestamp() as f64);
             }
             Bound::Unbounded => {}
         }

From c55df1a35b4271e81e848bf972484fdc9ed55185 Mon Sep 17 00:00:00 2001
From: George Talbot <george.talbot@datadoghq.com>
Date: Thu, 12 Mar 2026 16:18:34 -0400
Subject: [PATCH 24/35] feat(31): add shared invariants module to quickwit-dst
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Extract duplicated invariant logic into a shared `invariants/` module
within `quickwit-dst`. This is the "single source of truth" layer in
the verification pyramid — used by stateright models, production
debug_assert checks, and (future) Datadog metrics emission.

Key changes:
- `invariants/registry.rs`: InvariantId enum (20 variants) with Display
- `invariants/window.rs`: shared window_start_secs(), is_valid_window_duration()
- `invariants/sort.rs`: generic compare_with_null_ordering() for SS-2
- `invariants/check.rs`: check_invariant! macro wrapping debug_assert
- stateright gated behind `model-checking` feature (optional dep)
- quickwit-parquet-engine uses shared functions and check_invariant!

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
---
 quickwit/Cargo.toml                           |   3 +
 quickwit/quickwit-dst/Cargo.toml              |  16 +
 quickwit/quickwit-dst/src/invariants/check.rs |  45 ++
 quickwit/quickwit-dst/src/invariants/mod.rs   |  33 +
 .../quickwit-dst/src/invariants/registry.rs   | 180 +++++
 quickwit/quickwit-dst/src/invariants/sort.rs  | 109 +++
 .../quickwit-dst/src/invariants/window.rs     |  87 +++
 quickwit/quickwit-dst/src/lib.rs              |  47 ++
 quickwit/quickwit-dst/src/models/mod.rs       |  24 +
 .../src/models/parquet_data_model.rs          | 416 +++++++++++
 .../quickwit-dst/src/models/sort_schema.rs    | 668 ++++++++++++++++++
 .../src/models/time_windowed_compaction.rs    | 635 +++++++++++++++++
 .../quickwit-dst/tests/stateright_models.rs   |  77 ++
 quickwit/quickwit-parquet-engine/Cargo.toml   |   1 +
 .../src/sort_fields/window.rs                 |  19 +-
 .../src/split/metadata.rs                     |  14 +-
 .../src/split/postgres.rs                     |  32 +-
 .../src/storage/writer.rs                     |  33 +-
 18 files changed, 2402 insertions(+), 37 deletions(-)
 create mode 100644 quickwit/quickwit-dst/Cargo.toml
 create mode 100644 quickwit/quickwit-dst/src/invariants/check.rs
 create mode 100644 quickwit/quickwit-dst/src/invariants/mod.rs
 create mode 100644 quickwit/quickwit-dst/src/invariants/registry.rs
 create mode 100644 quickwit/quickwit-dst/src/invariants/sort.rs
 create mode 100644 quickwit/quickwit-dst/src/invariants/window.rs
 create mode 100644 quickwit/quickwit-dst/src/lib.rs
 create mode 100644 quickwit/quickwit-dst/src/models/mod.rs
 create mode 100644 quickwit/quickwit-dst/src/models/parquet_data_model.rs
 create mode 100644 quickwit/quickwit-dst/src/models/sort_schema.rs
 create mode 100644 quickwit/quickwit-dst/src/models/time_windowed_compaction.rs
 create mode 100644 quickwit/quickwit-dst/tests/stateright_models.rs

diff --git a/quickwit/Cargo.toml b/quickwit/Cargo.toml
index d252652b27f..cfd732a61f7 100644
--- a/quickwit/Cargo.toml
+++ b/quickwit/Cargo.toml
@@ -13,6 +13,7 @@ members = [
   "quickwit-datetime",
   "quickwit-directories",
   "quickwit-doc-mapper",
+  "quickwit-dst",
   "quickwit-index-management",
   "quickwit-indexing",
   "quickwit-ingest",
@@ -233,6 +234,7 @@ serde_yaml = "0.9"
 serial_test = { version = "3.2", features = ["file_locks"] }
 sha2 = "0.10"
 siphasher = "1.0"
+stateright = "0.30"
 smallvec = "1"
 sqlx = { version = "0.8", features = [
   "migrate",
@@ -352,6 +354,7 @@ quickwit-control-plane = { path = "quickwit-control-plane" }
 quickwit-datetime = { path = "quickwit-datetime" }
 quickwit-directories = { path = "quickwit-directories" }
 quickwit-doc-mapper = { path = "quickwit-doc-mapper" }
+quickwit-dst = { path = "quickwit-dst" }
 quickwit-index-management = { path = "quickwit-index-management" }
 quickwit-indexing = { path = "quickwit-indexing" }
 quickwit-ingest = { path = "quickwit-ingest" }
diff --git a/quickwit/quickwit-dst/Cargo.toml b/quickwit/quickwit-dst/Cargo.toml
new file mode 100644
index 00000000000..f9a8abae202
--- /dev/null
+++ b/quickwit/quickwit-dst/Cargo.toml
@@ -0,0 +1,16 @@
+[package]
+name = "quickwit-dst"
+version.workspace = true
+edition.workspace = true
+homepage.workspace = true
+documentation.workspace = true
+repository.workspace = true
+authors.workspace = true
+license.workspace = true
+description = "Deterministic simulation testing and stateright model checking for Quickhouse-Pomsky"
+
+[features]
+model-checking = ["stateright"]
+
+[dependencies]
+stateright = { workspace = true, optional = true }
diff --git a/quickwit/quickwit-dst/src/invariants/check.rs b/quickwit/quickwit-dst/src/invariants/check.rs
new file mode 100644
index 00000000000..06c59ee9a3c
--- /dev/null
+++ b/quickwit/quickwit-dst/src/invariants/check.rs
@@ -0,0 +1,45 @@
+// Copyright (C) 2024 Quickwit, Inc.
+//
+// Quickwit is offered under the AGPL v3.0 and as commercial software.
+// For commercial licensing, contact us at hello@quickwit.io.
+//
+// AGPL:
+// This program is free software: you can redistribute it and/or modify
+// it under the terms of the GNU Affero General Public License as
+// published by the Free Software Foundation, either version 3 of the
+// License, or (at your option) any later version.
+//
+// This program is distributed in the hope that it will be useful,
+// but WITHOUT ANY WARRANTY; without even the implied warranty of
+// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+// GNU Affero General Public License for more details.
+//
+// You should have received a copy of the GNU Affero General Public License
+// along with this program. If not, see <http://www.gnu.org/licenses/>.
+
+//! Invariant checking macro.
+//!
+//! Wraps `debug_assert!` with the invariant ID, providing a single hook point
+//! for future Datadog metrics emission (Layer 4 of the verification stack).
+
+/// Check an invariant condition. In debug builds, panics on violation.
+/// In release builds, currently a no-op (future: emit Datadog metric).
+///
+/// # Examples
+///
+/// ```
+/// use quickwit_dst::check_invariant;
+/// use quickwit_dst::invariants::InvariantId;
+///
+/// let duration_secs = 900u32;
+/// check_invariant!(InvariantId::TW2, 3600 % duration_secs == 0, ": duration={}", duration_secs);
+/// ```
+#[macro_export]
+macro_rules! check_invariant {
+    ($id:expr, $cond:expr) => {
+        debug_assert!($cond, "{} violated", $id);
+    };
+    ($id:expr, $cond:expr, $fmt:literal $($arg:tt)*) => {
+        debug_assert!($cond, concat!("{} violated", $fmt), $id $($arg)*);
+    };
+}
diff --git a/quickwit/quickwit-dst/src/invariants/mod.rs b/quickwit/quickwit-dst/src/invariants/mod.rs
new file mode 100644
index 00000000000..b33ed7849c9
--- /dev/null
+++ b/quickwit/quickwit-dst/src/invariants/mod.rs
@@ -0,0 +1,33 @@
+// Copyright (C) 2024 Quickwit, Inc.
+//
+// Quickwit is offered under the AGPL v3.0 and as commercial software.
+// For commercial licensing, contact us at hello@quickwit.io.
+//
+// AGPL:
+// This program is free software: you can redistribute it and/or modify
+// it under the terms of the GNU Affero General Public License as
+// published by the Free Software Foundation, either version 3 of the
+// License, or (at your option) any later version.
+//
+// This program is distributed in the hope that it will be useful,
+// but WITHOUT ANY WARRANTY; without even the implied warranty of
+// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+// GNU Affero General Public License for more details.
+//
+// You should have received a copy of the GNU Affero General Public License
+// along with this program. If not, see <http://www.gnu.org/licenses/>.
+
+//! Shared invariant definitions — the single source of truth.
+//!
+//! This module contains pure-Rust functions and types that express the
+//! invariants verified across all layers of the verification pyramid:
+//! TLA+ specs, stateright models, DST tests, and production code.
+//!
+//! No external dependencies — only `std`.
+
+mod check;
+pub mod registry;
+pub mod sort;
+pub mod window;
+
+pub use registry::InvariantId;
diff --git a/quickwit/quickwit-dst/src/invariants/registry.rs b/quickwit/quickwit-dst/src/invariants/registry.rs
new file mode 100644
index 00000000000..9704f0da891
--- /dev/null
+++ b/quickwit/quickwit-dst/src/invariants/registry.rs
@@ -0,0 +1,180 @@
+// Copyright (C) 2024 Quickwit, Inc.
+//
+// Quickwit is offered under the AGPL v3.0 and as commercial software.
+// For commercial licensing, contact us at hello@quickwit.io.
+//
+// AGPL:
+// This program is free software: you can redistribute it and/or modify
+// it under the terms of the GNU Affero General Public License as
+// published by the Free Software Foundation, either version 3 of the
+// License, or (at your option) any later version.
+//
+// This program is distributed in the hope that it will be useful,
+// but WITHOUT ANY WARRANTY; without even the implied warranty of
+// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+// GNU Affero General Public License for more details.
+//
+// You should have received a copy of the GNU Affero General Public License
+// along with this program. If not, see <http://www.gnu.org/licenses/>.
+
+//! Invariant ID catalog — single source of truth for all invariant definitions.
+//!
+//! Each variant corresponds to a named invariant verified across the TLA+ specs,
+//! stateright models, and production code. See `docs/internals/specs/tla/` for
+//! the formal definitions.
+
+use std::fmt;
+
+/// Unique identifier for each verified invariant.
+///
+/// The naming convention is `<SPEC_PREFIX><NUMBER>`:
+/// - SS: SortSchema.tla (ADR-002)
+/// - TW: TimeWindowedCompaction.tla time-window invariants (ADR-003)
+/// - CS: TimeWindowedCompaction.tla compaction-scope invariants (ADR-003)
+/// - MC: TimeWindowedCompaction.tla merge-correctness invariants (ADR-003)
+/// - DM: ParquetDataModel.tla (ADR-001)
+#[derive(Clone, Copy, Debug, PartialEq, Eq, Hash)]
+pub enum InvariantId {
+    /// SS-1: all rows within a split are sorted according to the split's schema
+    SS1,
+    /// SS-2: null values sort correctly per direction (nulls last asc, first desc)
+    SS2,
+    /// SS-3: missing sort columns are treated as NULL
+    SS3,
+    /// SS-4: a split's sort schema never changes after write
+    SS4,
+    /// SS-5: three copies of sort schema are identical per split
+    SS5,
+
+    /// TW-1: every split belongs to exactly one time window
+    TW1,
+    /// TW-2: window_duration evenly divides one hour (3600 seconds)
+    TW2,
+    /// TW-3: data is never merged across window boundaries
+    TW3,
+
+    /// CS-1: only splits sharing all six scope components may be merged
+    CS1,
+    /// CS-2: within a scope, only same window_start splits merge
+    CS2,
+    /// CS-3: splits before compaction_start_time are never compacted
+    CS3,
+
+    /// MC-1: row multiset preserved through compaction (no add/remove/duplicate)
+    MC1,
+    /// MC-2: row contents unchanged through compaction
+    MC2,
+    /// MC-3: output is sorted according to sort schema
+    MC3,
+    /// MC-4: column set is the union of input column sets
+    MC4,
+
+    /// DM-1: each row has all required fields populated
+    DM1,
+    /// DM-2: no last-write-wins; duplicate ingests both survive
+    DM2,
+    /// DM-3: storage only contains ingested points (no interpolation)
+    DM3,
+    /// DM-4: same tags produce same timeseries_id (deterministic TSID)
+    DM4,
+    /// DM-5: timeseries_id persists through compaction without recomputation
+    DM5,
+}
+
+impl InvariantId {
+    /// Human-readable description of this invariant.
+    pub fn description(self) -> &'static str {
+        match self {
+            Self::SS1 => "rows sorted by split schema",
+            Self::SS2 => "null ordering correct per direction",
+            Self::SS3 => "missing sort columns treated as NULL",
+            Self::SS4 => "sort schema immutable after write",
+            Self::SS5 => "three copies of sort schema identical",
+
+            Self::TW1 => "one window per split",
+            Self::TW2 => "window_duration divides 3600",
+            Self::TW3 => "no cross-window merge",
+
+            Self::CS1 => "scope compatibility for merge",
+            Self::CS2 => "same window_start for merge",
+            Self::CS3 => "compaction start time respected",
+
+            Self::MC1 => "row set preserved through compaction",
+            Self::MC2 => "row contents unchanged through compaction",
+            Self::MC3 => "sort order preserved after compaction",
+            Self::MC4 => "column union after compaction",
+
+            Self::DM1 => "point per row — all fields populated",
+            Self::DM2 => "no last-write-wins",
+            Self::DM3 => "no interpolation — only ingested points",
+            Self::DM4 => "deterministic TSID from tags",
+            Self::DM5 => "TSID persists through compaction",
+        }
+    }
+}
+
+impl fmt::Display for InvariantId {
+    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
+        let s = match self {
+            Self::SS1 => "SS-1",
+            Self::SS2 => "SS-2",
+            Self::SS3 => "SS-3",
+            Self::SS4 => "SS-4",
+            Self::SS5 => "SS-5",
+
+            Self::TW1 => "TW-1",
+            Self::TW2 => "TW-2",
+            Self::TW3 => "TW-3",
+
+            Self::CS1 => "CS-1",
+            Self::CS2 => "CS-2",
+            Self::CS3 => "CS-3",
+
+            Self::MC1 => "MC-1",
+            Self::MC2 => "MC-2",
+            Self::MC3 => "MC-3",
+            Self::MC4 => "MC-4",
+
+            Self::DM1 => "DM-1",
+            Self::DM2 => "DM-2",
+            Self::DM3 => "DM-3",
+            Self::DM4 => "DM-4",
+            Self::DM5 => "DM-5",
+        };
+        f.write_str(s)
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    #[test]
+    fn display_format() {
+        assert_eq!(InvariantId::SS1.to_string(), "SS-1");
+        assert_eq!(InvariantId::TW2.to_string(), "TW-2");
+        assert_eq!(InvariantId::CS3.to_string(), "CS-3");
+        assert_eq!(InvariantId::MC4.to_string(), "MC-4");
+        assert_eq!(InvariantId::DM5.to_string(), "DM-5");
+    }
+
+    #[test]
+    fn descriptions_non_empty() {
+        let all = [
+            InvariantId::SS1, InvariantId::SS2, InvariantId::SS3,
+            InvariantId::SS4, InvariantId::SS5,
+            InvariantId::TW1, InvariantId::TW2, InvariantId::TW3,
+            InvariantId::CS1, InvariantId::CS2, InvariantId::CS3,
+            InvariantId::MC1, InvariantId::MC2, InvariantId::MC3, InvariantId::MC4,
+            InvariantId::DM1, InvariantId::DM2, InvariantId::DM3,
+            InvariantId::DM4, InvariantId::DM5,
+        ];
+        for id in all {
+            assert!(
+                !id.description().is_empty(),
+                "{} has empty description",
+                id
+            );
+        }
+    }
+}
diff --git a/quickwit/quickwit-dst/src/invariants/sort.rs b/quickwit/quickwit-dst/src/invariants/sort.rs
new file mode 100644
index 00000000000..68a4ad05cb7
--- /dev/null
+++ b/quickwit/quickwit-dst/src/invariants/sort.rs
@@ -0,0 +1,109 @@
+// Copyright (C) 2024 Quickwit, Inc.
+//
+// Quickwit is offered under the AGPL v3.0 and as commercial software.
+// For commercial licensing, contact us at hello@quickwit.io.
+//
+// AGPL:
+// This program is free software: you can redistribute it and/or modify
+// it under the terms of the GNU Affero General Public License as
+// published by the Free Software Foundation, either version 3 of the
+// License, or (at your option) any later version.
+//
+// This program is distributed in the hope that it will be useful,
+// but WITHOUT ANY WARRANTY; without even the implied warranty of
+// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+// GNU Affero General Public License for more details.
+//
+// You should have received a copy of the GNU Affero General Public License
+// along with this program. If not, see <http://www.gnu.org/licenses/>.
+
+//! Shared null-aware comparison for SS-2 (null ordering invariant).
+//!
+//! This is the single source of truth for how nulls sort relative to non-null
+//! values. Used by both the stateright sort_schema model and production code.
+
+use std::cmp::Ordering;
+
+/// Compare two optional values with null ordering per SS-2.
+///
+/// - Ascending:  nulls sort AFTER non-null (nulls last).
+/// - Descending: nulls sort BEFORE non-null (nulls first).
+///
+/// For two non-null values, the natural ordering is used (reversed for
+/// descending). Two nulls compare as equal.
+pub fn compare_with_null_ordering<T: Ord>(
+    a: Option<&T>,
+    b: Option<&T>,
+    ascending: bool,
+) -> Ordering {
+    match (a, b) {
+        (None, None) => Ordering::Equal,
+        (None, Some(_)) => {
+            if ascending {
+                Ordering::Greater // null after non-null
+            } else {
+                Ordering::Less // null before non-null
+            }
+        }
+        (Some(_), None) => {
+            if ascending {
+                Ordering::Less // non-null before null
+            } else {
+                Ordering::Greater // non-null after null
+            }
+        }
+        (Some(va), Some(vb)) => {
+            if ascending {
+                va.cmp(vb)
+            } else {
+                vb.cmp(va)
+            }
+        }
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    #[test]
+    fn ascending_null_ordering() {
+        // null > non-null in ascending
+        assert_eq!(
+            compare_with_null_ordering(None::<&i32>, Some(&1), true),
+            Ordering::Greater
+        );
+        assert_eq!(
+            compare_with_null_ordering(Some(&1), None::<&i32>, true),
+            Ordering::Less
+        );
+        // null == null
+        assert_eq!(
+            compare_with_null_ordering(None::<&i32>, None::<&i32>, true),
+            Ordering::Equal
+        );
+        // non-null comparison
+        assert_eq!(
+            compare_with_null_ordering(Some(&1), Some(&2), true),
+            Ordering::Less
+        );
+    }
+
+    #[test]
+    fn descending_null_ordering() {
+        // null < non-null in descending
+        assert_eq!(
+            compare_with_null_ordering(None::<&i32>, Some(&1), false),
+            Ordering::Less
+        );
+        assert_eq!(
+            compare_with_null_ordering(Some(&1), None::<&i32>, false),
+            Ordering::Greater
+        );
+        // non-null comparison reversed
+        assert_eq!(
+            compare_with_null_ordering(Some(&1), Some(&2), false),
+            Ordering::Greater
+        );
+    }
+}
diff --git a/quickwit/quickwit-dst/src/invariants/window.rs b/quickwit/quickwit-dst/src/invariants/window.rs
new file mode 100644
index 00000000000..31276c1da6b
--- /dev/null
+++ b/quickwit/quickwit-dst/src/invariants/window.rs
@@ -0,0 +1,87 @@
+// Copyright (C) 2024 Quickwit, Inc.
+//
+// Quickwit is offered under the AGPL v3.0 and as commercial software.
+// For commercial licensing, contact us at hello@quickwit.io.
+//
+// AGPL:
+// This program is free software: you can redistribute it and/or modify
+// it under the terms of the GNU Affero General Public License as
+// published by the Free Software Foundation, either version 3 of the
+// License, or (at your option) any later version.
+//
+// This program is distributed in the hope that it will be useful,
+// but WITHOUT ANY WARRANTY; without even the implied warranty of
+// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+// GNU Affero General Public License for more details.
+//
+// You should have received a copy of the GNU Affero General Public License
+// along with this program. If not, see <http://www.gnu.org/licenses/>.
+
+//! Shared window math for time-windowed compaction.
+//!
+//! These pure functions are the single source of truth for window arithmetic,
+//! used by both stateright models and production code.
+
+/// Compute window_start for a timestamp.
+///
+/// Uses `rem_euclid` for correct handling of negative timestamps (before Unix
+/// epoch). Standard `%` truncates toward zero: `-1 % 900 = -1` (wrong).
+/// `rem_euclid` always returns non-negative: `(-1i64).rem_euclid(900) = 899`.
+///
+/// Mirrors TLA+ `WindowStart(t) == t - (t % WindowDuration)`.
+pub fn window_start_secs(timestamp_secs: i64, duration_secs: i64) -> i64 {
+    timestamp_secs - timestamp_secs.rem_euclid(duration_secs)
+}
+
+/// TW-2: window_duration must evenly divide one hour (3600 seconds).
+///
+/// Returns true if the duration is a positive divisor of 3600. This ensures
+/// window boundaries align across hours and days regardless of starting point.
+pub fn is_valid_window_duration(duration_secs: u32) -> bool {
+    duration_secs > 0 && 3600 % duration_secs == 0
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    #[test]
+    fn window_start_basic() {
+        assert_eq!(window_start_secs(0, 2), 0);
+        assert_eq!(window_start_secs(1, 2), 0);
+        assert_eq!(window_start_secs(2, 2), 2);
+        assert_eq!(window_start_secs(3, 2), 2);
+        assert_eq!(window_start_secs(5, 3), 3);
+    }
+
+    #[test]
+    fn window_start_negative_timestamps() {
+        assert_eq!(window_start_secs(-1, 900), -900);
+        assert_eq!(window_start_secs(-3601, 3600), -7200);
+    }
+
+    #[test]
+    fn window_start_on_boundary() {
+        assert_eq!(window_start_secs(900, 900), 900);
+        assert_eq!(window_start_secs(899, 900), 0);
+    }
+
+    #[test]
+    fn valid_window_durations() {
+        let valid = [1, 2, 3, 4, 5, 6, 8, 9, 10, 12, 15, 16, 18, 20, 24, 25,
+                     30, 36, 40, 45, 48, 50, 60, 72, 75, 80, 90, 100, 120,
+                     144, 150, 180, 200, 225, 240, 300, 360, 400, 450, 600,
+                     720, 900, 1200, 1800, 3600];
+        for dur in valid {
+            assert!(is_valid_window_duration(dur), "expected {} to be valid", dur);
+        }
+    }
+
+    #[test]
+    fn invalid_window_durations() {
+        assert!(!is_valid_window_duration(0));
+        assert!(!is_valid_window_duration(7));
+        assert!(!is_valid_window_duration(11));
+        assert!(!is_valid_window_duration(7200));
+    }
+}
diff --git a/quickwit/quickwit-dst/src/lib.rs b/quickwit/quickwit-dst/src/lib.rs
new file mode 100644
index 00000000000..2fa71cbf47c
--- /dev/null
+++ b/quickwit/quickwit-dst/src/lib.rs
@@ -0,0 +1,47 @@
+// Copyright (C) 2024 Quickwit, Inc.
+//
+// Quickwit is offered under the AGPL v3.0 and as commercial software.
+// For commercial licensing, contact us at hello@quickwit.io.
+//
+// AGPL:
+// This program is free software: you can redistribute it and/or modify
+// it under the terms of the GNU Affero General Public License as
+// published by the Free Software Foundation, either version 3 of the
+// License, or (at your option) any later version.
+//
+// This program is distributed in the hope that it will be useful,
+// but WITHOUT ANY WARRANTY; without even the implied warranty of
+// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+// GNU Affero General Public License for more details.
+//
+// You should have received a copy of the GNU Affero General Public License
+// along with this program. If not, see <http://www.gnu.org/licenses/>.
+
+//! Deterministic simulation testing and shared invariants for Quickhouse-Pomsky.
+//!
+//! # Invariants (always available)
+//!
+//! The [`invariants`] module contains pure-Rust definitions shared across the
+//! entire verification pyramid: TLA+ specs, stateright models, DST tests, and
+//! production `debug_assert!` checks. Zero external dependencies.
+//!
+//! # Models (feature = "model-checking")
+//!
+//! The [`models`] module contains exhaustive model-checking models that mirror
+//! the TLA+ specs in `docs/internals/specs/tla/`. Each model verifies the same
+//! invariants as the corresponding TLA+ spec, but runs as a Rust test via
+//! [stateright](https://docs.rs/stateright). Requires the `model-checking`
+//! feature.
+//!
+//! ## Models
+//!
+//! - `models::sort_schema` — SS-1..SS-5 (ADR-002, `SortSchema.tla`)
+//! - `models::time_windowed_compaction` — TW-1..TW-3, CS-1..CS-3, MC-1..MC-4
+//!   (ADR-003, `TimeWindowedCompaction.tla`)
+//! - `models::parquet_data_model` — DM-1..DM-5 (ADR-001,
+//!   `ParquetDataModel.tla`)
+
+pub mod invariants;
+
+#[cfg(feature = "model-checking")]
+pub mod models;
diff --git a/quickwit/quickwit-dst/src/models/mod.rs b/quickwit/quickwit-dst/src/models/mod.rs
new file mode 100644
index 00000000000..a46f5552d35
--- /dev/null
+++ b/quickwit/quickwit-dst/src/models/mod.rs
@@ -0,0 +1,24 @@
+// Copyright (C) 2024 Quickwit, Inc.
+//
+// Quickwit is offered under the AGPL v3.0 and as commercial software.
+// For commercial licensing, contact us at hello@quickwit.io.
+//
+// AGPL:
+// This program is free software: you can redistribute it and/or modify
+// it under the terms of the GNU Affero General Public License as
+// published by the Free Software Foundation, either version 3 of the
+// License, or (at your option) any later version.
+//
+// This program is distributed in the hope that it will be useful,
+// but WITHOUT ANY WARRANTY; without even the implied warranty of
+// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+// GNU Affero General Public License for more details.
+//
+// You should have received a copy of the GNU Affero General Public License
+// along with this program. If not, see <http://www.gnu.org/licenses/>.
+
+//! Stateright models mirroring the TLA+ specifications.
+
+pub mod parquet_data_model;
+pub mod sort_schema;
+pub mod time_windowed_compaction;
diff --git a/quickwit/quickwit-dst/src/models/parquet_data_model.rs b/quickwit/quickwit-dst/src/models/parquet_data_model.rs
new file mode 100644
index 00000000000..321dbc5dfc5
--- /dev/null
+++ b/quickwit/quickwit-dst/src/models/parquet_data_model.rs
@@ -0,0 +1,416 @@
+// Copyright (C) 2024 Quickwit, Inc.
+//
+// Quickwit is offered under the AGPL v3.0 and as commercial software.
+// For commercial licensing, contact us at hello@quickwit.io.
+//
+// AGPL:
+// This program is free software: you can redistribute it and/or modify
+// it under the terms of the GNU Affero General Public License as
+// published by the Free Software Foundation, either version 3 of the
+// License, or (at your option) any later version.
+//
+// This program is distributed in the hope that it will be useful,
+// but WITHOUT ANY WARRANTY; without even the implied warranty of
+// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+// GNU Affero General Public License for more details.
+//
+// You should have received a copy of the GNU Affero General Public License
+// along with this program. If not, see <http://www.gnu.org/licenses/>.
+
+//! Stateright model for Parquet Data Model invariants (ADR-001).
+//!
+//! Mirrors `docs/internals/specs/tla/ParquetDataModel.tla`.
+//!
+//! # Invariants
+//! - DM-1: Each row is exactly one data point (all required fields populated)
+//! - DM-2: No last-write-wins; duplicate (metric, tags, ts) from separate
+//!   ingests both survive
+//! - DM-3: No interpolation; storage contains only ingested points
+//! - DM-4: timeseries_id is deterministic for a given tag set
+//! - DM-5: timeseries_id persists through compaction without recomputation
+
+use std::collections::BTreeSet;
+
+use stateright::*;
+
+/// Node identifier.
+#[derive(Clone, Copy, Debug, Eq, Hash, Ord, PartialEq, PartialOrd)]
+pub struct Node(pub u8);
+
+/// Metric name.
+#[derive(Clone, Copy, Debug, Eq, Hash, Ord, PartialEq, PartialOrd)]
+pub struct MetricName(pub u8);
+
+/// Tag set (opaque identifier; deterministic hash is identity).
+#[derive(Clone, Copy, Debug, Eq, Hash, Ord, PartialEq, PartialOrd)]
+pub struct TagSet(pub u8);
+
+/// Deterministic hash of a tag set to a timeseries_id.
+/// Mirrors TLA+ `TSIDHash(tags) == CHOOSE n \in 0..100 : TRUE`.
+/// We use the tag set's inner value as the hash (deterministic + injective).
+fn tsid_hash(tags: TagSet) -> u32 {
+    tags.0 as u32
+}
+
+/// A data point.
+#[derive(Clone, Debug, Eq, Hash, Ord, PartialEq, PartialOrd)]
+pub struct DataPoint {
+    pub metric_name: MetricName,
+    pub tags: TagSet,
+    pub timestamp: i64,
+    pub value: i32,
+    pub request_id: u32,
+    pub timeseries_id: u32,
+}
+
+/// A split in object storage.
+#[derive(Clone, Debug, Eq, Hash, Ord, PartialEq, PartialOrd)]
+pub struct DataModelSplit {
+    pub split_id: u32,
+    pub rows: BTreeSet<DataPoint>,
+}
+
+/// Model state. Mirrors TLA+ `VARIABLES`.
+#[derive(Clone, Debug, Eq, Hash, PartialEq)]
+pub struct DataModelState {
+    /// Per-node pending batches.
+    pub pending: Vec<(Node, BTreeSet<DataPoint>)>,
+    /// Published splits in object storage.
+    pub splits: BTreeSet<DataModelSplit>,
+    /// All points ever ingested (ghost variable for DM-3).
+    pub all_ingested_points: BTreeSet<DataPoint>,
+    pub next_split_id: u32,
+    pub next_request_id: u32,
+}
+
+impl DataModelState {
+    fn pending_for(&self, node: Node) -> &BTreeSet<DataPoint> {
+        for (n, set) in &self.pending {
+            if *n == node {
+                return set;
+            }
+        }
+        // Should not happen in well-formed model.
+        panic!("node not found in pending");
+    }
+
+    fn pending_for_mut(&mut self, node: Node) -> &mut BTreeSet<DataPoint> {
+        for (n, set) in &mut self.pending {
+            if *n == node {
+                return set;
+            }
+        }
+        panic!("node not found in pending");
+    }
+
+    fn all_stored_rows(&self) -> BTreeSet<DataPoint> {
+        self.splits.iter().flat_map(|s| s.rows.iter().cloned()).collect()
+    }
+
+    fn all_pending_rows(&self) -> BTreeSet<DataPoint> {
+        self.pending.iter().flat_map(|(_, set)| set.iter().cloned()).collect()
+    }
+}
+
+/// Actions.
+#[derive(Clone, Debug, Eq, Hash, PartialEq)]
+pub enum DataModelAction {
+    IngestPoint {
+        node: Node,
+        metric_name: MetricName,
+        tags: TagSet,
+        timestamp: i64,
+    },
+    FlushSplit {
+        node: Node,
+    },
+    CompactSplits {
+        selected_ids: BTreeSet<u32>,
+    },
+}
+
+/// Model configuration. Mirrors TLA+ `CONSTANTS`.
+#[derive(Clone, Debug)]
+pub struct DataModelModel {
+    pub nodes: Vec<Node>,
+    pub metric_names: Vec<MetricName>,
+    pub tag_sets: Vec<TagSet>,
+    pub timestamps: Vec<i64>,
+    pub request_count_max: u32,
+}
+
+impl DataModelModel {
+    /// Small model matching `ParquetDataModel_small.cfg`.
+    pub fn small() -> Self {
+        DataModelModel {
+            nodes: vec![Node(1)],
+            metric_names: vec![MetricName(1)],
+            tag_sets: vec![TagSet(1)],
+            timestamps: vec![1],
+            request_count_max: 3,
+        }
+    }
+}
+
+/// Generate all subsets of size >= 2 from a set of split IDs.
+fn subsets_ge2(ids: &[u32]) -> Vec<BTreeSet<u32>> {
+    let n = ids.len();
+    let mut result = Vec::new();
+    for mask in 0..(1u32 << n) {
+        if mask.count_ones() >= 2 {
+            let mut subset = BTreeSet::new();
+            for (i, &id) in ids.iter().enumerate() {
+                if mask & (1 << i) != 0 {
+                    subset.insert(id);
+                }
+            }
+            result.push(subset);
+        }
+    }
+    result
+}
+
+impl Model for DataModelModel {
+    type State = DataModelState;
+    type Action = DataModelAction;
+
+    fn init_states(&self) -> Vec<Self::State> {
+        let pending: Vec<(Node, BTreeSet<DataPoint>)> =
+            self.nodes.iter().map(|&n| (n, BTreeSet::new())).collect();
+        vec![DataModelState {
+            pending,
+            splits: BTreeSet::new(),
+            all_ingested_points: BTreeSet::new(),
+            next_split_id: 1,
+            next_request_id: 1,
+        }]
+    }
+
+    fn actions(&self, state: &Self::State, actions: &mut Vec<Self::Action>) {
+        // IngestPoint
+        if state.next_request_id < self.request_count_max {
+            for &node in &self.nodes {
+                for &mn in &self.metric_names {
+                    for &tags in &self.tag_sets {
+                        for &ts in &self.timestamps {
+                            actions.push(DataModelAction::IngestPoint {
+                                node,
+                                metric_name: mn,
+                                tags,
+                                timestamp: ts,
+                            });
+                        }
+                    }
+                }
+            }
+        }
+
+        // FlushSplit
+        for &node in &self.nodes {
+            if !state.pending_for(node).is_empty() {
+                actions.push(DataModelAction::FlushSplit { node });
+            }
+        }
+
+        // CompactSplits
+        if state.splits.len() >= 2 {
+            let ids: Vec<u32> = state.splits.iter().map(|s| s.split_id).collect();
+            for subset in subsets_ge2(&ids) {
+                actions.push(DataModelAction::CompactSplits {
+                    selected_ids: subset,
+                });
+            }
+        }
+    }
+
+    fn next_state(
+        &self,
+        state: &Self::State,
+        action: Self::Action,
+    ) -> Option<Self::State> {
+        let mut next = state.clone();
+
+        match action {
+            DataModelAction::IngestPoint {
+                node,
+                metric_name,
+                tags,
+                timestamp,
+            } => {
+                let point = DataPoint {
+                    metric_name,
+                    tags,
+                    timestamp,
+                    value: 1,
+                    request_id: next.next_request_id,
+                    timeseries_id: tsid_hash(tags),
+                };
+                next.pending_for_mut(node).insert(point.clone());
+                next.all_ingested_points.insert(point);
+                next.next_request_id += 1;
+            }
+            DataModelAction::FlushSplit { node } => {
+                let rows = next.pending_for(node).clone();
+                if rows.is_empty() {
+                    return None;
+                }
+                let new_split = DataModelSplit {
+                    split_id: next.next_split_id,
+                    rows,
+                };
+                next.splits.insert(new_split);
+                next.next_split_id += 1;
+                *next.pending_for_mut(node) = BTreeSet::new();
+            }
+            DataModelAction::CompactSplits { selected_ids } => {
+                let selected: Vec<DataModelSplit> = next
+                    .splits
+                    .iter()
+                    .filter(|s| selected_ids.contains(&s.split_id))
+                    .cloned()
+                    .collect();
+
+                if selected.len() < 2 {
+                    return None;
+                }
+
+                let merged_rows: BTreeSet<DataPoint> =
+                    selected.iter().flat_map(|s| s.rows.iter().cloned()).collect();
+                let new_split = DataModelSplit {
+                    split_id: next.next_split_id,
+                    rows: merged_rows,
+                };
+
+                for s in &selected {
+                    next.splits.remove(s);
+                }
+                next.splits.insert(new_split);
+                next.next_split_id += 1;
+            }
+        }
+
+        Some(next)
+    }
+
+    fn properties(&self) -> Vec<Property<Self>> {
+        vec![
+            // DM-1: Each row is exactly one data point.
+            // Every row has all required fields populated.
+            // Mirrors ParquetDataModel.tla lines 174-180
+            Property::always(
+                "DM-1: point per row",
+                |model: &DataModelModel, state: &DataModelState| {
+                    for s in &state.splits {
+                        for row in &s.rows {
+                            if !model.metric_names.contains(&row.metric_name) {
+                                return false;
+                            }
+                            if !model.tag_sets.contains(&row.tags) {
+                                return false;
+                            }
+                            if !model.timestamps.contains(&row.timestamp) {
+                                return false;
+                            }
+                            if row.timeseries_id != tsid_hash(row.tags) {
+                                return false;
+                            }
+                        }
+                    }
+                    true
+                },
+            ),
+            // DM-2: No last-write-wins.
+            // If two points share (metric, tags, ts) with different request_id,
+            // and both have been flushed, both must be in storage.
+            // Mirrors ParquetDataModel.tla lines 198-208
+            Property::always(
+                "DM-2: no LWW",
+                |_model: &DataModelModel, state: &DataModelState| {
+                    let stored = state.all_stored_rows();
+                    let pending = state.all_pending_rows();
+                    for p1 in &state.all_ingested_points {
+                        for p2 in &state.all_ingested_points {
+                            if p1.metric_name == p2.metric_name
+                                && p1.tags == p2.tags
+                                && p1.timestamp == p2.timestamp
+                                && p1.request_id != p2.request_id
+                                && !pending.contains(p1)
+                                && !pending.contains(p2)
+                                && (!stored.contains(p1) || !stored.contains(p2))
+                            {
+                                return false;
+                            }
+                        }
+                    }
+                    true
+                },
+            ),
+            // DM-3: No interpolation.
+            // Storage only contains ingested points.
+            // Mirrors ParquetDataModel.tla lines 214-215
+            Property::always(
+                "DM-3: no interpolation",
+                |_model: &DataModelModel, state: &DataModelState| {
+                    let stored = state.all_stored_rows();
+                    stored.is_subset(&state.all_ingested_points)
+                },
+            ),
+            // DM-4: Deterministic timeseries_id.
+            // Same tags => same timeseries_id.
+            // Mirrors ParquetDataModel.tla lines 221-224
+            Property::always(
+                "DM-4: deterministic TSID",
+                |_model: &DataModelModel, state: &DataModelState| {
+                    let stored = state.all_stored_rows();
+                    let pending = state.all_pending_rows();
+                    let all: BTreeSet<&DataPoint> =
+                        stored.iter().chain(pending.iter()).collect();
+                    for r1 in &all {
+                        for r2 in &all {
+                            if r1.tags == r2.tags && r1.timeseries_id != r2.timeseries_id {
+                                return false;
+                            }
+                        }
+                    }
+                    true
+                },
+            ),
+            // DM-5: timeseries_id persists through compaction.
+            // Every stored row's timeseries_id equals TSIDHash(tags).
+            // Mirrors ParquetDataModel.tla lines 234-236
+            Property::always(
+                "DM-5: TSID persistence",
+                |_model: &DataModelModel, state: &DataModelState| {
+                    for row in state.all_stored_rows() {
+                        if row.timeseries_id != tsid_hash(row.tags) {
+                            return false;
+                        }
+                    }
+                    true
+                },
+            ),
+        ]
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    #[test]
+    fn check_data_model_small() {
+        let model = DataModelModel::small();
+        model
+            .checker()
+            .spawn_bfs()
+            .join()
+            .assert_properties();
+    }
+
+    #[test]
+    fn tsid_hash_deterministic() {
+        let t1 = TagSet(42);
+        assert_eq!(tsid_hash(t1), tsid_hash(t1));
+        assert_eq!(tsid_hash(TagSet(1)), tsid_hash(TagSet(1)));
+    }
+}
diff --git a/quickwit/quickwit-dst/src/models/sort_schema.rs b/quickwit/quickwit-dst/src/models/sort_schema.rs
new file mode 100644
index 00000000000..89746360993
--- /dev/null
+++ b/quickwit/quickwit-dst/src/models/sort_schema.rs
@@ -0,0 +1,668 @@
+// Copyright (C) 2024 Quickwit, Inc.
+//
+// Quickwit is offered under the AGPL v3.0 and as commercial software.
+// For commercial licensing, contact us at hello@quickwit.io.
+//
+// AGPL:
+// This program is free software: you can redistribute it and/or modify
+// it under the terms of the GNU Affero General Public License as
+// published by the Free Software Foundation, either version 3 of the
+// License, or (at your option) any later version.
+//
+// This program is distributed in the hope that it will be useful,
+// but WITHOUT ANY WARRANTY; without even the implied warranty of
+// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+// GNU Affero General Public License for more details.
+//
+// You should have received a copy of the GNU Affero General Public License
+// along with this program. If not, see <http://www.gnu.org/licenses/>.
+
+//! Stateright model for Sort Schema invariants (ADR-002).
+//!
+//! Mirrors `docs/internals/specs/tla/SortSchema.tla`.
+//!
+//! # Invariants
+//! - SS-1: All rows within a split are sorted according to the split's schema
+//! - SS-2: Null values sort correctly per direction (nulls last asc, first desc)
+//! - SS-3: Missing sort columns are treated as NULL
+//! - SS-4: A split's sort schema never changes after write
+//! - SS-5: Three copies of sort schema are identical per split
+
+use std::collections::BTreeMap;
+
+use stateright::*;
+
+/// Column identifier (small domain for model checking).
+#[derive(Clone, Copy, Debug, Eq, Hash, Ord, PartialEq, PartialOrd)]
+pub enum Column {
+    C1,
+}
+
+impl Column {
+    pub const ALL: &[Column] = &[Column::C1];
+}
+
+/// Sort direction.
+#[derive(Clone, Copy, Debug, Eq, Hash, Ord, PartialEq, PartialOrd)]
+pub enum Direction {
+    Asc,
+    Desc,
+}
+
+/// A single sort column specification.
+#[derive(Clone, Debug, Eq, Hash, Ord, PartialEq, PartialOrd)]
+pub struct SortColumn {
+    pub column: Column,
+    pub direction: Direction,
+}
+
+/// A cell value, modeling TLA+ `ValuesWithNull`.
+/// NULL is represented as `None`.
+#[derive(Clone, Copy, Debug, Eq, Hash, Ord, PartialEq, PartialOrd)]
+pub enum Value {
+    Null,
+    Val(i8),
+}
+
+impl Value {
+    /// Non-null values domain (matches TLA+ `Values == {1, 2, 3}`).
+    pub const NON_NULL: &[Value] = &[Value::Val(1), Value::Val(2), Value::Val(3)];
+
+    pub const ALL: &[Value] = &[Value::Null, Value::Val(1), Value::Val(2), Value::Val(3)];
+
+    fn is_null(self) -> bool {
+        matches!(self, Value::Null)
+    }
+}
+
+/// A row: maps present columns to values.
+/// Columns absent from the map are treated as NULL (SS-3).
+#[derive(Clone, Debug, Eq, Hash, Ord, PartialEq, PartialOrd)]
+pub struct Row {
+    pub cells: BTreeMap<Column, Value>,
+}
+
+impl Row {
+    fn get_value(&self, col: Column) -> Value {
+        self.cells.get(&col).copied().unwrap_or(Value::Null)
+    }
+}
+
+/// A split in object storage.
+/// Mirrors the TLA+ split record with all three schema copies.
+#[derive(Clone, Debug, Eq, Hash, Ord, PartialEq, PartialOrd)]
+pub struct Split {
+    pub id: u32,
+    pub rows: Vec<Row>,
+    pub sort_schema: Vec<SortColumn>,
+    pub metadata_sort_schema: Vec<SortColumn>,
+    pub kv_sort_schema: Vec<SortColumn>,
+    pub sorting_columns_schema: Vec<SortColumn>,
+    pub columns_present: Vec<Column>,
+}
+
+/// Sort schema model state. Mirrors TLA+ `VARIABLES`.
+#[derive(Clone, Debug, Eq, Hash, PartialEq)]
+pub struct SortSchemaState {
+    pub metastore_schema: Vec<SortColumn>,
+    pub splits: Vec<Split>,
+    pub next_split_id: u32,
+    pub schema_change_count: u32,
+    pub split_schema_history: BTreeMap<u32, Vec<SortColumn>>,
+}
+
+/// Actions. Mirrors TLA+ `Next`.
+#[derive(Clone, Debug, Eq, Hash, PartialEq)]
+pub enum SortSchemaAction {
+    /// Ingest a batch with specific columns present and specific row data.
+    IngestBatch {
+        columns_present: Vec<Column>,
+        rows: Vec<Row>,
+    },
+    /// Change the metastore schema to a new value.
+    ChangeSchema {
+        new_schema: Vec<SortColumn>,
+    },
+    /// Compact two splits (identified by index in the splits vec).
+    CompactSplits {
+        s1_idx: usize,
+        s2_idx: usize,
+        merged_rows: Vec<Row>,
+    },
+}
+
+/// Model configuration. Mirrors TLA+ `CONSTANTS`.
+#[derive(Clone, Debug)]
+pub struct SortSchemaModel {
+    pub rows_per_split_max: usize,
+    pub splits_max: usize,
+    pub schema_changes_max: u32,
+}
+
+impl SortSchemaModel {
+    /// Small model matching `SortSchema_small.cfg`.
+    pub fn small() -> Self {
+        SortSchemaModel {
+            rows_per_split_max: 2,
+            splits_max: 2,
+            schema_changes_max: 1,
+        }
+    }
+}
+
+/// Compare two values with null ordering (SS-2).
+/// Returns `Ordering`: Less, Equal, Greater.
+/// Ascending: nulls sort AFTER non-null.
+/// Descending: nulls sort BEFORE non-null.
+///
+/// Delegates to the shared [`crate::invariants::sort::compare_with_null_ordering`]
+/// for the null ordering logic, converting model-specific `Value` to `Option`.
+fn compare_values(v1: Value, v2: Value, direction: Direction) -> std::cmp::Ordering {
+    let a = match v1 {
+        Value::Null => None,
+        Value::Val(v) => Some(v),
+    };
+    let b = match v2 {
+        Value::Null => None,
+        Value::Val(v) => Some(v),
+    };
+    let ascending = matches!(direction, Direction::Asc);
+    crate::invariants::sort::compare_with_null_ordering(a.as_ref(), b.as_ref(), ascending)
+}
+
+/// Check if row1 <= row2 according to the schema (lexicographic).
+/// Mirrors TLA+ `RowLEQ`.
+fn row_leq(row1: &Row, row2: &Row, schema: &[SortColumn]) -> bool {
+    for sc in schema {
+        let v1 = row1.get_value(sc.column);
+        let v2 = row2.get_value(sc.column);
+        let cmp = compare_values(v1, v2, sc.direction);
+        match cmp {
+            std::cmp::Ordering::Less => return true,
+            std::cmp::Ordering::Greater => return false,
+            std::cmp::Ordering::Equal => continue,
+        }
+    }
+    true // all columns equal
+}
+
+/// Check if rows are sorted. Mirrors TLA+ `IsSorted`.
+fn is_sorted(rows: &[Row], schema: &[SortColumn]) -> bool {
+    rows.windows(2).all(|w| row_leq(&w[0], &w[1], schema))
+}
+
+/// Generate all possible sort schemas (length 0, 1, or 2).
+/// Mirrors TLA+ `AllSortSchemas`.
+fn all_sort_schemas() -> Vec<Vec<SortColumn>> {
+    let directions = [Direction::Asc, Direction::Desc];
+    let mut schemas = vec![vec![]]; // empty schema
+
+    // Length 1
+    for &col in Column::ALL {
+        for &dir in &directions {
+            schemas.push(vec![SortColumn {
+                column: col,
+                direction: dir,
+            }]);
+        }
+    }
+
+    // Length 2
+    for &c1 in Column::ALL {
+        for &d1 in &directions {
+            for &c2 in Column::ALL {
+                for &d2 in &directions {
+                    schemas.push(vec![
+                        SortColumn {
+                            column: c1,
+                            direction: d1,
+                        },
+                        SortColumn {
+                            column: c2,
+                            direction: d2,
+                        },
+                    ]);
+                }
+            }
+        }
+    }
+
+    schemas
+}
+
+/// Generate all possible rows for a given column set with n rows.
+fn all_row_sequences(columns_present: &[Column], n: usize) -> Vec<Vec<Row>> {
+    if n == 0 {
+        return vec![vec![]];
+    }
+
+    // Each row maps each present column to a value (including null).
+    let single_rows = all_single_rows(columns_present);
+
+    // Generate all sequences of length n.
+    let mut result = vec![vec![]];
+    for _ in 0..n {
+        let mut next = Vec::new();
+        for prefix in &result {
+            for row in &single_rows {
+                let mut extended = prefix.clone();
+                extended.push(row.clone());
+                next.push(extended);
+            }
+        }
+        result = next;
+    }
+    result
+}
+
+fn all_single_rows(columns_present: &[Column]) -> Vec<Row> {
+    if columns_present.is_empty() {
+        return vec![Row {
+            cells: BTreeMap::new(),
+        }];
+    }
+
+    // Generate all combinations of values for each column.
+    let mut rows = vec![BTreeMap::new()];
+    for &col in columns_present {
+        let mut next = Vec::new();
+        for partial in &rows {
+            for &val in Value::ALL {
+                let mut full = partial.clone();
+                full.insert(col, val);
+                next.push(full);
+            }
+        }
+        rows = next;
+    }
+    rows.into_iter().map(|cells| Row { cells }).collect()
+}
+
+/// Generate all subsets of columns.
+fn all_column_subsets() -> Vec<Vec<Column>> {
+    let mut subsets = Vec::new();
+    let cols = Column::ALL;
+    // 2^n subsets
+    for mask in 0..(1u32 << cols.len()) {
+        let mut subset = Vec::new();
+        for (i, &col) in cols.iter().enumerate() {
+            if mask & (1 << i) != 0 {
+                subset.push(col);
+            }
+        }
+        subsets.push(subset);
+    }
+    subsets
+}
+
+/// Check if merged_rows is a valid permutation of the union of s1 and s2 rows,
+/// accounting for column extension (missing columns become NULL).
+fn is_valid_merge(
+    merged_rows: &[Row],
+    s1: &Split,
+    s2: &Split,
+    merged_columns: &[Column],
+) -> bool {
+    let total_rows = s1.rows.len() + s2.rows.len();
+    if merged_rows.len() != total_rows {
+        return false;
+    }
+
+    // Each merged row must come from either s1 or s2 (extended with NULLs).
+    // Build extended versions of input rows.
+    let extend_row = |row: &Row, merged_cols: &[Column]| -> Row {
+        let mut cells = BTreeMap::new();
+        for &col in merged_cols {
+            cells.insert(col, row.get_value(col));
+        }
+        Row { cells }
+    };
+
+    let s1_extended: Vec<Row> = s1.rows.iter().map(|r| extend_row(r, merged_columns)).collect();
+    let s2_extended: Vec<Row> = s2.rows.iter().map(|r| extend_row(r, merged_columns)).collect();
+
+    // Check that merged_rows is a permutation of s1_extended ++ s2_extended.
+    let mut all_input: Vec<Row> = s1_extended;
+    all_input.extend(s2_extended);
+    all_input.sort();
+
+    let mut sorted_merged = merged_rows.to_vec();
+    sorted_merged.sort();
+
+    sorted_merged == all_input
+}
+
+impl Model for SortSchemaModel {
+    type State = SortSchemaState;
+    type Action = SortSchemaAction;
+
+    fn init_states(&self) -> Vec<Self::State> {
+        // TLA+ Init: metastore_schema \in AllSortSchemas, splits = {}, etc.
+        all_sort_schemas()
+            .into_iter()
+            .map(|schema| SortSchemaState {
+                metastore_schema: schema,
+                splits: Vec::new(),
+                next_split_id: 1,
+                schema_change_count: 0,
+                split_schema_history: BTreeMap::new(),
+            })
+            .collect()
+    }
+
+    fn actions(&self, state: &Self::State, actions: &mut Vec<Self::Action>) {
+        // IngestBatch: if splits < SplitsMax
+        if state.splits.len() < self.splits_max {
+            let current_schema = &state.metastore_schema;
+
+            for columns_present in all_column_subsets() {
+                for n in 1..=self.rows_per_split_max {
+                    for rows in all_row_sequences(&columns_present, n) {
+                        // Only add if rows are sorted by current schema.
+                        if is_sorted(&rows, current_schema) {
+                            actions.push(SortSchemaAction::IngestBatch {
+                                columns_present: columns_present.clone(),
+                                rows,
+                            });
+                        }
+                    }
+                }
+            }
+        }
+
+        // ChangeSchema: if schema_change_count < SchemaChangesMax
+        if state.schema_change_count < self.schema_changes_max {
+            for new_schema in all_sort_schemas() {
+                if new_schema != state.metastore_schema {
+                    actions.push(SortSchemaAction::ChangeSchema { new_schema });
+                }
+            }
+        }
+
+        // CompactSplits: merge two splits with same sort_schema
+        for (i, s1) in state.splits.iter().enumerate() {
+            for (j, s2) in state.splits.iter().enumerate() {
+                if i >= j {
+                    continue;
+                }
+                if s1.sort_schema != s2.sort_schema {
+                    continue;
+                }
+
+                let total_rows = s1.rows.len() + s2.rows.len();
+                if total_rows > self.rows_per_split_max {
+                    continue;
+                }
+
+                let merged_schema = &s1.sort_schema;
+                let mut merged_columns: Vec<Column> = s1.columns_present.clone();
+                for &col in &s2.columns_present {
+                    if !merged_columns.contains(&col) {
+                        merged_columns.push(col);
+                    }
+                }
+                merged_columns.sort();
+
+                // Generate all valid merged row sequences.
+                for merged_rows in all_row_sequences(&merged_columns, total_rows) {
+                    if is_sorted(&merged_rows, merged_schema)
+                        && is_valid_merge(&merged_rows, s1, s2, &merged_columns)
+                    {
+                        actions.push(SortSchemaAction::CompactSplits {
+                            s1_idx: i,
+                            s2_idx: j,
+                            merged_rows,
+                        });
+                    }
+                }
+            }
+        }
+    }
+
+    fn next_state(
+        &self,
+        state: &Self::State,
+        action: Self::Action,
+    ) -> Option<Self::State> {
+        let mut next = state.clone();
+
+        match action {
+            SortSchemaAction::IngestBatch {
+                columns_present,
+                rows,
+            } => {
+                let new_id = next.next_split_id;
+                let current_schema = next.metastore_schema.clone();
+                let split = Split {
+                    id: new_id,
+                    rows,
+                    sort_schema: current_schema.clone(),
+                    metadata_sort_schema: current_schema.clone(),
+                    kv_sort_schema: current_schema.clone(),
+                    sorting_columns_schema: current_schema.clone(),
+                    columns_present,
+                };
+                next.splits.push(split);
+                next.splits.sort();
+                next.next_split_id += 1;
+                next.split_schema_history.insert(new_id, current_schema);
+            }
+            SortSchemaAction::ChangeSchema { new_schema } => {
+                next.metastore_schema = new_schema;
+                next.schema_change_count += 1;
+            }
+            SortSchemaAction::CompactSplits {
+                s1_idx,
+                s2_idx,
+                merged_rows,
+            } => {
+                let s1 = &state.splits[s1_idx];
+                let s2 = &state.splits[s2_idx];
+                let merged_schema = s1.sort_schema.clone();
+                let mut merged_columns: Vec<Column> = s1.columns_present.clone();
+                for &col in &s2.columns_present {
+                    if !merged_columns.contains(&col) {
+                        merged_columns.push(col);
+                    }
+                }
+                merged_columns.sort();
+
+                let new_id = next.next_split_id;
+                let new_split = Split {
+                    id: new_id,
+                    rows: merged_rows,
+                    sort_schema: merged_schema.clone(),
+                    metadata_sort_schema: merged_schema.clone(),
+                    kv_sort_schema: merged_schema.clone(),
+                    sorting_columns_schema: merged_schema.clone(),
+                    columns_present: merged_columns,
+                };
+
+                // Remove old splits (higher index first to preserve indices).
+                let (lo, hi) = if s1_idx < s2_idx {
+                    (s1_idx, s2_idx)
+                } else {
+                    (s2_idx, s1_idx)
+                };
+                next.splits.remove(hi);
+                next.splits.remove(lo);
+                next.splits.push(new_split);
+                next.splits.sort();
+                next.next_split_id += 1;
+                next.split_schema_history.insert(new_id, merged_schema);
+            }
+        }
+
+        Some(next)
+    }
+
+    fn properties(&self) -> Vec<Property<Self>> {
+        vec![
+            // SS-1: All rows within a split are sorted according to its schema.
+            // Mirrors SortSchema.tla line 217-219
+            Property::always(
+                "SS-1: rows sorted",
+                |_model: &SortSchemaModel, state: &SortSchemaState| {
+                    state
+                        .splits
+                        .iter()
+                        .all(|s| is_sorted(&s.rows, &s.sort_schema))
+                },
+            ),
+            // SS-2: Null values ordered correctly per direction.
+            // Ascending: null must NOT appear before non-null.
+            // Descending: non-null must NOT appear before null.
+            // Mirrors SortSchema.tla lines 228-247
+            Property::always(
+                "SS-2: null ordering",
+                |_model: &SortSchemaModel, state: &SortSchemaState| {
+                    for s in &state.splits {
+                        for w in s.rows.windows(2) {
+                            let (row_curr, row_next) = (&w[0], &w[1]);
+                            for (k, sc) in s.sort_schema.iter().enumerate() {
+                                let v_curr = row_curr.get_value(sc.column);
+                                let v_next = row_next.get_value(sc.column);
+
+                                // Check only when earlier columns are equal.
+                                let earlier_equal =
+                                    s.sort_schema[..k].iter().all(|prev_sc| {
+                                        row_curr.get_value(prev_sc.column)
+                                            == row_next.get_value(prev_sc.column)
+                                    });
+
+                                if earlier_equal {
+                                    // Ascending: null must not appear before non-null.
+                                    if sc.direction == Direction::Asc
+                                        && v_curr.is_null()
+                                        && !v_next.is_null()
+                                    {
+                                        return false;
+                                    }
+                                    // Descending: non-null must not appear before null.
+                                    if sc.direction == Direction::Desc
+                                        && !v_curr.is_null()
+                                        && v_next.is_null()
+                                    {
+                                        return false;
+                                    }
+                                }
+                            }
+                        }
+                    }
+                    true
+                },
+            ),
+            // SS-3: Missing sort columns treated as NULL.
+            // Mirrors SortSchema.tla lines 253-259
+            Property::always(
+                "SS-3: missing columns null",
+                |_model: &SortSchemaModel, state: &SortSchemaState| {
+                    for s in &state.splits {
+                        for sc in &s.sort_schema {
+                            if !s.columns_present.contains(&sc.column) {
+                                for row in &s.rows {
+                                    if row.get_value(sc.column) != Value::Null {
+                                        return false;
+                                    }
+                                }
+                            }
+                        }
+                    }
+                    true
+                },
+            ),
+            // SS-4: Schema immutable after write.
+            // Mirrors SortSchema.tla lines 263-266
+            Property::always(
+                "SS-4: schema immutable",
+                |_model: &SortSchemaModel, state: &SortSchemaState| {
+                    for s in &state.splits {
+                        if let Some(historical) = state.split_schema_history.get(&s.id)
+                            && *historical != s.sort_schema
+                        {
+                            return false;
+                        }
+                    }
+                    true
+                },
+            ),
+            // SS-5: Three copies of sort schema are identical.
+            // Mirrors SortSchema.tla lines 270-274
+            Property::always(
+                "SS-5: three-copy consistency",
+                |_model: &SortSchemaModel, state: &SortSchemaState| {
+                    state.splits.iter().all(|s| {
+                        s.sort_schema == s.metadata_sort_schema
+                            && s.sort_schema == s.kv_sort_schema
+                            && s.sort_schema == s.sorting_columns_schema
+                    })
+                },
+            ),
+        ]
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    #[test]
+    fn check_sort_schema_small() {
+        let model = SortSchemaModel::small();
+        model
+            .checker()
+            .spawn_bfs()
+            .join()
+            .assert_properties();
+    }
+
+    #[test]
+    fn compare_values_null_ordering() {
+        // Ascending: null > non-null
+        assert_eq!(
+            compare_values(Value::Null, Value::Val(1), Direction::Asc),
+            std::cmp::Ordering::Greater
+        );
+        assert_eq!(
+            compare_values(Value::Val(1), Value::Null, Direction::Asc),
+            std::cmp::Ordering::Less
+        );
+
+        // Descending: null < non-null
+        assert_eq!(
+            compare_values(Value::Null, Value::Val(1), Direction::Desc),
+            std::cmp::Ordering::Less
+        );
+        assert_eq!(
+            compare_values(Value::Val(1), Value::Null, Direction::Desc),
+            std::cmp::Ordering::Greater
+        );
+    }
+
+    #[test]
+    fn is_sorted_basic() {
+        let schema = vec![SortColumn {
+            column: Column::C1,
+            direction: Direction::Asc,
+        }];
+        let rows = vec![
+            Row {
+                cells: [(Column::C1, Value::Val(1))].into(),
+            },
+            Row {
+                cells: [(Column::C1, Value::Val(2))].into(),
+            },
+        ];
+        assert!(is_sorted(&rows, &schema));
+
+        let rows_unsorted = vec![
+            Row {
+                cells: [(Column::C1, Value::Val(2))].into(),
+            },
+            Row {
+                cells: [(Column::C1, Value::Val(1))].into(),
+            },
+        ];
+        assert!(!is_sorted(&rows_unsorted, &schema));
+    }
+}
diff --git a/quickwit/quickwit-dst/src/models/time_windowed_compaction.rs b/quickwit/quickwit-dst/src/models/time_windowed_compaction.rs
new file mode 100644
index 00000000000..0723f93e453
--- /dev/null
+++ b/quickwit/quickwit-dst/src/models/time_windowed_compaction.rs
@@ -0,0 +1,635 @@
+// Copyright (C) 2024 Quickwit, Inc.
+//
+// Quickwit is offered under the AGPL v3.0 and as commercial software.
+// For commercial licensing, contact us at hello@quickwit.io.
+//
+// AGPL:
+// This program is free software: you can redistribute it and/or modify
+// it under the terms of the GNU Affero General Public License as
+// published by the Free Software Foundation, either version 3 of the
+// License, or (at your option) any later version.
+//
+// This program is distributed in the hope that it will be useful,
+// but WITHOUT ANY WARRANTY; without even the implied warranty of
+// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+// GNU Affero General Public License for more details.
+//
+// You should have received a copy of the GNU Affero General Public License
+// along with this program. If not, see <http://www.gnu.org/licenses/>.
+
+//! Stateright model for Time-Windowed Compaction invariants (ADR-003).
+//!
+//! Mirrors `docs/internals/specs/tla/TimeWindowedCompaction.tla`.
+//!
+//! # Invariants
+//! - TW-1: Every split belongs to exactly one time window
+//! - TW-2: window_duration evenly divides one hour
+//! - TW-3: Data is never merged across window boundaries
+//! - CS-1: Only splits sharing all six scope components may be merged
+//! - CS-2: Within a scope, only same window_start splits merge
+//! - CS-3: Splits before compaction_start_time are never compacted
+//! - MC-1: Row multiset preserved through compaction
+//! - MC-2: Row contents unchanged through compaction
+//! - MC-3: Output is sorted according to sort schema
+//! - MC-4: Column set is the union of input column sets
+
+use std::collections::{BTreeMap, BTreeSet};
+
+use stateright::*;
+
+/// Scope identifier (abstract; in TLA+ this is a 6-tuple).
+#[derive(Clone, Copy, Debug, Eq, Hash, Ord, PartialEq, PartialOrd)]
+pub struct Scope(pub u8);
+
+/// A column name.
+#[derive(Clone, Copy, Debug, Eq, Hash, Ord, PartialEq, PartialOrd)]
+pub enum ColumnName {
+    M,
+    V,
+}
+
+impl ColumnName {
+    pub const ALL: &[ColumnName] = &[ColumnName::M, ColumnName::V];
+}
+
+/// A row in a split.
+#[derive(Clone, Debug, Eq, Hash, Ord, PartialEq, PartialOrd)]
+pub struct CompactionRow {
+    pub point_id: u32,
+    pub timestamp: i64,
+    pub sort_key: i64,
+    pub columns: BTreeSet<ColumnName>,
+    /// Unique value per (point, column) for MC-2 tracking.
+    pub values: BTreeMap<ColumnName, (u32, ColumnName)>,
+}
+
+/// A split in object storage.
+#[derive(Clone, Debug, Eq, Hash, Ord, PartialEq, PartialOrd)]
+pub struct CompactionSplit {
+    pub id: u32,
+    pub scope: Scope,
+    pub window_start: i64,
+    pub rows: Vec<CompactionRow>,
+    pub columns: BTreeSet<ColumnName>,
+    pub sorted: bool,
+}
+
+/// Compaction log entry for invariant checking.
+#[derive(Clone, Debug, Eq, Hash, Ord, PartialEq, PartialOrd)]
+pub struct CompactionLogEntry {
+    pub input_split_ids: BTreeSet<u32>,
+    pub output_split_id: u32,
+    pub input_point_ids: BTreeSet<u32>,
+    pub output_point_ids: BTreeSet<u32>,
+    pub input_scopes: BTreeMap<u32, Scope>,
+    pub input_window_starts: BTreeMap<u32, i64>,
+    pub output_columns: BTreeSet<ColumnName>,
+    pub input_column_union: BTreeSet<ColumnName>,
+}
+
+/// Ingest buffer key: (scope, window_start).
+type BufferKey = (Scope, i64);
+
+/// Model state. Mirrors TLA+ `VARIABLES`.
+#[derive(Clone, Debug, Eq, Hash, PartialEq)]
+pub struct CompactionState {
+    pub current_time: i64,
+    pub object_storage: BTreeSet<CompactionSplit>,
+    pub ingest_buffer: BTreeMap<BufferKey, Vec<CompactionRow>>,
+    pub next_split_id: u32,
+    pub next_point_id: u32,
+    pub points_ingested: u32,
+    pub compactions_performed: u32,
+    pub row_history: BTreeMap<u32, CompactionRow>,
+    pub compaction_log: BTreeSet<CompactionLogEntry>,
+}
+
+/// Actions.
+#[derive(Clone, Debug, Eq, Hash, PartialEq)]
+pub enum CompactionAction {
+    AdvanceTime { new_time: i64 },
+    IngestPoint {
+        timestamp: i64,
+        sort_key: i64,
+        scope: Scope,
+        columns: BTreeSet<ColumnName>,
+    },
+    FlushSplit { key: BufferKey },
+    CompactWindow {
+        scope: Scope,
+        window_start: i64,
+        split_ids: BTreeSet<u32>,
+    },
+}
+
+/// Model configuration. Mirrors TLA+ `CONSTANTS`.
+#[derive(Clone, Debug)]
+pub struct CompactionModel {
+    pub timestamps: Vec<i64>,
+    pub scopes: Vec<Scope>,
+    pub window_duration: i64,
+    pub hour_seconds: i64,
+    pub compaction_start_time: i64,
+    pub late_data_acceptance_window: i64,
+    pub max_time: i64,
+    pub max_points: u32,
+    pub max_compactions: u32,
+    pub sort_keys: Vec<i64>,
+}
+
+impl CompactionModel {
+    /// Small model matching `TimeWindowedCompaction_small.cfg`.
+    pub fn small() -> Self {
+        CompactionModel {
+            timestamps: vec![0, 1],
+            scopes: vec![Scope(1)],
+            window_duration: 2,
+            hour_seconds: 4,
+            compaction_start_time: 0,
+            late_data_acceptance_window: 2,
+            max_time: 1,
+            max_points: 2,
+            max_compactions: 1,
+            sort_keys: vec![1, 2],
+        }
+    }
+}
+
+/// Compute window_start for a timestamp.
+/// Mirrors TLA+ `WindowStart(t) == t - (t % WindowDuration)`.
+///
+/// Delegates to the shared [`crate::invariants::window::window_start_secs`].
+fn window_start(t: i64, window_duration: i64) -> i64 {
+    crate::invariants::window::window_start_secs(t, window_duration)
+}
+
+/// Check if a sequence of rows is sorted by sort_key ascending.
+fn is_sorted_by_key(rows: &[CompactionRow]) -> bool {
+    rows.windows(2).all(|w| w[0].sort_key <= w[1].sort_key)
+}
+
+/// Merge-sort two sorted sequences by sort_key.
+fn merge_sorted(s1: &[CompactionRow], s2: &[CompactionRow]) -> Vec<CompactionRow> {
+    let mut result = Vec::with_capacity(s1.len() + s2.len());
+    let (mut i, mut j) = (0, 0);
+    while i < s1.len() && j < s2.len() {
+        if s1[i].sort_key <= s2[j].sort_key {
+            result.push(s1[i].clone());
+            i += 1;
+        } else {
+            result.push(s2[j].clone());
+            j += 1;
+        }
+    }
+    result.extend_from_slice(&s1[i..]);
+    result.extend_from_slice(&s2[j..]);
+    result
+}
+
+/// Insertion sort by sort_key (for small sequences at flush time).
+fn insertion_sort(rows: &[CompactionRow]) -> Vec<CompactionRow> {
+    let mut sorted = Vec::with_capacity(rows.len());
+    for row in rows {
+        let pos = sorted
+            .iter()
+            .position(|r: &CompactionRow| r.sort_key > row.sort_key)
+            .unwrap_or(sorted.len());
+        sorted.insert(pos, row.clone());
+    }
+    sorted
+}
+
+/// Generate all non-empty subsets of a set of column names.
+fn all_nonempty_column_subsets() -> Vec<BTreeSet<ColumnName>> {
+    let cols = ColumnName::ALL;
+    let mut subsets = Vec::new();
+    for mask in 1..(1u32 << cols.len()) {
+        let mut subset = BTreeSet::new();
+        for (i, &col) in cols.iter().enumerate() {
+            if mask & (1 << i) != 0 {
+                subset.insert(col);
+            }
+        }
+        subsets.push(subset);
+    }
+    subsets
+}
+
+/// Generate all subsets of size >= 2 from a set of split IDs.
+fn subsets_of_size_ge2(ids: &BTreeSet<u32>) -> Vec<BTreeSet<u32>> {
+    let id_vec: Vec<u32> = ids.iter().copied().collect();
+    let n = id_vec.len();
+    let mut result = Vec::new();
+    for mask in 0..(1u32 << n) {
+        if mask.count_ones() >= 2 {
+            let mut subset = BTreeSet::new();
+            for (i, &id) in id_vec.iter().enumerate() {
+                if mask & (1 << i) != 0 {
+                    subset.insert(id);
+                }
+            }
+            result.push(subset);
+        }
+    }
+    result
+}
+
+impl Model for CompactionModel {
+    type State = CompactionState;
+    type Action = CompactionAction;
+
+    fn init_states(&self) -> Vec<Self::State> {
+        vec![CompactionState {
+            current_time: 0,
+            object_storage: BTreeSet::new(),
+            ingest_buffer: BTreeMap::new(),
+            next_split_id: 1,
+            next_point_id: 1,
+            points_ingested: 0,
+            compactions_performed: 0,
+            row_history: BTreeMap::new(),
+            compaction_log: BTreeSet::new(),
+        }]
+    }
+
+    fn actions(&self, state: &Self::State, actions: &mut Vec<Self::Action>) {
+        // AdvanceTime
+        if state.current_time < self.max_time {
+            for &t in &self.timestamps {
+                if t > state.current_time && t <= self.max_time {
+                    actions.push(CompactionAction::AdvanceTime { new_time: t });
+                }
+            }
+        }
+
+        // IngestPoint
+        if state.points_ingested < self.max_points {
+            for &ts in &self.timestamps {
+                if ts > state.current_time {
+                    continue;
+                }
+                if ts < state.current_time - self.late_data_acceptance_window {
+                    continue;
+                }
+                for &sk in &self.sort_keys {
+                    for &scope in &self.scopes {
+                        for cols in all_nonempty_column_subsets() {
+                            actions.push(CompactionAction::IngestPoint {
+                                timestamp: ts,
+                                sort_key: sk,
+                                scope,
+                                columns: cols,
+                            });
+                        }
+                    }
+                }
+            }
+        }
+
+        // FlushSplit
+        for (key, buf) in &state.ingest_buffer {
+            if !buf.is_empty() {
+                actions.push(CompactionAction::FlushSplit { key: *key });
+            }
+        }
+
+        // CompactWindow
+        if state.compactions_performed < self.max_compactions {
+            for &scope in &self.scopes {
+                // Collect valid window starts from current splits.
+                let valid_ws: BTreeSet<i64> = state
+                    .object_storage
+                    .iter()
+                    .filter(|s| s.scope == scope && s.window_start >= self.compaction_start_time)
+                    .map(|s| s.window_start)
+                    .collect();
+
+                for &ws in &valid_ws {
+                    let candidate_ids: BTreeSet<u32> = state
+                        .object_storage
+                        .iter()
+                        .filter(|s| s.scope == scope && s.window_start == ws)
+                        .map(|s| s.id)
+                        .collect();
+
+                    if candidate_ids.len() < 2 {
+                        continue;
+                    }
+
+                    for subset in subsets_of_size_ge2(&candidate_ids) {
+                        actions.push(CompactionAction::CompactWindow {
+                            scope,
+                            window_start: ws,
+                            split_ids: subset,
+                        });
+                    }
+                }
+            }
+        }
+    }
+
+    fn next_state(
+        &self,
+        state: &Self::State,
+        action: Self::Action,
+    ) -> Option<Self::State> {
+        let mut next = state.clone();
+
+        match action {
+            CompactionAction::AdvanceTime { new_time } => {
+                next.current_time = new_time;
+            }
+            CompactionAction::IngestPoint {
+                timestamp,
+                sort_key,
+                scope,
+                columns,
+            } => {
+                let pid = next.next_point_id;
+                let values: BTreeMap<ColumnName, (u32, ColumnName)> =
+                    columns.iter().map(|&c| (c, (pid, c))).collect();
+                let row = CompactionRow {
+                    point_id: pid,
+                    timestamp,
+                    sort_key,
+                    columns: columns.clone(),
+                    values,
+                };
+                let ws = window_start(timestamp, self.window_duration);
+                let key = (scope, ws);
+                next.ingest_buffer
+                    .entry(key)
+                    .or_default()
+                    .push(row.clone());
+                next.next_point_id += 1;
+                next.points_ingested += 1;
+                next.row_history.insert(pid, row);
+            }
+            CompactionAction::FlushSplit { key } => {
+                let rows = next.ingest_buffer.remove(&key).unwrap_or_default();
+                if rows.is_empty() {
+                    return None;
+                }
+                let sorted_rows = insertion_sort(&rows);
+                let all_cols: BTreeSet<ColumnName> =
+                    rows.iter().flat_map(|r| r.columns.iter().copied()).collect();
+                let new_split = CompactionSplit {
+                    id: next.next_split_id,
+                    scope: key.0,
+                    window_start: key.1,
+                    rows: sorted_rows,
+                    columns: all_cols,
+                    sorted: true,
+                };
+                next.object_storage.insert(new_split);
+                next.next_split_id += 1;
+            }
+            CompactionAction::CompactWindow {
+                scope,
+                window_start: ws,
+                split_ids,
+            } => {
+                let merge_splits: Vec<CompactionSplit> = next
+                    .object_storage
+                    .iter()
+                    .filter(|s| split_ids.contains(&s.id))
+                    .cloned()
+                    .collect();
+
+                if merge_splits.len() < 2 {
+                    return None;
+                }
+
+                // Multi-way sorted merge.
+                let mut merged_rows = Vec::new();
+                for s in &merge_splits {
+                    merged_rows = merge_sorted(&merged_rows, &s.rows);
+                }
+
+                let all_cols: BTreeSet<ColumnName> = merge_splits
+                    .iter()
+                    .flat_map(|s| s.columns.iter().copied())
+                    .collect();
+
+                let output_split = CompactionSplit {
+                    id: next.next_split_id,
+                    scope,
+                    window_start: ws,
+                    rows: merged_rows.clone(),
+                    columns: all_cols.clone(),
+                    sorted: true,
+                };
+
+                // Build compaction log entry.
+                let input_ids: BTreeSet<u32> = merge_splits.iter().map(|s| s.id).collect();
+                let input_point_ids: BTreeSet<u32> = merge_splits
+                    .iter()
+                    .flat_map(|s| s.rows.iter().map(|r| r.point_id))
+                    .collect();
+                let output_point_ids: BTreeSet<u32> =
+                    merged_rows.iter().map(|r| r.point_id).collect();
+                let input_scopes: BTreeMap<u32, Scope> =
+                    merge_splits.iter().map(|s| (s.id, s.scope)).collect();
+                let input_ws: BTreeMap<u32, i64> = merge_splits
+                    .iter()
+                    .map(|s| (s.id, s.window_start))
+                    .collect();
+
+                let log_entry = CompactionLogEntry {
+                    input_split_ids: input_ids,
+                    output_split_id: next.next_split_id,
+                    input_point_ids,
+                    output_point_ids,
+                    input_scopes,
+                    input_window_starts: input_ws,
+                    output_columns: all_cols.clone(),
+                    input_column_union: all_cols,
+                };
+
+                // Remove input splits, add output.
+                for s in &merge_splits {
+                    next.object_storage.remove(s);
+                }
+                next.object_storage.insert(output_split);
+                next.next_split_id += 1;
+                next.compactions_performed += 1;
+                next.compaction_log.insert(log_entry);
+            }
+        }
+
+        Some(next)
+    }
+
+    fn properties(&self) -> Vec<Property<Self>> {
+        vec![
+            // TW-1: Every split belongs to exactly one time window.
+            // All rows have the same window_start as the split metadata.
+            // Mirrors TimeWindowedCompaction.tla lines 274-277
+            Property::always("TW-1: one window per split", |model: &CompactionModel, state: &CompactionState| {
+                let wd = model.window_duration;
+                state.object_storage.iter().all(|split| {
+                    split
+                        .rows
+                        .iter()
+                        .all(|row| window_start(row.timestamp, wd) == split.window_start)
+                })
+            }),
+            // TW-2: window_duration evenly divides one hour.
+            // Mirrors TimeWindowedCompaction.tla lines 283-284
+            Property::always("TW-2: duration divides hour", |model: &CompactionModel, _state: &CompactionState| {
+                model.hour_seconds % model.window_duration == 0
+            }),
+            // TW-3: No cross-window merges.
+            // Mirrors TimeWindowedCompaction.tla lines 295-305
+            Property::always("TW-3: no cross-window merge", |_model: &CompactionModel, state: &CompactionState| {
+                state.compaction_log.iter().all(|entry| {
+                    // All input window_starts are identical.
+                    let ws_values: BTreeSet<i64> =
+                        entry.input_window_starts.values().copied().collect();
+                    if ws_values.len() > 1 {
+                        return false;
+                    }
+                    // Output split (if in storage) matches.
+                    for s in &state.object_storage {
+                        if s.id == entry.output_split_id {
+                            for &input_ws in entry.input_window_starts.values() {
+                                if s.window_start != input_ws {
+                                    return false;
+                                }
+                            }
+                        }
+                    }
+                    true
+                })
+            }),
+            // CS-1: Only splits sharing scope may be merged.
+            // Mirrors TimeWindowedCompaction.tla lines 311-314
+            Property::always("CS-1: scope compatibility", |_model: &CompactionModel, state: &CompactionState| {
+                state.compaction_log.iter().all(|entry| {
+                    let scopes: BTreeSet<Scope> =
+                        entry.input_scopes.values().copied().collect();
+                    scopes.len() <= 1
+                })
+            }),
+            // CS-2: Same window_start within scope.
+            // Mirrors TimeWindowedCompaction.tla lines 320-323
+            Property::always("CS-2: same window_start", |_model: &CompactionModel, state: &CompactionState| {
+                state.compaction_log.iter().all(|entry| {
+                    let ws_values: BTreeSet<i64> =
+                        entry.input_window_starts.values().copied().collect();
+                    ws_values.len() <= 1
+                })
+            }),
+            // CS-3: Splits before compaction_start_time never compacted.
+            // Mirrors TimeWindowedCompaction.tla lines 329-332
+            Property::always("CS-3: compaction start time", |model: &CompactionModel, state: &CompactionState| {
+                let cst = model.compaction_start_time;
+                state.compaction_log.iter().all(|entry| {
+                    entry
+                        .input_window_starts
+                        .values()
+                        .all(|&ws| ws >= cst)
+                })
+            }),
+            // MC-1: Row multiset preserved (no add/remove/duplicate).
+            // Mirrors TimeWindowedCompaction.tla lines 339-344
+            Property::always("MC-1: row set preserved", |_model: &CompactionModel, state: &CompactionState| {
+                state.compaction_log.iter().all(|entry| {
+                    entry.input_point_ids == entry.output_point_ids
+                })
+            }),
+            // MC-2: Row contents unchanged through compaction.
+            // Mirrors TimeWindowedCompaction.tla lines 351-360
+            Property::always("MC-2: row contents preserved", |_model: &CompactionModel, state: &CompactionState| {
+                for split in &state.object_storage {
+                    for row in &split.rows {
+                        if let Some(original) = state.row_history.get(&row.point_id) {
+                            if row.timestamp != original.timestamp
+                                || row.sort_key != original.sort_key
+                                || row.columns != original.columns
+                                || row.values != original.values
+                            {
+                                return false;
+                            }
+                        } else {
+                            return false;
+                        }
+                    }
+                }
+                true
+            }),
+            // MC-3: Output is sorted.
+            // Mirrors TimeWindowedCompaction.tla lines 366-368
+            Property::always("MC-3: sort order preserved", |_model: &CompactionModel, state: &CompactionState| {
+                state.object_storage.iter().all(|split| {
+                    if split.sorted {
+                        is_sorted_by_key(&split.rows)
+                    } else {
+                        true
+                    }
+                })
+            }),
+            // MC-4: Column set is the union of inputs.
+            // Mirrors TimeWindowedCompaction.tla lines 376-378
+            Property::always("MC-4: column union", |_model: &CompactionModel, state: &CompactionState| {
+                state.compaction_log.iter().all(|entry| {
+                    entry.output_columns == entry.input_column_union
+                })
+            }),
+        ]
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    #[test]
+    fn check_compaction_small() {
+        let model = CompactionModel::small();
+        model
+            .checker()
+            .spawn_bfs()
+            .join()
+            .assert_properties();
+    }
+
+    #[test]
+    fn window_start_computation() {
+        assert_eq!(window_start(0, 2), 0);
+        assert_eq!(window_start(1, 2), 0);
+        assert_eq!(window_start(2, 2), 2);
+        assert_eq!(window_start(3, 2), 2);
+        assert_eq!(window_start(5, 3), 3);
+    }
+
+    #[test]
+    fn merge_sort_basic() {
+        let r1 = CompactionRow {
+            point_id: 1,
+            timestamp: 0,
+            sort_key: 1,
+            columns: BTreeSet::new(),
+            values: BTreeMap::new(),
+        };
+        let r2 = CompactionRow {
+            point_id: 2,
+            timestamp: 0,
+            sort_key: 3,
+            columns: BTreeSet::new(),
+            values: BTreeMap::new(),
+        };
+        let r3 = CompactionRow {
+            point_id: 3,
+            timestamp: 0,
+            sort_key: 2,
+            columns: BTreeSet::new(),
+            values: BTreeMap::new(),
+        };
+        let merged = merge_sorted(&[r1.clone(), r2.clone()], std::slice::from_ref(&r3));
+        assert_eq!(merged.len(), 3);
+        assert_eq!(merged[0].point_id, 1);
+        assert_eq!(merged[1].point_id, 3);
+        assert_eq!(merged[2].point_id, 2);
+    }
+}
diff --git a/quickwit/quickwit-dst/tests/stateright_models.rs b/quickwit/quickwit-dst/tests/stateright_models.rs
new file mode 100644
index 00000000000..7b94fb38fd8
--- /dev/null
+++ b/quickwit/quickwit-dst/tests/stateright_models.rs
@@ -0,0 +1,77 @@
+// Copyright (C) 2024 Quickwit, Inc.
+//
+// Quickwit is offered under the AGPL v3.0 and as commercial software.
+// For commercial licensing, contact us at hello@quickwit.io.
+//
+// AGPL:
+// This program is free software: you can redistribute it and/or modify
+// it under the terms of the GNU Affero General Public License as
+// published by the Free Software Foundation, either version 3 of the
+// License, or (at your option) any later version.
+//
+// This program is distributed in the hope that it will be useful,
+// but WITHOUT ANY WARRANTY; without even the implied warranty of
+// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+// GNU Affero General Public License for more details.
+//
+// You should have received a copy of the GNU Affero General Public License
+// along with this program. If not, see <http://www.gnu.org/licenses/>.
+
+//! Integration tests for stateright models.
+//!
+//! Each test runs exhaustive BFS model checking against the small TLA+ config
+//! equivalents and verifies all invariants hold.
+//!
+//! Requires `--features model-checking` to compile and run.
+
+#![cfg(feature = "model-checking")]
+
+use quickwit_dst::models::{
+    parquet_data_model::DataModelModel, sort_schema::SortSchemaModel,
+    time_windowed_compaction::CompactionModel,
+};
+use stateright::{Checker, Model};
+
+/// SS-1..SS-5: Sort schema invariants (ADR-002).
+/// Mirrors SortSchema_small.cfg: Columns={c1}, RowsPerSplitMax=2,
+/// SplitsMax=2, SchemaChangesMax=1.
+#[test]
+fn exhaustive_sort_schema() {
+    let model = SortSchemaModel::small();
+    let result = model.checker().spawn_bfs().join();
+    result.assert_properties();
+    println!(
+        "SortSchema: states={}, unique={}",
+        result.state_count(),
+        result.unique_state_count()
+    );
+}
+
+/// TW-1..TW-3, CS-1..CS-3, MC-1..MC-4: Compaction invariants (ADR-003).
+/// Mirrors TimeWindowedCompaction_small.cfg.
+#[test]
+fn exhaustive_compaction() {
+    let model = CompactionModel::small();
+    let result = model.checker().spawn_bfs().join();
+    result.assert_properties();
+    println!(
+        "Compaction: states={}, unique={}",
+        result.state_count(),
+        result.unique_state_count()
+    );
+}
+
+/// DM-1..DM-5: Data model invariants (ADR-001).
+/// Mirrors ParquetDataModel_small.cfg: Nodes={n1}, MetricNames={m1},
+/// TagSets={tags1}, Timestamps={1}, RequestCountMax=3.
+#[test]
+fn exhaustive_data_model() {
+    let model = DataModelModel::small();
+    let result = model.checker().spawn_bfs().join();
+    result.assert_properties();
+    println!(
+        "DataModel: states={}, unique={}",
+        result.state_count(),
+        result.unique_state_count()
+    );
+}
diff --git a/quickwit/quickwit-parquet-engine/Cargo.toml b/quickwit/quickwit-parquet-engine/Cargo.toml
index 39918c0948c..ca1e8d72150 100644
--- a/quickwit/quickwit-parquet-engine/Cargo.toml
+++ b/quickwit/quickwit-parquet-engine/Cargo.toml
@@ -18,6 +18,7 @@ chrono = { workspace = true }
 parquet = { workspace = true }
 prost = { workspace = true }
 quickwit-common = { workspace = true }
+quickwit-dst = { workspace = true }
 quickwit-proto = { workspace = true }
 sea-query = { workspace = true, optional = true }
 serde = { workspace = true }
diff --git a/quickwit/quickwit-parquet-engine/src/sort_fields/window.rs b/quickwit/quickwit-parquet-engine/src/sort_fields/window.rs
index b1ad896e31f..d34a3ec15ea 100644
--- a/quickwit/quickwit-parquet-engine/src/sort_fields/window.rs
+++ b/quickwit/quickwit-parquet-engine/src/sort_fields/window.rs
@@ -77,16 +77,19 @@ pub fn window_start(
     timestamp_secs: i64,
     duration_secs: i64,
 ) -> Result<DateTime<Utc>, SortFieldsError> {
-    debug_assert!(duration_secs > 0, "window duration must be positive");
-    // TW-2 (ADR-003): window duration must evenly divide one hour.
-    // This ensures window boundaries align across hours and days.
-    debug_assert!(
+    use quickwit_dst::check_invariant;
+    use quickwit_dst::invariants::InvariantId;
+
+    check_invariant!(InvariantId::TW2, duration_secs > 0, ": duration_secs must be positive");
+    check_invariant!(
+        InvariantId::TW2,
         3600 % duration_secs == 0,
-        "TW-2 violated: duration_secs={} does not divide 3600",
-        duration_secs
+        ": duration_secs={} does not divide 3600", duration_secs
+    );
+    let start_secs = quickwit_dst::invariants::window::window_start_secs(
+        timestamp_secs,
+        duration_secs,
     );
-    let remainder = timestamp_secs.rem_euclid(duration_secs);
-    let start_secs = timestamp_secs - remainder;
     DateTime::from_timestamp(start_secs, 0).ok_or(SortFieldsError::WindowStartOutOfRange {
         timestamp_secs: start_secs,
     })
diff --git a/quickwit/quickwit-parquet-engine/src/split/metadata.rs b/quickwit/quickwit-parquet-engine/src/split/metadata.rs
index 5bf85ed987b..d280d846d8e 100644
--- a/quickwit/quickwit-parquet-engine/src/split/metadata.rs
+++ b/quickwit/quickwit-parquet-engine/src/split/metadata.rs
@@ -467,21 +467,21 @@ impl MetricsSplitMetadataBuilder {
     pub fn build(self) -> MetricsSplitMetadata {
         // TW-2 (ADR-003): window_duration must evenly divide 3600.
         // Enforced at build time so no invalid metadata propagates to storage.
-        debug_assert!(
+        quickwit_dst::check_invariant!(
+            quickwit_dst::invariants::InvariantId::TW2,
             self.window_duration_secs == 0 || 3600 % self.window_duration_secs == 0,
-            "TW-2 violated: window_duration_secs={} does not divide 3600",
-            self.window_duration_secs
+            ": window_duration_secs={} does not divide 3600", self.window_duration_secs
         );
 
         // TW-1 (ADR-003, partial): window_start and window_duration_secs are paired.
         // If one is set, the other must be too. Pre-Phase-31 splits have both at defaults.
-        debug_assert!(
+        quickwit_dst::check_invariant!(
+            quickwit_dst::invariants::InvariantId::TW1,
             (self.window_start.is_none() && self.window_duration_secs == 0)
                 || (self.window_start.is_some() && self.window_duration_secs > 0),
-            "TW-1 violated: window_start and window_duration_secs must be set together \
+            ": window_start and window_duration_secs must be set together \
              (window_start={:?}, window_duration_secs={})",
-            self.window_start,
-            self.window_duration_secs
+            self.window_start, self.window_duration_secs
         );
 
         // Fuse the two builder fields into a single Range.
diff --git a/quickwit/quickwit-parquet-engine/src/split/postgres.rs b/quickwit/quickwit-parquet-engine/src/split/postgres.rs
index 0d1bbff184a..ca6b1b8e9ca 100644
--- a/quickwit/quickwit-parquet-engine/src/split/postgres.rs
+++ b/quickwit/quickwit-parquet-engine/src/split/postgres.rs
@@ -157,13 +157,33 @@ impl PgMetricsSplit {
         debug_assert_eq!(metadata.split_id.as_str(), self.split_id);
         debug_assert_eq!(metadata.time_range.start_secs, self.time_range_start as u64);
         debug_assert_eq!(metadata.time_range.end_secs, self.time_range_end as u64);
-        debug_assert_eq!(metadata.window_start(), self.window_start);
-        debug_assert_eq!(
-            metadata.window_duration_secs(),
-            self.window_duration_secs.unwrap_or(0) as u32
+
+        // SS-5 (SortSchema.tla): sort_fields must be identical in JSON metadata
+        // and the dedicated SQL column. Inconsistency would cause the compaction
+        // planner to select wrong splits or miss eligible ones.
+        quickwit_dst::check_invariant!(
+            quickwit_dst::invariants::InvariantId::SS5,
+            metadata.sort_fields == self.sort_fields,
+            ": sort_fields mismatch between JSON ('{}') and SQL column ('{}')",
+            metadata.sort_fields, self.sort_fields
+        );
+
+        // SS-5 continued: window_start must match between JSON and SQL column.
+        quickwit_dst::check_invariant!(
+            quickwit_dst::invariants::InvariantId::SS5,
+            metadata.window_start() == self.window_start,
+            ": window_start mismatch between JSON ({:?}) and SQL column ({:?})",
+            metadata.window_start(), self.window_start
+        );
+
+        // SS-4 (SortSchema.tla): sort_fields is immutable after write.
+        // We can't verify immutability at read time (no history available), but
+        // we verify the row_keys_proto round-trips consistently.
+        quickwit_dst::check_invariant!(
+            quickwit_dst::invariants::InvariantId::SS5,
+            metadata.row_keys_proto == self.row_keys,
+            ": row_keys_proto mismatch between JSON and SQL column"
         );
-        debug_assert_eq!(metadata.sort_fields, self.sort_fields);
-        debug_assert_eq!(metadata.num_merge_ops, self.num_merge_ops as u32);
 
         Ok(metadata)
     }
diff --git a/quickwit/quickwit-parquet-engine/src/storage/writer.rs b/quickwit/quickwit-parquet-engine/src/storage/writer.rs
index 3ec633573bc..60b39dd34c5 100644
--- a/quickwit/quickwit-parquet-engine/src/storage/writer.rs
+++ b/quickwit/quickwit-parquet-engine/src/storage/writer.rs
@@ -52,10 +52,12 @@ pub(crate) const PARQUET_META_ROW_KEYS_JSON: &str = "qh.row_keys_json";
 pub(crate) fn build_compaction_key_value_metadata(
     metadata: &MetricsSplitMetadata,
 ) -> Vec<KeyValue> {
-    // TW-2: window_duration must divide 3600.
-    debug_assert!(
+    // TW-2: window_duration must divide 3600 (also checked at build time,
+    // but belt-and-suspenders at the serialization boundary).
+    quickwit_dst::check_invariant!(
+        quickwit_dst::invariants::InvariantId::TW2,
         metadata.window_duration_secs() == 0 || 3600 % metadata.window_duration_secs() == 0,
-        "TW-2 violated at Parquet write: window_duration_secs={} does not divide 3600",
+        " at Parquet write: window_duration_secs={} does not divide 3600",
         metadata.window_duration_secs()
     );
 
@@ -116,18 +118,18 @@ fn verify_ss5_kv_consistency(metadata: &MetricsSplitMetadata, kvs: &[KeyValue])
     };
 
     if !metadata.sort_fields.is_empty() {
-        debug_assert_eq!(
-            find_kv(PARQUET_META_SORT_FIELDS),
-            Some(metadata.sort_fields.as_str()),
-            "SS-5 violated: sort_fields in kv_metadata does not match MetricsSplitMetadata"
+        quickwit_dst::check_invariant!(
+            quickwit_dst::invariants::InvariantId::SS5,
+            find_kv(PARQUET_META_SORT_FIELDS) == Some(metadata.sort_fields.as_str()),
+            ": sort_fields in kv_metadata does not match MetricsSplitMetadata"
         );
     }
 
     if let Some(ws) = metadata.window_start() {
-        debug_assert_eq!(
-            find_kv(PARQUET_META_WINDOW_START),
-            Some(ws.to_string()).as_deref(),
-            "SS-5 violated: window_start in kv_metadata does not match MetricsSplitMetadata"
+        quickwit_dst::check_invariant!(
+            quickwit_dst::invariants::InvariantId::SS5,
+            find_kv(PARQUET_META_WINDOW_START) == Some(ws.to_string()).as_deref(),
+            ": window_start in kv_metadata does not match MetricsSplitMetadata"
         );
     }
 }
@@ -283,11 +285,10 @@ impl ParquetWriter {
                 let verify_indices = lexsort_to_indices(&verify_columns, None)
                     .expect("SS-1 verification sort failed");
                 for i in 0..verify_indices.len() {
-                    debug_assert_eq!(
-                        verify_indices.value(i) as usize,
-                        i,
-                        "SS-1 violated: row {} is out of sort order after sort_batch()",
-                        i
+                    quickwit_dst::check_invariant!(
+                        quickwit_dst::invariants::InvariantId::SS1,
+                        verify_indices.value(i) as usize == i,
+                        ": row {} is out of sort order after sort_batch()", i
                     );
                 }
             }

From 70793f6d68d5a916b80922bbd34e163ed7970d8b Mon Sep 17 00:00:00 2001
From: George Talbot <george.talbot@datadoghq.com>
Date: Thu, 12 Mar 2026 16:36:01 -0400
Subject: [PATCH 25/35] feat(31): check invariants in release builds, add
 pluggable recorder
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The check_invariant! macro now always evaluates the condition — not just
in debug builds. This implements Layer 4 (Production) of the verification
stack: invariant checks run in release, with results forwarded to a
pluggable InvariantRecorder for Datadog metrics emission.

- Debug builds: panic on violation (debug_assert, Layer 3)
- All builds: evaluate condition, call recorder (Layer 4)
- set_invariant_recorder() wires up statsd at process startup
- No recorder registered = no-op (single OnceLock load)

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
---
 quickwit/quickwit-dst/src/invariants/check.rs |  36 +++--
 quickwit/quickwit-dst/src/invariants/mod.rs   |   2 +
 .../quickwit-dst/src/invariants/recorder.rs   | 130 ++++++++++++++++++
 3 files changed, 157 insertions(+), 11 deletions(-)
 create mode 100644 quickwit/quickwit-dst/src/invariants/recorder.rs

diff --git a/quickwit/quickwit-dst/src/invariants/check.rs b/quickwit/quickwit-dst/src/invariants/check.rs
index 06c59ee9a3c..6c65377ae63 100644
--- a/quickwit/quickwit-dst/src/invariants/check.rs
+++ b/quickwit/quickwit-dst/src/invariants/check.rs
@@ -17,13 +17,23 @@
 // You should have received a copy of the GNU Affero General Public License
 // along with this program. If not, see <http://www.gnu.org/licenses/>.
 
-//! Invariant checking macro.
+//! Invariant checking macro — Layers 3 + 4 of the verification stack.
 //!
-//! Wraps `debug_assert!` with the invariant ID, providing a single hook point
-//! for future Datadog metrics emission (Layer 4 of the verification stack).
+//! The condition is **always evaluated** (debug and release). Results are:
+//!
+//! - **Debug builds (Layer 3 — Prevention):** panics on violation via
+//!   `debug_assert!`, catching bugs during development and testing.
+//! - **All builds (Layer 4 — Production):** forwards the result to the
+//!   registered [`InvariantRecorder`](super::recorder::InvariantRecorder)
+//!   for Datadog metrics emission. No-op if no recorder is set.
 
-/// Check an invariant condition. In debug builds, panics on violation.
-/// In release builds, currently a no-op (future: emit Datadog metric).
+/// Check an invariant condition in all build profiles.
+///
+/// The condition is always evaluated. In debug builds, a violation panics.
+/// In all builds, the result is forwarded to the registered invariant
+/// recorder for metrics emission (see [`set_invariant_recorder`]).
+///
+/// [`set_invariant_recorder`]: crate::invariants::set_invariant_recorder
 ///
 /// # Examples
 ///
@@ -36,10 +46,14 @@
 /// ```
 #[macro_export]
 macro_rules! check_invariant {
-    ($id:expr, $cond:expr) => {
-        debug_assert!($cond, "{} violated", $id);
-    };
-    ($id:expr, $cond:expr, $fmt:literal $($arg:tt)*) => {
-        debug_assert!($cond, concat!("{} violated", $fmt), $id $($arg)*);
-    };
+    ($id:expr, $cond:expr) => {{
+        let passed = $cond;
+        $crate::invariants::record_invariant_check($id, passed);
+        debug_assert!(passed, "{} violated", $id);
+    }};
+    ($id:expr, $cond:expr, $fmt:literal $($arg:tt)*) => {{
+        let passed = $cond;
+        $crate::invariants::record_invariant_check($id, passed);
+        debug_assert!(passed, concat!("{} violated", $fmt), $id $($arg)*);
+    }};
 }
diff --git a/quickwit/quickwit-dst/src/invariants/mod.rs b/quickwit/quickwit-dst/src/invariants/mod.rs
index b33ed7849c9..b9679683a7e 100644
--- a/quickwit/quickwit-dst/src/invariants/mod.rs
+++ b/quickwit/quickwit-dst/src/invariants/mod.rs
@@ -26,8 +26,10 @@
 //! No external dependencies — only `std`.
 
 mod check;
+pub mod recorder;
 pub mod registry;
 pub mod sort;
 pub mod window;
 
+pub use recorder::{record_invariant_check, set_invariant_recorder};
 pub use registry::InvariantId;
diff --git a/quickwit/quickwit-dst/src/invariants/recorder.rs b/quickwit/quickwit-dst/src/invariants/recorder.rs
new file mode 100644
index 00000000000..f3859e03849
--- /dev/null
+++ b/quickwit/quickwit-dst/src/invariants/recorder.rs
@@ -0,0 +1,130 @@
+// Copyright (C) 2024 Quickwit, Inc.
+//
+// Quickwit is offered under the AGPL v3.0 and as commercial software.
+// For commercial licensing, contact us at hello@quickwit.io.
+//
+// AGPL:
+// This program is free software: you can redistribute it and/or modify
+// it under the terms of the GNU Affero General Public License as
+// published by the Free Software Foundation, either version 3 of the
+// License, or (at your option) any later version.
+//
+// This program is distributed in the hope that it will be useful,
+// but WITHOUT ANY WARRANTY; without even the implied warranty of
+// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+// GNU Affero General Public License for more details.
+//
+// You should have received a copy of the GNU Affero General Public License
+// along with this program. If not, see <http://www.gnu.org/licenses/>.
+
+//! Pluggable invariant recorder — Layer 4 of the verification stack.
+//!
+//! Every call to [`check_invariant!`](crate::check_invariant) evaluates the
+//! condition in **all** build profiles (debug and release). The result is
+//! forwarded to a recorder function that can emit Datadog metrics, log
+//! violations, or take any other action.
+//!
+//! # Wiring up Datadog metrics
+//!
+//! Call [`set_invariant_recorder`] once at process startup:
+//!
+//! ```rust
+//! use quickwit_dst::invariants::{InvariantId, set_invariant_recorder};
+//!
+//! fn my_recorder(id: InvariantId, passed: bool) {
+//!     // statsd.count("pomsky.invariant.checked", 1, &[&format!("name:{}", id)]);
+//!     // if !passed {
+//!     //     statsd.count("pomsky.invariant.violated", 1, &[&format!("name:{}", id)]);
+//!     // }
+//!     if !passed {
+//!         eprintln!("{} violated in production", id);
+//!     }
+//! }
+//!
+//! set_invariant_recorder(my_recorder);
+//! ```
+
+use std::sync::OnceLock;
+
+use super::InvariantId;
+
+/// Signature for an invariant recorder function.
+///
+/// Called on every `check_invariant!` invocation with the invariant ID and
+/// whether the check passed. Implementations must be cheap — this is called
+/// on hot paths.
+pub type InvariantRecorder = fn(InvariantId, bool);
+
+/// Global recorder. When unset, [`record_invariant_check`] is a no-op.
+static RECORDER: OnceLock<InvariantRecorder> = OnceLock::new();
+
+/// Register a global invariant recorder.
+///
+/// Should be called once at process startup. Subsequent calls are ignored
+/// (first writer wins). This is safe to call from any thread.
+pub fn set_invariant_recorder(recorder: InvariantRecorder) {
+    // OnceLock::set returns Err if already initialized — that's fine.
+    let _ = RECORDER.set(recorder);
+}
+
+/// Record an invariant check result.
+///
+/// Called by [`check_invariant!`](crate::check_invariant) on every invocation,
+/// in both debug and release builds. If no recorder has been registered via
+/// [`set_invariant_recorder`], this is a no-op (single atomic load).
+#[inline]
+pub fn record_invariant_check(id: InvariantId, passed: bool) {
+    if let Some(recorder) = RECORDER.get() {
+        recorder(id, passed);
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use std::sync::atomic::{AtomicU32, Ordering};
+
+    use super::*;
+
+    // Note: these tests use a process-global OnceLock, so only the first
+    // test to run can set the recorder. We test the no-recorder path and
+    // the recorder path in a single test to avoid ordering issues.
+
+    #[test]
+    fn record_without_recorder_is_noop() {
+        // Before any recorder is set, this should not panic.
+        // (In the test binary, another test may have set it, so we just
+        // verify it doesn't panic either way.)
+        record_invariant_check(InvariantId::SS1, true);
+        record_invariant_check(InvariantId::SS1, false);
+    }
+
+    #[test]
+    fn recorder_receives_calls() {
+        static CHECKS: AtomicU32 = AtomicU32::new(0);
+        static VIOLATIONS: AtomicU32 = AtomicU32::new(0);
+
+        fn test_recorder(_id: InvariantId, passed: bool) {
+            CHECKS.fetch_add(1, Ordering::Relaxed);
+            if !passed {
+                VIOLATIONS.fetch_add(1, Ordering::Relaxed);
+            }
+        }
+
+        // May fail if another test already set the recorder — that's OK,
+        // the test still verifies the function doesn't panic.
+        let _ = RECORDER.set(test_recorder);
+
+        let before_checks = CHECKS.load(Ordering::Relaxed);
+        let before_violations = VIOLATIONS.load(Ordering::Relaxed);
+
+        record_invariant_check(InvariantId::TW2, true);
+        record_invariant_check(InvariantId::TW2, false);
+
+        // If our recorder was set, we should see the increments.
+        // If another recorder was set first, we can't assert on counts.
+        if RECORDER.get() == Some(&(test_recorder as InvariantRecorder)) {
+            assert_eq!(CHECKS.load(Ordering::Relaxed), before_checks + 2);
+            assert_eq!(VIOLATIONS.load(Ordering::Relaxed), before_violations + 1);
+        }
+    }
+}

From 1e1e328da44e0dbfafd7e535abfcd9230f247d46 Mon Sep 17 00:00:00 2001
From: George Talbot <george.talbot@datadoghq.com>
Date: Thu, 12 Mar 2026 16:50:49 -0400
Subject: [PATCH 26/35] feat(31): wire invariant recorder to DogStatsD metrics

Emit cloudprem.pomsky.invariant.checked and .violated counters with
invariant label via the metrics crate / DogStatsD exporter at process
startup, completing Layer 4 of the verification stack.

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
---
 quickwit/Cargo.lock                           | 160 +++++++++++++++++-
 quickwit/Cargo.toml                           |   2 +
 quickwit/quickwit-cli/Cargo.toml              |   4 +
 quickwit/quickwit-cli/src/logger.rs           |  59 +++++++
 quickwit/quickwit-cli/src/main.rs             |   6 +
 .../quickwit-dst/src/invariants/registry.rs   |  60 ++++---
 6 files changed, 263 insertions(+), 28 deletions(-)

diff --git a/quickwit/Cargo.lock b/quickwit/Cargo.lock
index df969d83d7e..020b281625f 100644
--- a/quickwit/Cargo.lock
+++ b/quickwit/Cargo.lock
@@ -407,6 +407,12 @@ dependencies = [
  "regex-syntax",
 ]
 
+[[package]]
+name = "ascii"
+version = "1.1.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "d92bec98840b8f03a5ff5413de5293bfcd8bf96467cf5452609f939ec6f5de16"
+
 [[package]]
 name = "ascii-canvas"
 version = "4.0.0"
@@ -1808,6 +1814,12 @@ dependencies = [
  "zstd",
 ]
 
+[[package]]
+name = "choice"
+version = "0.0.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "a3b71fc821deaf602a933ada5c845d088156d0cdf2ebf43ede390afe93466553"
+
 [[package]]
 name = "chrono"
 version = "0.4.44"
@@ -1832,6 +1844,12 @@ dependencies = [
  "phf",
 ]
 
+[[package]]
+name = "chunked_transfer"
+version = "1.5.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "6e4de3bc4ea267985becf712dc6d9eed8b04c953b3fcfb339ebc87acd9804901"
+
 [[package]]
 name = "ciborium"
 version = "0.2.2"
@@ -2883,6 +2901,12 @@ dependencies = [
  "cfg-if",
 ]
 
+[[package]]
+name = "endian-type"
+version = "0.1.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "c34f04666d835ff5d62e058c3995147c06f42fe86ff053337632bca83e42702d"
+
 [[package]]
 name = "enum-iterator"
 version = "2.3.0"
@@ -4183,6 +4207,12 @@ version = "2.3.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "3d3067d79b975e8844ca9eb072e16b31c3c1c36928edf9c6789548c524d0d954"
 
+[[package]]
+name = "id-set"
+version = "0.2.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "9633fadf6346456cf8531119ba4838bc6d82ac4ce84d9852126dd2aa34d49264"
+
 [[package]]
 name = "ident_case"
 version = "1.0.1"
@@ -4934,6 +4964,51 @@ dependencies = [
  "libc",
 ]
 
+[[package]]
+name = "metrics"
+version = "0.24.3"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "5d5312e9ba3771cfa961b585728215e3d972c950a3eed9252aa093d6301277e8"
+dependencies = [
+ "ahash",
+ "portable-atomic",
+]
+
+[[package]]
+name = "metrics-exporter-dogstatsd"
+version = "0.9.6"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "961f3712d8a7cfe14caaf74c3af503fe701cee6439ff49a7a3ebd04bf49c0502"
+dependencies = [
+ "bytes",
+ "itoa",
+ "metrics",
+ "metrics-util",
+ "ryu",
+ "thiserror 2.0.18",
+ "tracing",
+]
+
+[[package]]
+name = "metrics-util"
+version = "0.20.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "cdfb1365fea27e6dd9dc1dbc19f570198bc86914533ad639dae939635f096be4"
+dependencies = [
+ "aho-corasick",
+ "crossbeam-epoch",
+ "crossbeam-utils",
+ "hashbrown 0.16.1",
+ "indexmap 2.13.0",
+ "metrics",
+ "ordered-float 5.3.0",
+ "quanta",
+ "radix_trie",
+ "rand 0.9.2",
+ "rand_xoshiro",
+ "sketches-ddsketch 0.3.1",
+]
+
 [[package]]
 name = "mime"
 version = "0.3.17"
@@ -5124,6 +5199,15 @@ dependencies = [
  "regex",
 ]
 
+[[package]]
+name = "nibble_vec"
+version = "0.1.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "77a5d83df9f36fe23f0c3648c6bbb8b0298bb5f1939c8f2704431371f4b84d43"
+dependencies = [
+ "smallvec",
+]
+
 [[package]]
 name = "nix"
 version = "0.26.4"
@@ -5141,6 +5225,12 @@ version = "0.6.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "43794a0ace135be66a25d3ae77d41b91615fb68ae937f904090203e81f755b65"
 
+[[package]]
+name = "nohash-hasher"
+version = "0.2.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "2bf50223579dc7cdcfb3bfcacf7069ff68243f8c363f62ffa99cf000a6b9c451"
+
 [[package]]
 name = "nom"
 version = "7.1.3"
@@ -6975,6 +7065,8 @@ dependencies = [
  "humantime",
  "indicatif",
  "itertools 0.14.0",
+ "metrics",
+ "metrics-exporter-dogstatsd",
  "numfmt",
  "once_cell",
  "openssl-probe 0.1.6",
@@ -6987,6 +7079,7 @@ dependencies = [
  "quickwit-cluster",
  "quickwit-common",
  "quickwit-config",
+ "quickwit-dst",
  "quickwit-index-management",
  "quickwit-indexing",
  "quickwit-ingest",
@@ -7251,6 +7344,13 @@ dependencies = [
  "utoipa",
 ]
 
+[[package]]
+name = "quickwit-dst"
+version = "0.8.0"
+dependencies = [
+ "stateright",
+]
+
 [[package]]
 name = "quickwit-index-management"
 version = "0.8.0"
@@ -7617,6 +7717,7 @@ dependencies = [
  "proptest",
  "prost 0.14.3",
  "quickwit-common",
+ "quickwit-dst",
  "quickwit-proto",
  "sea-query",
  "serde",
@@ -7988,6 +8089,16 @@ version = "6.0.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "f8dcc9c7d52a811697d2151c701e0d08956f92b0e24136cf4cf27b57a6a0d9bf"
 
+[[package]]
+name = "radix_trie"
+version = "0.2.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "c069c179fcdc6a2fe24d8d18305cf085fdbd4f922c041943e203685d6a1c58fd"
+dependencies = [
+ "endian-type",
+ "nibble_vec",
+]
+
 [[package]]
 name = "rand"
 version = "0.7.3"
@@ -8124,6 +8235,15 @@ dependencies = [
  "rand_core 0.9.5",
 ]
 
+[[package]]
+name = "rand_xoshiro"
+version = "0.7.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "f703f4665700daf5512dcca5f43afa6af89f09db47fb56be587f80636bda2d41"
+dependencies = [
+ "rand_core 0.9.5",
+]
+
 [[package]]
 name = "raw-cpuid"
 version = "11.6.0"
@@ -9331,6 +9451,12 @@ dependencies = [
  "serde",
 ]
 
+[[package]]
+name = "sketches-ddsketch"
+version = "0.3.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "0c6f73aeb92d671e0cc4dca167e59b2deb6387c375391bc99ee743f326994a2b"
+
 [[package]]
 name = "slab"
 version = "0.4.12"
@@ -9662,6 +9788,26 @@ version = "1.2.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "6ce2be8dc25455e1f91df71bfa12ad37d7af1092ae736f3a6cd0e37bc7810596"
 
+[[package]]
+name = "stateright"
+version = "0.30.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "ebd37c74ff38ca9e5d370efb7af3c49ecab91cb5644affa23dc54a061d0f3a59"
+dependencies = [
+ "ahash",
+ "choice",
+ "crossbeam-utils",
+ "dashmap 5.5.3",
+ "id-set",
+ "log",
+ "nohash-hasher",
+ "parking_lot 0.12.5",
+ "rand 0.8.5",
+ "serde",
+ "serde_json",
+ "tiny_http",
+]
+
 [[package]]
 name = "static_assertions"
 version = "1.1.0"
@@ -9873,7 +10019,7 @@ dependencies = [
  "rustc-hash",
  "serde",
  "serde_json",
- "sketches-ddsketch",
+ "sketches-ddsketch 0.3.0",
  "smallvec",
  "tantivy-bitpacker",
  "tantivy-columnar",
@@ -10177,6 +10323,18 @@ dependencies = [
  "crunchy",
 ]
 
+[[package]]
+name = "tiny_http"
+version = "0.12.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "389915df6413a2e74fb181895f933386023c71110878cd0825588928e64cdc82"
+dependencies = [
+ "ascii",
+ "chunked_transfer",
+ "httpdate",
+ "log",
+]
+
 [[package]]
 name = "tinystr"
 version = "0.8.2"
diff --git a/quickwit/Cargo.toml b/quickwit/Cargo.toml
index cfd732a61f7..fc73ec93684 100644
--- a/quickwit/Cargo.toml
+++ b/quickwit/Cargo.toml
@@ -156,6 +156,8 @@ libz-sys = "1.1"
 lru = "0.16"
 matches = "0.1"
 md5 = "0.8"
+metrics = "0.24"
+metrics-exporter-dogstatsd = "0.9"
 mime_guess = "2.0"
 mini-moka = "0.10.3"
 mockall = "0.14"
diff --git a/quickwit/quickwit-cli/Cargo.toml b/quickwit/quickwit-cli/Cargo.toml
index 5d9dc955107..5af056d0820 100644
--- a/quickwit/quickwit-cli/Cargo.toml
+++ b/quickwit/quickwit-cli/Cargo.toml
@@ -56,10 +56,14 @@ tracing = { workspace = true }
 tracing-opentelemetry = { workspace = true }
 tracing-subscriber = { workspace = true }
 
+metrics = { workspace = true }
+metrics-exporter-dogstatsd = { workspace = true }
+
 quickwit-actors = { workspace = true }
 quickwit-cluster = { workspace = true }
 quickwit-common = { workspace = true }
 quickwit-config = { workspace = true }
+quickwit-dst = { workspace = true }
 quickwit-index-management = { workspace = true }
 quickwit-indexing = { workspace = true }
 quickwit-ingest = { workspace = true }
diff --git a/quickwit/quickwit-cli/src/logger.rs b/quickwit/quickwit-cli/src/logger.rs
index e1e60a14f93..fe91769b6cc 100644
--- a/quickwit/quickwit-cli/src/logger.rs
+++ b/quickwit/quickwit-cli/src/logger.rs
@@ -161,6 +161,65 @@ pub fn setup_logging_and_tracing(
     ))
 }
 
+#[cfg(not(any(test, feature = "testsuite")))]
+pub fn setup_dogstatsd_exporter(build_info: &BuildInfo) -> anyhow::Result<()> {
+    // Reading both `CLOUDPREM_*` and `CP_*` env vars for backward compatibility. The former is
+    // deprecated and can be removed after 2026-04-01.
+    let host: String = quickwit_common::get_from_env_opt("CLOUDPREM_DOGSTATSD_SERVER_HOST", false)
+        .unwrap_or_else(|| {
+            quickwit_common::get_from_env(
+                "CP_DOGSTATSD_SERVER_HOST",
+                "127.0.0.1".to_string(),
+                false,
+            )
+        });
+    let port: u16 = quickwit_common::get_from_env_opt("CLOUDPREM_DOGSTATSD_SERVER_PORT", false)
+        .unwrap_or_else(|| quickwit_common::get_from_env("CP_DOGSTATSD_SERVER_PORT", 8125, false));
+    let addr = format!("{host}:{port}");
+
+    let mut global_labels = vec![::metrics::Label::new("version", build_info.version.clone())];
+    let keys = [
+        ("IMAGE_NAME", "image_name"),
+        ("IMAGE_TAG", "image_tag"),
+        ("KUBERNETES_COMPONENT", "kube_component"),
+        ("KUBERNETES_NAMESPACE", "kube_namespace"),
+        ("KUBERNETES_POD_NAME", "kube_pod_name"),
+        ("QW_CLUSTER_ID", "cloudprem_cluster_id"),
+        ("QW_NODE_ID", "cloudprem_node_id"),
+    ];
+    for (env_var_key, label_key) in keys {
+        if let Some(label_val) = quickwit_common::get_from_env_opt::<String>(env_var_key, false) {
+            global_labels.push(::metrics::Label::new(label_key, label_val));
+        }
+    }
+    metrics_exporter_dogstatsd::DogStatsDBuilder::default()
+        .set_global_prefix("cloudprem")
+        .with_global_labels(global_labels)
+        .with_remote_address(addr)
+        .context("failed to parse DogStatsD server address")?
+        .install()
+        .context("failed to register DogStatsD exporter")?;
+    Ok(())
+}
+
+/// Register the invariant recorder that emits DogStatsD counters.
+///
+/// Must be called after [`setup_dogstatsd_exporter`] so the `metrics` crate
+/// has a registered recorder.
+#[cfg(not(any(test, feature = "testsuite")))]
+pub fn setup_invariant_recorder() {
+    quickwit_dst::invariants::set_invariant_recorder(invariant_recorder);
+}
+
+#[cfg(not(any(test, feature = "testsuite")))]
+fn invariant_recorder(id: quickwit_dst::invariants::InvariantId, passed: bool) {
+    let name = id.as_str();
+    metrics::counter!("pomsky.invariant.checked", "invariant" => name).increment(1);
+    if !passed {
+        metrics::counter!("pomsky.invariant.violated", "invariant" => name).increment(1);
+    }
+}
+
 /// We do not rely on the RFC3339 implementation, because it has a nanosecond precision.
 /// See discussion here: https://github.com/time-rs/time/discussions/418
 fn time_formatter() -> UtcTime<Vec<BorrowedFormatItem<'static>>> {
diff --git a/quickwit/quickwit-cli/src/main.rs b/quickwit/quickwit-cli/src/main.rs
index 4a1f9ce036e..38b696d5037 100644
--- a/quickwit/quickwit-cli/src/main.rs
+++ b/quickwit/quickwit-cli/src/main.rs
@@ -101,6 +101,12 @@ async fn main_impl() -> anyhow::Result<()> {
     let (env_filter_reload_fn, tracer_provider_opt) =
         setup_logging_and_tracing(command.default_log_level(), ansi_colors, build_info)?;
 
+    #[cfg(not(any(test, feature = "testsuite")))]
+    quickwit_cli::logger::setup_dogstatsd_exporter(build_info)?;
+
+    #[cfg(not(any(test, feature = "testsuite")))]
+    quickwit_cli::logger::setup_invariant_recorder();
+
     let return_code: i32 = if let Err(command_error) = command.execute(env_filter_reload_fn).await {
         error!(error=%command_error, "command failed");
         eprintln!(
diff --git a/quickwit/quickwit-dst/src/invariants/registry.rs b/quickwit/quickwit-dst/src/invariants/registry.rs
index 9704f0da891..cccc1480f6f 100644
--- a/quickwit/quickwit-dst/src/invariants/registry.rs
+++ b/quickwit/quickwit-dst/src/invariants/registry.rs
@@ -82,6 +82,38 @@ pub enum InvariantId {
 }
 
 impl InvariantId {
+    /// Short identifier string (e.g. `"SS-1"`).
+    ///
+    /// Returns a `&'static str` to avoid allocation on the hot path.
+    pub fn as_str(self) -> &'static str {
+        match self {
+            Self::SS1 => "SS-1",
+            Self::SS2 => "SS-2",
+            Self::SS3 => "SS-3",
+            Self::SS4 => "SS-4",
+            Self::SS5 => "SS-5",
+
+            Self::TW1 => "TW-1",
+            Self::TW2 => "TW-2",
+            Self::TW3 => "TW-3",
+
+            Self::CS1 => "CS-1",
+            Self::CS2 => "CS-2",
+            Self::CS3 => "CS-3",
+
+            Self::MC1 => "MC-1",
+            Self::MC2 => "MC-2",
+            Self::MC3 => "MC-3",
+            Self::MC4 => "MC-4",
+
+            Self::DM1 => "DM-1",
+            Self::DM2 => "DM-2",
+            Self::DM3 => "DM-3",
+            Self::DM4 => "DM-4",
+            Self::DM5 => "DM-5",
+        }
+    }
+
     /// Human-readable description of this invariant.
     pub fn description(self) -> &'static str {
         match self {
@@ -115,33 +147,7 @@ impl InvariantId {
 
 impl fmt::Display for InvariantId {
     fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
-        let s = match self {
-            Self::SS1 => "SS-1",
-            Self::SS2 => "SS-2",
-            Self::SS3 => "SS-3",
-            Self::SS4 => "SS-4",
-            Self::SS5 => "SS-5",
-
-            Self::TW1 => "TW-1",
-            Self::TW2 => "TW-2",
-            Self::TW3 => "TW-3",
-
-            Self::CS1 => "CS-1",
-            Self::CS2 => "CS-2",
-            Self::CS3 => "CS-3",
-
-            Self::MC1 => "MC-1",
-            Self::MC2 => "MC-2",
-            Self::MC3 => "MC-3",
-            Self::MC4 => "MC-4",
-
-            Self::DM1 => "DM-1",
-            Self::DM2 => "DM-2",
-            Self::DM3 => "DM-3",
-            Self::DM4 => "DM-4",
-            Self::DM5 => "DM-5",
-        };
-        f.write_str(s)
+        f.write_str(self.as_str())
     }
 }
 

From f8ee0cd2a80b552db8b93cfdb47550b52f24a501 Mon Sep 17 00:00:00 2001
From: George Talbot <george.talbot@datadoghq.com>
Date: Wed, 1 Apr 2026 14:59:03 -0400
Subject: [PATCH 27/35] fix: license headers + cfg(not(test)) for quickwit-dst
 and quickwit-cli

---
 quickwit/quickwit-cli/src/logger.rs           |  6 ++---
 quickwit/quickwit-cli/src/main.rs             |  4 +--
 quickwit/quickwit-dst/src/invariants/check.rs | 25 ++++++++-----------
 quickwit/quickwit-dst/src/invariants/mod.rs   | 25 ++++++++-----------
 .../quickwit-dst/src/invariants/recorder.rs   | 25 ++++++++-----------
 .../quickwit-dst/src/invariants/registry.rs   | 25 ++++++++-----------
 quickwit/quickwit-dst/src/invariants/sort.rs  | 25 ++++++++-----------
 .../quickwit-dst/src/invariants/window.rs     | 25 ++++++++-----------
 quickwit/quickwit-dst/src/lib.rs              | 25 ++++++++-----------
 quickwit/quickwit-dst/src/models/mod.rs       | 25 ++++++++-----------
 .../src/models/parquet_data_model.rs          | 25 ++++++++-----------
 .../quickwit-dst/src/models/sort_schema.rs    | 25 ++++++++-----------
 .../src/models/time_windowed_compaction.rs    | 25 ++++++++-----------
 13 files changed, 115 insertions(+), 170 deletions(-)

diff --git a/quickwit/quickwit-cli/src/logger.rs b/quickwit/quickwit-cli/src/logger.rs
index fe91769b6cc..a1d1dda8197 100644
--- a/quickwit/quickwit-cli/src/logger.rs
+++ b/quickwit/quickwit-cli/src/logger.rs
@@ -161,7 +161,7 @@ pub fn setup_logging_and_tracing(
     ))
 }
 
-#[cfg(not(any(test, feature = "testsuite")))]
+#[cfg(not(test))]
 pub fn setup_dogstatsd_exporter(build_info: &BuildInfo) -> anyhow::Result<()> {
     // Reading both `CLOUDPREM_*` and `CP_*` env vars for backward compatibility. The former is
     // deprecated and can be removed after 2026-04-01.
@@ -206,12 +206,12 @@ pub fn setup_dogstatsd_exporter(build_info: &BuildInfo) -> anyhow::Result<()> {
 ///
 /// Must be called after [`setup_dogstatsd_exporter`] so the `metrics` crate
 /// has a registered recorder.
-#[cfg(not(any(test, feature = "testsuite")))]
+#[cfg(not(test))]
 pub fn setup_invariant_recorder() {
     quickwit_dst::invariants::set_invariant_recorder(invariant_recorder);
 }
 
-#[cfg(not(any(test, feature = "testsuite")))]
+#[cfg(not(test))]
 fn invariant_recorder(id: quickwit_dst::invariants::InvariantId, passed: bool) {
     let name = id.as_str();
     metrics::counter!("pomsky.invariant.checked", "invariant" => name).increment(1);
diff --git a/quickwit/quickwit-cli/src/main.rs b/quickwit/quickwit-cli/src/main.rs
index 38b696d5037..a448d543eb9 100644
--- a/quickwit/quickwit-cli/src/main.rs
+++ b/quickwit/quickwit-cli/src/main.rs
@@ -101,10 +101,10 @@ async fn main_impl() -> anyhow::Result<()> {
     let (env_filter_reload_fn, tracer_provider_opt) =
         setup_logging_and_tracing(command.default_log_level(), ansi_colors, build_info)?;
 
-    #[cfg(not(any(test, feature = "testsuite")))]
+    #[cfg(not(test))]
     quickwit_cli::logger::setup_dogstatsd_exporter(build_info)?;
 
-    #[cfg(not(any(test, feature = "testsuite")))]
+    #[cfg(not(test))]
     quickwit_cli::logger::setup_invariant_recorder();
 
     let return_code: i32 = if let Err(command_error) = command.execute(env_filter_reload_fn).await {
diff --git a/quickwit/quickwit-dst/src/invariants/check.rs b/quickwit/quickwit-dst/src/invariants/check.rs
index 6c65377ae63..3db376027b1 100644
--- a/quickwit/quickwit-dst/src/invariants/check.rs
+++ b/quickwit/quickwit-dst/src/invariants/check.rs
@@ -1,21 +1,16 @@
-// Copyright (C) 2024 Quickwit, Inc.
+// Copyright 2021-Present Datadog, Inc.
 //
-// Quickwit is offered under the AGPL v3.0 and as commercial software.
-// For commercial licensing, contact us at hello@quickwit.io.
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
 //
-// AGPL:
-// This program is free software: you can redistribute it and/or modify
-// it under the terms of the GNU Affero General Public License as
-// published by the Free Software Foundation, either version 3 of the
-// License, or (at your option) any later version.
+//     http://www.apache.org/licenses/LICENSE-2.0
 //
-// This program is distributed in the hope that it will be useful,
-// but WITHOUT ANY WARRANTY; without even the implied warranty of
-// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
-// GNU Affero General Public License for more details.
-//
-// You should have received a copy of the GNU Affero General Public License
-// along with this program. If not, see <http://www.gnu.org/licenses/>.
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
 
 //! Invariant checking macro — Layers 3 + 4 of the verification stack.
 //!
diff --git a/quickwit/quickwit-dst/src/invariants/mod.rs b/quickwit/quickwit-dst/src/invariants/mod.rs
index b9679683a7e..88a731eb730 100644
--- a/quickwit/quickwit-dst/src/invariants/mod.rs
+++ b/quickwit/quickwit-dst/src/invariants/mod.rs
@@ -1,21 +1,16 @@
-// Copyright (C) 2024 Quickwit, Inc.
+// Copyright 2021-Present Datadog, Inc.
 //
-// Quickwit is offered under the AGPL v3.0 and as commercial software.
-// For commercial licensing, contact us at hello@quickwit.io.
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
 //
-// AGPL:
-// This program is free software: you can redistribute it and/or modify
-// it under the terms of the GNU Affero General Public License as
-// published by the Free Software Foundation, either version 3 of the
-// License, or (at your option) any later version.
+//     http://www.apache.org/licenses/LICENSE-2.0
 //
-// This program is distributed in the hope that it will be useful,
-// but WITHOUT ANY WARRANTY; without even the implied warranty of
-// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
-// GNU Affero General Public License for more details.
-//
-// You should have received a copy of the GNU Affero General Public License
-// along with this program. If not, see <http://www.gnu.org/licenses/>.
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
 
 //! Shared invariant definitions — the single source of truth.
 //!
diff --git a/quickwit/quickwit-dst/src/invariants/recorder.rs b/quickwit/quickwit-dst/src/invariants/recorder.rs
index f3859e03849..09432ad9045 100644
--- a/quickwit/quickwit-dst/src/invariants/recorder.rs
+++ b/quickwit/quickwit-dst/src/invariants/recorder.rs
@@ -1,21 +1,16 @@
-// Copyright (C) 2024 Quickwit, Inc.
+// Copyright 2021-Present Datadog, Inc.
 //
-// Quickwit is offered under the AGPL v3.0 and as commercial software.
-// For commercial licensing, contact us at hello@quickwit.io.
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
 //
-// AGPL:
-// This program is free software: you can redistribute it and/or modify
-// it under the terms of the GNU Affero General Public License as
-// published by the Free Software Foundation, either version 3 of the
-// License, or (at your option) any later version.
+//     http://www.apache.org/licenses/LICENSE-2.0
 //
-// This program is distributed in the hope that it will be useful,
-// but WITHOUT ANY WARRANTY; without even the implied warranty of
-// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
-// GNU Affero General Public License for more details.
-//
-// You should have received a copy of the GNU Affero General Public License
-// along with this program. If not, see <http://www.gnu.org/licenses/>.
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
 
 //! Pluggable invariant recorder — Layer 4 of the verification stack.
 //!
diff --git a/quickwit/quickwit-dst/src/invariants/registry.rs b/quickwit/quickwit-dst/src/invariants/registry.rs
index cccc1480f6f..f0d9d2bf9d8 100644
--- a/quickwit/quickwit-dst/src/invariants/registry.rs
+++ b/quickwit/quickwit-dst/src/invariants/registry.rs
@@ -1,21 +1,16 @@
-// Copyright (C) 2024 Quickwit, Inc.
+// Copyright 2021-Present Datadog, Inc.
 //
-// Quickwit is offered under the AGPL v3.0 and as commercial software.
-// For commercial licensing, contact us at hello@quickwit.io.
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
 //
-// AGPL:
-// This program is free software: you can redistribute it and/or modify
-// it under the terms of the GNU Affero General Public License as
-// published by the Free Software Foundation, either version 3 of the
-// License, or (at your option) any later version.
+//     http://www.apache.org/licenses/LICENSE-2.0
 //
-// This program is distributed in the hope that it will be useful,
-// but WITHOUT ANY WARRANTY; without even the implied warranty of
-// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
-// GNU Affero General Public License for more details.
-//
-// You should have received a copy of the GNU Affero General Public License
-// along with this program. If not, see <http://www.gnu.org/licenses/>.
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
 
 //! Invariant ID catalog — single source of truth for all invariant definitions.
 //!
diff --git a/quickwit/quickwit-dst/src/invariants/sort.rs b/quickwit/quickwit-dst/src/invariants/sort.rs
index 68a4ad05cb7..a79be010de1 100644
--- a/quickwit/quickwit-dst/src/invariants/sort.rs
+++ b/quickwit/quickwit-dst/src/invariants/sort.rs
@@ -1,21 +1,16 @@
-// Copyright (C) 2024 Quickwit, Inc.
+// Copyright 2021-Present Datadog, Inc.
 //
-// Quickwit is offered under the AGPL v3.0 and as commercial software.
-// For commercial licensing, contact us at hello@quickwit.io.
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
 //
-// AGPL:
-// This program is free software: you can redistribute it and/or modify
-// it under the terms of the GNU Affero General Public License as
-// published by the Free Software Foundation, either version 3 of the
-// License, or (at your option) any later version.
+//     http://www.apache.org/licenses/LICENSE-2.0
 //
-// This program is distributed in the hope that it will be useful,
-// but WITHOUT ANY WARRANTY; without even the implied warranty of
-// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
-// GNU Affero General Public License for more details.
-//
-// You should have received a copy of the GNU Affero General Public License
-// along with this program. If not, see <http://www.gnu.org/licenses/>.
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
 
 //! Shared null-aware comparison for SS-2 (null ordering invariant).
 //!
diff --git a/quickwit/quickwit-dst/src/invariants/window.rs b/quickwit/quickwit-dst/src/invariants/window.rs
index 31276c1da6b..8f96ee23ef8 100644
--- a/quickwit/quickwit-dst/src/invariants/window.rs
+++ b/quickwit/quickwit-dst/src/invariants/window.rs
@@ -1,21 +1,16 @@
-// Copyright (C) 2024 Quickwit, Inc.
+// Copyright 2021-Present Datadog, Inc.
 //
-// Quickwit is offered under the AGPL v3.0 and as commercial software.
-// For commercial licensing, contact us at hello@quickwit.io.
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
 //
-// AGPL:
-// This program is free software: you can redistribute it and/or modify
-// it under the terms of the GNU Affero General Public License as
-// published by the Free Software Foundation, either version 3 of the
-// License, or (at your option) any later version.
+//     http://www.apache.org/licenses/LICENSE-2.0
 //
-// This program is distributed in the hope that it will be useful,
-// but WITHOUT ANY WARRANTY; without even the implied warranty of
-// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
-// GNU Affero General Public License for more details.
-//
-// You should have received a copy of the GNU Affero General Public License
-// along with this program. If not, see <http://www.gnu.org/licenses/>.
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
 
 //! Shared window math for time-windowed compaction.
 //!
diff --git a/quickwit/quickwit-dst/src/lib.rs b/quickwit/quickwit-dst/src/lib.rs
index 2fa71cbf47c..194af267b52 100644
--- a/quickwit/quickwit-dst/src/lib.rs
+++ b/quickwit/quickwit-dst/src/lib.rs
@@ -1,21 +1,16 @@
-// Copyright (C) 2024 Quickwit, Inc.
+// Copyright 2021-Present Datadog, Inc.
 //
-// Quickwit is offered under the AGPL v3.0 and as commercial software.
-// For commercial licensing, contact us at hello@quickwit.io.
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
 //
-// AGPL:
-// This program is free software: you can redistribute it and/or modify
-// it under the terms of the GNU Affero General Public License as
-// published by the Free Software Foundation, either version 3 of the
-// License, or (at your option) any later version.
+//     http://www.apache.org/licenses/LICENSE-2.0
 //
-// This program is distributed in the hope that it will be useful,
-// but WITHOUT ANY WARRANTY; without even the implied warranty of
-// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
-// GNU Affero General Public License for more details.
-//
-// You should have received a copy of the GNU Affero General Public License
-// along with this program. If not, see <http://www.gnu.org/licenses/>.
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
 
 //! Deterministic simulation testing and shared invariants for Quickhouse-Pomsky.
 //!
diff --git a/quickwit/quickwit-dst/src/models/mod.rs b/quickwit/quickwit-dst/src/models/mod.rs
index a46f5552d35..d2a74fa424a 100644
--- a/quickwit/quickwit-dst/src/models/mod.rs
+++ b/quickwit/quickwit-dst/src/models/mod.rs
@@ -1,21 +1,16 @@
-// Copyright (C) 2024 Quickwit, Inc.
+// Copyright 2021-Present Datadog, Inc.
 //
-// Quickwit is offered under the AGPL v3.0 and as commercial software.
-// For commercial licensing, contact us at hello@quickwit.io.
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
 //
-// AGPL:
-// This program is free software: you can redistribute it and/or modify
-// it under the terms of the GNU Affero General Public License as
-// published by the Free Software Foundation, either version 3 of the
-// License, or (at your option) any later version.
+//     http://www.apache.org/licenses/LICENSE-2.0
 //
-// This program is distributed in the hope that it will be useful,
-// but WITHOUT ANY WARRANTY; without even the implied warranty of
-// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
-// GNU Affero General Public License for more details.
-//
-// You should have received a copy of the GNU Affero General Public License
-// along with this program. If not, see <http://www.gnu.org/licenses/>.
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
 
 //! Stateright models mirroring the TLA+ specifications.
 
diff --git a/quickwit/quickwit-dst/src/models/parquet_data_model.rs b/quickwit/quickwit-dst/src/models/parquet_data_model.rs
index 321dbc5dfc5..08d9a27dc3c 100644
--- a/quickwit/quickwit-dst/src/models/parquet_data_model.rs
+++ b/quickwit/quickwit-dst/src/models/parquet_data_model.rs
@@ -1,21 +1,16 @@
-// Copyright (C) 2024 Quickwit, Inc.
+// Copyright 2021-Present Datadog, Inc.
 //
-// Quickwit is offered under the AGPL v3.0 and as commercial software.
-// For commercial licensing, contact us at hello@quickwit.io.
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
 //
-// AGPL:
-// This program is free software: you can redistribute it and/or modify
-// it under the terms of the GNU Affero General Public License as
-// published by the Free Software Foundation, either version 3 of the
-// License, or (at your option) any later version.
+//     http://www.apache.org/licenses/LICENSE-2.0
 //
-// This program is distributed in the hope that it will be useful,
-// but WITHOUT ANY WARRANTY; without even the implied warranty of
-// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
-// GNU Affero General Public License for more details.
-//
-// You should have received a copy of the GNU Affero General Public License
-// along with this program. If not, see <http://www.gnu.org/licenses/>.
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
 
 //! Stateright model for Parquet Data Model invariants (ADR-001).
 //!
diff --git a/quickwit/quickwit-dst/src/models/sort_schema.rs b/quickwit/quickwit-dst/src/models/sort_schema.rs
index 89746360993..d741ceddffa 100644
--- a/quickwit/quickwit-dst/src/models/sort_schema.rs
+++ b/quickwit/quickwit-dst/src/models/sort_schema.rs
@@ -1,21 +1,16 @@
-// Copyright (C) 2024 Quickwit, Inc.
+// Copyright 2021-Present Datadog, Inc.
 //
-// Quickwit is offered under the AGPL v3.0 and as commercial software.
-// For commercial licensing, contact us at hello@quickwit.io.
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
 //
-// AGPL:
-// This program is free software: you can redistribute it and/or modify
-// it under the terms of the GNU Affero General Public License as
-// published by the Free Software Foundation, either version 3 of the
-// License, or (at your option) any later version.
+//     http://www.apache.org/licenses/LICENSE-2.0
 //
-// This program is distributed in the hope that it will be useful,
-// but WITHOUT ANY WARRANTY; without even the implied warranty of
-// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
-// GNU Affero General Public License for more details.
-//
-// You should have received a copy of the GNU Affero General Public License
-// along with this program. If not, see <http://www.gnu.org/licenses/>.
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
 
 //! Stateright model for Sort Schema invariants (ADR-002).
 //!
diff --git a/quickwit/quickwit-dst/src/models/time_windowed_compaction.rs b/quickwit/quickwit-dst/src/models/time_windowed_compaction.rs
index 0723f93e453..b50d8a695b7 100644
--- a/quickwit/quickwit-dst/src/models/time_windowed_compaction.rs
+++ b/quickwit/quickwit-dst/src/models/time_windowed_compaction.rs
@@ -1,21 +1,16 @@
-// Copyright (C) 2024 Quickwit, Inc.
+// Copyright 2021-Present Datadog, Inc.
 //
-// Quickwit is offered under the AGPL v3.0 and as commercial software.
-// For commercial licensing, contact us at hello@quickwit.io.
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
 //
-// AGPL:
-// This program is free software: you can redistribute it and/or modify
-// it under the terms of the GNU Affero General Public License as
-// published by the Free Software Foundation, either version 3 of the
-// License, or (at your option) any later version.
+//     http://www.apache.org/licenses/LICENSE-2.0
 //
-// This program is distributed in the hope that it will be useful,
-// but WITHOUT ANY WARRANTY; without even the implied warranty of
-// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
-// GNU Affero General Public License for more details.
-//
-// You should have received a copy of the GNU Affero General Public License
-// along with this program. If not, see <http://www.gnu.org/licenses/>.
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
 
 //! Stateright model for Time-Windowed Compaction invariants (ADR-003).
 //!

From d40fd11bbafdca782a1c872fb11268c0a9c15dc5 Mon Sep 17 00:00:00 2001
From: George Talbot <george.talbot@datadoghq.com>
Date: Mon, 6 Apr 2026 12:46:20 -0400
Subject: [PATCH 28/35] chore: regenerate third-party license file

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
---
 LICENSE-3rdparty.csv | 14 ++++++++++++++
 1 file changed, 14 insertions(+)

diff --git a/LICENSE-3rdparty.csv b/LICENSE-3rdparty.csv
index 1f51678111f..95b43aaded6 100644
--- a/LICENSE-3rdparty.csv
+++ b/LICENSE-3rdparty.csv
@@ -36,6 +36,7 @@ arrow-row,https://github.com/apache/arrow-rs,Apache-2.0,Apache Arrow <dev@arrow.
 arrow-schema,https://github.com/apache/arrow-rs,Apache-2.0,Apache Arrow <dev@arrow.apache.org>
 arrow-select,https://github.com/apache/arrow-rs,Apache-2.0,Apache Arrow <dev@arrow.apache.org>
 arrow-string,https://github.com/apache/arrow-rs,Apache-2.0,Apache Arrow <dev@arrow.apache.org>
+ascii,https://github.com/tomprogrammer/rust-ascii,Apache-2.0 OR MIT,"Thomas Bahn <thomas@thomas-bahn.net>, Torbjørn Birch Moltu <t.b.moltu@lyse.net>, Simon Sapin <simon.sapin@exyr.org>"
 ascii-canvas,https://github.com/lalrpop/ascii-canvas,Apache-2.0 OR MIT,Niko Matsakis <niko@alum.mit.edu>
 assert-json-diff,https://github.com/davidpdrsn/assert-json-diff,MIT,David Pedersen <david.pdrsn@gmail.com>
 async-channel,https://github.com/smol-rs/async-channel,Apache-2.0 OR MIT,Stjepan Glavina <stjepang@gmail.com>
@@ -130,8 +131,10 @@ chacha20,https://github.com/RustCrypto/stream-ciphers,MIT OR Apache-2.0,RustCryp
 chacha20poly1305,https://github.com/RustCrypto/AEADs/tree/master/chacha20poly1305,Apache-2.0 OR MIT,RustCrypto Developers
 charset,https://github.com/hsivonen/charset,Apache-2.0 OR MIT,Henri Sivonen <hsivonen@hsivonen.fi>
 chitchat,https://github.com/quickwit-oss/chitchat,MIT,"Quickwit, Inc. <hello@quickwit.io>"
+choice,https://github.com/jonnadal/choice,MIT,Jonathan Nadal <jon.nadal@gmail.com>
 chrono,https://github.com/chronotope/chrono,MIT OR Apache-2.0,The chrono Authors
 chrono-tz,https://github.com/chronotope/chrono-tz,MIT OR Apache-2.0,The chrono-tz Authors
+chunked_transfer,https://github.com/frewsxcv/rust-chunked-transfer,MIT OR Apache-2.0,Corey Farwell <coreyf@rwell.org>
 ciborium,https://github.com/enarx/ciborium,Apache-2.0,Nathaniel McCallum <npmccallum@profian.com>
 ciborium-io,https://github.com/enarx/ciborium,Apache-2.0,Nathaniel McCallum <npmccallum@profian.com>
 ciborium-ll,https://github.com/enarx/ciborium,Apache-2.0,Nathaniel McCallum <npmccallum@profian.com>
@@ -223,6 +226,7 @@ embedded-io,https://github.com/rust-embedded/embedded-hal,MIT OR Apache-2.0,The
 ena,https://github.com/rust-lang/ena,MIT OR Apache-2.0,Niko Matsakis <niko@alum.mit.edu>
 encode_unicode,https://github.com/tormol/encode_unicode,Apache-2.0 OR MIT,Torbjørn Birch Moltu <t.b.moltu@lyse.net>
 encoding_rs,https://github.com/hsivonen/encoding_rs,(Apache-2.0 OR MIT) AND BSD-3-Clause,Henri Sivonen <hsivonen@hsivonen.fi>
+endian-type,https://github.com/Lolirofle/endian-type,MIT,Lolirofle <lolipopple@hotmail.com>
 enum-iterator,https://github.com/stephaneyfx/enum-iterator,0BSD,Stephane Raux <stephaneyfx@gmail.com>
 enum-iterator-derive,https://github.com/stephaneyfx/enum-iterator,0BSD,Stephane Raux <stephaneyfx@gmail.com>
 env_filter,https://github.com/rust-cli/env_logger,MIT OR Apache-2.0,The env_filter Authors
@@ -326,6 +330,7 @@ icu_properties,https://github.com/unicode-org/icu4x,Unicode-3.0,The ICU4X Projec
 icu_properties_data,https://github.com/unicode-org/icu4x,Unicode-3.0,The ICU4X Project Developers
 icu_provider,https://github.com/unicode-org/icu4x,Unicode-3.0,The ICU4X Project Developers
 id-arena,https://github.com/fitzgen/id-arena,MIT OR Apache-2.0,"Nick Fitzgerald <fitzgen@gmail.com>, Aleksey Kladov <aleksey.kladov@gmail.com>"
+id-set,https://github.com/andrewhickman/id-set,MIT OR Apache-2.0,Andrew Hickman <andrew.hickman1@sky.com>
 ident_case,https://github.com/TedDriggs/ident_case,MIT OR Apache-2.0,Ted Driggs <ted.driggs@outlook.com>
 idna,https://github.com/servo/rust-url,MIT OR Apache-2.0,The rust-url developers
 idna_adapter,https://github.com/hsivonen/idna_adapter,Apache-2.0 OR MIT,The rust-url developers
@@ -395,6 +400,9 @@ md5,https://github.com/stainless-steel/md5,Apache-2.0 OR MIT,"Ivan Ukhov <ivan.u
 measure_time,https://github.com/PSeitz/rust_measure_time,MIT,Pascal Seitz <pascal.seitz@gmail.com>
 memchr,https://github.com/BurntSushi/memchr,Unlicense OR MIT,"Andrew Gallant <jamslam@gmail.com>, bluss"
 memmap2,https://github.com/RazrFalcon/memmap2-rs,MIT OR Apache-2.0,"Dan Burkert <dan@danburkert.com>, Yevhenii Reizner <razrfalcon@gmail.com>, The Contributors"
+metrics,https://github.com/metrics-rs/metrics,MIT,Toby Lawrence <toby@nuclearfurnace.com>
+metrics-exporter-dogstatsd,https://github.com/metrics-rs/metrics,MIT,Toby Lawrence <toby@nuclearfurnace.com>
+metrics-util,https://github.com/metrics-rs/metrics,MIT,Toby Lawrence <toby@nuclearfurnace.com>
 mime,https://github.com/hyperium/mime,MIT OR Apache-2.0,Sean McArthur <sean@seanmonstar.com>
 mime_guess,https://github.com/abonander/mime_guess,MIT,Austin Bonander <austin.bonander@gmail.com>
 mini-internal,https://github.com/dtolnay/miniserde,MIT OR Apache-2.0,David Tolnay <dtolnay@gmail.com>
@@ -413,8 +421,10 @@ murmurhash32,https://github.com/quickwit-inc/murmurhash32,MIT,Paul Masurel <paul
 native-tls,https://github.com/rust-native-tls/rust-native-tls,MIT OR Apache-2.0,Steven Fackler <sfackler@gmail.com>
 new_debug_unreachable,https://github.com/mbrubeck/rust-debug-unreachable,MIT,"Matt Brubeck <mbrubeck@limpet.net>, Jonathan Reem <jonathan.reem@gmail.com>"
 new_string_template,https://github.com/hasezoey/new_string_template,MIT,hasezoey <hasezoey@gmail.com>
+nibble_vec,https://github.com/michaelsproul/rust_nibble_vec,MIT,Michael Sproul <micsproul@gmail.com>
 nix,https://github.com/nix-rust/nix,MIT,The nix-rust Project Developers
 no-std-net,https://github.com/dunmatt/no-std-net,MIT,M@ Dunlap <mattdunlap@gmail.com>
+nohash-hasher,https://github.com/paritytech/nohash-hasher,Apache-2.0 OR MIT,Parity Technologies <admin@parity.io>
 nom,https://github.com/Geal/nom,MIT,contact@geoffroycouprie.com
 nom,https://github.com/rust-bakery/nom,MIT,contact@geoffroycouprie.com
 nom-language,https://github.com/rust-bakery/nom,MIT,contact@geoffroycouprie.com
@@ -566,12 +576,14 @@ quinn-udp,https://github.com/quinn-rs/quinn,MIT OR Apache-2.0,The quinn-udp Auth
 quote,https://github.com/dtolnay/quote,MIT OR Apache-2.0,David Tolnay <dtolnay@gmail.com>
 quoted_printable,https://github.com/staktrace/quoted-printable,0BSD,Kartikaya Gupta <kats@seldon.staktrace.com>
 r-efi,https://github.com/r-efi/r-efi,MIT OR Apache-2.0 OR LGPL-2.1-or-later,The r-efi Authors
+radix_trie,https://github.com/michaelsproul/rust_radix_trie,MIT,Michael Sproul <micsproul@gmail.com>
 rand,https://github.com/rust-random/rand,MIT OR Apache-2.0,"The Rand Project Developers, The Rust Project Developers"
 rand_chacha,https://github.com/rust-random/rand,MIT OR Apache-2.0,"The Rand Project Developers, The Rust Project Developers, The CryptoCorrosion Contributors"
 rand_core,https://github.com/rust-random/rand,MIT OR Apache-2.0,"The Rand Project Developers, The Rust Project Developers"
 rand_core,https://github.com/rust-random/rand_core,MIT OR Apache-2.0,The Rand Project Developers
 rand_hc,https://github.com/rust-random/rand,MIT OR Apache-2.0,The Rand Project Developers
 rand_xorshift,https://github.com/rust-random/rngs,MIT OR Apache-2.0,"The Rand Project Developers, The Rust Project Developers"
+rand_xoshiro,https://github.com/rust-random/rngs,MIT OR Apache-2.0,The Rand Project Developers
 raw-cpuid,https://github.com/gz/rust-cpuid,MIT,Gerd Zellweger <mail@gerdzellweger.com>
 rayon,https://github.com/rayon-rs/rayon,MIT OR Apache-2.0,The rayon Authors
 rayon-core,https://github.com/rayon-rs/rayon,MIT OR Apache-2.0,The rayon-core Authors
@@ -683,6 +695,7 @@ sqlx-mysql,https://github.com/launchbadge/sqlx,MIT OR Apache-2.0,"Ryan Leckey <l
 sqlx-postgres,https://github.com/launchbadge/sqlx,MIT OR Apache-2.0,"Ryan Leckey <leckey.ryan@gmail.com>, Austin Bonander <austin.bonander@gmail.com>, Chloe Ross <orangesnowfox@gmail.com>, Daniel Akhterov <akhterovd@gmail.com>"
 sqlx-sqlite,https://github.com/launchbadge/sqlx,MIT OR Apache-2.0,"Ryan Leckey <leckey.ryan@gmail.com>, Austin Bonander <austin.bonander@gmail.com>, Chloe Ross <orangesnowfox@gmail.com>, Daniel Akhterov <akhterovd@gmail.com>"
 stable_deref_trait,https://github.com/storyyeller/stable_deref_trait,MIT OR Apache-2.0,Robert Grosse <n210241048576@gmail.com>
+stateright,https://github.com/stateright/stateright,MIT,Jonathan Nadal <jon.nadal@gmail.com>
 static_assertions,https://github.com/nvzqz/static-assertions-rs,MIT OR Apache-2.0,Nikolai Vazquez
 str_stack,https://github.com/Stebalien/str_stack,MIT OR Apache-2.0,Steven Allen <steven@stebalien.com>
 string_cache,https://github.com/servo/string-cache,MIT OR Apache-2.0,The Servo Project Developers
@@ -727,6 +740,7 @@ time-core,https://github.com/time-rs/time,MIT OR Apache-2.0,"Jacob Pratt <open-s
 time-fmt,https://github.com/MiSawa/time-fmt,MIT OR Apache-2.0,mi_sawa <mi.sawa.1216+git@gmail.com>
 time-macros,https://github.com/time-rs/time,MIT OR Apache-2.0,"Jacob Pratt <open-source@jhpratt.dev>, Time contributors"
 tiny-keccak,https://github.com/debris/tiny-keccak,CC0-1.0,debris <marek.kotewicz@gmail.com>
+tiny_http,https://github.com/tiny-http/tiny-http,MIT OR Apache-2.0,"pierre.krieger1708@gmail.com, Corey Farwell <coreyf@rwell.org>"
 tinystr,https://github.com/unicode-org/icu4x,Unicode-3.0,The ICU4X Project Developers
 tinytemplate,https://github.com/bheisler/TinyTemplate,Apache-2.0 OR MIT,Brook Heisler <brookheisler@gmail.com>
 tinyvec,https://github.com/Lokathor/tinyvec,Zlib OR Apache-2.0 OR MIT,Lokathor <zefria@gmail.com>

From 0e4be8863e7412ddea92d49d5d6ee0e5828a0b14 Mon Sep 17 00:00:00 2001
From: George Talbot <george.talbot@datadoghq.com>
Date: Mon, 6 Apr 2026 13:07:40 -0400
Subject: [PATCH 29/35] style: fix rustfmt nightly formatting for quickwit-dst
 and quickwit-parquet-engine

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
---
 quickwit/quickwit-dst/src/invariants/check.rs |  10 +-
 .../quickwit-dst/src/invariants/registry.rs   |  33 ++-
 .../quickwit-dst/src/invariants/window.rs     |  15 +-
 quickwit/quickwit-dst/src/lib.rs              |   7 +-
 .../src/models/parquet_data_model.rs          |  34 ++-
 .../quickwit-dst/src/models/sort_schema.rs    |  44 ++--
 .../src/models/time_windowed_compaction.rs    | 233 ++++++++++--------
 .../quickwit-dst/tests/stateright_models.rs   |   7 +-
 .../src/sort_fields/window.rs                 |  15 +-
 .../src/split/metadata.rs                     |  10 +-
 .../src/split/postgres.rs                     |   6 +-
 .../src/storage/writer.rs                     |   3 +-
 12 files changed, 226 insertions(+), 191 deletions(-)

diff --git a/quickwit/quickwit-dst/src/invariants/check.rs b/quickwit/quickwit-dst/src/invariants/check.rs
index 3db376027b1..aecafa7d02a 100644
--- a/quickwit/quickwit-dst/src/invariants/check.rs
+++ b/quickwit/quickwit-dst/src/invariants/check.rs
@@ -16,11 +16,11 @@
 //!
 //! The condition is **always evaluated** (debug and release). Results are:
 //!
-//! - **Debug builds (Layer 3 — Prevention):** panics on violation via
-//!   `debug_assert!`, catching bugs during development and testing.
-//! - **All builds (Layer 4 — Production):** forwards the result to the
-//!   registered [`InvariantRecorder`](super::recorder::InvariantRecorder)
-//!   for Datadog metrics emission. No-op if no recorder is set.
+//! - **Debug builds (Layer 3 — Prevention):** panics on violation via `debug_assert!`, catching
+//!   bugs during development and testing.
+//! - **All builds (Layer 4 — Production):** forwards the result to the registered
+//!   [`InvariantRecorder`](super::recorder::InvariantRecorder) for Datadog metrics emission. No-op
+//!   if no recorder is set.
 
 /// Check an invariant condition in all build profiles.
 ///
diff --git a/quickwit/quickwit-dst/src/invariants/registry.rs b/quickwit/quickwit-dst/src/invariants/registry.rs
index f0d9d2bf9d8..577d084d4a6 100644
--- a/quickwit/quickwit-dst/src/invariants/registry.rs
+++ b/quickwit/quickwit-dst/src/invariants/registry.rs
@@ -162,20 +162,29 @@ mod tests {
     #[test]
     fn descriptions_non_empty() {
         let all = [
-            InvariantId::SS1, InvariantId::SS2, InvariantId::SS3,
-            InvariantId::SS4, InvariantId::SS5,
-            InvariantId::TW1, InvariantId::TW2, InvariantId::TW3,
-            InvariantId::CS1, InvariantId::CS2, InvariantId::CS3,
-            InvariantId::MC1, InvariantId::MC2, InvariantId::MC3, InvariantId::MC4,
-            InvariantId::DM1, InvariantId::DM2, InvariantId::DM3,
-            InvariantId::DM4, InvariantId::DM5,
+            InvariantId::SS1,
+            InvariantId::SS2,
+            InvariantId::SS3,
+            InvariantId::SS4,
+            InvariantId::SS5,
+            InvariantId::TW1,
+            InvariantId::TW2,
+            InvariantId::TW3,
+            InvariantId::CS1,
+            InvariantId::CS2,
+            InvariantId::CS3,
+            InvariantId::MC1,
+            InvariantId::MC2,
+            InvariantId::MC3,
+            InvariantId::MC4,
+            InvariantId::DM1,
+            InvariantId::DM2,
+            InvariantId::DM3,
+            InvariantId::DM4,
+            InvariantId::DM5,
         ];
         for id in all {
-            assert!(
-                !id.description().is_empty(),
-                "{} has empty description",
-                id
-            );
+            assert!(!id.description().is_empty(), "{} has empty description", id);
         }
     }
 }
diff --git a/quickwit/quickwit-dst/src/invariants/window.rs b/quickwit/quickwit-dst/src/invariants/window.rs
index 8f96ee23ef8..f3ab8504116 100644
--- a/quickwit/quickwit-dst/src/invariants/window.rs
+++ b/quickwit/quickwit-dst/src/invariants/window.rs
@@ -63,12 +63,17 @@ mod tests {
 
     #[test]
     fn valid_window_durations() {
-        let valid = [1, 2, 3, 4, 5, 6, 8, 9, 10, 12, 15, 16, 18, 20, 24, 25,
-                     30, 36, 40, 45, 48, 50, 60, 72, 75, 80, 90, 100, 120,
-                     144, 150, 180, 200, 225, 240, 300, 360, 400, 450, 600,
-                     720, 900, 1200, 1800, 3600];
+        let valid = [
+            1, 2, 3, 4, 5, 6, 8, 9, 10, 12, 15, 16, 18, 20, 24, 25, 30, 36, 40, 45, 48, 50, 60, 72,
+            75, 80, 90, 100, 120, 144, 150, 180, 200, 225, 240, 300, 360, 400, 450, 600, 720, 900,
+            1200, 1800, 3600,
+        ];
         for dur in valid {
-            assert!(is_valid_window_duration(dur), "expected {} to be valid", dur);
+            assert!(
+                is_valid_window_duration(dur),
+                "expected {} to be valid",
+                dur
+            );
         }
     }
 
diff --git a/quickwit/quickwit-dst/src/lib.rs b/quickwit/quickwit-dst/src/lib.rs
index 194af267b52..fa8b6e0d00a 100644
--- a/quickwit/quickwit-dst/src/lib.rs
+++ b/quickwit/quickwit-dst/src/lib.rs
@@ -31,10 +31,9 @@
 //! ## Models
 //!
 //! - `models::sort_schema` — SS-1..SS-5 (ADR-002, `SortSchema.tla`)
-//! - `models::time_windowed_compaction` — TW-1..TW-3, CS-1..CS-3, MC-1..MC-4
-//!   (ADR-003, `TimeWindowedCompaction.tla`)
-//! - `models::parquet_data_model` — DM-1..DM-5 (ADR-001,
-//!   `ParquetDataModel.tla`)
+//! - `models::time_windowed_compaction` — TW-1..TW-3, CS-1..CS-3, MC-1..MC-4 (ADR-003,
+//!   `TimeWindowedCompaction.tla`)
+//! - `models::parquet_data_model` — DM-1..DM-5 (ADR-001, `ParquetDataModel.tla`)
 
 pub mod invariants;
 
diff --git a/quickwit/quickwit-dst/src/models/parquet_data_model.rs b/quickwit/quickwit-dst/src/models/parquet_data_model.rs
index 08d9a27dc3c..a567b0a4fea 100644
--- a/quickwit/quickwit-dst/src/models/parquet_data_model.rs
+++ b/quickwit/quickwit-dst/src/models/parquet_data_model.rs
@@ -18,8 +18,7 @@
 //!
 //! # Invariants
 //! - DM-1: Each row is exactly one data point (all required fields populated)
-//! - DM-2: No last-write-wins; duplicate (metric, tags, ts) from separate
-//!   ingests both survive
+//! - DM-2: No last-write-wins; duplicate (metric, tags, ts) from separate ingests both survive
 //! - DM-3: No interpolation; storage contains only ingested points
 //! - DM-4: timeseries_id is deterministic for a given tag set
 //! - DM-5: timeseries_id persists through compaction without recomputation
@@ -99,11 +98,17 @@ impl DataModelState {
     }
 
     fn all_stored_rows(&self) -> BTreeSet<DataPoint> {
-        self.splits.iter().flat_map(|s| s.rows.iter().cloned()).collect()
+        self.splits
+            .iter()
+            .flat_map(|s| s.rows.iter().cloned())
+            .collect()
     }
 
     fn all_pending_rows(&self) -> BTreeSet<DataPoint> {
-        self.pending.iter().flat_map(|(_, set)| set.iter().cloned()).collect()
+        self.pending
+            .iter()
+            .flat_map(|(_, set)| set.iter().cloned())
+            .collect()
     }
 }
 
@@ -218,11 +223,7 @@ impl Model for DataModelModel {
         }
     }
 
-    fn next_state(
-        &self,
-        state: &Self::State,
-        action: Self::Action,
-    ) -> Option<Self::State> {
+    fn next_state(&self, state: &Self::State, action: Self::Action) -> Option<Self::State> {
         let mut next = state.clone();
 
         match action {
@@ -269,8 +270,10 @@ impl Model for DataModelModel {
                     return None;
                 }
 
-                let merged_rows: BTreeSet<DataPoint> =
-                    selected.iter().flat_map(|s| s.rows.iter().cloned()).collect();
+                let merged_rows: BTreeSet<DataPoint> = selected
+                    .iter()
+                    .flat_map(|s| s.rows.iter().cloned())
+                    .collect();
                 let new_split = DataModelSplit {
                     split_id: next.next_split_id,
                     rows: merged_rows,
@@ -358,8 +361,7 @@ impl Model for DataModelModel {
                 |_model: &DataModelModel, state: &DataModelState| {
                     let stored = state.all_stored_rows();
                     let pending = state.all_pending_rows();
-                    let all: BTreeSet<&DataPoint> =
-                        stored.iter().chain(pending.iter()).collect();
+                    let all: BTreeSet<&DataPoint> = stored.iter().chain(pending.iter()).collect();
                     for r1 in &all {
                         for r2 in &all {
                             if r1.tags == r2.tags && r1.timeseries_id != r2.timeseries_id {
@@ -395,11 +397,7 @@ mod tests {
     #[test]
     fn check_data_model_small() {
         let model = DataModelModel::small();
-        model
-            .checker()
-            .spawn_bfs()
-            .join()
-            .assert_properties();
+        model.checker().spawn_bfs().join().assert_properties();
     }
 
     #[test]
diff --git a/quickwit/quickwit-dst/src/models/sort_schema.rs b/quickwit/quickwit-dst/src/models/sort_schema.rs
index d741ceddffa..f62d0d01c50 100644
--- a/quickwit/quickwit-dst/src/models/sort_schema.rs
+++ b/quickwit/quickwit-dst/src/models/sort_schema.rs
@@ -115,9 +115,7 @@ pub enum SortSchemaAction {
         rows: Vec<Row>,
     },
     /// Change the metastore schema to a new value.
-    ChangeSchema {
-        new_schema: Vec<SortColumn>,
-    },
+    ChangeSchema { new_schema: Vec<SortColumn> },
     /// Compact two splits (identified by index in the splits vec).
     CompactSplits {
         s1_idx: usize,
@@ -292,12 +290,7 @@ fn all_column_subsets() -> Vec<Vec<Column>> {
 
 /// Check if merged_rows is a valid permutation of the union of s1 and s2 rows,
 /// accounting for column extension (missing columns become NULL).
-fn is_valid_merge(
-    merged_rows: &[Row],
-    s1: &Split,
-    s2: &Split,
-    merged_columns: &[Column],
-) -> bool {
+fn is_valid_merge(merged_rows: &[Row], s1: &Split, s2: &Split, merged_columns: &[Column]) -> bool {
     let total_rows = s1.rows.len() + s2.rows.len();
     if merged_rows.len() != total_rows {
         return false;
@@ -313,8 +306,16 @@ fn is_valid_merge(
         Row { cells }
     };
 
-    let s1_extended: Vec<Row> = s1.rows.iter().map(|r| extend_row(r, merged_columns)).collect();
-    let s2_extended: Vec<Row> = s2.rows.iter().map(|r| extend_row(r, merged_columns)).collect();
+    let s1_extended: Vec<Row> = s1
+        .rows
+        .iter()
+        .map(|r| extend_row(r, merged_columns))
+        .collect();
+    let s2_extended: Vec<Row> = s2
+        .rows
+        .iter()
+        .map(|r| extend_row(r, merged_columns))
+        .collect();
 
     // Check that merged_rows is a permutation of s1_extended ++ s2_extended.
     let mut all_input: Vec<Row> = s1_extended;
@@ -414,11 +415,7 @@ impl Model for SortSchemaModel {
         }
     }
 
-    fn next_state(
-        &self,
-        state: &Self::State,
-        action: Self::Action,
-    ) -> Option<Self::State> {
+    fn next_state(&self, state: &Self::State, action: Self::Action) -> Option<Self::State> {
         let mut next = state.clone();
 
         match action {
@@ -519,11 +516,10 @@ impl Model for SortSchemaModel {
                                 let v_next = row_next.get_value(sc.column);
 
                                 // Check only when earlier columns are equal.
-                                let earlier_equal =
-                                    s.sort_schema[..k].iter().all(|prev_sc| {
-                                        row_curr.get_value(prev_sc.column)
-                                            == row_next.get_value(prev_sc.column)
-                                    });
+                                let earlier_equal = s.sort_schema[..k].iter().all(|prev_sc| {
+                                    row_curr.get_value(prev_sc.column)
+                                        == row_next.get_value(prev_sc.column)
+                                });
 
                                 if earlier_equal {
                                     // Ascending: null must not appear before non-null.
@@ -604,11 +600,7 @@ mod tests {
     #[test]
     fn check_sort_schema_small() {
         let model = SortSchemaModel::small();
-        model
-            .checker()
-            .spawn_bfs()
-            .join()
-            .assert_properties();
+        model.checker().spawn_bfs().join().assert_properties();
     }
 
     #[test]
diff --git a/quickwit/quickwit-dst/src/models/time_windowed_compaction.rs b/quickwit/quickwit-dst/src/models/time_windowed_compaction.rs
index b50d8a695b7..7c323b6dcd0 100644
--- a/quickwit/quickwit-dst/src/models/time_windowed_compaction.rs
+++ b/quickwit/quickwit-dst/src/models/time_windowed_compaction.rs
@@ -102,14 +102,18 @@ pub struct CompactionState {
 /// Actions.
 #[derive(Clone, Debug, Eq, Hash, PartialEq)]
 pub enum CompactionAction {
-    AdvanceTime { new_time: i64 },
+    AdvanceTime {
+        new_time: i64,
+    },
     IngestPoint {
         timestamp: i64,
         sort_key: i64,
         scope: Scope,
         columns: BTreeSet<ColumnName>,
     },
-    FlushSplit { key: BufferKey },
+    FlushSplit {
+        key: BufferKey,
+    },
     CompactWindow {
         scope: Scope,
         window_start: i64,
@@ -323,11 +327,7 @@ impl Model for CompactionModel {
         }
     }
 
-    fn next_state(
-        &self,
-        state: &Self::State,
-        action: Self::Action,
-    ) -> Option<Self::State> {
+    fn next_state(&self, state: &Self::State, action: Self::Action) -> Option<Self::State> {
         let mut next = state.clone();
 
         match action {
@@ -352,10 +352,7 @@ impl Model for CompactionModel {
                 };
                 let ws = window_start(timestamp, self.window_duration);
                 let key = (scope, ws);
-                next.ingest_buffer
-                    .entry(key)
-                    .or_default()
-                    .push(row.clone());
+                next.ingest_buffer.entry(key).or_default().push(row.clone());
                 next.next_point_id += 1;
                 next.points_ingested += 1;
                 next.row_history.insert(pid, row);
@@ -366,8 +363,10 @@ impl Model for CompactionModel {
                     return None;
                 }
                 let sorted_rows = insertion_sort(&rows);
-                let all_cols: BTreeSet<ColumnName> =
-                    rows.iter().flat_map(|r| r.columns.iter().copied()).collect();
+                let all_cols: BTreeSet<ColumnName> = rows
+                    .iter()
+                    .flat_map(|r| r.columns.iter().copied())
+                    .collect();
                 let new_split = CompactionSplit {
                     id: next.next_split_id,
                     scope: key.0,
@@ -460,117 +459,147 @@ impl Model for CompactionModel {
             // TW-1: Every split belongs to exactly one time window.
             // All rows have the same window_start as the split metadata.
             // Mirrors TimeWindowedCompaction.tla lines 274-277
-            Property::always("TW-1: one window per split", |model: &CompactionModel, state: &CompactionState| {
-                let wd = model.window_duration;
-                state.object_storage.iter().all(|split| {
-                    split
-                        .rows
-                        .iter()
-                        .all(|row| window_start(row.timestamp, wd) == split.window_start)
-                })
-            }),
+            Property::always(
+                "TW-1: one window per split",
+                |model: &CompactionModel, state: &CompactionState| {
+                    let wd = model.window_duration;
+                    state.object_storage.iter().all(|split| {
+                        split
+                            .rows
+                            .iter()
+                            .all(|row| window_start(row.timestamp, wd) == split.window_start)
+                    })
+                },
+            ),
             // TW-2: window_duration evenly divides one hour.
             // Mirrors TimeWindowedCompaction.tla lines 283-284
-            Property::always("TW-2: duration divides hour", |model: &CompactionModel, _state: &CompactionState| {
-                model.hour_seconds % model.window_duration == 0
-            }),
+            Property::always(
+                "TW-2: duration divides hour",
+                |model: &CompactionModel, _state: &CompactionState| {
+                    model.hour_seconds % model.window_duration == 0
+                },
+            ),
             // TW-3: No cross-window merges.
             // Mirrors TimeWindowedCompaction.tla lines 295-305
-            Property::always("TW-3: no cross-window merge", |_model: &CompactionModel, state: &CompactionState| {
-                state.compaction_log.iter().all(|entry| {
-                    // All input window_starts are identical.
-                    let ws_values: BTreeSet<i64> =
-                        entry.input_window_starts.values().copied().collect();
-                    if ws_values.len() > 1 {
-                        return false;
-                    }
-                    // Output split (if in storage) matches.
-                    for s in &state.object_storage {
-                        if s.id == entry.output_split_id {
-                            for &input_ws in entry.input_window_starts.values() {
-                                if s.window_start != input_ws {
-                                    return false;
+            Property::always(
+                "TW-3: no cross-window merge",
+                |_model: &CompactionModel, state: &CompactionState| {
+                    state.compaction_log.iter().all(|entry| {
+                        // All input window_starts are identical.
+                        let ws_values: BTreeSet<i64> =
+                            entry.input_window_starts.values().copied().collect();
+                        if ws_values.len() > 1 {
+                            return false;
+                        }
+                        // Output split (if in storage) matches.
+                        for s in &state.object_storage {
+                            if s.id == entry.output_split_id {
+                                for &input_ws in entry.input_window_starts.values() {
+                                    if s.window_start != input_ws {
+                                        return false;
+                                    }
                                 }
                             }
                         }
-                    }
-                    true
-                })
-            }),
+                        true
+                    })
+                },
+            ),
             // CS-1: Only splits sharing scope may be merged.
             // Mirrors TimeWindowedCompaction.tla lines 311-314
-            Property::always("CS-1: scope compatibility", |_model: &CompactionModel, state: &CompactionState| {
-                state.compaction_log.iter().all(|entry| {
-                    let scopes: BTreeSet<Scope> =
-                        entry.input_scopes.values().copied().collect();
-                    scopes.len() <= 1
-                })
-            }),
+            Property::always(
+                "CS-1: scope compatibility",
+                |_model: &CompactionModel, state: &CompactionState| {
+                    state.compaction_log.iter().all(|entry| {
+                        let scopes: BTreeSet<Scope> =
+                            entry.input_scopes.values().copied().collect();
+                        scopes.len() <= 1
+                    })
+                },
+            ),
             // CS-2: Same window_start within scope.
             // Mirrors TimeWindowedCompaction.tla lines 320-323
-            Property::always("CS-2: same window_start", |_model: &CompactionModel, state: &CompactionState| {
-                state.compaction_log.iter().all(|entry| {
-                    let ws_values: BTreeSet<i64> =
-                        entry.input_window_starts.values().copied().collect();
-                    ws_values.len() <= 1
-                })
-            }),
+            Property::always(
+                "CS-2: same window_start",
+                |_model: &CompactionModel, state: &CompactionState| {
+                    state.compaction_log.iter().all(|entry| {
+                        let ws_values: BTreeSet<i64> =
+                            entry.input_window_starts.values().copied().collect();
+                        ws_values.len() <= 1
+                    })
+                },
+            ),
             // CS-3: Splits before compaction_start_time never compacted.
             // Mirrors TimeWindowedCompaction.tla lines 329-332
-            Property::always("CS-3: compaction start time", |model: &CompactionModel, state: &CompactionState| {
-                let cst = model.compaction_start_time;
-                state.compaction_log.iter().all(|entry| {
-                    entry
-                        .input_window_starts
-                        .values()
-                        .all(|&ws| ws >= cst)
-                })
-            }),
+            Property::always(
+                "CS-3: compaction start time",
+                |model: &CompactionModel, state: &CompactionState| {
+                    let cst = model.compaction_start_time;
+                    state
+                        .compaction_log
+                        .iter()
+                        .all(|entry| entry.input_window_starts.values().all(|&ws| ws >= cst))
+                },
+            ),
             // MC-1: Row multiset preserved (no add/remove/duplicate).
             // Mirrors TimeWindowedCompaction.tla lines 339-344
-            Property::always("MC-1: row set preserved", |_model: &CompactionModel, state: &CompactionState| {
-                state.compaction_log.iter().all(|entry| {
-                    entry.input_point_ids == entry.output_point_ids
-                })
-            }),
+            Property::always(
+                "MC-1: row set preserved",
+                |_model: &CompactionModel, state: &CompactionState| {
+                    state
+                        .compaction_log
+                        .iter()
+                        .all(|entry| entry.input_point_ids == entry.output_point_ids)
+                },
+            ),
             // MC-2: Row contents unchanged through compaction.
             // Mirrors TimeWindowedCompaction.tla lines 351-360
-            Property::always("MC-2: row contents preserved", |_model: &CompactionModel, state: &CompactionState| {
-                for split in &state.object_storage {
-                    for row in &split.rows {
-                        if let Some(original) = state.row_history.get(&row.point_id) {
-                            if row.timestamp != original.timestamp
-                                || row.sort_key != original.sort_key
-                                || row.columns != original.columns
-                                || row.values != original.values
-                            {
+            Property::always(
+                "MC-2: row contents preserved",
+                |_model: &CompactionModel, state: &CompactionState| {
+                    for split in &state.object_storage {
+                        for row in &split.rows {
+                            if let Some(original) = state.row_history.get(&row.point_id) {
+                                if row.timestamp != original.timestamp
+                                    || row.sort_key != original.sort_key
+                                    || row.columns != original.columns
+                                    || row.values != original.values
+                                {
+                                    return false;
+                                }
+                            } else {
                                 return false;
                             }
-                        } else {
-                            return false;
                         }
                     }
-                }
-                true
-            }),
+                    true
+                },
+            ),
             // MC-3: Output is sorted.
             // Mirrors TimeWindowedCompaction.tla lines 366-368
-            Property::always("MC-3: sort order preserved", |_model: &CompactionModel, state: &CompactionState| {
-                state.object_storage.iter().all(|split| {
-                    if split.sorted {
-                        is_sorted_by_key(&split.rows)
-                    } else {
-                        true
-                    }
-                })
-            }),
+            Property::always(
+                "MC-3: sort order preserved",
+                |_model: &CompactionModel, state: &CompactionState| {
+                    state.object_storage.iter().all(|split| {
+                        if split.sorted {
+                            is_sorted_by_key(&split.rows)
+                        } else {
+                            true
+                        }
+                    })
+                },
+            ),
             // MC-4: Column set is the union of inputs.
             // Mirrors TimeWindowedCompaction.tla lines 376-378
-            Property::always("MC-4: column union", |_model: &CompactionModel, state: &CompactionState| {
-                state.compaction_log.iter().all(|entry| {
-                    entry.output_columns == entry.input_column_union
-                })
-            }),
+            Property::always(
+                "MC-4: column union",
+                |_model: &CompactionModel, state: &CompactionState| {
+                    state
+                        .compaction_log
+                        .iter()
+                        .all(|entry| entry.output_columns == entry.input_column_union)
+                },
+            ),
         ]
     }
 }
@@ -582,11 +611,7 @@ mod tests {
     #[test]
     fn check_compaction_small() {
         let model = CompactionModel::small();
-        model
-            .checker()
-            .spawn_bfs()
-            .join()
-            .assert_properties();
+        model.checker().spawn_bfs().join().assert_properties();
     }
 
     #[test]
diff --git a/quickwit/quickwit-dst/tests/stateright_models.rs b/quickwit/quickwit-dst/tests/stateright_models.rs
index 7b94fb38fd8..aad0fb71366 100644
--- a/quickwit/quickwit-dst/tests/stateright_models.rs
+++ b/quickwit/quickwit-dst/tests/stateright_models.rs
@@ -26,10 +26,9 @@
 
 #![cfg(feature = "model-checking")]
 
-use quickwit_dst::models::{
-    parquet_data_model::DataModelModel, sort_schema::SortSchemaModel,
-    time_windowed_compaction::CompactionModel,
-};
+use quickwit_dst::models::parquet_data_model::DataModelModel;
+use quickwit_dst::models::sort_schema::SortSchemaModel;
+use quickwit_dst::models::time_windowed_compaction::CompactionModel;
 use stateright::{Checker, Model};
 
 /// SS-1..SS-5: Sort schema invariants (ADR-002).
diff --git a/quickwit/quickwit-parquet-engine/src/sort_fields/window.rs b/quickwit/quickwit-parquet-engine/src/sort_fields/window.rs
index d34a3ec15ea..948ba03f72d 100644
--- a/quickwit/quickwit-parquet-engine/src/sort_fields/window.rs
+++ b/quickwit/quickwit-parquet-engine/src/sort_fields/window.rs
@@ -80,16 +80,19 @@ pub fn window_start(
     use quickwit_dst::check_invariant;
     use quickwit_dst::invariants::InvariantId;
 
-    check_invariant!(InvariantId::TW2, duration_secs > 0, ": duration_secs must be positive");
     check_invariant!(
         InvariantId::TW2,
-        3600 % duration_secs == 0,
-        ": duration_secs={} does not divide 3600", duration_secs
+        duration_secs > 0,
+        ": duration_secs must be positive"
     );
-    let start_secs = quickwit_dst::invariants::window::window_start_secs(
-        timestamp_secs,
-        duration_secs,
+    check_invariant!(
+        InvariantId::TW2,
+        3600 % duration_secs == 0,
+        ": duration_secs={} does not divide 3600",
+        duration_secs
     );
+    let start_secs =
+        quickwit_dst::invariants::window::window_start_secs(timestamp_secs, duration_secs);
     DateTime::from_timestamp(start_secs, 0).ok_or(SortFieldsError::WindowStartOutOfRange {
         timestamp_secs: start_secs,
     })
diff --git a/quickwit/quickwit-parquet-engine/src/split/metadata.rs b/quickwit/quickwit-parquet-engine/src/split/metadata.rs
index d280d846d8e..5677fd5e813 100644
--- a/quickwit/quickwit-parquet-engine/src/split/metadata.rs
+++ b/quickwit/quickwit-parquet-engine/src/split/metadata.rs
@@ -470,7 +470,8 @@ impl MetricsSplitMetadataBuilder {
         quickwit_dst::check_invariant!(
             quickwit_dst::invariants::InvariantId::TW2,
             self.window_duration_secs == 0 || 3600 % self.window_duration_secs == 0,
-            ": window_duration_secs={} does not divide 3600", self.window_duration_secs
+            ": window_duration_secs={} does not divide 3600",
+            self.window_duration_secs
         );
 
         // TW-1 (ADR-003, partial): window_start and window_duration_secs are paired.
@@ -479,9 +480,10 @@ impl MetricsSplitMetadataBuilder {
             quickwit_dst::invariants::InvariantId::TW1,
             (self.window_start.is_none() && self.window_duration_secs == 0)
                 || (self.window_start.is_some() && self.window_duration_secs > 0),
-            ": window_start and window_duration_secs must be set together \
-             (window_start={:?}, window_duration_secs={})",
-            self.window_start, self.window_duration_secs
+            ": window_start and window_duration_secs must be set together (window_start={:?}, \
+             window_duration_secs={})",
+            self.window_start,
+            self.window_duration_secs
         );
 
         // Fuse the two builder fields into a single Range.
diff --git a/quickwit/quickwit-parquet-engine/src/split/postgres.rs b/quickwit/quickwit-parquet-engine/src/split/postgres.rs
index ca6b1b8e9ca..ab4dbb6c89f 100644
--- a/quickwit/quickwit-parquet-engine/src/split/postgres.rs
+++ b/quickwit/quickwit-parquet-engine/src/split/postgres.rs
@@ -165,7 +165,8 @@ impl PgMetricsSplit {
             quickwit_dst::invariants::InvariantId::SS5,
             metadata.sort_fields == self.sort_fields,
             ": sort_fields mismatch between JSON ('{}') and SQL column ('{}')",
-            metadata.sort_fields, self.sort_fields
+            metadata.sort_fields,
+            self.sort_fields
         );
 
         // SS-5 continued: window_start must match between JSON and SQL column.
@@ -173,7 +174,8 @@ impl PgMetricsSplit {
             quickwit_dst::invariants::InvariantId::SS5,
             metadata.window_start() == self.window_start,
             ": window_start mismatch between JSON ({:?}) and SQL column ({:?})",
-            metadata.window_start(), self.window_start
+            metadata.window_start(),
+            self.window_start
         );
 
         // SS-4 (SortSchema.tla): sort_fields is immutable after write.
diff --git a/quickwit/quickwit-parquet-engine/src/storage/writer.rs b/quickwit/quickwit-parquet-engine/src/storage/writer.rs
index 60b39dd34c5..2fcf45b9699 100644
--- a/quickwit/quickwit-parquet-engine/src/storage/writer.rs
+++ b/quickwit/quickwit-parquet-engine/src/storage/writer.rs
@@ -288,7 +288,8 @@ impl ParquetWriter {
                     quickwit_dst::check_invariant!(
                         quickwit_dst::invariants::InvariantId::SS1,
                         verify_indices.value(i) as usize == i,
-                        ": row {} is out of sort order after sort_batch()", i
+                        ": row {} is out of sort order after sort_batch()",
+                        i
                     );
                 }
             }

From 759c2caa5321ab2c4dea699dc7001dc656e72511 Mon Sep 17 00:00:00 2001
From: George Talbot <george.talbot@datadoghq.com>
Date: Mon, 6 Apr 2026 21:11:33 -0400
Subject: [PATCH 30/35] Update
 quickwit/quickwit-parquet-engine/src/table_config.rs

Co-authored-by: Matthew Kim <matthew.kim@datadoghq.com>
---
 quickwit/quickwit-parquet-engine/src/table_config.rs | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/quickwit/quickwit-parquet-engine/src/table_config.rs b/quickwit/quickwit-parquet-engine/src/table_config.rs
index 0573b8177db..8cf7834fa6a 100644
--- a/quickwit/quickwit-parquet-engine/src/table_config.rs
+++ b/quickwit/quickwit-parquet-engine/src/table_config.rs
@@ -47,7 +47,7 @@ impl ProductType {
     /// when the Parquet pipeline is extended to those signal types.
     pub fn default_sort_fields(self) -> &'static str {
         match self {
-            Self::Metrics => "metric_name|tag_service|tag_env|tag_datacenter|tag_region|tag_host|timeseries_id|timestamp_secs/V2",
+            Self::Metrics => "metric_name|service|env|datacenter|region|host|timeseries_id|timestamp_secs/V2",
             // Placeholder: column names TBD when logs Parquet schema is defined.
             Self::Logs => "service_name|level|host|timestamp_secs/V2",
             // Placeholder: column names TBD when traces Parquet schema is defined.

From 6454f1d2f623391e6e86e8c4644e51c458977a30 Mon Sep 17 00:00:00 2001
From: George Talbot <george.talbot@datadoghq.com>
Date: Mon, 6 Apr 2026 21:11:44 -0400
Subject: [PATCH 31/35] Update
 quickwit/quickwit-parquet-engine/src/table_config.rs

Co-authored-by: Matthew Kim <matthew.kim@datadoghq.com>
---
 quickwit/quickwit-parquet-engine/src/table_config.rs | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/quickwit/quickwit-parquet-engine/src/table_config.rs b/quickwit/quickwit-parquet-engine/src/table_config.rs
index 8cf7834fa6a..002d99246bc 100644
--- a/quickwit/quickwit-parquet-engine/src/table_config.rs
+++ b/quickwit/quickwit-parquet-engine/src/table_config.rs
@@ -119,7 +119,7 @@ mod tests {
         assert_eq!(schema.column.len(), 8);
         // Proto names are bare (suffixes stripped by parser).
         assert_eq!(schema.column[0].name, "metric_name");
-        assert_eq!(schema.column[1].name, "tag_service");
+        assert_eq!(schema.column[1].name, "service");
         assert_eq!(schema.column[6].name, "timeseries_id");
         assert_eq!(schema.column[7].name, "timestamp_secs");
     }

From 4481bef516556f1d26b0106b7935bc3de8621c6b Mon Sep 17 00:00:00 2001
From: George Talbot <george.talbot@datadoghq.com>
Date: Wed, 8 Apr 2026 07:38:13 -0400
Subject: [PATCH 32/35] style: rustfmt long match arm in default_sort_fields

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
---
 quickwit/quickwit-parquet-engine/src/table_config.rs | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/quickwit/quickwit-parquet-engine/src/table_config.rs b/quickwit/quickwit-parquet-engine/src/table_config.rs
index 002d99246bc..c360423cba2 100644
--- a/quickwit/quickwit-parquet-engine/src/table_config.rs
+++ b/quickwit/quickwit-parquet-engine/src/table_config.rs
@@ -47,7 +47,9 @@ impl ProductType {
     /// when the Parquet pipeline is extended to those signal types.
     pub fn default_sort_fields(self) -> &'static str {
         match self {
-            Self::Metrics => "metric_name|service|env|datacenter|region|host|timeseries_id|timestamp_secs/V2",
+            Self::Metrics => {
+                "metric_name|service|env|datacenter|region|host|timeseries_id|timestamp_secs/V2"
+            }
             // Placeholder: column names TBD when logs Parquet schema is defined.
             Self::Logs => "service_name|level|host|timestamp_secs/V2",
             // Placeholder: column names TBD when traces Parquet schema is defined.

From 93e1cc71e8a97db1195ae61592e65fc911e7a6ba Mon Sep 17 00:00:00 2001
From: George Talbot <george.talbot@datadoghq.com>
Date: Wed, 8 Apr 2026 08:01:29 -0400
Subject: [PATCH 33/35] fix: make parquet_file field backward-compatible in
 MetricsSplitMetadata

Pre-existing splits were serialized before the parquet_file field was
added, so their JSON doesn't contain it. Adding #[serde(default)]
makes deserialization fall back to empty string for old splits.

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
---
 quickwit/quickwit-parquet-engine/src/split/metadata.rs | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/quickwit/quickwit-parquet-engine/src/split/metadata.rs b/quickwit/quickwit-parquet-engine/src/split/metadata.rs
index 5bf85ed987b..57ab6d83eee 100644
--- a/quickwit/quickwit-parquet-engine/src/split/metadata.rs
+++ b/quickwit/quickwit-parquet-engine/src/split/metadata.rs
@@ -198,6 +198,8 @@ struct MetricsSplitMetadataSerde {
     low_cardinality_tags: HashMap<String, HashSet<String>>,
     high_cardinality_tag_keys: HashSet<String>,
     created_at: SystemTime,
+
+    #[serde(default)]
     parquet_file: String,
 
     #[serde(default, skip_serializing_if = "Option::is_none")]

From f7c89bf94865b906ee10a97e8a0bfd24fecb8017 Mon Sep 17 00:00:00 2001
From: George Talbot <george.talbot@datadoghq.com>
Date: Wed, 8 Apr 2026 08:02:02 -0400
Subject: [PATCH 34/35] fix: handle empty-column batches in accumulator flush
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

When the commit timeout fires and the accumulator contains only
zero-column batches, union_fields is empty and concat_batches fails
with "must either specify a row count or at least one column".
Now flush_internal treats empty union_fields the same as empty
pending_batches — resets state and returns None.

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
---
 quickwit/quickwit-parquet-engine/src/index/accumulator.rs | 8 +++++++-
 1 file changed, 7 insertions(+), 1 deletion(-)

diff --git a/quickwit/quickwit-parquet-engine/src/index/accumulator.rs b/quickwit/quickwit-parquet-engine/src/index/accumulator.rs
index f0bf22e68b8..0267f635b26 100644
--- a/quickwit/quickwit-parquet-engine/src/index/accumulator.rs
+++ b/quickwit/quickwit-parquet-engine/src/index/accumulator.rs
@@ -151,7 +151,13 @@ impl ParquetBatchAccumulator {
 
     /// Internal flush implementation.
     fn flush_internal(&mut self) -> Result<Option<RecordBatch>, IndexingError> {
-        if self.pending_batches.is_empty() {
+        if self.pending_batches.is_empty() || self.union_fields.is_empty() {
+            // Nothing to flush: either no batches at all, or only empty
+            // (zero-column) batches were accumulated.
+            self.pending_batches.clear();
+            self.union_fields.clear();
+            self.pending_rows = 0;
+            self.pending_bytes = 0;
             return Ok(None);
         }
 

From 501f014e1031fb9b24f0e17104d804ba69a84d44 Mon Sep 17 00:00:00 2001
From: George Talbot <george.talbot@datadoghq.com>
Date: Wed, 8 Apr 2026 14:49:50 -0400
Subject: [PATCH 35/35] style: rustfmt check_invariant macro argument

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
---
 quickwit/quickwit-parquet-engine/src/split/postgres.rs | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/quickwit/quickwit-parquet-engine/src/split/postgres.rs b/quickwit/quickwit-parquet-engine/src/split/postgres.rs
index e0dc6209b6b..1a69072620a 100644
--- a/quickwit/quickwit-parquet-engine/src/split/postgres.rs
+++ b/quickwit/quickwit-parquet-engine/src/split/postgres.rs
@@ -181,8 +181,7 @@ impl PgMetricsSplit {
         // SS-5 continued: window_duration_secs must match.
         quickwit_dst::check_invariant!(
             quickwit_dst::invariants::InvariantId::SS5,
-            metadata.window_duration_secs()
-                == self.window_duration_secs.unwrap_or(0) as u32,
+            metadata.window_duration_secs() == self.window_duration_secs.unwrap_or(0) as u32,
             ": window_duration_secs mismatch between JSON ({}) and SQL column ({:?})",
             metadata.window_duration_secs(),
             self.window_duration_secs