From f6bc124ace7e188569583cdb0b257646f41eeb2f Mon Sep 17 00:00:00 2001 From: AndreaBozzo Date: Sat, 6 Dec 2025 16:01:35 +0100 Subject: [PATCH 1/3] fix: Serialize `split_offsets` as null when empty Change `split_offsets` field in `DataFile` from `Vec` to `Option>` to properly serialize empty values as `null` instead of `[]`, aligning with the Iceberg specification. Closes #1897 --- bindings/python/src/data_file.rs | 4 ++-- .../src/expr/visitors/expression_evaluator.rs | 4 ++-- .../visitors/inclusive_metrics_evaluator.rs | 12 +++++----- .../expr/visitors/strict_metrics_evaluator.rs | 8 +++---- crates/iceberg/src/spec/manifest/_serde.rs | 6 ++--- crates/iceberg/src/spec/manifest/data_file.rs | 10 ++++---- crates/iceberg/src/spec/manifest/mod.rs | 24 +++++++++---------- crates/iceberg/src/spec/manifest/writer.rs | 6 ++--- crates/iceberg/src/spec/snapshot_summary.rs | 10 ++++---- .../base_writer/equality_delete_writer.rs | 16 ++++++------- .../src/writer/file_writer/parquet_writer.rs | 4 ++-- 11 files changed, 53 insertions(+), 51 deletions(-) diff --git a/bindings/python/src/data_file.rs b/bindings/python/src/data_file.rs index 900d6c6014..0de495d168 100644 --- a/bindings/python/src/data_file.rs +++ b/bindings/python/src/data_file.rs @@ -143,8 +143,8 @@ impl PyDataFile { } #[getter] - fn split_offsets(&self) -> &[i64] { - self.inner.split_offsets() + fn split_offsets(&self) -> Option> { + self.inner.split_offsets().map(|s| s.to_vec()) } #[getter] diff --git a/crates/iceberg/src/expr/visitors/expression_evaluator.rs b/crates/iceberg/src/expr/visitors/expression_evaluator.rs index 3675ce355f..570c409502 100644 --- a/crates/iceberg/src/expr/visitors/expression_evaluator.rs +++ b/crates/iceberg/src/expr/visitors/expression_evaluator.rs @@ -346,7 +346,7 @@ mod tests { lower_bounds: HashMap::new(), upper_bounds: HashMap::new(), key_metadata: None, - split_offsets: vec![], + split_offsets: None, equality_ids: None, sort_order_id: None, partition_spec_id: 0, @@ -374,7 +374,7 @@ mod tests { lower_bounds: HashMap::new(), upper_bounds: HashMap::new(), key_metadata: None, - split_offsets: vec![], + split_offsets: None, equality_ids: None, sort_order_id: None, partition_spec_id: 0, diff --git a/crates/iceberg/src/expr/visitors/inclusive_metrics_evaluator.rs b/crates/iceberg/src/expr/visitors/inclusive_metrics_evaluator.rs index 2b65cf12aa..06c92ab3e8 100644 --- a/crates/iceberg/src/expr/visitors/inclusive_metrics_evaluator.rs +++ b/crates/iceberg/src/expr/visitors/inclusive_metrics_evaluator.rs @@ -1995,7 +1995,7 @@ mod test { lower_bounds: Default::default(), upper_bounds: Default::default(), key_metadata: None, - split_offsets: vec![], + split_offsets: None, equality_ids: None, sort_order_id: None, partition_spec_id: 0, @@ -2021,7 +2021,7 @@ mod test { lower_bounds: Default::default(), upper_bounds: Default::default(), key_metadata: None, - split_offsets: vec![], + split_offsets: None, equality_ids: None, sort_order_id: None, partition_spec_id: 0, @@ -2083,7 +2083,7 @@ mod test { column_sizes: Default::default(), key_metadata: None, - split_offsets: vec![], + split_offsets: None, equality_ids: None, sort_order_id: None, partition_spec_id: 0, @@ -2114,7 +2114,7 @@ mod test { column_sizes: Default::default(), key_metadata: None, - split_offsets: vec![], + split_offsets: None, equality_ids: None, sort_order_id: None, partition_spec_id: 0, @@ -2146,7 +2146,7 @@ mod test { column_sizes: Default::default(), key_metadata: None, - split_offsets: vec![], + split_offsets: None, equality_ids: None, sort_order_id: None, partition_spec_id: 0, @@ -2178,7 +2178,7 @@ mod test { column_sizes: Default::default(), key_metadata: None, - split_offsets: vec![], + split_offsets: None, equality_ids: None, sort_order_id: None, partition_spec_id: 0, diff --git a/crates/iceberg/src/expr/visitors/strict_metrics_evaluator.rs b/crates/iceberg/src/expr/visitors/strict_metrics_evaluator.rs index e9bed775ef..301d6d916f 100644 --- a/crates/iceberg/src/expr/visitors/strict_metrics_evaluator.rs +++ b/crates/iceberg/src/expr/visitors/strict_metrics_evaluator.rs @@ -578,7 +578,7 @@ mod test { ]), column_sizes: Default::default(), key_metadata: None, - split_offsets: vec![], + split_offsets: None, equality_ids: None, sort_order_id: None, partition_spec_id: 0, @@ -604,7 +604,7 @@ mod test { lower_bounds: Default::default(), upper_bounds: Default::default(), key_metadata: None, - split_offsets: vec![], + split_offsets: None, equality_ids: None, sort_order_id: None, partition_spec_id: 0, @@ -630,7 +630,7 @@ mod test { upper_bounds: HashMap::from([(1, Datum::int(42))]), column_sizes: Default::default(), key_metadata: None, - split_offsets: vec![], + split_offsets: None, equality_ids: None, sort_order_id: None, partition_spec_id: 0, @@ -657,7 +657,7 @@ mod test { upper_bounds: HashMap::from([(3, Datum::string("dC"))]), column_sizes: Default::default(), key_metadata: None, - split_offsets: vec![], + split_offsets: None, equality_ids: None, sort_order_id: None, partition_spec_id: 0, diff --git a/crates/iceberg/src/spec/manifest/_serde.rs b/crates/iceberg/src/spec/manifest/_serde.rs index 7738af46d4..07306be2b9 100644 --- a/crates/iceberg/src/spec/manifest/_serde.rs +++ b/crates/iceberg/src/spec/manifest/_serde.rs @@ -153,7 +153,7 @@ impl DataFileSerde { lower_bounds: Some(to_bytes_entry(value.lower_bounds)?), upper_bounds: Some(to_bytes_entry(value.upper_bounds)?), key_metadata: value.key_metadata.map(serde_bytes::ByteBuf::from), - split_offsets: Some(value.split_offsets), + split_offsets: value.split_offsets, equality_ids: value.equality_ids, sort_order_id: value.sort_order_id, first_row_id: value.first_row_id, @@ -222,7 +222,7 @@ impl DataFileSerde { .transpose()? .unwrap_or_default(), key_metadata: self.key_metadata.map(|v| v.to_vec()), - split_offsets: self.split_offsets.unwrap_or_default(), + split_offsets: self.split_offsets, equality_ids: self.equality_ids, sort_order_id: self.sort_order_id, partition_spec_id, @@ -380,7 +380,7 @@ mod tests { lower_bounds: HashMap::from([(1,Datum::int(1)),(2,Datum::string("a")),(3,Datum::string("AC/DC"))]), upper_bounds: HashMap::from([(1,Datum::int(1)),(2,Datum::string("a")),(3,Datum::string("AC/DC"))]), key_metadata: None, - split_offsets: vec![4], + split_offsets: Some(vec![4]), equality_ids: None, sort_order_id: Some(0), partition_spec_id: 0, diff --git a/crates/iceberg/src/spec/manifest/data_file.rs b/crates/iceberg/src/spec/manifest/data_file.rs index a9c041f540..77bd046f8a 100644 --- a/crates/iceberg/src/spec/manifest/data_file.rs +++ b/crates/iceberg/src/spec/manifest/data_file.rs @@ -127,9 +127,10 @@ pub struct DataFile { /// element field id: 133 /// /// Split offsets for the data file. For example, all row group offsets - /// in a Parquet file. Must be sorted ascending + /// in a Parquet file. Must be sorted ascending. Optional field that + /// should be serialized as null when not present. #[builder(default)] - pub(crate) split_offsets: Vec, + pub(crate) split_offsets: Option>, /// field id: 135 /// element field id: 136 /// @@ -247,8 +248,9 @@ impl DataFile { } /// Get the split offsets of the data file. /// For example, all row group offsets in a Parquet file. - pub fn split_offsets(&self) -> &[i64] { - &self.split_offsets + /// Returns `None` if no split offsets are present. + pub fn split_offsets(&self) -> Option<&[i64]> { + self.split_offsets.as_deref() } /// Get the equality ids of the data file. /// Field ids used to determine row equality in equality delete files. diff --git a/crates/iceberg/src/spec/manifest/mod.rs b/crates/iceberg/src/spec/manifest/mod.rs index 51219bfdb7..b126396e3c 100644 --- a/crates/iceberg/src/spec/manifest/mod.rs +++ b/crates/iceberg/src/spec/manifest/mod.rs @@ -257,7 +257,7 @@ mod tests { snapshot_id: None, sequence_number: None, file_sequence_number: None, - data_file: DataFile {content:DataContentType::Data,file_path:"s3a://icebergdata/demo/s1/t1/data/00000-0-ba56fbfa-f2ff-40c9-bb27-565ad6dc2be8-00000.parquet".to_string(),file_format:DataFileFormat::Parquet,partition:Struct::empty(),record_count:1,file_size_in_bytes:5442,column_sizes:HashMap::from([(0,73),(6,34),(2,73),(7,61),(3,61),(5,62),(9,79),(10,73),(1,61),(4,73),(8,73)]),value_counts:HashMap::from([(4,1),(5,1),(2,1),(0,1),(3,1),(6,1),(8,1),(1,1),(10,1),(7,1),(9,1)]),null_value_counts:HashMap::from([(1,0),(6,0),(2,0),(8,0),(0,0),(3,0),(5,0),(9,0),(7,0),(4,0),(10,0)]),nan_value_counts:HashMap::new(),lower_bounds:HashMap::new(),upper_bounds:HashMap::new(),key_metadata:None,split_offsets:vec![4],equality_ids:Some(Vec::new()),sort_order_id:None, partition_spec_id: 0,first_row_id: None,referenced_data_file: None,content_offset: None,content_size_in_bytes: None } + data_file: DataFile {content:DataContentType::Data,file_path:"s3a://icebergdata/demo/s1/t1/data/00000-0-ba56fbfa-f2ff-40c9-bb27-565ad6dc2be8-00000.parquet".to_string(),file_format:DataFileFormat::Parquet,partition:Struct::empty(),record_count:1,file_size_in_bytes:5442,column_sizes:HashMap::from([(0,73),(6,34),(2,73),(7,61),(3,61),(5,62),(9,79),(10,73),(1,61),(4,73),(8,73)]),value_counts:HashMap::from([(4,1),(5,1),(2,1),(0,1),(3,1),(6,1),(8,1),(1,1),(10,1),(7,1),(9,1)]),null_value_counts:HashMap::from([(1,0),(6,0),(2,0),(8,0),(0,0),(3,0),(5,0),(9,0),(7,0),(4,0),(10,0)]),nan_value_counts:HashMap::new(),lower_bounds:HashMap::new(),upper_bounds:HashMap::new(),key_metadata:None,split_offsets:Some(vec![4]),equality_ids:Some(Vec::new()),sort_order_id:None, partition_spec_id: 0,first_row_id: None,referenced_data_file: None,content_offset: None,content_size_in_bytes: None } } ]; @@ -435,7 +435,7 @@ mod tests { lower_bounds: HashMap::new(), upper_bounds: HashMap::new(), key_metadata: None, - split_offsets: vec![4], + split_offsets: Some(vec![4]), equality_ids: Some(Vec::new()), sort_order_id: None, partition_spec_id: 0, @@ -532,7 +532,7 @@ mod tests { lower_bounds: HashMap::from([(1,Datum::int(1)),(2,Datum::string("a")),(3,Datum::string("AC/DC"))]), upper_bounds: HashMap::from([(1,Datum::int(1)),(2,Datum::string("a")),(3,Datum::string("AC/DC"))]), key_metadata: None, - split_offsets: vec![4], + split_offsets: Some(vec![4]), equality_ids: None, sort_order_id: Some(0), partition_spec_id: 0, @@ -640,7 +640,7 @@ mod tests { (3, Datum::string("x")) ]), key_metadata: None, - split_offsets: vec![4], + split_offsets: Some(vec![4]), equality_ids: None, sort_order_id: Some(0), partition_spec_id: 0, @@ -749,7 +749,7 @@ mod tests { (3, Datum::string("x")) ]), key_metadata: None, - split_offsets: vec![4], + split_offsets: Some(vec![4]), equality_ids: None, sort_order_id: None, partition_spec_id: 0, @@ -840,7 +840,7 @@ mod tests { (2, Datum::int(2)), ]), key_metadata: None, - split_offsets: vec![4], + split_offsets: Some(vec![4]), equality_ids: None, sort_order_id: None, partition_spec_id: 0, @@ -922,7 +922,7 @@ mod tests { lower_bounds: HashMap::new(), upper_bounds: HashMap::new(), key_metadata: None, - split_offsets: vec![4], + split_offsets: Some(vec![4]), equality_ids: None, sort_order_id: None, partition_spec_id: 0, @@ -957,7 +957,7 @@ mod tests { lower_bounds: HashMap::new(), upper_bounds: HashMap::new(), key_metadata: None, - split_offsets: vec![4], + split_offsets: Some(vec![4]), equality_ids: None, sort_order_id: None, partition_spec_id: 0, @@ -992,7 +992,7 @@ mod tests { lower_bounds: HashMap::new(), upper_bounds: HashMap::new(), key_metadata: None, - split_offsets: vec![4], + split_offsets: Some(vec![4]), equality_ids: None, sort_order_id: None, partition_spec_id: 0, @@ -1027,7 +1027,7 @@ mod tests { lower_bounds: HashMap::new(), upper_bounds: HashMap::new(), key_metadata: None, - split_offsets: vec![4], + split_offsets: Some(vec![4]), equality_ids: None, sort_order_id: None, partition_spec_id: 0, @@ -1182,7 +1182,7 @@ mod tests { "lower_bounds": [], "upper_bounds": [], "key_metadata": null, - "split_offsets": [], + "split_offsets": null, "equality_ids": null, "sort_order_id": null, "first_row_id": null, @@ -1213,7 +1213,7 @@ mod tests { "lower_bounds": [], "upper_bounds": [], "key_metadata": null, - "split_offsets": [], + "split_offsets": null, "equality_ids": null, "sort_order_id": null, "first_row_id": null, diff --git a/crates/iceberg/src/spec/manifest/writer.rs b/crates/iceberg/src/spec/manifest/writer.rs index ebb0590bcf..567239921f 100644 --- a/crates/iceberg/src/spec/manifest/writer.rs +++ b/crates/iceberg/src/spec/manifest/writer.rs @@ -608,7 +608,7 @@ mod tests { lower_bounds: HashMap::new(), upper_bounds: HashMap::new(), key_metadata: Some(Vec::new()), - split_offsets: vec![4], + split_offsets: Some(vec![4]), equality_ids: None, sort_order_id: None, partition_spec_id: 0, @@ -637,7 +637,7 @@ mod tests { lower_bounds: HashMap::new(), upper_bounds: HashMap::new(), key_metadata: Some(Vec::new()), - split_offsets: vec![4], + split_offsets: Some(vec![4]), equality_ids: None, sort_order_id: None, partition_spec_id: 0, @@ -666,7 +666,7 @@ mod tests { lower_bounds: HashMap::new(), upper_bounds: HashMap::new(), key_metadata: Some(Vec::new()), - split_offsets: vec![4], + split_offsets: Some(vec![4]), equality_ids: None, sort_order_id: None, partition_spec_id: 0, diff --git a/crates/iceberg/src/spec/snapshot_summary.rs b/crates/iceberg/src/spec/snapshot_summary.rs index 4cd3715e06..c67ee37d3e 100644 --- a/crates/iceberg/src/spec/snapshot_summary.rs +++ b/crates/iceberg/src/spec/snapshot_summary.rs @@ -767,7 +767,7 @@ mod tests { (3, Datum::string("x")), ]), key_metadata: None, - split_offsets: vec![4], + split_offsets: Some(vec![4]), equality_ids: None, sort_order_id: Some(0), partition_spec_id: 0, @@ -799,7 +799,7 @@ mod tests { (3, Datum::string("x")), ]), key_metadata: None, - split_offsets: vec![4], + split_offsets: Some(vec![4]), equality_ids: None, sort_order_id: Some(0), partition_spec_id: 0, @@ -910,7 +910,7 @@ mod tests { lower_bounds: HashMap::new(), upper_bounds: HashMap::new(), key_metadata: None, - split_offsets: vec![], + split_offsets: None, equality_ids: None, sort_order_id: None, partition_spec_id: 0, @@ -938,7 +938,7 @@ mod tests { lower_bounds: HashMap::new(), upper_bounds: HashMap::new(), key_metadata: None, - split_offsets: vec![], + split_offsets: None, equality_ids: None, sort_order_id: None, partition_spec_id: 0, @@ -993,7 +993,7 @@ mod tests { lower_bounds: HashMap::new(), upper_bounds: HashMap::new(), key_metadata: None, - split_offsets: vec![], + split_offsets: None, equality_ids: None, sort_order_id: None, partition_spec_id: 0, diff --git a/crates/iceberg/src/writer/base_writer/equality_delete_writer.rs b/crates/iceberg/src/writer/base_writer/equality_delete_writer.rs index cd0b19148d..dd8487f9cc 100644 --- a/crates/iceberg/src/writer/base_writer/equality_delete_writer.rs +++ b/crates/iceberg/src/writer/base_writer/equality_delete_writer.rs @@ -293,15 +293,15 @@ mod test { assert_eq!(*data_file.null_value_counts.get(id).unwrap(), expect); } - assert_eq!(data_file.split_offsets.len(), metadata.num_row_groups()); - data_file + let split_offsets = data_file .split_offsets - .iter() - .enumerate() - .for_each(|(i, &v)| { - let expect = metadata.row_groups()[i].file_offset().unwrap(); - assert_eq!(v, expect); - }); + .as_ref() + .expect("split_offsets should be set"); + assert_eq!(split_offsets.len(), metadata.num_row_groups()); + split_offsets.iter().enumerate().for_each(|(i, &v)| { + let expect = metadata.row_groups()[i].file_offset().unwrap(); + assert_eq!(v, expect); + }); } #[tokio::test] diff --git a/crates/iceberg/src/writer/file_writer/parquet_writer.rs b/crates/iceberg/src/writer/file_writer/parquet_writer.rs index 5cf031a9fb..ed14e94ddf 100644 --- a/crates/iceberg/src/writer/file_writer/parquet_writer.rs +++ b/crates/iceberg/src/writer/file_writer/parquet_writer.rs @@ -438,13 +438,13 @@ impl ParquetWriter { // - We can ignore implementing distinct_counts due to this: https://lists.apache.org/thread/j52tsojv0x4bopxyzsp7m7bqt23n5fnd .lower_bounds(lower_bounds) .upper_bounds(upper_bounds) - .split_offsets( + .split_offsets(Some( metadata .row_groups() .iter() .filter_map(|group| group.file_offset()) .collect(), - ); + )); Ok(builder) } From b8ef5c5e581a855ef1e9ee135c9f81006d45d343 Mon Sep 17 00:00:00 2001 From: AndreaBozzo Date: Tue, 9 Dec 2025 12:02:27 +0100 Subject: [PATCH 2/3] fix: Change return type of `split_offsets` to Option<&[i64]> as suggested by maintainer --- bindings/python/src/data_file.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/bindings/python/src/data_file.rs b/bindings/python/src/data_file.rs index 0de495d168..ab2edd7e7c 100644 --- a/bindings/python/src/data_file.rs +++ b/bindings/python/src/data_file.rs @@ -143,7 +143,7 @@ impl PyDataFile { } #[getter] - fn split_offsets(&self) -> Option> { + fn split_offsets(&self) -> Option<&[i64]> { self.inner.split_offsets().map(|s| s.to_vec()) } From 8ba3d58dcc26c4aa0e4051cd708301e92d6efbd2 Mon Sep 17 00:00:00 2001 From: AndreaBozzo Date: Tue, 9 Dec 2025 12:13:45 +0100 Subject: [PATCH 3/3] fix: Update `split_offsets` method to return Option<&[i64]> directly --- bindings/python/src/data_file.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/bindings/python/src/data_file.rs b/bindings/python/src/data_file.rs index ab2edd7e7c..b0e42e7d73 100644 --- a/bindings/python/src/data_file.rs +++ b/bindings/python/src/data_file.rs @@ -144,7 +144,7 @@ impl PyDataFile { #[getter] fn split_offsets(&self) -> Option<&[i64]> { - self.inner.split_offsets().map(|s| s.to_vec()) + self.inner.split_offsets() } #[getter]