Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion bindings/python/src/data_file.rs
Original file line number Diff line number Diff line change
Expand Up @@ -143,7 +143,7 @@ impl PyDataFile {
}

#[getter]
fn split_offsets(&self) -> &[i64] {
fn split_offsets(&self) -> Option<&[i64]> {
self.inner.split_offsets()
}

Expand Down
4 changes: 2 additions & 2 deletions crates/iceberg/src/expr/visitors/expression_evaluator.rs
Original file line number Diff line number Diff line change
Expand Up @@ -346,7 +346,7 @@ mod tests {
lower_bounds: HashMap::new(),
upper_bounds: HashMap::new(),
key_metadata: None,
split_offsets: vec![],
split_offsets: None,
equality_ids: None,
sort_order_id: None,
partition_spec_id: 0,
Expand Down Expand Up @@ -374,7 +374,7 @@ mod tests {
lower_bounds: HashMap::new(),
upper_bounds: HashMap::new(),
key_metadata: None,
split_offsets: vec![],
split_offsets: None,
equality_ids: None,
sort_order_id: None,
partition_spec_id: 0,
Expand Down
12 changes: 6 additions & 6 deletions crates/iceberg/src/expr/visitors/inclusive_metrics_evaluator.rs
Original file line number Diff line number Diff line change
Expand Up @@ -1995,7 +1995,7 @@ mod test {
lower_bounds: Default::default(),
upper_bounds: Default::default(),
key_metadata: None,
split_offsets: vec![],
split_offsets: None,
equality_ids: None,
sort_order_id: None,
partition_spec_id: 0,
Expand All @@ -2021,7 +2021,7 @@ mod test {
lower_bounds: Default::default(),
upper_bounds: Default::default(),
key_metadata: None,
split_offsets: vec![],
split_offsets: None,
equality_ids: None,
sort_order_id: None,
partition_spec_id: 0,
Expand Down Expand Up @@ -2083,7 +2083,7 @@ mod test {

column_sizes: Default::default(),
key_metadata: None,
split_offsets: vec![],
split_offsets: None,
equality_ids: None,
sort_order_id: None,
partition_spec_id: 0,
Expand Down Expand Up @@ -2114,7 +2114,7 @@ mod test {

column_sizes: Default::default(),
key_metadata: None,
split_offsets: vec![],
split_offsets: None,
equality_ids: None,
sort_order_id: None,
partition_spec_id: 0,
Expand Down Expand Up @@ -2146,7 +2146,7 @@ mod test {

column_sizes: Default::default(),
key_metadata: None,
split_offsets: vec![],
split_offsets: None,
equality_ids: None,
sort_order_id: None,
partition_spec_id: 0,
Expand Down Expand Up @@ -2178,7 +2178,7 @@ mod test {

column_sizes: Default::default(),
key_metadata: None,
split_offsets: vec![],
split_offsets: None,
equality_ids: None,
sort_order_id: None,
partition_spec_id: 0,
Expand Down
8 changes: 4 additions & 4 deletions crates/iceberg/src/expr/visitors/strict_metrics_evaluator.rs
Original file line number Diff line number Diff line change
Expand Up @@ -578,7 +578,7 @@ mod test {
]),
column_sizes: Default::default(),
key_metadata: None,
split_offsets: vec![],
split_offsets: None,
equality_ids: None,
sort_order_id: None,
partition_spec_id: 0,
Expand All @@ -604,7 +604,7 @@ mod test {
lower_bounds: Default::default(),
upper_bounds: Default::default(),
key_metadata: None,
split_offsets: vec![],
split_offsets: None,
equality_ids: None,
sort_order_id: None,
partition_spec_id: 0,
Expand All @@ -630,7 +630,7 @@ mod test {
upper_bounds: HashMap::from([(1, Datum::int(42))]),
column_sizes: Default::default(),
key_metadata: None,
split_offsets: vec![],
split_offsets: None,
equality_ids: None,
sort_order_id: None,
partition_spec_id: 0,
Expand All @@ -657,7 +657,7 @@ mod test {
upper_bounds: HashMap::from([(3, Datum::string("dC"))]),
column_sizes: Default::default(),
key_metadata: None,
split_offsets: vec![],
split_offsets: None,
equality_ids: None,
sort_order_id: None,
partition_spec_id: 0,
Expand Down
6 changes: 3 additions & 3 deletions crates/iceberg/src/spec/manifest/_serde.rs
Original file line number Diff line number Diff line change
Expand Up @@ -153,7 +153,7 @@ impl DataFileSerde {
lower_bounds: Some(to_bytes_entry(value.lower_bounds)?),
upper_bounds: Some(to_bytes_entry(value.upper_bounds)?),
key_metadata: value.key_metadata.map(serde_bytes::ByteBuf::from),
split_offsets: Some(value.split_offsets),
split_offsets: value.split_offsets,
equality_ids: value.equality_ids,
sort_order_id: value.sort_order_id,
first_row_id: value.first_row_id,
Expand Down Expand Up @@ -222,7 +222,7 @@ impl DataFileSerde {
.transpose()?
.unwrap_or_default(),
key_metadata: self.key_metadata.map(|v| v.to_vec()),
split_offsets: self.split_offsets.unwrap_or_default(),
split_offsets: self.split_offsets,
equality_ids: self.equality_ids,
sort_order_id: self.sort_order_id,
partition_spec_id,
Expand Down Expand Up @@ -380,7 +380,7 @@ mod tests {
lower_bounds: HashMap::from([(1,Datum::int(1)),(2,Datum::string("a")),(3,Datum::string("AC/DC"))]),
upper_bounds: HashMap::from([(1,Datum::int(1)),(2,Datum::string("a")),(3,Datum::string("AC/DC"))]),
key_metadata: None,
split_offsets: vec![4],
split_offsets: Some(vec![4]),
equality_ids: None,
sort_order_id: Some(0),
partition_spec_id: 0,
Expand Down
10 changes: 6 additions & 4 deletions crates/iceberg/src/spec/manifest/data_file.rs
Original file line number Diff line number Diff line change
Expand Up @@ -127,9 +127,10 @@ pub struct DataFile {
/// element field id: 133
///
/// Split offsets for the data file. For example, all row group offsets
/// in a Parquet file. Must be sorted ascending
/// in a Parquet file. Must be sorted ascending. Optional field that
/// should be serialized as null when not present.
#[builder(default)]
pub(crate) split_offsets: Vec<i64>,
pub(crate) split_offsets: Option<Vec<i64>>,
/// field id: 135
/// element field id: 136
///
Expand Down Expand Up @@ -247,8 +248,9 @@ impl DataFile {
}
/// Get the split offsets of the data file.
/// For example, all row group offsets in a Parquet file.
pub fn split_offsets(&self) -> &[i64] {
&self.split_offsets
/// Returns `None` if no split offsets are present.
pub fn split_offsets(&self) -> Option<&[i64]> {
self.split_offsets.as_deref()
}
/// Get the equality ids of the data file.
/// Field ids used to determine row equality in equality delete files.
Expand Down
24 changes: 12 additions & 12 deletions crates/iceberg/src/spec/manifest/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -257,7 +257,7 @@ mod tests {
snapshot_id: None,
sequence_number: None,
file_sequence_number: None,
data_file: DataFile {content:DataContentType::Data,file_path:"s3a://icebergdata/demo/s1/t1/data/00000-0-ba56fbfa-f2ff-40c9-bb27-565ad6dc2be8-00000.parquet".to_string(),file_format:DataFileFormat::Parquet,partition:Struct::empty(),record_count:1,file_size_in_bytes:5442,column_sizes:HashMap::from([(0,73),(6,34),(2,73),(7,61),(3,61),(5,62),(9,79),(10,73),(1,61),(4,73),(8,73)]),value_counts:HashMap::from([(4,1),(5,1),(2,1),(0,1),(3,1),(6,1),(8,1),(1,1),(10,1),(7,1),(9,1)]),null_value_counts:HashMap::from([(1,0),(6,0),(2,0),(8,0),(0,0),(3,0),(5,0),(9,0),(7,0),(4,0),(10,0)]),nan_value_counts:HashMap::new(),lower_bounds:HashMap::new(),upper_bounds:HashMap::new(),key_metadata:None,split_offsets:vec![4],equality_ids:Some(Vec::new()),sort_order_id:None, partition_spec_id: 0,first_row_id: None,referenced_data_file: None,content_offset: None,content_size_in_bytes: None }
data_file: DataFile {content:DataContentType::Data,file_path:"s3a://icebergdata/demo/s1/t1/data/00000-0-ba56fbfa-f2ff-40c9-bb27-565ad6dc2be8-00000.parquet".to_string(),file_format:DataFileFormat::Parquet,partition:Struct::empty(),record_count:1,file_size_in_bytes:5442,column_sizes:HashMap::from([(0,73),(6,34),(2,73),(7,61),(3,61),(5,62),(9,79),(10,73),(1,61),(4,73),(8,73)]),value_counts:HashMap::from([(4,1),(5,1),(2,1),(0,1),(3,1),(6,1),(8,1),(1,1),(10,1),(7,1),(9,1)]),null_value_counts:HashMap::from([(1,0),(6,0),(2,0),(8,0),(0,0),(3,0),(5,0),(9,0),(7,0),(4,0),(10,0)]),nan_value_counts:HashMap::new(),lower_bounds:HashMap::new(),upper_bounds:HashMap::new(),key_metadata:None,split_offsets:Some(vec![4]),equality_ids:Some(Vec::new()),sort_order_id:None, partition_spec_id: 0,first_row_id: None,referenced_data_file: None,content_offset: None,content_size_in_bytes: None }
}
];

Expand Down Expand Up @@ -435,7 +435,7 @@ mod tests {
lower_bounds: HashMap::new(),
upper_bounds: HashMap::new(),
key_metadata: None,
split_offsets: vec![4],
split_offsets: Some(vec![4]),
equality_ids: Some(Vec::new()),
sort_order_id: None,
partition_spec_id: 0,
Expand Down Expand Up @@ -532,7 +532,7 @@ mod tests {
lower_bounds: HashMap::from([(1,Datum::int(1)),(2,Datum::string("a")),(3,Datum::string("AC/DC"))]),
upper_bounds: HashMap::from([(1,Datum::int(1)),(2,Datum::string("a")),(3,Datum::string("AC/DC"))]),
key_metadata: None,
split_offsets: vec![4],
split_offsets: Some(vec![4]),
equality_ids: None,
sort_order_id: Some(0),
partition_spec_id: 0,
Expand Down Expand Up @@ -640,7 +640,7 @@ mod tests {
(3, Datum::string("x"))
]),
key_metadata: None,
split_offsets: vec![4],
split_offsets: Some(vec![4]),
equality_ids: None,
sort_order_id: Some(0),
partition_spec_id: 0,
Expand Down Expand Up @@ -749,7 +749,7 @@ mod tests {
(3, Datum::string("x"))
]),
key_metadata: None,
split_offsets: vec![4],
split_offsets: Some(vec![4]),
equality_ids: None,
sort_order_id: None,
partition_spec_id: 0,
Expand Down Expand Up @@ -840,7 +840,7 @@ mod tests {
(2, Datum::int(2)),
]),
key_metadata: None,
split_offsets: vec![4],
split_offsets: Some(vec![4]),
equality_ids: None,
sort_order_id: None,
partition_spec_id: 0,
Expand Down Expand Up @@ -922,7 +922,7 @@ mod tests {
lower_bounds: HashMap::new(),
upper_bounds: HashMap::new(),
key_metadata: None,
split_offsets: vec![4],
split_offsets: Some(vec![4]),
equality_ids: None,
sort_order_id: None,
partition_spec_id: 0,
Expand Down Expand Up @@ -957,7 +957,7 @@ mod tests {
lower_bounds: HashMap::new(),
upper_bounds: HashMap::new(),
key_metadata: None,
split_offsets: vec![4],
split_offsets: Some(vec![4]),
equality_ids: None,
sort_order_id: None,
partition_spec_id: 0,
Expand Down Expand Up @@ -992,7 +992,7 @@ mod tests {
lower_bounds: HashMap::new(),
upper_bounds: HashMap::new(),
key_metadata: None,
split_offsets: vec![4],
split_offsets: Some(vec![4]),
equality_ids: None,
sort_order_id: None,
partition_spec_id: 0,
Expand Down Expand Up @@ -1027,7 +1027,7 @@ mod tests {
lower_bounds: HashMap::new(),
upper_bounds: HashMap::new(),
key_metadata: None,
split_offsets: vec![4],
split_offsets: Some(vec![4]),
equality_ids: None,
sort_order_id: None,
partition_spec_id: 0,
Expand Down Expand Up @@ -1182,7 +1182,7 @@ mod tests {
"lower_bounds": [],
"upper_bounds": [],
"key_metadata": null,
"split_offsets": [],
"split_offsets": null,
"equality_ids": null,
"sort_order_id": null,
"first_row_id": null,
Expand Down Expand Up @@ -1213,7 +1213,7 @@ mod tests {
"lower_bounds": [],
"upper_bounds": [],
"key_metadata": null,
"split_offsets": [],
"split_offsets": null,
"equality_ids": null,
"sort_order_id": null,
"first_row_id": null,
Expand Down
6 changes: 3 additions & 3 deletions crates/iceberg/src/spec/manifest/writer.rs
Original file line number Diff line number Diff line change
Expand Up @@ -608,7 +608,7 @@ mod tests {
lower_bounds: HashMap::new(),
upper_bounds: HashMap::new(),
key_metadata: Some(Vec::new()),
split_offsets: vec![4],
split_offsets: Some(vec![4]),
equality_ids: None,
sort_order_id: None,
partition_spec_id: 0,
Expand Down Expand Up @@ -637,7 +637,7 @@ mod tests {
lower_bounds: HashMap::new(),
upper_bounds: HashMap::new(),
key_metadata: Some(Vec::new()),
split_offsets: vec![4],
split_offsets: Some(vec![4]),
equality_ids: None,
sort_order_id: None,
partition_spec_id: 0,
Expand Down Expand Up @@ -666,7 +666,7 @@ mod tests {
lower_bounds: HashMap::new(),
upper_bounds: HashMap::new(),
key_metadata: Some(Vec::new()),
split_offsets: vec![4],
split_offsets: Some(vec![4]),
equality_ids: None,
sort_order_id: None,
partition_spec_id: 0,
Expand Down
10 changes: 5 additions & 5 deletions crates/iceberg/src/spec/snapshot_summary.rs
Original file line number Diff line number Diff line change
Expand Up @@ -767,7 +767,7 @@ mod tests {
(3, Datum::string("x")),
]),
key_metadata: None,
split_offsets: vec![4],
split_offsets: Some(vec![4]),
equality_ids: None,
sort_order_id: Some(0),
partition_spec_id: 0,
Expand Down Expand Up @@ -799,7 +799,7 @@ mod tests {
(3, Datum::string("x")),
]),
key_metadata: None,
split_offsets: vec![4],
split_offsets: Some(vec![4]),
equality_ids: None,
sort_order_id: Some(0),
partition_spec_id: 0,
Expand Down Expand Up @@ -910,7 +910,7 @@ mod tests {
lower_bounds: HashMap::new(),
upper_bounds: HashMap::new(),
key_metadata: None,
split_offsets: vec![],
split_offsets: None,
equality_ids: None,
sort_order_id: None,
partition_spec_id: 0,
Expand Down Expand Up @@ -938,7 +938,7 @@ mod tests {
lower_bounds: HashMap::new(),
upper_bounds: HashMap::new(),
key_metadata: None,
split_offsets: vec![],
split_offsets: None,
equality_ids: None,
sort_order_id: None,
partition_spec_id: 0,
Expand Down Expand Up @@ -993,7 +993,7 @@ mod tests {
lower_bounds: HashMap::new(),
upper_bounds: HashMap::new(),
key_metadata: None,
split_offsets: vec![],
split_offsets: None,
equality_ids: None,
sort_order_id: None,
partition_spec_id: 0,
Expand Down
16 changes: 8 additions & 8 deletions crates/iceberg/src/writer/base_writer/equality_delete_writer.rs
Original file line number Diff line number Diff line change
Expand Up @@ -293,15 +293,15 @@ mod test {
assert_eq!(*data_file.null_value_counts.get(id).unwrap(), expect);
}

assert_eq!(data_file.split_offsets.len(), metadata.num_row_groups());
data_file
let split_offsets = data_file
.split_offsets
.iter()
.enumerate()
.for_each(|(i, &v)| {
let expect = metadata.row_groups()[i].file_offset().unwrap();
assert_eq!(v, expect);
});
.as_ref()
.expect("split_offsets should be set");
assert_eq!(split_offsets.len(), metadata.num_row_groups());
split_offsets.iter().enumerate().for_each(|(i, &v)| {
let expect = metadata.row_groups()[i].file_offset().unwrap();
assert_eq!(v, expect);
});
}

#[tokio::test]
Expand Down
4 changes: 2 additions & 2 deletions crates/iceberg/src/writer/file_writer/parquet_writer.rs
Original file line number Diff line number Diff line change
Expand Up @@ -438,13 +438,13 @@ impl ParquetWriter {
// - We can ignore implementing distinct_counts due to this: https://lists.apache.org/thread/j52tsojv0x4bopxyzsp7m7bqt23n5fnd
.lower_bounds(lower_bounds)
.upper_bounds(upper_bounds)
.split_offsets(
.split_offsets(Some(
metadata
.row_groups()
.iter()
.filter_map(|group| group.file_offset())
.collect(),
);
));

Ok(builder)
}
Expand Down
Loading