diff --git a/python/python/tests/test_file.py b/python/python/tests/test_file.py index 44af6a8e2d5..5ac5886e2e6 100644 --- a/python/python/tests/test_file.py +++ b/python/python/tests/test_file.py @@ -757,3 +757,23 @@ def test_session_contains(tmp_path): assert session.contains("subdir/nested.lance") assert not session.contains("subdir/nonexistent.lance") + + +def test_struct_null_regression(): + import lance + + # Create struct array where 2nd element is null + tag_array = pa.array(["valid", "null_struct", "valid", "valid"]) + struct_array = pa.StructArray.from_arrays( + [tag_array], + fields=[pa.field("tag", pa.string(), nullable=True)], + mask=pa.array([True, False, True, True]), # False = null struct element + ) + + # Create list containing these structs + offsets = pa.array([0, 4], type=pa.int32()) + list_array = pa.ListArray.from_arrays(offsets, struct_array) + batch = pa.record_batch([pa.array([0]), list_array], names=["id", "value"]) + + ds = lance.write_dataset(batch, "memory://", data_storage_version="2.2") + ds.to_table() diff --git a/rust/lance-encoding/src/encodings/logical/struct.rs b/rust/lance-encoding/src/encodings/logical/struct.rs index 3eb9a6bd250..ab2212e71e4 100644 --- a/rust/lance-encoding/src/encodings/logical/struct.rs +++ b/rust/lance-encoding/src/encodings/logical/struct.rs @@ -778,6 +778,40 @@ mod tests { check_basic_random(field).await; } + #[test_log::test(tokio::test)] + async fn test_list_of_struct_with_null_struct_element() { + // Regression: a list containing structs where most struct elements are null + // causes a length mismatch during decoding with V2_2 encoding. + use arrow_array::StringArray; + + let tag_array = StringArray::from(vec![ + Some("valid"), + Some("null_struct"), + Some("valid"), + Some("valid"), + ]); + let struct_fields = Fields::from(vec![Field::new("tag", DataType::Utf8, true)]); + // 3 out of 4 struct elements are null + let struct_validity = NullBuffer::from(vec![false, true, false, false]); + let struct_array = StructArray::new( + struct_fields.clone(), + vec![Arc::new(tag_array)], + Some(struct_validity), + ); + + let offsets = OffsetBuffer::new(ScalarBuffer::::from(vec![0, 4])); + let list_field = Field::new("item", DataType::Struct(struct_fields), true); + let list_array = + ListArray::new(Arc::new(list_field), offsets, Arc::new(struct_array), None); + + check_round_trip_encoding_of_data( + vec![Arc::new(list_array)], + &TestCases::default().with_min_file_version(LanceFileVersion::V2_2), + HashMap::new(), + ) + .await; + } + #[test_log::test(tokio::test)] async fn test_ragged_scheduling() { // This test covers scheduling when batches straddle page boundaries diff --git a/rust/lance-encoding/src/repdef.rs b/rust/lance-encoding/src/repdef.rs index 1cbe5592f1e..99b1bfb82bc 100644 --- a/rust/lance-encoding/src/repdef.rs +++ b/rust/lance-encoding/src/repdef.rs @@ -1278,7 +1278,7 @@ impl RepDefUnraveler { // This is the highest def level that is still visible. Once we hit a list then // we stop looking because any null / empty list (or list masked by a higher level // null) will not be visible - let mut max_level = null_level.max(empty_level); + let mut max_level = null_level.max(empty_level).max(valid_level); // Anything higher than this (but less than max_level) is a null struct masking our // list. We will materialize this is a null list. let upper_null = max_level; @@ -2692,6 +2692,40 @@ mod tests { assert_eq!(val, Some(validity(&[true, false, true, true]))); } + #[test] + fn test_repdef_null_struct_valid_list() { + // This regresses a bug + + let rep = vec![1, 0, 0, 0]; + let def = vec![2, 0, 2, 2]; + // AllValidList> + let def_meaning = vec![ + DefinitionInterpretation::NullableItem, + DefinitionInterpretation::NullableItem, + DefinitionInterpretation::AllValidList, + ]; + let num_items = 4; + + let mut unraveler = CompositeRepDefUnraveler::new(vec![RepDefUnraveler::new( + Some(rep), + Some(def), + def_meaning.into(), + num_items, + )]); + + assert_eq!( + unraveler.unravel_validity(4), + Some(validity(&[false, true, false, false])) + ); + assert_eq!( + unraveler.unravel_validity(4), + Some(validity(&[false, true, false, false])) + ); + let (off, val) = unraveler.unravel_offsets::().unwrap(); + assert_eq!(off.inner(), offsets_32(&[0, 4]).inner()); + assert_eq!(val, None); + } + #[test] fn test_repdef_no_rep() { let mut builder = RepDefBuilder::default();