Describe the bug
When taking two DataFrame objects and running except the function fails when there are Structs in the schema, but succeeds with more simple schemas.
For example, this works:
let schema = Arc::new(Schema::new(vec![Field::new(
"value",
DataType::Int32,
true),
]));
let batch = RecordBatch::try_new(
Arc::clone(&schema),
vec![
Arc::new(Int32Array::from(vec![Some(1), Some(2), Some(3)])),
])
.unwrap();
let updated_batch = RecordBatch::try_new(
Arc::clone(&schema),
vec![
Arc::new(Int32Array::from(vec![Some(1), Some(12), Some(3)])),
],
)
.unwrap();
let _ = datafusion::arrow::util::pretty::print_batches(&[batch.clone()]);
let _ = datafusion::arrow::util::pretty::print_batches(&[updated_batch.clone()]);
let ctx = SessionContext::new();
let before = ctx.read_batch(batch).expect("Failed to make DataFrame");
let after = ctx.read_batch(updated_batch).expect("Failed to make DataFrame");
let diff = before.except(after).expect("Failed to except").collect().await.expect("Failed to diff");
assert_eq!(diff.len(), 1);
To Reproduce
let nested_schema = Arc::new(Schema::new(vec![
Field::new("id", DataType::Int32, true),
Field::new("lat", DataType::Int32, true),
Field::new("long", DataType::Int32, true),
]));
let schema = Arc::new(Schema::new(vec![Field::new(
"value",
DataType::Int32,
true),
Field::new("nested",
DataType::Struct(nested_schema.fields.clone()),
true)
]));
let batch = RecordBatch::try_new(
Arc::clone(&schema),
vec![
Arc::new(Int32Array::from(vec![Some(1), Some(2), Some(3)])),
Arc::new(StructArray::from(vec![
(
Arc::new(Field::new("id", DataType::Int32, true)),
Arc::new(Int32Array::from(vec![1,2,3])) as ArrayRef
),
(
Arc::new(Field::new("lat", DataType::Int32, true)),
Arc::new(Int32Array::from(vec![1,2,3])) as ArrayRef
),
(
Arc::new(Field::new("long", DataType::Int32, true)),
Arc::new(Int32Array::from(vec![1,2,3])) as ArrayRef
),
]))
])
.unwrap();
let updated_batch = RecordBatch::try_new(
Arc::clone(&schema),
vec![
Arc::new(Int32Array::from(vec![Some(1), Some(12), Some(3)])),
Arc::new(StructArray::from(vec![
(
Arc::new(Field::new("id", DataType::Int32, true)),
Arc::new(Int32Array::from(vec![1,2,3])) as ArrayRef
),
(
Arc::new(Field::new("lat", DataType::Int32, true)),
Arc::new(Int32Array::from(vec![1,2,3])) as ArrayRef
),
(
Arc::new(Field::new("long", DataType::Int32, true)),
Arc::new(Int32Array::from(vec![1,2,3])) as ArrayRef
),
]))
],
)
.unwrap();
let _ = datafusion::arrow::util::pretty::print_batches(&[batch.clone()]);
let _ = datafusion::arrow::util::pretty::print_batches(&[updated_batch.clone()]);
let ctx = SessionContext::new();
let before = ctx.read_batch(batch).expect("Failed to make DataFrame");
let after = ctx.read_batch(updated_batch).expect("Failed to make DataFrame");
let diff = before.except(after).expect("Failed to except").collect().await.expect("Failed to diff");
assert_eq!(diff.len(), 1);
Expected behavior
I would expect the above to pass assertions, instead this output is produced:
running 2 tests
test tests::test_simple ... ok
test tests::test_with_struct ... FAILED
failures:
---- tests::test_with_struct stdout ----
+-------+--------------------------+
| value | nested |
+-------+--------------------------+
| 1 | {id: 1, lat: 1, long: 1} |
| 2 | {id: 2, lat: 2, long: 2} |
| 3 | {id: 3, lat: 3, long: 3} |
+-------+--------------------------+
+-------+--------------------------+
| value | nested |
+-------+--------------------------+
| 1 | {id: 1, lat: 1, long: 1} |
| 12 | {id: 2, lat: 2, long: 2} |
| 3 | {id: 3, lat: 3, long: 3} |
+-------+--------------------------+
thread 'tests::test_with_struct' panicked at except-df-bug/src/lib.rs:74:84:
Failed to diff: ArrowError(InvalidArgumentError("Invalid comparison operation: Struct([Field { name: \"id\", data_type: Int32, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }, Field { name: \"lat\", data_type: Int32, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }, Field { name: \"long\", data_type: Int32, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }]) IS NOT DISTINCT FROM Struct([Field { name: \"id\", data_type: Int32, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }, Field { name: \"lat\", data_type: Int32, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }, Field { name: \"long\", data_type: Int32, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }])"), None)
note: run with `RUST_BACKTRACE=1` environment variable to display a backtrace
failures:
tests::test_with_struct
test result: FAILED. 1 passed; 1 failed; 0 ignored; 0 measured; 0 filtered out; finished in 0.01s
Additional context
I should also note that I tested this with DataFusion 37 and 38, same results.
Describe the bug
When taking two
DataFrameobjects and runningexceptthe function fails when there are Structs in the schema, but succeeds with more simple schemas.For example, this works:
To Reproduce
Expected behavior
I would expect the above to pass assertions, instead this output is produced:
Additional context
I should also note that I tested this with DataFusion 37 and 38, same results.