Skip to content

DataFrame.except() does not work with structs in schema #10749

@rtyler

Description

@rtyler

Describe the bug

When taking two DataFrame objects and running except the function fails when there are Structs in the schema, but succeeds with more simple schemas.

For example, this works:

        let schema = Arc::new(Schema::new(vec![Field::new(
            "value",
            DataType::Int32,
            true),
        ]));
        let batch = RecordBatch::try_new(
            Arc::clone(&schema),
            vec![
                Arc::new(Int32Array::from(vec![Some(1), Some(2), Some(3)])),
                ])
        .unwrap();

        let updated_batch = RecordBatch::try_new(
            Arc::clone(&schema),
            vec![
                Arc::new(Int32Array::from(vec![Some(1), Some(12), Some(3)])),
            ],
        )
        .unwrap();
        let _ = datafusion::arrow::util::pretty::print_batches(&[batch.clone()]);
        let _ = datafusion::arrow::util::pretty::print_batches(&[updated_batch.clone()]);

        let ctx = SessionContext::new();
        let before = ctx.read_batch(batch).expect("Failed to make DataFrame");
        let after = ctx.read_batch(updated_batch).expect("Failed to make DataFrame");

        let diff = before.except(after).expect("Failed to except").collect().await.expect("Failed to diff");
        assert_eq!(diff.len(), 1);

To Reproduce

        let nested_schema = Arc::new(Schema::new(vec![
            Field::new("id", DataType::Int32, true),
            Field::new("lat", DataType::Int32, true),
            Field::new("long", DataType::Int32, true),
        ]));
        let schema = Arc::new(Schema::new(vec![Field::new(
            "value",
            DataType::Int32,
            true),
            Field::new("nested",
                DataType::Struct(nested_schema.fields.clone()),
                true)
        ]));
        let batch = RecordBatch::try_new(
            Arc::clone(&schema),
            vec![
                Arc::new(Int32Array::from(vec![Some(1), Some(2), Some(3)])),
                Arc::new(StructArray::from(vec![
                    (
                        Arc::new(Field::new("id", DataType::Int32, true)),
                        Arc::new(Int32Array::from(vec![1,2,3])) as ArrayRef
                    ),
                    (
                        Arc::new(Field::new("lat", DataType::Int32, true)),
                        Arc::new(Int32Array::from(vec![1,2,3])) as ArrayRef
                    ),
                    (
                        Arc::new(Field::new("long", DataType::Int32, true)),
                        Arc::new(Int32Array::from(vec![1,2,3])) as ArrayRef
                    ),
                    ]))
                ])
        .unwrap();

        let updated_batch = RecordBatch::try_new(
            Arc::clone(&schema),
            vec![
                Arc::new(Int32Array::from(vec![Some(1), Some(12), Some(3)])),
                Arc::new(StructArray::from(vec![
                    (
                        Arc::new(Field::new("id", DataType::Int32, true)),
                        Arc::new(Int32Array::from(vec![1,2,3])) as ArrayRef
                    ),
                    (
                        Arc::new(Field::new("lat", DataType::Int32, true)),
                        Arc::new(Int32Array::from(vec![1,2,3])) as ArrayRef
                    ),
                    (
                        Arc::new(Field::new("long", DataType::Int32, true)),
                        Arc::new(Int32Array::from(vec![1,2,3])) as ArrayRef
                    ),
                    ]))
            ],
        )
        .unwrap();
        let _ = datafusion::arrow::util::pretty::print_batches(&[batch.clone()]);
        let _ = datafusion::arrow::util::pretty::print_batches(&[updated_batch.clone()]);

        let ctx = SessionContext::new();
        let before = ctx.read_batch(batch).expect("Failed to make DataFrame");
        let after = ctx.read_batch(updated_batch).expect("Failed to make DataFrame");

        let diff = before.except(after).expect("Failed to except").collect().await.expect("Failed to diff");
        assert_eq!(diff.len(), 1);

Expected behavior

I would expect the above to pass assertions, instead this output is produced:

running 2 tests
test tests::test_simple ... ok
test tests::test_with_struct ... FAILED

failures:

---- tests::test_with_struct stdout ----
+-------+--------------------------+
| value | nested                   |
+-------+--------------------------+
| 1     | {id: 1, lat: 1, long: 1} |
| 2     | {id: 2, lat: 2, long: 2} |
| 3     | {id: 3, lat: 3, long: 3} |
+-------+--------------------------+
+-------+--------------------------+
| value | nested                   |
+-------+--------------------------+
| 1     | {id: 1, lat: 1, long: 1} |
| 12    | {id: 2, lat: 2, long: 2} |
| 3     | {id: 3, lat: 3, long: 3} |
+-------+--------------------------+
thread 'tests::test_with_struct' panicked at except-df-bug/src/lib.rs:74:84:
Failed to diff: ArrowError(InvalidArgumentError("Invalid comparison operation: Struct([Field { name: \"id\", data_type: Int32, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }, Field { name: \"lat\", data_type: Int32, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }, Field { name: \"long\", data_type: Int32, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }]) IS NOT DISTINCT FROM Struct([Field { name: \"id\", data_type: Int32, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }, Field { name: \"lat\", data_type: Int32, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }, Field { name: \"long\", data_type: Int32, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }])"), None)
note: run with `RUST_BACKTRACE=1` environment variable to display a backtrace


failures:
    tests::test_with_struct

test result: FAILED. 1 passed; 1 failed; 0 ignored; 0 measured; 0 filtered out; finished in 0.01s

Additional context

I should also note that I tested this with DataFusion 37 and 38, same results.

Metadata

Metadata

Assignees

No one assigned

    Labels

    bugSomething isn't working

    Type

    No type
    No fields configured for issues without a type.

    Projects

    No projects

    Milestone

    No milestone

    Relationships

    None yet

    Development

    No branches or pull requests

    Issue actions