From a2b0c4f6592accd522991d4bdc882577dd1b974a Mon Sep 17 00:00:00 2001 From: Benjamin Kietzman Date: Mon, 5 Oct 2020 11:12:52 -0400 Subject: [PATCH 1/4] ARROW-9147: [C++][Dataset] Support projection from null->any type --- cpp/src/arrow/dataset/dataset_test.cc | 14 ++++++++++---- cpp/src/arrow/dataset/projector.cc | 14 ++++++++++++-- cpp/src/arrow/testing/generator.h | 2 ++ python/pyarrow/tests/test_dataset.py | 17 +++++++++++++++++ 4 files changed, 41 insertions(+), 6 deletions(-) diff --git a/cpp/src/arrow/dataset/dataset_test.cc b/cpp/src/arrow/dataset/dataset_test.cc index 7a378cd9210..d5c1cc621c4 100644 --- a/cpp/src/arrow/dataset/dataset_test.cc +++ b/cpp/src/arrow/dataset/dataset_test.cc @@ -211,6 +211,7 @@ TEST(TestProjector, CheckProjectable) { auto i8_req = field("i8", int8(), false); auto u16_req = field("u16", uint16(), false); auto str_req = field("str", utf8(), false); + auto str_nil = field("str", null()); // trivial Assert({}).ProjectableTo({}); @@ -235,6 +236,8 @@ TEST(TestProjector, CheckProjectable) { Assert({i8}).NotProjectableTo({i8_req}, "not nullable but is not required in origin schema"); Assert({i8_req}).ProjectableTo({i8}); + Assert({str_nil}).ProjectableTo({str}); + Assert({str_nil}).NotProjectableTo({str_req}); // change field type Assert({i8}).NotProjectableTo({field("i8", utf8())}, @@ -257,15 +260,18 @@ TEST(TestProjector, MismatchedType) { TEST(TestProjector, AugmentWithNull) { constexpr int64_t kBatchSize = 1024; - auto from_schema = schema({field("f64", float64()), field("b", boolean())}); + auto from_schema = + schema({field("f64", float64()), field("b", boolean()), field("str", null())}); auto batch = ConstantArrayGenerator::Zeroes(kBatchSize, from_schema); - auto to_schema = schema({field("i32", int32()), field("f64", float64())}); + auto to_schema = + schema({field("i32", int32()), field("f64", float64()), field("str", utf8())}); RecordBatchProjector projector(to_schema); ASSERT_OK_AND_ASSIGN(auto null_i32, MakeArrayOfNull(int32(), batch->num_rows())); - auto expected_batch = - RecordBatch::Make(to_schema, batch->num_rows(), {null_i32, batch->column(0)}); + ASSERT_OK_AND_ASSIGN(auto null_str, MakeArrayOfNull(null(), batch->num_rows())); + auto expected_batch = RecordBatch::Make(to_schema, batch->num_rows(), + {null_i32, batch->column(0), null_str}); ASSERT_OK_AND_ASSIGN(auto reconciled_batch, projector.Project(*batch)); AssertBatchesEqual(*expected_batch, *reconciled_batch); diff --git a/cpp/src/arrow/dataset/projector.cc b/cpp/src/arrow/dataset/projector.cc index 9ce90ad0ed3..bda775ee658 100644 --- a/cpp/src/arrow/dataset/projector.cc +++ b/cpp/src/arrow/dataset/projector.cc @@ -46,6 +46,15 @@ Status CheckProjectable(const Schema& from, const Schema& to) { from); } + if (from_field->type()->id() == Type::NA) { + // promotion from null to any type is supported + if (to_field->nullable()) continue; + + return Status::TypeError("field ", to_field->ToString(), + " is not nullable and but has type ", NullType(), + " in origin schema ", from); + } + if (!from_field->type()->Equals(to_field->type())) { return Status::TypeError("fields had matching names but differing types. From: ", from_field->ToString(), " To: ", to_field->ToString()); @@ -98,7 +107,7 @@ Result> RecordBatchProjector::Project( RETURN_NOT_OK(ResizeMissingColumns(batch.num_rows(), pool)); } - std::vector> columns(to_->num_fields()); + ArrayVector columns(to_->num_fields()); for (int i = 0; i < to_->num_fields(); ++i) { if (column_indices_[i] != kNoMatch) { @@ -120,7 +129,8 @@ Status RecordBatchProjector::SetInputSchema(std::shared_ptr from, ARROW_ASSIGN_OR_RAISE(auto match, FieldRef(to_->field(i)->name()).FindOneOrNone(*from_)); - if (match.indices().empty()) { + if (match.indices().empty() || + from_->field(match.indices()[0])->type()->id() == Type::NA) { // Mark column i as missing by setting missing_columns_[i] // to a non-null placeholder. ARROW_ASSIGN_OR_RAISE(missing_columns_[i], diff --git a/cpp/src/arrow/testing/generator.h b/cpp/src/arrow/testing/generator.h index 9b5425b5e24..9188dca5709 100644 --- a/cpp/src/arrow/testing/generator.h +++ b/cpp/src/arrow/testing/generator.h @@ -166,6 +166,8 @@ class ARROW_TESTING_EXPORT ConstantArrayGenerator { static std::shared_ptr Zeroes(int64_t size, const std::shared_ptr& type) { switch (type->id()) { + case Type::NA: + return std::make_shared(size); case Type::BOOL: return Boolean(size); case Type::UINT8: diff --git a/python/pyarrow/tests/test_dataset.py b/python/pyarrow/tests/test_dataset.py index cab6f700c34..e313ac58a65 100644 --- a/python/pyarrow/tests/test_dataset.py +++ b/python/pyarrow/tests/test_dataset.py @@ -2124,6 +2124,23 @@ def test_dataset_project_only_partition_columns(tempdir): assert all_cols.column('part').equals(part_only.column('part')) +@pytest.mark.parquet +@pytest.mark.pandas +def test_dataset_project_null_column(tempdir): + import pandas as pd + df = pd.DataFrame({"col": np.array([None, None, None], dtype='object')}) + + f = tempdir / "test_dataset_project_null_column.parquet" + df.to_parquet(f, engine="pyarrow") + + import pyarrow as pa + import pyarrow.dataset as ds + dataset = ds.dataset(f, format="parquet", + schema=pa.schema([("col", pa.int64())])) + expected = pa.table({'col': pa.array([None, None, None], pa.int64())}) + assert dataset.to_table().equals(expected) + + def _check_dataset_roundtrip(dataset, base_dir, expected_files, base_dir_path=None, partitioning=None): base_dir_path = base_dir_path or base_dir From 84a545439fdf84e4206da3be452a10a98fe3ff2d Mon Sep 17 00:00:00 2001 From: Benjamin Kietzman Date: Mon, 5 Oct 2020 16:22:44 -0400 Subject: [PATCH 2/4] correct test --- cpp/src/arrow/dataset/dataset_test.cc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cpp/src/arrow/dataset/dataset_test.cc b/cpp/src/arrow/dataset/dataset_test.cc index d5c1cc621c4..cbf1082a2da 100644 --- a/cpp/src/arrow/dataset/dataset_test.cc +++ b/cpp/src/arrow/dataset/dataset_test.cc @@ -269,7 +269,7 @@ TEST(TestProjector, AugmentWithNull) { RecordBatchProjector projector(to_schema); ASSERT_OK_AND_ASSIGN(auto null_i32, MakeArrayOfNull(int32(), batch->num_rows())); - ASSERT_OK_AND_ASSIGN(auto null_str, MakeArrayOfNull(null(), batch->num_rows())); + ASSERT_OK_AND_ASSIGN(auto null_str, MakeArrayOfNull(utf8(), batch->num_rows())); auto expected_batch = RecordBatch::Make(to_schema, batch->num_rows(), {null_i32, batch->column(0), null_str}); From fb4ea2cb103f714880188121e5acae090c1084dd Mon Sep 17 00:00:00 2001 From: Benjamin Kietzman Date: Mon, 5 Oct 2020 16:23:24 -0400 Subject: [PATCH 3/4] typo --- cpp/src/arrow/dataset/projector.cc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cpp/src/arrow/dataset/projector.cc b/cpp/src/arrow/dataset/projector.cc index bda775ee658..2ba679ce6e7 100644 --- a/cpp/src/arrow/dataset/projector.cc +++ b/cpp/src/arrow/dataset/projector.cc @@ -51,7 +51,7 @@ Status CheckProjectable(const Schema& from, const Schema& to) { if (to_field->nullable()) continue; return Status::TypeError("field ", to_field->ToString(), - " is not nullable and but has type ", NullType(), + " is not nullable but has type ", NullType(), " in origin schema ", from); } From 3e780f96b9c2259a9d90669f8d8de926cacf8073 Mon Sep 17 00:00:00 2001 From: Benjamin Kietzman Date: Tue, 6 Oct 2020 11:55:52 -0400 Subject: [PATCH 4/4] Update python/pyarrow/tests/test_dataset.py --- python/pyarrow/tests/test_dataset.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/python/pyarrow/tests/test_dataset.py b/python/pyarrow/tests/test_dataset.py index e313ac58a65..d8f638c56d1 100644 --- a/python/pyarrow/tests/test_dataset.py +++ b/python/pyarrow/tests/test_dataset.py @@ -2133,8 +2133,6 @@ def test_dataset_project_null_column(tempdir): f = tempdir / "test_dataset_project_null_column.parquet" df.to_parquet(f, engine="pyarrow") - import pyarrow as pa - import pyarrow.dataset as ds dataset = ds.dataset(f, format="parquet", schema=pa.schema([("col", pa.int64())])) expected = pa.table({'col': pa.array([None, None, None], pa.int64())})