apache · bkietz · Oct 5, 2020 · Oct 5, 2020 · Oct 5, 2020 · Oct 6, 2020
diff --git a/cpp/src/arrow/dataset/dataset_test.cc b/cpp/src/arrow/dataset/dataset_test.cc
@@ -211,6 +211,7 @@ TEST(TestProjector, CheckProjectable) {
   auto i8_req = field("i8", int8(), false);
   auto u16_req = field("u16", uint16(), false);
   auto str_req = field("str", utf8(), false);
+  auto str_nil = field("str", null());
 
   // trivial
   Assert({}).ProjectableTo({});
@@ -235,6 +236,8 @@ TEST(TestProjector, CheckProjectable) {
   Assert({i8}).NotProjectableTo({i8_req},
                                 "not nullable but is not required in origin schema");
   Assert({i8_req}).ProjectableTo({i8});
+  Assert({str_nil}).ProjectableTo({str});
+  Assert({str_nil}).NotProjectableTo({str_req});
 
   // change field type
   Assert({i8}).NotProjectableTo({field("i8", utf8())},
@@ -257,15 +260,18 @@ TEST(TestProjector, MismatchedType) {
 TEST(TestProjector, AugmentWithNull) {
   constexpr int64_t kBatchSize = 1024;
 
-  auto from_schema = schema({field("f64", float64()), field("b", boolean())});
+  auto from_schema =
+      schema({field("f64", float64()), field("b", boolean()), field("str", null())});
   auto batch = ConstantArrayGenerator::Zeroes(kBatchSize, from_schema);
-  auto to_schema = schema({field("i32", int32()), field("f64", float64())});
+  auto to_schema =
+      schema({field("i32", int32()), field("f64", float64()), field("str", utf8())});
 
   RecordBatchProjector projector(to_schema);
 
   ASSERT_OK_AND_ASSIGN(auto null_i32, MakeArrayOfNull(int32(), batch->num_rows()));
-  auto expected_batch =
-      RecordBatch::Make(to_schema, batch->num_rows(), {null_i32, batch->column(0)});
+  ASSERT_OK_AND_ASSIGN(auto null_str, MakeArrayOfNull(utf8(), batch->num_rows()));
+  auto expected_batch = RecordBatch::Make(to_schema, batch->num_rows(),
+                                          {null_i32, batch->column(0), null_str});
 
   ASSERT_OK_AND_ASSIGN(auto reconciled_batch, projector.Project(*batch));
   AssertBatchesEqual(*expected_batch, *reconciled_batch);

diff --git a/cpp/src/arrow/dataset/projector.cc b/cpp/src/arrow/dataset/projector.cc
@@ -46,6 +46,15 @@ Status CheckProjectable(const Schema& from, const Schema& to) {
                                from);
     }
 
+    if (from_field->type()->id() == Type::NA) {
+      // promotion from null to any type is supported
+      if (to_field->nullable()) continue;
+
+      return Status::TypeError("field ", to_field->ToString(),
+                               " is not nullable but has type ", NullType(),
+                               " in origin schema ", from);
+    }
+
     if (!from_field->type()->Equals(to_field->type())) {
       return Status::TypeError("fields had matching names but differing types. From: ",
                                from_field->ToString(), " To: ", to_field->ToString());
@@ -98,7 +107,7 @@ Result<std::shared_ptr<RecordBatch>> RecordBatchProjector::Project(
     RETURN_NOT_OK(ResizeMissingColumns(batch.num_rows(), pool));
   }
 
-  std::vector<std::shared_ptr<Array>> columns(to_->num_fields());
+  ArrayVector columns(to_->num_fields());
 
   for (int i = 0; i < to_->num_fields(); ++i) {
     if (column_indices_[i] != kNoMatch) {
@@ -120,7 +129,8 @@ Status RecordBatchProjector::SetInputSchema(std::shared_ptr<Schema> from,
     ARROW_ASSIGN_OR_RAISE(auto match,
                           FieldRef(to_->field(i)->name()).FindOneOrNone(*from_));
 
-    if (match.indices().empty()) {
+    if (match.indices().empty() ||
+        from_->field(match.indices()[0])->type()->id() == Type::NA) {
       // Mark column i as missing by setting missing_columns_[i]
       // to a non-null placeholder.
       ARROW_ASSIGN_OR_RAISE(missing_columns_[i],

diff --git a/cpp/src/arrow/testing/generator.h b/cpp/src/arrow/testing/generator.h
@@ -166,6 +166,8 @@ class ARROW_TESTING_EXPORT ConstantArrayGenerator {
   static std::shared_ptr<arrow::Array> Zeroes(int64_t size,
                                               const std::shared_ptr<DataType>& type) {
     switch (type->id()) {
+      case Type::NA:
+        return std::make_shared<NullArray>(size);
       case Type::BOOL:
         return Boolean(size);
       case Type::UINT8:

diff --git a/python/pyarrow/tests/test_dataset.py b/python/pyarrow/tests/test_dataset.py
@@ -2124,6 +2124,21 @@ def test_dataset_project_only_partition_columns(tempdir):
     assert all_cols.column('part').equals(part_only.column('part'))
 
 
+@pytest.mark.parquet
+@pytest.mark.pandas
+def test_dataset_project_null_column(tempdir):
+    import pandas as pd
+    df = pd.DataFrame({"col": np.array([None, None, None], dtype='object')})
+
+    f = tempdir / "test_dataset_project_null_column.parquet"
+    df.to_parquet(f, engine="pyarrow")
+
+    dataset = ds.dataset(f, format="parquet",
+                         schema=pa.schema([("col", pa.int64())]))
+    expected = pa.table({'col': pa.array([None, None, None], pa.int64())})
+    assert dataset.to_table().equals(expected)
+
+
 def _check_dataset_roundtrip(dataset, base_dir, expected_files,
                              base_dir_path=None, partitioning=None):
     base_dir_path = base_dir_path or base_dir