diff --git a/cpp/src/arrow/dataset/scanner.cc b/cpp/src/arrow/dataset/scanner.cc index e74db5b5c5a..043843307cf 100644 --- a/cpp/src/arrow/dataset/scanner.cc +++ b/cpp/src/arrow/dataset/scanner.cc @@ -751,7 +751,19 @@ Result ProjectionDescr::FromNames(std::vector name const Schema& dataset_schema) { std::vector exprs(names.size()); for (size_t i = 0; i < exprs.size(); ++i) { - exprs[i] = compute::field_ref(names[i]); + // If name isn't in schema, try finding it by dotted path. + if (dataset_schema.GetFieldByName(names[i]) == nullptr) { + auto name = names[i]; + if (name.rfind(".", 0) != 0) { + name = "." + name; + } + ARROW_ASSIGN_OR_RAISE(auto field_ref, FieldRef::FromDotPath(name)); + // safe as we know there is at least 1 dot. + names[i] = name.substr(name.rfind(".") + 1); + exprs[i] = compute::field_ref(field_ref); + } else { + exprs[i] = compute::field_ref(names[i]); + } } auto fields = dataset_schema.fields(); for (const auto& aug_field : kAugmentedFields) { diff --git a/cpp/src/arrow/dataset/scanner_test.cc b/cpp/src/arrow/dataset/scanner_test.cc index 8a10037b548..f2d345e8339 100644 --- a/cpp/src/arrow/dataset/scanner_test.cc +++ b/cpp/src/arrow/dataset/scanner_test.cc @@ -1074,6 +1074,24 @@ TEST_P(TestScanner, ProjectedScanNested) { AssertScanBatchesUnorderedEqualRepetitionsOf(MakeScanner(batch_in), batch_out); } +TEST_P(TestScanner, ProjectedScanNestedFromNames) { + SetSchema({ + field("struct", struct_({field("i32", int32()), field("f64", float64())})), + field("nested", struct_({field("left", int32()), + field("right", struct_({field("i32", int32()), + field("f64", float64())}))})), + }); + ASSERT_OK_AND_ASSIGN(auto descr, + ProjectionDescr::FromNames({".struct.i32", "nested.right.f64"}, + *options_->dataset_schema)) + SetProjection(options_.get(), std::move(descr)); + auto batch_in = ConstantArrayGenerator::Zeroes(GetParam().items_per_batch, schema_); + auto batch_out = ConstantArrayGenerator::Zeroes( + GetParam().items_per_batch, + schema({field("i32", int32()), field("f64", float64())})); + AssertScanBatchesUnorderedEqualRepetitionsOf(MakeScanner(batch_in), batch_out); +} + TEST_P(TestScanner, MaterializeMissingColumn) { SetSchema({field("i32", int32()), field("f64", float64())}); auto batch_missing_f64 = ConstantArrayGenerator::Zeroes( diff --git a/cpp/src/arrow/dataset/test_util.h b/cpp/src/arrow/dataset/test_util.h index 17065bfd7d2..02464f0c38d 100644 --- a/cpp/src/arrow/dataset/test_util.h +++ b/cpp/src/arrow/dataset/test_util.h @@ -157,7 +157,7 @@ class DatasetFixtureMixin : public ::testing::Test { std::shared_ptr lhs; ASSERT_OK(expected->ReadNext(&lhs)); EXPECT_NE(lhs, nullptr); - AssertBatchesEqual(*lhs, batch); + AssertBatchesEqual(*lhs, batch, true); } /// \brief Ensure that record batches found in reader are equals to the diff --git a/python/pyarrow/tests/test_dataset.py b/python/pyarrow/tests/test_dataset.py index 5e2135fde42..b53f3dcac99 100644 --- a/python/pyarrow/tests/test_dataset.py +++ b/python/pyarrow/tests/test_dataset.py @@ -4787,3 +4787,31 @@ def test_write_dataset_with_scanner_use_projected_schema(tempdir): ds.write_dataset( scanner, tempdir, partitioning=["original_column"], format="ipc" ) + + +@pytest.mark.parametrize("format", ("ipc", "parquet")) +def test_read_table_nested_columns(tempdir, format): + if format == "parquet": + pytest.importorskip("pyarrow.parquet") + + table = pa.table({"user_id": ["abc123", "qrs456"], + "a.dotted.field": [1, 2], + "interaction": [ + {"type": None, "element": "button", + "values": [1, 2], "structs":[{"foo": "bar"}, None]}, + {"type": "scroll", "element": "window", + "values": [None, 3, 4], "structs":[{"fizz": "buzz"}]} + ]}) + ds.write_dataset(table, tempdir / "table", format=format) + ds1 = ds.dataset(tempdir / "table", format=format) + + # Dot path to read subsets of nested data + table = ds1.to_table( + columns=["user_id", "interaction.type", "interaction.values", + "interaction.structs", "a.dotted.field"]) + assert table.to_pylist() == [ + {'user_id': 'abc123', 'type': None, 'values': [1, 2], + 'structs': [{'fizz': None, 'foo': 'bar'}, None], 'a.dotted.field': 1}, + {'user_id': 'qrs456', 'type': 'scroll', 'values': [None, 3, 4], + 'structs': [{'fizz': 'buzz', 'foo': None}], 'a.dotted.field': 2} + ]