From 3563f7c77d3104aa67f322ca1a52c824efa4f343 Mon Sep 17 00:00:00 2001 From: Miles Granger Date: Mon, 3 Oct 2022 08:21:35 +0200 Subject: [PATCH 01/16] Add failing test [skip ci] --- python/pyarrow/tests/parquet/test_dataset.py | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/python/pyarrow/tests/parquet/test_dataset.py b/python/pyarrow/tests/parquet/test_dataset.py index 6fdc7435418..c1df9e0f6fa 100644 --- a/python/pyarrow/tests/parquet/test_dataset.py +++ b/python/pyarrow/tests/parquet/test_dataset.py @@ -1877,3 +1877,12 @@ def test_write_to_dataset_conflicting_keywords(tempdir): use_legacy_dataset=False, metadata_collector=[], file_visitor=lambda x: x) + + +def test_read_table_nested_columns(tempdir): + table = pa.table({"user_id": ["abc123", "qrs456"], + "interaction": [{"type": "click", "element": "button"}, + {"type": "scroll", "element": "window"}]}) + pq.write_table(table, tempdir / 'example.parquet') + pq.read_table(tempdir / "example.parquet", + columns=["user_id", "interaction.type"]) From ff252b0df1c985d684988a072b7945c52d0f629f Mon Sep 17 00:00:00 2001 From: Miles Granger Date: Wed, 5 Oct 2022 09:26:51 +0200 Subject: [PATCH 02/16] Support simple dot path in columns --- cpp/src/arrow/dataset/scanner.cc | 7 ++++++- python/pyarrow/tests/parquet/test_dataset.py | 9 --------- python/pyarrow/tests/test_dataset.py | 10 ++++++++++ 3 files changed, 16 insertions(+), 10 deletions(-) diff --git a/cpp/src/arrow/dataset/scanner.cc b/cpp/src/arrow/dataset/scanner.cc index e74db5b5c5a..3b4029323cd 100644 --- a/cpp/src/arrow/dataset/scanner.cc +++ b/cpp/src/arrow/dataset/scanner.cc @@ -751,7 +751,12 @@ Result ProjectionDescr::FromNames(std::vector name const Schema& dataset_schema) { std::vector exprs(names.size()); for (size_t i = 0; i < exprs.size(); ++i) { - exprs[i] = compute::field_ref(names[i]); + if (names[i].rfind(".", 0) == 0) { + ARROW_ASSIGN_OR_RAISE(auto field_ref, FieldRef::FromDotPath(names[i])); + exprs[i] = compute::field_ref(field_ref); + } else { + exprs[i] = compute::field_ref(names[i]); + } } auto fields = dataset_schema.fields(); for (const auto& aug_field : kAugmentedFields) { diff --git a/python/pyarrow/tests/parquet/test_dataset.py b/python/pyarrow/tests/parquet/test_dataset.py index c1df9e0f6fa..6fdc7435418 100644 --- a/python/pyarrow/tests/parquet/test_dataset.py +++ b/python/pyarrow/tests/parquet/test_dataset.py @@ -1877,12 +1877,3 @@ def test_write_to_dataset_conflicting_keywords(tempdir): use_legacy_dataset=False, metadata_collector=[], file_visitor=lambda x: x) - - -def test_read_table_nested_columns(tempdir): - table = pa.table({"user_id": ["abc123", "qrs456"], - "interaction": [{"type": "click", "element": "button"}, - {"type": "scroll", "element": "window"}]}) - pq.write_table(table, tempdir / 'example.parquet') - pq.read_table(tempdir / "example.parquet", - columns=["user_id", "interaction.type"]) diff --git a/python/pyarrow/tests/test_dataset.py b/python/pyarrow/tests/test_dataset.py index 5e2135fde42..f002f68d549 100644 --- a/python/pyarrow/tests/test_dataset.py +++ b/python/pyarrow/tests/test_dataset.py @@ -4787,3 +4787,13 @@ def test_write_dataset_with_scanner_use_projected_schema(tempdir): ds.write_dataset( scanner, tempdir, partitioning=["original_column"], format="ipc" ) + +def test_read_table_nested_columns(tempdir): + table = pa.table({"user_id": ["abc123", "qrs456"], + "interaction": [{"type": "click", "element": "button"}, + {"type": "scroll", "element": "window"}]}) + pq.write_table(table, tempdir / 'example.parquet') + table = pq.read_table(tempdir / "example.parquet", + columns=["user_id", ".interaction.type"]) + breakpoint() + From e3367dda9ff25e4a0b7b52e4462ffa604446b21d Mon Sep 17 00:00:00 2001 From: Miles Granger Date: Wed, 5 Oct 2022 11:42:15 +0200 Subject: [PATCH 03/16] Support dot path selecting into ListArray --- .../arrow/compute/kernels/scalar_nested.cc | 19 ++++++++++++++++++- python/pyarrow/tests/test_dataset.py | 12 +++++++----- 2 files changed, 25 insertions(+), 6 deletions(-) diff --git a/cpp/src/arrow/compute/kernels/scalar_nested.cc b/cpp/src/arrow/compute/kernels/scalar_nested.cc index 0b6118812a4..060b7df592a 100644 --- a/cpp/src/arrow/compute/kernels/scalar_nested.cc +++ b/cpp/src/arrow/compute/kernels/scalar_nested.cc @@ -18,12 +18,15 @@ // Vector kernels involving nested types #include "arrow/array/array_base.h" +#include "arrow/array/array_nested.h" #include "arrow/array/builder_nested.h" +#include "arrow/array/data.h" #include "arrow/compute/api_scalar.h" #include "arrow/compute/kernels/common.h" #include "arrow/result.h" #include "arrow/util/bit_block_counter.h" #include "arrow/util/bitmap_generate.h" +#include "arrow/util/logging.h" namespace arrow { namespace compute { @@ -242,6 +245,20 @@ struct StructFieldFunctor { union_array.GetFlattenedField(index, ctx->memory_pool())); break; } + case Type::LIST: { + const auto& list_array = checked_cast(*current); + ARROW_LOG(INFO) << list_array.ToString(); + ARROW_ASSIGN_OR_RAISE( + Datum indices, + CallFunction("add", {list_array.offsets()->Slice( + 0, list_array.offsets()->length() - 1), + MakeScalar(index)})); + ARROW_ASSIGN_OR_RAISE(Datum result, CallFunction("take", {list_array.values(), + std::move(indices)})); + current = result.make_array(); + ARROW_LOG(INFO) << current->ToString(); + break; + } default: // Should have been checked in ResolveStructFieldType return Status::TypeError("struct_field: cannot reference child field of type ", @@ -265,7 +282,7 @@ struct StructFieldFunctor { static bool ValidParentType(const DataType& type) { return type.id() == Type::STRUCT || type.id() == Type::DENSE_UNION || - type.id() == Type::SPARSE_UNION; + type.id() == Type::SPARSE_UNION || type.id() == Type::LIST; } }; diff --git a/python/pyarrow/tests/test_dataset.py b/python/pyarrow/tests/test_dataset.py index f002f68d549..e0e48b4f3d9 100644 --- a/python/pyarrow/tests/test_dataset.py +++ b/python/pyarrow/tests/test_dataset.py @@ -4790,10 +4790,12 @@ def test_write_dataset_with_scanner_use_projected_schema(tempdir): def test_read_table_nested_columns(tempdir): table = pa.table({"user_id": ["abc123", "qrs456"], - "interaction": [{"type": "click", "element": "button"}, - {"type": "scroll", "element": "window"}]}) - pq.write_table(table, tempdir / 'example.parquet') - table = pq.read_table(tempdir / "example.parquet", - columns=["user_id", ".interaction.type"]) + "interaction": [ + {"type": "click", "element": "button", "values": [1, 2], "structs":[{"foo": "bar"}]}, + {"type": "scroll", "element": "window", "values": [3, 4], "structs":[{"fizz": "buzz"}]} + ]}) + ds.write_dataset(table, tempdir / "table", format="ipc") + table = ds.dataset(tempdir / "table", format="ipc")\ + .to_table(columns=["user_id", ".interaction.type", ".interaction.values[0]", ".interaction.structs[0].foo"]) breakpoint() From bbea2534d8cc1ef3c08556130a06942b4d716a19 Mon Sep 17 00:00:00 2001 From: Miles Granger Date: Wed, 5 Oct 2022 14:57:23 +0200 Subject: [PATCH 04/16] (Ugly) prototype for specifying struct output [skip ci] --- python/pyarrow/_dataset.pyx | 35 +++++++++++++++++++++++++++- python/pyarrow/tests/test_dataset.py | 33 ++++++++++++++++++++------ 2 files changed, 60 insertions(+), 8 deletions(-) diff --git a/python/pyarrow/_dataset.pyx b/python/pyarrow/_dataset.pyx index 57029b8da5c..1074db7d1b8 100644 --- a/python/pyarrow/_dataset.pyx +++ b/python/pyarrow/_dataset.pyx @@ -329,7 +329,40 @@ cdef class Dataset(_Weakrefable): ------- table : Table """ - return self.scanner(**kwargs).to_table() + import pyarrow.compute as pc + + original_columns = kwargs.get('columns') + if original_columns is not None: + if (isinstance(original_columns, list) + and + any(isinstance(c, dict) for c in original_columns)): + + columns_flat = list() + for col in original_columns: + if isinstance(col, dict): + mapping = list(col.values())[0] + for c in mapping.values(): + columns_flat.append(c) + else: + columns_flat.append(col) + kwargs['columns'] = columns_flat + + table = self.scanner(**kwargs).to_table() + + if original_columns != kwargs.get("columns"): + for col in original_columns: + if isinstance(col, dict): + mapping = list(col.values())[0] + current_names = list(mapping.values()) + new_names = list(mapping.keys()) + columns = table.select(current_names) + table = table.drop(current_names) + table = table.append_column( + list(col.keys())[0], + pc.make_struct(*columns, field_names=new_names)) + return table + + def take(self, object indices, **kwargs): """ diff --git a/python/pyarrow/tests/test_dataset.py b/python/pyarrow/tests/test_dataset.py index e0e48b4f3d9..29e39f84cc5 100644 --- a/python/pyarrow/tests/test_dataset.py +++ b/python/pyarrow/tests/test_dataset.py @@ -4787,15 +4787,34 @@ def test_write_dataset_with_scanner_use_projected_schema(tempdir): ds.write_dataset( scanner, tempdir, partitioning=["original_column"], format="ipc" ) - + + def test_read_table_nested_columns(tempdir): table = pa.table({"user_id": ["abc123", "qrs456"], "interaction": [ - {"type": "click", "element": "button", "values": [1, 2], "structs":[{"foo": "bar"}]}, - {"type": "scroll", "element": "window", "values": [3, 4], "structs":[{"fizz": "buzz"}]} + {"type": "click", "element": "button", "values": [ + 1, 2], "structs":[{"foo": "bar"}]}, + {"type": "scroll", "element": "window", "values": [ + 3, 4], "structs":[{"fizz": "buzz"}]} ]}) ds.write_dataset(table, tempdir / "table", format="ipc") - table = ds.dataset(tempdir / "table", format="ipc")\ - .to_table(columns=["user_id", ".interaction.type", ".interaction.values[0]", ".interaction.structs[0].foo"]) - breakpoint() - + ds1 = ds.dataset(tempdir / "table", format="ipc") + + # Dot path to read subsets of nested data + table = ds1.to_table(columns=["user_id", ".interaction.type", + ".interaction.values[0]", ".interaction.structs[0].foo"]) + assert table.to_pylist() == [ + {'user_id': 'abc123', '.interaction.type': 'click', '.interaction.values[0]': 1, '.interaction.structs[0].foo': 'bar'}, + {'user_id': 'qrs456', '.interaction.type': 'scroll', '.interaction.values[0]': 3, '.interaction.structs[0].foo': None} + ] + # Dot path with directive for constructing a struct from the subset data + table = ds1.to_table(columns=[ + "user_id", + {"interaction": {"type": ".interaction.type", + "value": ".interaction.values[0]", + "foo": ".interaction.structs[0].foo"} + }]) + assert table.to_pylist() == [ + {'user_id': 'abc123', 'interaction': {'type': 'click', 'value': 1, 'foo': 'bar'}}, + {'user_id': 'qrs456', 'interaction': {'type': 'scroll', 'value': 3, 'foo': None}} + ] From f7143771b453d4bc8e0b46118cf9681c1771e125 Mon Sep 17 00:00:00 2001 From: Miles Granger Date: Thu, 6 Oct 2022 10:54:18 +0200 Subject: [PATCH 05/16] Use list_element kernel [skip ci] --- cpp/src/arrow/compute/kernels/scalar_nested.cc | 9 +-------- 1 file changed, 1 insertion(+), 8 deletions(-) diff --git a/cpp/src/arrow/compute/kernels/scalar_nested.cc b/cpp/src/arrow/compute/kernels/scalar_nested.cc index 060b7df592a..9b7c386ee51 100644 --- a/cpp/src/arrow/compute/kernels/scalar_nested.cc +++ b/cpp/src/arrow/compute/kernels/scalar_nested.cc @@ -247,16 +247,9 @@ struct StructFieldFunctor { } case Type::LIST: { const auto& list_array = checked_cast(*current); - ARROW_LOG(INFO) << list_array.ToString(); ARROW_ASSIGN_OR_RAISE( - Datum indices, - CallFunction("add", {list_array.offsets()->Slice( - 0, list_array.offsets()->length() - 1), - MakeScalar(index)})); - ARROW_ASSIGN_OR_RAISE(Datum result, CallFunction("take", {list_array.values(), - std::move(indices)})); + Datum result, CallFunction("list_element", {list_array, {Datum(index)}})); current = result.make_array(); - ARROW_LOG(INFO) << current->ToString(); break; } default: From ecf9fc8289433e8e86a324a031155a851b177a60 Mon Sep 17 00:00:00 2001 From: Miles Granger Date: Fri, 7 Oct 2022 09:10:50 +0200 Subject: [PATCH 06/16] Parameterize over ipc and parquet --- cpp/submodules/parquet-testing | 2 +- python/pyarrow/tests/test_dataset.py | 45 +++++++++++++--------------- 2 files changed, 21 insertions(+), 26 deletions(-) diff --git a/cpp/submodules/parquet-testing b/cpp/submodules/parquet-testing index e13af117de7..19fcd4d5e8a 160000 --- a/cpp/submodules/parquet-testing +++ b/cpp/submodules/parquet-testing @@ -1 +1 @@ -Subproject commit e13af117de7c4f0a4d9908ae3827b3ab119868f3 +Subproject commit 19fcd4d5e8a6bc66a8ba7c37b05eb3e698e73c2b diff --git a/python/pyarrow/tests/test_dataset.py b/python/pyarrow/tests/test_dataset.py index 29e39f84cc5..ccff4bc9770 100644 --- a/python/pyarrow/tests/test_dataset.py +++ b/python/pyarrow/tests/test_dataset.py @@ -4789,32 +4789,27 @@ def test_write_dataset_with_scanner_use_projected_schema(tempdir): ) -def test_read_table_nested_columns(tempdir): +@pytest.mark.parametrize("format", ("ipc", "parquet")) +def test_read_table_nested_columns(tempdir, format): + if format == "parquet": + pytest.importorskip("pyarrow.parquet") + table = pa.table({"user_id": ["abc123", "qrs456"], "interaction": [ - {"type": "click", "element": "button", "values": [ - 1, 2], "structs":[{"foo": "bar"}]}, - {"type": "scroll", "element": "window", "values": [ - 3, 4], "structs":[{"fizz": "buzz"}]} - ]}) - ds.write_dataset(table, tempdir / "table", format="ipc") - ds1 = ds.dataset(tempdir / "table", format="ipc") - + {"type": "click", "element": "button", "values": [ + 1, 2], "structs":[{"foo": "bar"}]}, + {"type": "scroll", "element": "window", "values": [ + 3, 4], "structs":[{"fizz": "buzz"}]} + ]}) + ds.write_dataset(table, tempdir / "table", format=format) + ds1 = ds.dataset(tempdir / "table", format=format) + # Dot path to read subsets of nested data - table = ds1.to_table(columns=["user_id", ".interaction.type", - ".interaction.values[0]", ".interaction.structs[0].foo"]) + table = ds1.to_table( + columns=["user_id", ".interaction.type", ".interaction.values", + ".interaction.structs"]) assert table.to_pylist() == [ - {'user_id': 'abc123', '.interaction.type': 'click', '.interaction.values[0]': 1, '.interaction.structs[0].foo': 'bar'}, - {'user_id': 'qrs456', '.interaction.type': 'scroll', '.interaction.values[0]': 3, '.interaction.structs[0].foo': None} - ] - # Dot path with directive for constructing a struct from the subset data - table = ds1.to_table(columns=[ - "user_id", - {"interaction": {"type": ".interaction.type", - "value": ".interaction.values[0]", - "foo": ".interaction.structs[0].foo"} - }]) - assert table.to_pylist() == [ - {'user_id': 'abc123', 'interaction': {'type': 'click', 'value': 1, 'foo': 'bar'}}, - {'user_id': 'qrs456', 'interaction': {'type': 'scroll', 'value': 3, 'foo': None}} - ] + {'user_id': 'abc123', '.interaction.type': 'click', '.interaction.values': [1, 2], + '.interaction.structs': [{'fizz': None, 'foo': 'bar'}]}, + {'user_id': 'qrs456', '.interaction.type': 'scroll', '.interaction.values': [3, 4], + '.interaction.structs': [{'fizz': 'buzz', 'foo': None}]}] From b7ac62c5e99f7138f765431e027b2376f00fb3d3 Mon Sep 17 00:00:00 2001 From: Miles Granger Date: Fri, 7 Oct 2022 10:35:33 +0200 Subject: [PATCH 07/16] Reset scalar_nested.cc - remove Type::LIST --- cpp/src/arrow/compute/kernels/scalar_nested.cc | 12 +----------- 1 file changed, 1 insertion(+), 11 deletions(-) diff --git a/cpp/src/arrow/compute/kernels/scalar_nested.cc b/cpp/src/arrow/compute/kernels/scalar_nested.cc index 9b7c386ee51..0b6118812a4 100644 --- a/cpp/src/arrow/compute/kernels/scalar_nested.cc +++ b/cpp/src/arrow/compute/kernels/scalar_nested.cc @@ -18,15 +18,12 @@ // Vector kernels involving nested types #include "arrow/array/array_base.h" -#include "arrow/array/array_nested.h" #include "arrow/array/builder_nested.h" -#include "arrow/array/data.h" #include "arrow/compute/api_scalar.h" #include "arrow/compute/kernels/common.h" #include "arrow/result.h" #include "arrow/util/bit_block_counter.h" #include "arrow/util/bitmap_generate.h" -#include "arrow/util/logging.h" namespace arrow { namespace compute { @@ -245,13 +242,6 @@ struct StructFieldFunctor { union_array.GetFlattenedField(index, ctx->memory_pool())); break; } - case Type::LIST: { - const auto& list_array = checked_cast(*current); - ARROW_ASSIGN_OR_RAISE( - Datum result, CallFunction("list_element", {list_array, {Datum(index)}})); - current = result.make_array(); - break; - } default: // Should have been checked in ResolveStructFieldType return Status::TypeError("struct_field: cannot reference child field of type ", @@ -275,7 +265,7 @@ struct StructFieldFunctor { static bool ValidParentType(const DataType& type) { return type.id() == Type::STRUCT || type.id() == Type::DENSE_UNION || - type.id() == Type::SPARSE_UNION || type.id() == Type::LIST; + type.id() == Type::SPARSE_UNION; } }; From 9f924b5f45f6f7548222bdd05c98cc414d1ee25a Mon Sep 17 00:00:00 2001 From: Miles Granger Date: Fri, 7 Oct 2022 10:37:49 +0200 Subject: [PATCH 08/16] [skip ci] Reset _dataset.pyx - remove make_struct logic prototype --- python/pyarrow/_dataset.pyx | 35 +---------------------------------- 1 file changed, 1 insertion(+), 34 deletions(-) diff --git a/python/pyarrow/_dataset.pyx b/python/pyarrow/_dataset.pyx index 1074db7d1b8..57029b8da5c 100644 --- a/python/pyarrow/_dataset.pyx +++ b/python/pyarrow/_dataset.pyx @@ -329,40 +329,7 @@ cdef class Dataset(_Weakrefable): ------- table : Table """ - import pyarrow.compute as pc - - original_columns = kwargs.get('columns') - if original_columns is not None: - if (isinstance(original_columns, list) - and - any(isinstance(c, dict) for c in original_columns)): - - columns_flat = list() - for col in original_columns: - if isinstance(col, dict): - mapping = list(col.values())[0] - for c in mapping.values(): - columns_flat.append(c) - else: - columns_flat.append(col) - kwargs['columns'] = columns_flat - - table = self.scanner(**kwargs).to_table() - - if original_columns != kwargs.get("columns"): - for col in original_columns: - if isinstance(col, dict): - mapping = list(col.values())[0] - current_names = list(mapping.values()) - new_names = list(mapping.keys()) - columns = table.select(current_names) - table = table.drop(current_names) - table = table.append_column( - list(col.keys())[0], - pc.make_struct(*columns, field_names=new_names)) - return table - - + return self.scanner(**kwargs).to_table() def take(self, object indices, **kwargs): """ From 3602fcc7e144d9c326c6d02db469cf4a2a9d535f Mon Sep 17 00:00:00 2001 From: Miles Granger Date: Fri, 7 Oct 2022 10:41:21 +0200 Subject: [PATCH 09/16] [skip ci] submodule update --- cpp/submodules/parquet-testing | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cpp/submodules/parquet-testing b/cpp/submodules/parquet-testing index 19fcd4d5e8a..e13af117de7 160000 --- a/cpp/submodules/parquet-testing +++ b/cpp/submodules/parquet-testing @@ -1 +1 @@ -Subproject commit 19fcd4d5e8a6bc66a8ba7c37b05eb3e698e73c2b +Subproject commit e13af117de7c4f0a4d9908ae3827b3ab119868f3 From 34673a8b6c96e0f33bb348be2b56130720d887ac Mon Sep 17 00:00:00 2001 From: Miles Granger Date: Fri, 7 Oct 2022 12:07:23 +0200 Subject: [PATCH 10/16] [skip ci] optional dot prefix, check schema before going to dotted path --- cpp/src/arrow/dataset/scanner.cc | 9 +++++++-- python/pyarrow/tests/test_dataset.py | 21 +++++++++++---------- 2 files changed, 18 insertions(+), 12 deletions(-) diff --git a/cpp/src/arrow/dataset/scanner.cc b/cpp/src/arrow/dataset/scanner.cc index 3b4029323cd..1a8be96c633 100644 --- a/cpp/src/arrow/dataset/scanner.cc +++ b/cpp/src/arrow/dataset/scanner.cc @@ -751,8 +751,13 @@ Result ProjectionDescr::FromNames(std::vector name const Schema& dataset_schema) { std::vector exprs(names.size()); for (size_t i = 0; i < exprs.size(); ++i) { - if (names[i].rfind(".", 0) == 0) { - ARROW_ASSIGN_OR_RAISE(auto field_ref, FieldRef::FromDotPath(names[i])); + // If name isn't in schema, try finding it by dotted path. + if (dataset_schema.GetFieldByName(names[i]) == nullptr) { + auto name = names[i]; + if (name.rfind(".", 0) != 0) { + name = "." + name; + } + ARROW_ASSIGN_OR_RAISE(auto field_ref, FieldRef::FromDotPath(name)); exprs[i] = compute::field_ref(field_ref); } else { exprs[i] = compute::field_ref(names[i]); diff --git a/python/pyarrow/tests/test_dataset.py b/python/pyarrow/tests/test_dataset.py index ccff4bc9770..9a4303ce291 100644 --- a/python/pyarrow/tests/test_dataset.py +++ b/python/pyarrow/tests/test_dataset.py @@ -4795,21 +4795,22 @@ def test_read_table_nested_columns(tempdir, format): pytest.importorskip("pyarrow.parquet") table = pa.table({"user_id": ["abc123", "qrs456"], + "a.dotted.field": [1, 2], "interaction": [ - {"type": "click", "element": "button", "values": [ - 1, 2], "structs":[{"foo": "bar"}]}, - {"type": "scroll", "element": "window", "values": [ - 3, 4], "structs":[{"fizz": "buzz"}]} + {"type": "click", "element": "button", + "values": [1, 2], "structs":[{"foo": "bar"}]}, + {"type": "scroll", "element": "window", + "values": [3, 4], "structs":[{"fizz": "buzz"}]} ]}) ds.write_dataset(table, tempdir / "table", format=format) ds1 = ds.dataset(tempdir / "table", format=format) # Dot path to read subsets of nested data table = ds1.to_table( - columns=["user_id", ".interaction.type", ".interaction.values", - ".interaction.structs"]) + columns=["user_id", "interaction.type", "interaction.values", + "interaction.structs", "a.dotted.field"]) assert table.to_pylist() == [ - {'user_id': 'abc123', '.interaction.type': 'click', '.interaction.values': [1, 2], - '.interaction.structs': [{'fizz': None, 'foo': 'bar'}]}, - {'user_id': 'qrs456', '.interaction.type': 'scroll', '.interaction.values': [3, 4], - '.interaction.structs': [{'fizz': 'buzz', 'foo': None}]}] + {'user_id': 'abc123', 'interaction.type': 'click', 'interaction.values': [1, 2], + 'interaction.structs': [{'fizz': None, 'foo': 'bar'}], 'a.dotted.field': 1}, + {'user_id': 'qrs456', 'interaction.type': 'scroll', 'interaction.values': [3, 4], + 'interaction.structs': [{'fizz': 'buzz', 'foo': None}], 'a.dotted.field': 2}] From 1b4914f4493ae5658a9f02186f07d3f308445ae2 Mon Sep 17 00:00:00 2001 From: Miles Granger Date: Mon, 10 Oct 2022 11:54:43 +0200 Subject: [PATCH 11/16] [skip ci] Take last delimited dot path as column name --- cpp/src/arrow/dataset/scanner.cc | 2 ++ python/pyarrow/tests/test_dataset.py | 9 +++++---- 2 files changed, 7 insertions(+), 4 deletions(-) diff --git a/cpp/src/arrow/dataset/scanner.cc b/cpp/src/arrow/dataset/scanner.cc index 1a8be96c633..043843307cf 100644 --- a/cpp/src/arrow/dataset/scanner.cc +++ b/cpp/src/arrow/dataset/scanner.cc @@ -758,6 +758,8 @@ Result ProjectionDescr::FromNames(std::vector name name = "." + name; } ARROW_ASSIGN_OR_RAISE(auto field_ref, FieldRef::FromDotPath(name)); + // safe as we know there is at least 1 dot. + names[i] = name.substr(name.rfind(".") + 1); exprs[i] = compute::field_ref(field_ref); } else { exprs[i] = compute::field_ref(names[i]); diff --git a/python/pyarrow/tests/test_dataset.py b/python/pyarrow/tests/test_dataset.py index 9a4303ce291..1b1e049e5c9 100644 --- a/python/pyarrow/tests/test_dataset.py +++ b/python/pyarrow/tests/test_dataset.py @@ -4810,7 +4810,8 @@ def test_read_table_nested_columns(tempdir, format): columns=["user_id", "interaction.type", "interaction.values", "interaction.structs", "a.dotted.field"]) assert table.to_pylist() == [ - {'user_id': 'abc123', 'interaction.type': 'click', 'interaction.values': [1, 2], - 'interaction.structs': [{'fizz': None, 'foo': 'bar'}], 'a.dotted.field': 1}, - {'user_id': 'qrs456', 'interaction.type': 'scroll', 'interaction.values': [3, 4], - 'interaction.structs': [{'fizz': 'buzz', 'foo': None}], 'a.dotted.field': 2}] + {'user_id': 'abc123', 'type': 'click', 'values': [1, 2], + 'structs': [{'fizz': None, 'foo': 'bar'}], 'a.dotted.field': 1}, + {'user_id': 'qrs456', 'type': 'scroll', 'values': [3, 4], + 'structs': [{'fizz': 'buzz', 'foo': None}], 'a.dotted.field': 2} + ] From 038b87f3ba4cd83c0ff5804279b6987cd7070fb3 Mon Sep 17 00:00:00 2001 From: Miles Granger Date: Fri, 14 Oct 2022 05:12:11 +0200 Subject: [PATCH 12/16] Add C++ test for ProjectedScanNestedFromNames w/ dot path --- cpp/src/arrow/dataset/scanner_test.cc | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) diff --git a/cpp/src/arrow/dataset/scanner_test.cc b/cpp/src/arrow/dataset/scanner_test.cc index 8a10037b548..b968bd23983 100644 --- a/cpp/src/arrow/dataset/scanner_test.cc +++ b/cpp/src/arrow/dataset/scanner_test.cc @@ -1074,6 +1074,24 @@ TEST_P(TestScanner, ProjectedScanNested) { AssertScanBatchesUnorderedEqualRepetitionsOf(MakeScanner(batch_in), batch_out); } +TEST_P(TestScanner, ProjectedScanNestedFromNames) { + SetSchema({ + field("struct", struct_({field("i32", int32()), field("f64", float64())})), + field("nested", struct_({field("left", int32()), + field("right", struct_({field("i32", int32()), + field("f64", float64())}))})), + }); + ASSERT_OK_AND_ASSIGN(auto descr, + ProjectionDescr::FromNames({"struct.i32", "nested.right.f64"}, + *options_->dataset_schema)) + SetProjection(options_.get(), std::move(descr)); + auto batch_in = ConstantArrayGenerator::Zeroes(GetParam().items_per_batch, schema_); + auto batch_out = ConstantArrayGenerator::Zeroes( + GetParam().items_per_batch, + schema({field("struct.i32", int32()), field("nested.right.f64", float64())})); + AssertScanBatchesUnorderedEqualRepetitionsOf(MakeScanner(batch_in), batch_out); +} + TEST_P(TestScanner, MaterializeMissingColumn) { SetSchema({field("i32", int32()), field("f64", float64())}); auto batch_missing_f64 = ConstantArrayGenerator::Zeroes( From 7f93599be817c62502d8f7380e179081cf3373e0 Mon Sep 17 00:00:00 2001 From: Miles Granger Date: Mon, 17 Oct 2022 20:26:12 +0200 Subject: [PATCH 13/16] Check leading dot in dotted path --- cpp/src/arrow/dataset/scanner_test.cc | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/cpp/src/arrow/dataset/scanner_test.cc b/cpp/src/arrow/dataset/scanner_test.cc index b968bd23983..d6cacf33f9f 100644 --- a/cpp/src/arrow/dataset/scanner_test.cc +++ b/cpp/src/arrow/dataset/scanner_test.cc @@ -1082,13 +1082,13 @@ TEST_P(TestScanner, ProjectedScanNestedFromNames) { field("f64", float64())}))})), }); ASSERT_OK_AND_ASSIGN(auto descr, - ProjectionDescr::FromNames({"struct.i32", "nested.right.f64"}, + ProjectionDescr::FromNames({".struct.i32", "nested.right.f64"}, *options_->dataset_schema)) SetProjection(options_.get(), std::move(descr)); auto batch_in = ConstantArrayGenerator::Zeroes(GetParam().items_per_batch, schema_); auto batch_out = ConstantArrayGenerator::Zeroes( GetParam().items_per_batch, - schema({field("struct.i32", int32()), field("nested.right.f64", float64())})); + schema({field(".struct.i32", int32()), field("nested.right.f64", float64())})); AssertScanBatchesUnorderedEqualRepetitionsOf(MakeScanner(batch_in), batch_out); } From 83cfdb46cdb97abab7c8342faadd99f50c97d46f Mon Sep 17 00:00:00 2001 From: Miles Granger Date: Mon, 17 Oct 2022 20:34:51 +0200 Subject: [PATCH 14/16] Add some nulls in test case --- python/pyarrow/tests/test_dataset.py | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/python/pyarrow/tests/test_dataset.py b/python/pyarrow/tests/test_dataset.py index 1b1e049e5c9..b53f3dcac99 100644 --- a/python/pyarrow/tests/test_dataset.py +++ b/python/pyarrow/tests/test_dataset.py @@ -4797,10 +4797,10 @@ def test_read_table_nested_columns(tempdir, format): table = pa.table({"user_id": ["abc123", "qrs456"], "a.dotted.field": [1, 2], "interaction": [ - {"type": "click", "element": "button", - "values": [1, 2], "structs":[{"foo": "bar"}]}, + {"type": None, "element": "button", + "values": [1, 2], "structs":[{"foo": "bar"}, None]}, {"type": "scroll", "element": "window", - "values": [3, 4], "structs":[{"fizz": "buzz"}]} + "values": [None, 3, 4], "structs":[{"fizz": "buzz"}]} ]}) ds.write_dataset(table, tempdir / "table", format=format) ds1 = ds.dataset(tempdir / "table", format=format) @@ -4810,8 +4810,8 @@ def test_read_table_nested_columns(tempdir, format): columns=["user_id", "interaction.type", "interaction.values", "interaction.structs", "a.dotted.field"]) assert table.to_pylist() == [ - {'user_id': 'abc123', 'type': 'click', 'values': [1, 2], - 'structs': [{'fizz': None, 'foo': 'bar'}], 'a.dotted.field': 1}, - {'user_id': 'qrs456', 'type': 'scroll', 'values': [3, 4], + {'user_id': 'abc123', 'type': None, 'values': [1, 2], + 'structs': [{'fizz': None, 'foo': 'bar'}, None], 'a.dotted.field': 1}, + {'user_id': 'qrs456', 'type': 'scroll', 'values': [None, 3, 4], 'structs': [{'fizz': 'buzz', 'foo': None}], 'a.dotted.field': 2} ] From 664107fc5933ad38cb1835a4ad053cfd0e30f598 Mon Sep 17 00:00:00 2001 From: Miles Granger Date: Tue, 18 Oct 2022 15:30:19 +0200 Subject: [PATCH 15/16] Fixup: update batch out field names --- cpp/src/arrow/dataset/scanner_test.cc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cpp/src/arrow/dataset/scanner_test.cc b/cpp/src/arrow/dataset/scanner_test.cc index d6cacf33f9f..f2d345e8339 100644 --- a/cpp/src/arrow/dataset/scanner_test.cc +++ b/cpp/src/arrow/dataset/scanner_test.cc @@ -1088,7 +1088,7 @@ TEST_P(TestScanner, ProjectedScanNestedFromNames) { auto batch_in = ConstantArrayGenerator::Zeroes(GetParam().items_per_batch, schema_); auto batch_out = ConstantArrayGenerator::Zeroes( GetParam().items_per_batch, - schema({field(".struct.i32", int32()), field("nested.right.f64", float64())})); + schema({field("i32", int32()), field("f64", float64())})); AssertScanBatchesUnorderedEqualRepetitionsOf(MakeScanner(batch_in), batch_out); } From ae5b01d2ce7c67fba5fc17028e6230fb550d2535 Mon Sep 17 00:00:00 2001 From: Miles Granger Date: Tue, 18 Oct 2022 16:11:45 +0200 Subject: [PATCH 16/16] AssertBatchesEqual, check metadata --- cpp/src/arrow/dataset/test_util.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cpp/src/arrow/dataset/test_util.h b/cpp/src/arrow/dataset/test_util.h index 17065bfd7d2..02464f0c38d 100644 --- a/cpp/src/arrow/dataset/test_util.h +++ b/cpp/src/arrow/dataset/test_util.h @@ -157,7 +157,7 @@ class DatasetFixtureMixin : public ::testing::Test { std::shared_ptr lhs; ASSERT_OK(expected->ReadNext(&lhs)); EXPECT_NE(lhs, nullptr); - AssertBatchesEqual(*lhs, batch); + AssertBatchesEqual(*lhs, batch, true); } /// \brief Ensure that record batches found in reader are equals to the