From a0c2817617be38e75ea6c825797467f757d7c455 Mon Sep 17 00:00:00 2001 From: benibus Date: Mon, 17 Apr 2023 13:53:16 -0400 Subject: [PATCH 01/21] Implement FieldPath::GetFlattened methods --- cpp/src/arrow/type.cc | 81 +++++++++++++++++++++++++++++++++++++++---- cpp/src/arrow/type.h | 12 +++++++ 2 files changed, 86 insertions(+), 7 deletions(-) diff --git a/cpp/src/arrow/type.cc b/cpp/src/arrow/type.cc index 606b231f6f7..c2155fb2b2a 100644 --- a/cpp/src/arrow/type.cc +++ b/cpp/src/arrow/type.cc @@ -1080,7 +1080,7 @@ class ChunkedColumn { const std::shared_ptr& type() const { return type_; } - ChunkedColumnVector Flatten() const; + ChunkedColumnVector FlattenZeroCopy() const; Result> ToChunkedArray() const { if (num_chunks() == 0) { @@ -1131,7 +1131,7 @@ class ChunkedArrayData : public ChunkedColumn { // Return a vector of ChunkedColumns - one for each struct field. // Unlike ChunkedArray::Flatten, this is zero-copy and doesn't merge parent/child // validity bitmaps. -ChunkedColumnVector ChunkedColumn::Flatten() const { +ChunkedColumnVector ChunkedColumn::FlattenZeroCopy() const { DCHECK_EQ(type()->id(), Type::STRUCT); ChunkedColumnVector columns(type()->num_fields()); @@ -1152,7 +1152,11 @@ ChunkedColumnVector ChunkedColumn::Flatten() const { } struct FieldPathGetImpl { + static const DataType& GetType(const Array& array) { return *array.type(); } static const DataType& GetType(const ArrayData& data) { return *data.type; } + static const DataType& GetType(const ChunkedArray& chunked_array) { + return *chunked_array.type(); + } static const DataType& GetType(const ChunkedColumn& column) { return *column.type(); } static void Summarize(const FieldVector& fields, std::stringstream* ss) { @@ -1172,6 +1176,10 @@ struct FieldPathGetImpl { *ss << "}"; } + static Status NonStructError() { + return Status::NotImplemented("Get child data of non-struct array"); + } + template static Status IndexError(const FieldPath* path, int out_of_range_depth, const std::vector& children) { @@ -1211,7 +1219,7 @@ struct FieldPathGetImpl { const T* out; while (true) { if (children == nullptr) { - return Status::NotImplemented("Get child data of non-struct array"); + return NonStructError(); } auto index = (*path)[depth]; @@ -1262,7 +1270,7 @@ struct FieldPathGetImpl { if (parent->type()->id() != Type::STRUCT) { return nullptr; } - children = parent->Flatten(); + children = parent->FlattenZeroCopy(); return &children; })); @@ -1280,6 +1288,31 @@ struct FieldPathGetImpl { return &data->child_data; }); } + + static Status Flatten(const Array& array, ArrayVector* out) { + return checked_cast(array).Flatten().Value(out); + } + static Status Flatten(const ChunkedArray& chunked_array, ChunkedArrayVector* out) { + return chunked_array.Flatten().Value(out); + } + + template + static Result GetFlattened(const FieldPath* path, + const std::vector& toplevel_children) { + std::vector children; + Status error; + auto result = FieldPathGetImpl::Get( + path, &toplevel_children, + [&children, &error](const T& parent) -> const std::vector* { + if (parent->type()->id() != Type::STRUCT) { + return nullptr; + } + error = Flatten(*parent, &children); + return error.ok() ? &children : nullptr; + }); + ARROW_RETURN_NOT_OK(error); + return result; + } }; Result> FieldPath::Get(const Schema& schema) const { @@ -1330,7 +1363,7 @@ Result> FieldPath::Get(const Array& array) const { Result> FieldPath::Get(const ArrayData& data) const { if (data.type->id() != Type::STRUCT) { - return Status::NotImplemented("Get child data of non-struct array"); + return FieldPathGetImpl::NonStructError(); } return FieldPathGetImpl::Get(this, data.child_data); } @@ -1338,12 +1371,46 @@ Result> FieldPath::Get(const ArrayData& data) const { Result> FieldPath::Get( const ChunkedArray& chunked_array) const { if (chunked_array.type()->id() != Type::STRUCT) { - return Status::NotImplemented("Get child data of non-struct chunked array"); + return FieldPathGetImpl::NonStructError(); } - auto columns = ChunkedArrayRef(chunked_array).Flatten(); + auto columns = ChunkedArrayRef(chunked_array).FlattenZeroCopy(); return FieldPathGetImpl::Get(this, columns); } +Result> FieldPath::GetFlattened(const Array& array) const { + if (array.type_id() != Type::STRUCT) { + return FieldPathGetImpl::NonStructError(); + } + auto&& struct_array = checked_cast(array); + if (struct_array.null_count() == 0) { + return FieldPathGetImpl::GetFlattened(this, struct_array.fields()); + } + ARROW_ASSIGN_OR_RAISE(auto children, struct_array.Flatten()); + return FieldPathGetImpl::GetFlattened(this, children); +} + +Result> FieldPath::GetFlattened(const ArrayData& data) const { + ARROW_ASSIGN_OR_RAISE(auto array, GetFlattened(*MakeArray(data.Copy()))); + return array->data(); +} + +Result> FieldPath::GetFlattened( + const ChunkedArray& chunked_array) const { + if (chunked_array.type()->id() != Type::STRUCT) { + return FieldPathGetImpl::NonStructError(); + } + ARROW_ASSIGN_OR_RAISE(auto children, chunked_array.Flatten()); + return FieldPathGetImpl::GetFlattened(this, children); +} + +Result> FieldPath::GetFlattened(const RecordBatch& batch) const { + return FieldPathGetImpl::GetFlattened(this, batch.columns()); +} + +Result> FieldPath::GetFlattened(const Table& table) const { + return FieldPathGetImpl::GetFlattened(this, table.columns()); +} + FieldRef::FieldRef(FieldPath indices) : impl_(std::move(indices)) {} void FieldRef::Flatten(std::vector children) { diff --git a/cpp/src/arrow/type.h b/cpp/src/arrow/type.h index a3e78eeb722..46537010c89 100644 --- a/cpp/src/arrow/type.h +++ b/cpp/src/arrow/type.h @@ -1699,6 +1699,18 @@ class ARROW_EXPORT FieldPath { /// \brief Retrieve the referenced child from a ChunkedArray Result> Get(const ChunkedArray& chunked_array) const; + /// \brief Retrieve the referenced child/column from an Array, ArrayData, ChunkedArray, + /// RecordBatch, or Table + /// + /// Unlike `FieldPath::Get`, these variants are not zero-copy and the retrieved child's + /// null bitmap is ANDed with its parent's + Result> GetFlattened(const Array& array) const; + Result> GetFlattened(const ArrayData& data) const; + Result> GetFlattened( + const ChunkedArray& chunked_array) const; + Result> GetFlattened(const RecordBatch& batch) const; + Result> GetFlattened(const Table& table) const; + private: std::vector indices_; }; From 4396f6ce805c2ac162f3afae54f2c9c3eba8688a Mon Sep 17 00:00:00 2001 From: benibus Date: Mon, 17 Apr 2023 13:58:08 -0400 Subject: [PATCH 02/21] Fix offset handling in ChunkedColumn::FlattenZeroCopy --- cpp/src/arrow/type.cc | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/cpp/src/arrow/type.cc b/cpp/src/arrow/type.cc index c2155fb2b2a..0a594dd4e3b 100644 --- a/cpp/src/arrow/type.cc +++ b/cpp/src/arrow/type.cc @@ -1139,10 +1139,12 @@ ChunkedColumnVector ChunkedColumn::FlattenZeroCopy() const { const auto& child_type = type()->field(column_idx)->type(); ArrayDataVector chunks(num_chunks()); for (int chunk_idx = 0; chunk_idx < num_chunks(); ++chunk_idx) { - const auto& child_data = chunk(chunk_idx)->child_data; - DCHECK_EQ(columns.size(), child_data.size()); - DCHECK(child_type->Equals(child_data[column_idx]->type)); - chunks[chunk_idx] = child_data[column_idx]; + const auto& parent = chunk(chunk_idx); + const auto& children = parent->child_data; + DCHECK_EQ(columns.size(), children.size()); + auto child = children[column_idx]->Slice(parent->offset, parent->length); + DCHECK(child_type->Equals(child->type)); + chunks[chunk_idx] = child; } columns[column_idx] = std::make_shared(child_type, std::move(chunks)); From 665c3dcc79b37a4c39aca7f82b1f4b638294bc8d Mon Sep 17 00:00:00 2001 From: benibus Date: Mon, 17 Apr 2023 14:04:13 -0400 Subject: [PATCH 03/21] Revamp tests for FieldPath --- cpp/src/arrow/type_test.cc | 502 +++++++++++++++++++++++++------------ 1 file changed, 335 insertions(+), 167 deletions(-) diff --git a/cpp/src/arrow/type_test.cc b/cpp/src/arrow/type_test.cc index 4fb9598f936..dab2b4fe17a 100644 --- a/cpp/src/arrow/type_test.cc +++ b/cpp/src/arrow/type_test.cc @@ -37,6 +37,7 @@ #include "arrow/type_traits.h" #include "arrow/util/checked_cast.h" #include "arrow/util/key_value_metadata.h" +#include "arrow/util/logging.h" namespace arrow { @@ -364,152 +365,354 @@ TEST(TestField, TestMerge) { } } -TEST(TestFieldPath, Basics) { - auto f0 = field("alpha", int32()); - auto f1 = field("beta", int32()); - auto f2 = field("alpha", int32()); - auto f3 = field("beta", int32()); - Schema s({f0, f1, f2, f3}); +struct FieldPathTestCase { + struct OutputValues { + explicit OutputValues(std::vector indices = {}) + : path(FieldPath(std::move(indices))) {} - // retrieving a field with single-element FieldPath is equivalent to Schema::field - for (int index = 0; index < s.num_fields(); ++index) { - ASSERT_OK_AND_EQ(s.field(index), FieldPath({index}).Get(s)); + template + auto&& Get() const; + + FieldPath path; + std::shared_ptr field; + std::shared_ptr array; + std::shared_ptr chunked_array; + }; + + static constexpr int kNumColumns = 2; + static constexpr int kNumRows = 100; + static constexpr int kRandomSeed = 0xbeef; + + // Input for the FieldPath::Get functions in multiple forms + std::shared_ptr schema; + std::shared_ptr type; + std::shared_ptr array; + std::shared_ptr record_batch; + std::shared_ptr chunked_array; + std::shared_ptr table; + + template + auto&& GetInput() const; + + // Number of chunks for each column in the input Table + const std::array num_column_chunks = {15, 20}; + // Number of chunks in the input ChunkedArray + const int num_chunks = 15; + + // Expected outputs for each child; + OutputValues v0{{0}}, v1{{1}}; + OutputValues v1_0{{1, 0}}, v1_1{{1, 1}}; + OutputValues v1_1_0{{1, 1, 0}}, v1_1_1{{1, 1, 1}}; + // Expected outputs for nested children with null flattening applied + OutputValues v1_0_flat{{1, 0}}, v1_1_flat{{1, 1}}; + OutputValues v1_1_0_flat{{1, 1, 0}}, v1_1_1_flat{{1, 1, 1}}; + + static Result Make() { + // Generate test input based on a single schema. First by creating a StructArray, + // then deriving the other input types (ChunkedArray, RecordBatch, Table, etc) from + // it. We also compute the expected outputs for each child individually (for each + // output type). + FieldPathTestCase out; + random::RandomArrayGenerator gen(kRandomSeed); + + // Define child fields and input schema + out.v1_1_1.field = field("b", boolean()); + out.v1_1_0.field = field("f", float32()); + out.v1_1.field = field("s1", struct_({out.v1_1_0.field, out.v1_1_1.field})); + out.v1_0.field = field("i", int32()); + out.v1.field = field("s0", struct_({out.v1_0.field, out.v1_1.field})); + out.v0.field = field("u", utf8()); + out.schema = arrow::schema({out.v0.field, out.v1.field}); + out.type = struct_(out.schema->fields()); + + // Create null bitmaps for the struct fields independent of its childrens' + // bitmaps. For FieldPath::GetFlattened, parent/child bitmaps should be combined + // - for FieldPath::Get, higher-level nulls are ignored. + auto bitmap1_1 = gen.NullBitmap(kNumRows, 0.15); + auto bitmap1 = gen.NullBitmap(kNumRows, 0.30); + + // Generate raw leaf arrays + out.v1_1_1.array = gen.ArrayOf(out.v1_1_1.field->type(), kNumRows); + out.v1_1_0.array = gen.ArrayOf(out.v1_1_0.field->type(), kNumRows); + out.v1_0.array = gen.ArrayOf(out.v1_0.field->type(), kNumRows); + out.v0.array = gen.ArrayOf(out.v0.field->type(), kNumRows); + // Make struct fields from leaf arrays (we use the custom bitmaps here) + ARROW_ASSIGN_OR_RAISE( + out.v1_1.array, + StructArray::Make({out.v1_1_0.array, out.v1_1_1.array}, + {out.v1_1_0.field, out.v1_1_1.field}, bitmap1_1)); + ARROW_ASSIGN_OR_RAISE(out.v1.array, + StructArray::Make({out.v1_0.array, out.v1_1.array}, + {out.v1_0.field, out.v1_1.field}, bitmap1)); + + // Not used to create the test input, but pre-compute flattened versions of nested + // arrays for comparisons in the GetFlattened tests. + ARROW_ASSIGN_OR_RAISE( + out.v1_0_flat.array, + checked_pointer_cast(out.v1.array)->GetFlattenedField(0)); + ARROW_ASSIGN_OR_RAISE( + out.v1_1_flat.array, + checked_pointer_cast(out.v1.array)->GetFlattenedField(1)); + ARROW_ASSIGN_OR_RAISE( + out.v1_1_0_flat.array, + checked_pointer_cast(out.v1_1_flat.array)->GetFlattenedField(0)); + ARROW_ASSIGN_OR_RAISE( + out.v1_1_1_flat.array, + checked_pointer_cast(out.v1_1_flat.array)->GetFlattenedField(1)); + // Sanity check + ARROW_CHECK(!out.v1_0_flat.array->Equals(out.v1_0.array)); + ARROW_CHECK(!out.v1_1_flat.array->Equals(out.v1_1.array)); + ARROW_CHECK(!out.v1_1_0_flat.array->Equals(out.v1_1_0.array)); + ARROW_CHECK(!out.v1_1_1_flat.array->Equals(out.v1_1_1.array)); + + // Finalize the input Array + ARROW_ASSIGN_OR_RAISE(out.array, StructArray::Make({out.v0.array, out.v1.array}, + {out.v0.field, out.v1.field})); + ARROW_RETURN_NOT_OK(out.array->ValidateFull()); + // Finalize the input RecordBatch + ARROW_ASSIGN_OR_RAISE(out.record_batch, RecordBatch::FromStructArray(out.array)); + ARROW_RETURN_NOT_OK(out.record_batch->ValidateFull()); + // Finalize the input ChunkedArray + out.chunked_array = SliceToChunkedArray(*out.array, out.num_chunks); + ARROW_RETURN_NOT_OK(out.chunked_array->ValidateFull()); + + // For each expected child array, create a chunked equivalent (we use a different + // chunk layout for each top-level column to make the Table test more interesting) + for (OutputValues* v : + {&out.v0, &out.v1, &out.v1_0, &out.v1_1, &out.v1_1_0, &out.v1_1_1, + &out.v1_0_flat, &out.v1_1_flat, &out.v1_1_0_flat, &out.v1_1_1_flat}) { + v->chunked_array = + SliceToChunkedArray(*v->array, out.num_column_chunks[v->path[0]]); + } + // Finalize the input Table + out.table = + Table::Make(out.schema, {out.v0.chunked_array, out.v1.chunked_array}, kNumRows); + ARROW_RETURN_NOT_OK(out.table->ValidateFull()); + + return std::move(out); } - EXPECT_RAISES_WITH_MESSAGE_THAT(Invalid, - testing::HasSubstr("empty indices cannot be traversed"), - FieldPath().Get(s)); - EXPECT_RAISES_WITH_MESSAGE_THAT(IndexError, testing::HasSubstr("index out of range"), - FieldPath({s.num_fields() * 2}).Get(s)); -} - -TEST(TestFieldPath, GetForTable) { - using testing::HasSubstr; - - constexpr int kNumRows = 4; - auto f0 = field("a", int32()); - auto f1 = field("b", int32()); - auto f2 = field("c", struct_({f1})); - auto f3 = field("d", struct_({f0, f2})); - auto table_schema = schema({f0, f1, f2, f3}); - - // Each column has a different chunking - ChunkedArrayVector columns(4); - columns[0] = ChunkedArrayFromJSON(f0->type(), {"[0,1,2,3]"}); - columns[1] = ChunkedArrayFromJSON(f1->type(), {"[3,2,1]", "[0]"}); - columns[2] = - ChunkedArrayFromJSON(f2->type(), {R"([{"b":3},{"b":2}])", R"([{"b":1},{"b":0}])"}); - columns[3] = ChunkedArrayFromJSON( - f3->type(), {R"([{"a":0,"c":{"b":3}},{"a":1,"c":{"b":2}}])", - R"([{"a":2,"c":{"b":1}}])", R"([{"a":3,"c":{"b":0}}])"}); - auto table = Table::Make(table_schema, columns, kNumRows); - ASSERT_OK(table->ValidateFull()); - ASSERT_OK_AND_ASSIGN(auto v0, FieldPath({0}).Get(*table)); - ASSERT_OK_AND_ASSIGN(auto v1, FieldPath({1}).Get(*table)); - ASSERT_OK_AND_ASSIGN(auto v2, FieldPath({2}).Get(*table)); - ASSERT_OK_AND_ASSIGN(auto v2_0, FieldPath({2, 0}).Get(*table)); - ASSERT_OK_AND_ASSIGN(auto v3, FieldPath({3}).Get(*table)); - ASSERT_OK_AND_ASSIGN(auto v3_0, FieldPath({3, 0}).Get(*table)); - ASSERT_OK_AND_ASSIGN(auto v3_1, FieldPath({3, 1}).Get(*table)); - ASSERT_OK_AND_ASSIGN(auto v3_1_0, FieldPath({3, 1, 0}).Get(*table)); - - EXPECT_EQ(v0->num_chunks(), columns[0]->num_chunks()); - EXPECT_EQ(v1->num_chunks(), columns[1]->num_chunks()); - EXPECT_EQ(v2->num_chunks(), columns[2]->num_chunks()); - EXPECT_EQ(v2_0->num_chunks(), columns[2]->num_chunks()); - EXPECT_EQ(v3->num_chunks(), columns[3]->num_chunks()); - EXPECT_EQ(v3_0->num_chunks(), columns[3]->num_chunks()); - EXPECT_EQ(v3_1->num_chunks(), columns[3]->num_chunks()); - EXPECT_EQ(v3_1_0->num_chunks(), columns[3]->num_chunks()); - - EXPECT_TRUE(columns[0]->Equals(v0)); - EXPECT_TRUE(columns[0]->Equals(v3_0)); - - EXPECT_TRUE(columns[1]->Equals(v1)); - EXPECT_TRUE(columns[1]->Equals(v2_0)); - EXPECT_TRUE(columns[1]->Equals(v3_1_0)); - - EXPECT_TRUE(columns[2]->Equals(v2)); - EXPECT_TRUE(columns[2]->Equals(v3_1)); - - EXPECT_TRUE(columns[3]->Equals(v3)); - - for (const auto& path : - {FieldPath({4, 1, 0}), FieldPath({3, 2, 0}), FieldPath{3, 1, 1}}) { - EXPECT_RAISES_WITH_MESSAGE_THAT(IndexError, HasSubstr("index out of range"), - path.Get(*table)); + static std::shared_ptr SliceToChunkedArray(const Array& array, + int num_chunks) { + ARROW_CHECK(num_chunks > 0 && array.length() >= num_chunks); + ArrayVector chunks; + chunks.reserve(num_chunks); + for (int64_t inc = array.length() / num_chunks, beg = 0, + end = inc + array.length() % num_chunks; + end <= array.length(); beg = end, end += inc) { + chunks.push_back(array.SliceSafe(beg, end - beg).ValueOrDie()); + } + ARROW_CHECK_EQ(static_cast(chunks.size()), num_chunks); + return ChunkedArray::Make(std::move(chunks)).ValueOrDie(); + } +}; + +template <> +auto&& FieldPathTestCase::GetInput() const { + return this->schema; +} +template <> +auto&& FieldPathTestCase::GetInput() const { + return this->type; +} +template <> +auto&& FieldPathTestCase::GetInput() const { + return this->array; +} +template <> +auto&& FieldPathTestCase::GetInput() const { + return this->array->data(); +} +template <> +auto&& FieldPathTestCase::GetInput() const { + return this->chunked_array; +} +template <> +auto&& FieldPathTestCase::GetInput() const { + return this->record_batch; +} +template <> +auto&& FieldPathTestCase::GetInput
() const { + return this->table; +} + +template <> +auto&& FieldPathTestCase::OutputValues::Get() const { + return this->field; +} +template <> +auto&& FieldPathTestCase::OutputValues::Get() const { + return this->array; +} +template <> +auto&& FieldPathTestCase::OutputValues::Get() const { + return this->array->data(); +} +template <> +auto&& FieldPathTestCase::OutputValues::Get() const { + return this->chunked_array; +} + +class FieldPathTestFixture : public ::testing::Test { + protected: + template + using OutputType = typename decltype(std::declval() + .Get(std::declval()) + .ValueOrDie())::element_type; + + template + void AssertOutputsEqual(const std::shared_ptr& expected, + const std::shared_ptr& actual) const { + AssertFieldEqual(*expected, *actual); } - EXPECT_RAISES_WITH_MESSAGE_THAT(Invalid, HasSubstr("empty indices cannot be traversed"), - FieldPath().Get(*table)); -} - -TEST(TestFieldPath, GetForChunkedArray) { - using testing::HasSubstr; - - auto f0 = field("a", int32()); - auto f1 = field("b", int32()); - auto f2 = field("c", struct_({f1})); - auto f3 = field("d", struct_({f0, f2})); - auto type = struct_({f0, f1, f3}); - - auto column0 = ChunkedArrayFromJSON(f0->type(), {"[0,1,2,3]"}); - auto column1 = ChunkedArrayFromJSON(f1->type(), {"[3,2,1,0]"}); - auto column2_1 = - ChunkedArrayFromJSON(f2->type(), {R"([{"b":3},{"b":2},{"b":1},{"b":0}])"}); - auto chunked_array = ChunkedArrayFromJSON( - type, - { - R"([{"a":0,"b":3,"d":{"a":0,"c":{"b":3}}}])", - R"([{"a":1,"b":2,"d":{"a":1,"c":{"b":2}}},{"a":2,"b":1,"d":{"a":2,"c":{"b":1}}}])", - R"([{"a":3,"b":0,"d":{"a":3,"c":{"b":0}}}])", - }); - ASSERT_OK(chunked_array->ValidateFull()); - - ASSERT_OK_AND_ASSIGN(auto v0, FieldPath({0}).Get(*chunked_array)); - ASSERT_OK_AND_ASSIGN(auto v1, FieldPath({1}).Get(*chunked_array)); - ASSERT_OK_AND_ASSIGN(auto v2_0, FieldPath({2, 0}).Get(*chunked_array)); - ASSERT_OK_AND_ASSIGN(auto v2_1, FieldPath({2, 1}).Get(*chunked_array)); - ASSERT_OK_AND_ASSIGN(auto v2_1_0, FieldPath({2, 1, 0}).Get(*chunked_array)); - - for (const auto& v : {v0, v1, v2_0, v2_1, v2_1_0}) { - EXPECT_EQ(v->num_chunks(), chunked_array->num_chunks()); + template + void AssertOutputsEqual(const std::shared_ptr& expected, + const std::shared_ptr& actual) const { + AssertArraysEqual(*expected, *actual); + } + template + void AssertOutputsEqual(const std::shared_ptr& expected, + const std::shared_ptr& actual) const { + if constexpr (std::is_same_v) { + EXPECT_EQ(case_->chunked_array->num_chunks(), actual->num_chunks()); + } else { + EXPECT_EQ(expected->num_chunks(), actual->num_chunks()); + } + AssertChunkedEquivalent(*expected, *actual); } - EXPECT_TRUE(column0->Equals(v0)); - EXPECT_TRUE(column0->Equals(v2_0)); + static const FieldPathTestCase* GenerateTestCase() { + static const auto maybe_test_case = FieldPathTestCase::Make(); + return &maybe_test_case.ValueOrDie(); + } - EXPECT_TRUE(column1->Equals(v1)); - EXPECT_TRUE(column1->Equals(v2_1_0)); - EXPECT_FALSE(column1->Equals(v2_1)); + static inline const FieldPathTestCase* case_ = GenerateTestCase(); +}; - EXPECT_TRUE(column2_1->Equals(v2_1)); +class TestFieldPath : public FieldPathTestFixture { + protected: + template > + using GetFn = std::function>(const FieldPath&, const I&)>; - EXPECT_RAISES_WITH_MESSAGE_THAT(NotImplemented, - HasSubstr("Get child data of non-struct chunked array"), - FieldPath({0}).Get(*column0)); -} + template + static auto DoGet(const FieldPath& path, const I& input) { + return path.Get(input); + } + template + static auto DoGetFlattened(const FieldPath& path, const I& input) { + return path.GetFlattened(input); + } -TEST(TestFieldPath, GetForChunkedArrayWithNulls) { - auto int_field = field("i", int32()); - auto int_chunked_array = - ChunkedArrayFromJSON(int_field->type(), {"[0,1]", "[2,null]", "[3,4]"}); + template + void TestGetWithInvalidIndex(GetFn get_fn) const { + const auto& input = case_->GetInput(); + for (const auto& path : + {FieldPath({2, 1, 0}), FieldPath({1, 2, 0}), FieldPath{1, 1, 2}}) { + EXPECT_RAISES_WITH_MESSAGE_THAT( + IndexError, ::testing::HasSubstr("index out of range"), get_fn(path, *input)); + } + EXPECT_RAISES_WITH_MESSAGE_THAT( + Invalid, ::testing::HasSubstr("empty indices cannot be traversed"), + get_fn(FieldPath(), *input)); + } - ASSERT_OK_AND_ASSIGN(auto null_bitmap, AllocateEmptyBitmap(2)); - ArrayVector struct_chunks; - for (const auto& int_chunk : int_chunked_array->chunks()) { - ASSERT_OK_AND_ASSIGN(auto chunk, - StructArray::Make({int_chunk}, {int_field}, null_bitmap, 2)); - struct_chunks.push_back(chunk); + template + void TestGetWithNonStructArray(GetFn get_fn) const { + EXPECT_RAISES_WITH_MESSAGE_THAT( + NotImplemented, ::testing::HasSubstr("Get child data of non-struct array"), + get_fn(FieldPath({1, 1, 0}), *case_->v1_1_0.Get())); } - ASSERT_OK_AND_ASSIGN(auto struct_chunked_array, ChunkedArray::Make(struct_chunks)); - ASSERT_OK(struct_chunked_array->ValidateFull()); - // The top-level null bitmap shouldn't affect the validity of the returned child field. - ASSERT_OK_AND_ASSIGN(auto int_child, FieldPath({0}).Get(*struct_chunked_array)); - ASSERT_TRUE(int_chunked_array->Equals(int_child)); + template > + void TestGet() const { + const auto& input = case_->GetInput(); + ASSERT_OK_AND_ASSIGN(auto v0, FieldPath({0}).Get(*input)); + ASSERT_OK_AND_ASSIGN(auto v1, FieldPath({1}).Get(*input)); + ASSERT_OK_AND_ASSIGN(auto v1_0, FieldPath({1, 0}).Get(*input)); + ASSERT_OK_AND_ASSIGN(auto v1_1, FieldPath({1, 1}).Get(*input)); + ASSERT_OK_AND_ASSIGN(auto v1_1_0, FieldPath({1, 1, 0}).Get(*input)); + ASSERT_OK_AND_ASSIGN(auto v1_1_1, FieldPath({1, 1, 1}).Get(*input)); + + AssertOutputsEqual(case_->v0.Get(), v0); + AssertOutputsEqual(case_->v1.Get(), v1); + AssertOutputsEqual(case_->v1_0.Get(), v1_0); + AssertOutputsEqual(case_->v1_1.Get(), v1_1); + AssertOutputsEqual(case_->v1_1_0.Get(), v1_1_0); + AssertOutputsEqual(case_->v1_1_1.Get(), v1_1_1); + } + + template > + void TestGetFlattened() const { + const auto& input = case_->GetInput(); + ASSERT_OK_AND_ASSIGN(auto v0, FieldPath({0}).GetFlattened(*input)); + ASSERT_OK_AND_ASSIGN(auto v1, FieldPath({1}).GetFlattened(*input)); + ASSERT_OK_AND_ASSIGN(auto v1_0, FieldPath({1, 0}).GetFlattened(*input)); + ASSERT_OK_AND_ASSIGN(auto v1_1, FieldPath({1, 1}).GetFlattened(*input)); + ASSERT_OK_AND_ASSIGN(auto v1_1_0, FieldPath({1, 1, 0}).GetFlattened(*input)); + ASSERT_OK_AND_ASSIGN(auto v1_1_1, FieldPath({1, 1, 1}).GetFlattened(*input)); + + AssertOutputsEqual(case_->v0.Get(), v0); + AssertOutputsEqual(case_->v1.Get(), v1); + AssertOutputsEqual(case_->v1_0_flat.Get(), v1_0); + AssertOutputsEqual(case_->v1_1_flat.Get(), v1_1); + AssertOutputsEqual(case_->v1_1_0_flat.Get(), v1_1_0); + AssertOutputsEqual(case_->v1_1_1_flat.Get(), v1_1_1); + } +}; + +TEST_F(TestFieldPath, GetWithInvalidIndex) { + TestGetWithInvalidIndex(DoGet); + TestGetWithInvalidIndex(DoGet); + TestGetWithInvalidIndex(DoGet); + TestGetWithInvalidIndex(DoGet); + TestGetWithInvalidIndex(DoGet); + TestGetWithInvalidIndex(DoGet); + TestGetWithInvalidIndex
(DoGet
); + + TestGetWithInvalidIndex(DoGetFlattened); + TestGetWithInvalidIndex(DoGetFlattened); + TestGetWithInvalidIndex(DoGetFlattened); + TestGetWithInvalidIndex(DoGetFlattened); + TestGetWithInvalidIndex
(DoGetFlattened
); } -TEST(TestFieldPath, GetForEmptyChunked) { +TEST_F(TestFieldPath, GetWithNonStructArray) { + TestGetWithNonStructArray(DoGet); + TestGetWithNonStructArray(DoGet); + TestGetWithNonStructArray(DoGet); + + TestGetWithNonStructArray(DoGetFlattened); + TestGetWithNonStructArray(DoGetFlattened); + TestGetWithNonStructArray(DoGetFlattened); +} + +TEST_F(TestFieldPath, GetFromSchema) { TestGet(); } +TEST_F(TestFieldPath, GetFromDataType) { TestGet(); } + +TEST_F(TestFieldPath, GetFromArray) { TestGet(); } +TEST_F(TestFieldPath, GetFromChunkedArray) { TestGet(); } +TEST_F(TestFieldPath, GetFromRecordBatch) { TestGet(); } +TEST_F(TestFieldPath, GetFromTable) { TestGet
(); } + +TEST_F(TestFieldPath, GetFlattenedFromArray) { TestGetFlattened(); } +TEST_F(TestFieldPath, GetFlattenedFromChunkedArray) { TestGetFlattened(); } +TEST_F(TestFieldPath, GetFlattenedFromRecordBatch) { TestGetFlattened(); } +TEST_F(TestFieldPath, GetFlattenedFromTable) { TestGetFlattened
(); } + +TEST_F(TestFieldPath, Basics) { + auto f0 = field("alpha", int32()); + auto f1 = field("beta", int32()); + auto f2 = field("alpha", int32()); + auto f3 = field("beta", int32()); + Schema s({f0, f1, f2, f3}); + + // retrieving a field with single-element FieldPath is equivalent to Schema::field + for (int index = 0; index < s.num_fields(); ++index) { + ASSERT_OK_AND_EQ(s.field(index), FieldPath({index}).Get(s)); + } +} + +TEST_F(TestFieldPath, GetFromEmptyChunked) { FieldVector fields = { field("i", int32()), field("s", struct_({field("b", boolean()), field("f", float32())}))}; @@ -538,41 +741,6 @@ TEST(TestFieldPath, GetForEmptyChunked) { ASSERT_EQ(child->length(), 0); } -TEST(TestFieldPath, GetForRecordBatch) { - using testing::HasSubstr; - - constexpr int kNumRows = 100; - auto f0 = field("alpha", int32()); - auto f1 = field("beta", int32()); - auto f2 = field("alpha", int32()); - auto f3 = field("beta", int32()); - auto schema = arrow::schema({f0, f1, f2, f3}); - - arrow::random::RandomArrayGenerator gen_{42}; - auto a0 = gen_.ArrayOf(int32(), kNumRows); - auto a1 = gen_.ArrayOf(int32(), kNumRows); - auto a2 = gen_.ArrayOf(int32(), kNumRows); - auto a3 = gen_.ArrayOf(int32(), kNumRows); - auto array_vector = ArrayVector({a0, a1, a2, a3}); - - auto record_batch_ptr = arrow::RecordBatch::Make(schema, kNumRows, array_vector); - ASSERT_OK(record_batch_ptr->ValidateFull()); - - // retrieving an array FieldPath is equivalent to RecordBatch::column - auto num_columns = record_batch_ptr->num_columns(); - auto record_batch_schema = record_batch_ptr->schema(); - for (int index = 0; index < num_columns; ++index) { - ASSERT_OK_AND_EQ(record_batch_schema->field(index), FieldPath({index}).Get(*schema)); - ASSERT_OK_AND_ASSIGN(auto field_path_column, - FieldPath({index}).Get(*record_batch_ptr)); - EXPECT_TRUE(field_path_column->Equals(record_batch_ptr->column(index))); - } - EXPECT_RAISES_WITH_MESSAGE_THAT(Invalid, HasSubstr("empty indices cannot be traversed"), - FieldPath().Get(*record_batch_ptr)); - EXPECT_RAISES_WITH_MESSAGE_THAT(IndexError, HasSubstr("index out of range"), - FieldPath({num_columns * 2}).Get(*record_batch_ptr)); -} - TEST(TestFieldRef, Basics) { auto f0 = field("alpha", int32()); auto f1 = field("beta", int32()); @@ -693,7 +861,7 @@ TEST(TestFieldRef, DotPathRoundTrip) { check_roundtrip(FieldRef("foo", 1, FieldRef("bar", 2, 3), FieldRef())); } -TEST(TestFieldPath, Nested) { +TEST_F(TestFieldPath, Nested) { auto f0 = field("alpha", int32()); auto f1_0 = field("alpha", int32()); auto f1 = field("beta", struct_({f1_0})); From a1ba0df6408c64a0956604e5e4d3fc75848ff799 Mon Sep 17 00:00:00 2001 From: benibus Date: Tue, 18 Apr 2023 12:21:44 -0400 Subject: [PATCH 04/21] Fix test segfault on java-jni-manylinux-2014 --- cpp/src/arrow/type_test.cc | 15 +++++++++------ 1 file changed, 9 insertions(+), 6 deletions(-) diff --git a/cpp/src/arrow/type_test.cc b/cpp/src/arrow/type_test.cc index dab2b4fe17a..2bde78fd9e4 100644 --- a/cpp/src/arrow/type_test.cc +++ b/cpp/src/arrow/type_test.cc @@ -407,6 +407,11 @@ struct FieldPathTestCase { OutputValues v1_0_flat{{1, 0}}, v1_1_flat{{1, 1}}; OutputValues v1_1_0_flat{{1, 1, 0}}, v1_1_1_flat{{1, 1, 1}}; + static const FieldPathTestCase* Instance() { + static const auto maybe_instance = Make(); + return &maybe_instance.ValueOrDie(); + } + static Result Make() { // Generate test input based on a single schema. First by creating a StructArray, // then deriving the other input types (ChunkedArray, RecordBatch, Table, etc) from @@ -554,6 +559,9 @@ auto&& FieldPathTestCase::OutputValues::Get() const { } class FieldPathTestFixture : public ::testing::Test { + public: + FieldPathTestFixture() : case_(FieldPathTestCase::Instance()) {} + protected: template using OutputType = typename decltype(std::declval() @@ -581,12 +589,7 @@ class FieldPathTestFixture : public ::testing::Test { AssertChunkedEquivalent(*expected, *actual); } - static const FieldPathTestCase* GenerateTestCase() { - static const auto maybe_test_case = FieldPathTestCase::Make(); - return &maybe_test_case.ValueOrDie(); - } - - static inline const FieldPathTestCase* case_ = GenerateTestCase(); + const FieldPathTestCase* case_; }; class TestFieldPath : public FieldPathTestFixture { From db2c0d3aa9b749c970c24486b93b538c25d659bb Mon Sep 17 00:00:00 2001 From: benibus Date: Tue, 18 Apr 2023 15:38:37 -0400 Subject: [PATCH 05/21] Implement flattened methods for FieldRef --- cpp/src/arrow/type.h | 70 ++++++++++++++++++++++++++++++++++++-------- 1 file changed, 58 insertions(+), 12 deletions(-) diff --git a/cpp/src/arrow/type.h b/cpp/src/arrow/type.h index 46537010c89..16f0e56d5a2 100644 --- a/cpp/src/arrow/type.h +++ b/cpp/src/arrow/type.h @@ -1715,6 +1715,25 @@ class ARROW_EXPORT FieldPath { std::vector indices_; }; +namespace internal { + +template +using FieldPathGetType = + decltype(std::declval().Get(std::declval()).ValueOrDie()); + +template +std::enable_if_t>> GetChild( + const T& root, const FieldPath& path) { + return path.Get(root); +} +template +std::enable_if_t>> GetChild(const T& root, + const FieldPath& path) { + return path.GetFlattened(root); +} + +} // namespace internal + /// \class FieldRef /// \brief Descriptor of a (potentially nested) field within a schema. /// @@ -1887,40 +1906,67 @@ class ARROW_EXPORT FieldRef : public util::EqualityComparable { } template - using GetType = decltype(std::declval().Get(std::declval()).ValueOrDie()); + using GetType = internal::FieldPathGetType; /// \brief Get all children matching this FieldRef. template std::vector> GetAll(const T& root) const { - std::vector> out; - for (const auto& match : FindAll(root)) { - out.push_back(match.Get(root).ValueOrDie()); - } - return out; + return GetAll(root); + } + template + std::vector> GetAllFlattened(const T& root) const { + return GetAll(root); } /// \brief Get the single child matching this FieldRef. /// Emit an error if none or multiple match. template Result> GetOne(const T& root) const { - ARROW_ASSIGN_OR_RAISE(auto match, FindOne(root)); - return match.Get(root).ValueOrDie(); + return GetOne(root); + } + template + Result> GetOneFlattened(const T& root) const { + return GetOne(root); } /// \brief Get the single child matching this FieldRef. /// Return nullptr if none match, emit an error if multiple match. template + Result> GetOneOrNone(const T& root) const { + return GetOneOrNone(root); + } + template + Result> GetOneOrNoneFlattened(const T& root) const { + return GetOneOrNone(root); + } + + private: + void Flatten(std::vector children); + + template + Result> GetOne(const T& root) const { + ARROW_ASSIGN_OR_RAISE(auto match, FindOne(root)); + return internal::GetChild(root, match).ValueOrDie(); + } + + template + std::vector> GetAll(const T& root) const { + std::vector> out; + for (const auto& match : FindAll(root)) { + out.push_back(internal::GetChild(root, match).ValueOrDie()); + } + return out; + } + + template Result> GetOneOrNone(const T& root) const { ARROW_ASSIGN_OR_RAISE(auto match, FindOneOrNone(root)); if (match.empty()) { return static_cast>(NULLPTR); } - return match.Get(root).ValueOrDie(); + return internal::GetChild(root, match).ValueOrDie(); } - private: - void Flatten(std::vector children); - std::variant> impl_; }; From 971ce4e3f1fe9461ff5042ef1e9af1eb45cf9228 Mon Sep 17 00:00:00 2001 From: benibus Date: Tue, 18 Apr 2023 15:41:07 -0400 Subject: [PATCH 06/21] Use internal helpers in FieldPath tests --- cpp/src/arrow/type_test.cc | 73 ++++++++++++++++---------------------- 1 file changed, 30 insertions(+), 43 deletions(-) diff --git a/cpp/src/arrow/type_test.cc b/cpp/src/arrow/type_test.cc index 2bde78fd9e4..f3d5d9db732 100644 --- a/cpp/src/arrow/type_test.cc +++ b/cpp/src/arrow/type_test.cc @@ -564,9 +564,7 @@ class FieldPathTestFixture : public ::testing::Test { protected: template - using OutputType = typename decltype(std::declval() - .Get(std::declval()) - .ValueOrDie())::element_type; + using OutputType = typename internal::FieldPathGetType::element_type; template void AssertOutputsEqual(const std::shared_ptr& expected, @@ -594,36 +592,25 @@ class FieldPathTestFixture : public ::testing::Test { class TestFieldPath : public FieldPathTestFixture { protected: - template > - using GetFn = std::function>(const FieldPath&, const I&)>; - - template - static auto DoGet(const FieldPath& path, const I& input) { - return path.Get(input); - } - template - static auto DoGetFlattened(const FieldPath& path, const I& input) { - return path.GetFlattened(input); - } - - template - void TestGetWithInvalidIndex(GetFn get_fn) const { + template + void TestGetWithInvalidIndex() const { const auto& input = case_->GetInput(); for (const auto& path : {FieldPath({2, 1, 0}), FieldPath({1, 2, 0}), FieldPath{1, 1, 2}}) { - EXPECT_RAISES_WITH_MESSAGE_THAT( - IndexError, ::testing::HasSubstr("index out of range"), get_fn(path, *input)); + EXPECT_RAISES_WITH_MESSAGE_THAT(IndexError, + ::testing::HasSubstr("index out of range"), + internal::GetChild(*input, path)); } EXPECT_RAISES_WITH_MESSAGE_THAT( Invalid, ::testing::HasSubstr("empty indices cannot be traversed"), - get_fn(FieldPath(), *input)); + internal::GetChild(*input, FieldPath())); } - template - void TestGetWithNonStructArray(GetFn get_fn) const { + template + void TestGetWithNonStructArray() const { EXPECT_RAISES_WITH_MESSAGE_THAT( NotImplemented, ::testing::HasSubstr("Get child data of non-struct array"), - get_fn(FieldPath({1, 1, 0}), *case_->v1_1_0.Get())); + internal::GetChild(*case_->v1_1_0.Get(), FieldPath({1, 1, 0}))); } template > @@ -664,29 +651,29 @@ class TestFieldPath : public FieldPathTestFixture { }; TEST_F(TestFieldPath, GetWithInvalidIndex) { - TestGetWithInvalidIndex(DoGet); - TestGetWithInvalidIndex(DoGet); - TestGetWithInvalidIndex(DoGet); - TestGetWithInvalidIndex(DoGet); - TestGetWithInvalidIndex(DoGet); - TestGetWithInvalidIndex(DoGet); - TestGetWithInvalidIndex
(DoGet
); - - TestGetWithInvalidIndex(DoGetFlattened); - TestGetWithInvalidIndex(DoGetFlattened); - TestGetWithInvalidIndex(DoGetFlattened); - TestGetWithInvalidIndex(DoGetFlattened); - TestGetWithInvalidIndex
(DoGetFlattened
); + TestGetWithInvalidIndex(); + TestGetWithInvalidIndex(); + TestGetWithInvalidIndex(); + TestGetWithInvalidIndex(); + TestGetWithInvalidIndex(); + TestGetWithInvalidIndex(); + TestGetWithInvalidIndex
(); + // With flattening + TestGetWithInvalidIndex(); + TestGetWithInvalidIndex(); + TestGetWithInvalidIndex(); + TestGetWithInvalidIndex(); + TestGetWithInvalidIndex(); } TEST_F(TestFieldPath, GetWithNonStructArray) { - TestGetWithNonStructArray(DoGet); - TestGetWithNonStructArray(DoGet); - TestGetWithNonStructArray(DoGet); - - TestGetWithNonStructArray(DoGetFlattened); - TestGetWithNonStructArray(DoGetFlattened); - TestGetWithNonStructArray(DoGetFlattened); + TestGetWithNonStructArray(); + TestGetWithNonStructArray(); + TestGetWithNonStructArray(); + // With flattening + TestGetWithNonStructArray(); + TestGetWithNonStructArray(); + TestGetWithNonStructArray(); } TEST_F(TestFieldPath, GetFromSchema) { TestGet(); } From 8b6aea4b248062f78b7a0c9c4487e80207d30dea Mon Sep 17 00:00:00 2001 From: benibus Date: Tue, 18 Apr 2023 16:04:49 -0400 Subject: [PATCH 07/21] Remove redundant FieldPath test --- cpp/src/arrow/type_test.cc | 19 ------------------- 1 file changed, 19 deletions(-) diff --git a/cpp/src/arrow/type_test.cc b/cpp/src/arrow/type_test.cc index f3d5d9db732..e01ad9ca065 100644 --- a/cpp/src/arrow/type_test.cc +++ b/cpp/src/arrow/type_test.cc @@ -851,25 +851,6 @@ TEST(TestFieldRef, DotPathRoundTrip) { check_roundtrip(FieldRef("foo", 1, FieldRef("bar", 2, 3), FieldRef())); } -TEST_F(TestFieldPath, Nested) { - auto f0 = field("alpha", int32()); - auto f1_0 = field("alpha", int32()); - auto f1 = field("beta", struct_({f1_0})); - auto f2_0 = field("alpha", int32()); - auto f2_1_0 = field("alpha", int32()); - auto f2_1_1 = field("alpha", int32()); - auto f2_1 = field("gamma", struct_({f2_1_0, f2_1_1})); - auto f2 = field("beta", struct_({f2_0, f2_1})); - Schema s({f0, f1, f2}); - - // retrieving fields with nested indices - EXPECT_EQ(FieldPath({0}).Get(s), f0); - EXPECT_EQ(FieldPath({1, 0}).Get(s), f1_0); - EXPECT_EQ(FieldPath({2, 0}).Get(s), f2_0); - EXPECT_EQ(FieldPath({2, 1, 0}).Get(s), f2_1_0); - EXPECT_EQ(FieldPath({2, 1, 1}).Get(s), f2_1_1); -} - TEST(TestFieldRef, Nested) { auto f0 = field("alpha", int32()); auto f1_0 = field("alpha", int32()); From 78ca0bad891785f8ef95699ae71b8771366aab32 Mon Sep 17 00:00:00 2001 From: benibus Date: Mon, 1 May 2023 14:25:13 -0400 Subject: [PATCH 08/21] Implement NestedSelector for FieldPathGetImpl --- cpp/src/arrow/type.cc | 394 +++++++++++++++++------------------------- 1 file changed, 155 insertions(+), 239 deletions(-) diff --git a/cpp/src/arrow/type.cc b/cpp/src/arrow/type.cc index 0a594dd4e3b..9948837d6df 100644 --- a/cpp/src/arrow/type.cc +++ b/cpp/src/arrow/type.cc @@ -1047,7 +1047,7 @@ std::string DictionaryType::ToString() const { std::string NullType::ToString() const { return name(); } // ---------------------------------------------------------------------- -// FieldRef +// FieldPath size_t FieldPath::hash() const { return internal::ComputeStringHash<0>(indices().data(), indices().size() * sizeof(int)); @@ -1066,125 +1066,146 @@ std::string FieldPath::ToString() const { return repr; } -class ChunkedColumn; -using ChunkedColumnVector = std::vector>; +static Status NonStructError() { + return Status::NotImplemented("Get child data of non-struct array"); +} -class ChunkedColumn { +// Utility class for retrieving a child field/column from a top-level Field, Array, or +// ChunkedArray. The "root" value can either be a single parent or a vector of its +// children. +template +class NestedSelector { public: - virtual ~ChunkedColumn() = default; - - explicit ChunkedColumn(const std::shared_ptr& type = nullptr) : type_(type) {} - - virtual int num_chunks() const = 0; - virtual const std::shared_ptr& chunk(int i) const = 0; - - const std::shared_ptr& type() const { return type_; } + using ArrowType = T; + + explicit NestedSelector(const std::vector>& children) + : parent_or_children_(&children) {} + explicit NestedSelector(const T& parent) : parent_or_children_(&parent) {} + explicit NestedSelector(std::shared_ptr parent) + : owned_parent_(std::move(parent)), parent_or_children_(owned_parent_.get()) {} + + // If the index is out of bounds, this returns an invalid selector rather than an + // error. + Result GetChild(int i) const { + std::shared_ptr child; + if (auto parent = get_parent()) { + ARROW_ASSIGN_OR_RAISE(child, GetChild(*parent, i)); + } else if (auto children = get_children()) { + if (ARROW_PREDICT_TRUE(i >= 0 && static_cast(i) < children->size())) { + child = (*children)[i]; + } + } + return NestedSelector(std::move(child)); + } - ChunkedColumnVector FlattenZeroCopy() const; + Result> Finish() const { + DCHECK(get_parent() && owned_parent_); + return owned_parent_; + } - Result> ToChunkedArray() const { - if (num_chunks() == 0) { - return ChunkedArray::MakeEmpty(type()); + template + std::enable_if_t> Serialize(OStream* os) const { + const FieldVector* fields = get_children(); + if (!fields && get_parent()) { + fields = &get_parent()->type()->fields(); } - ArrayVector chunks(num_chunks()); - for (int i = 0; i < num_chunks(); ++i) { - chunks[i] = MakeArray(chunk(i)); + *os << "fields: { "; + if (fields) { + for (const auto& field : *fields) { + *os << field->ToString() << ", "; + } } - return ChunkedArray::Make(std::move(chunks), type()); + *os << "}"; } - private: - const std::shared_ptr& type_; -}; - -// References a chunk vector owned by another ChunkedArray. -// This can be used to avoid transforming a top-level ChunkedArray's ArrayVector into an -// ArrayDataVector if flattening isn't needed. -class ChunkedArrayRef : public ChunkedColumn { - public: - explicit ChunkedArrayRef(const ChunkedArray& chunked_array) - : ChunkedColumn(chunked_array.type()), chunks_(chunked_array.chunks()) {} - - int num_chunks() const override { return static_cast(chunks_.size()); } - const std::shared_ptr& chunk(int i) const override { - return chunks_[i]->data(); + template + std::enable_if_t> Serialize(OStream* os) const { + *os << "column types: { "; + if (auto children = get_children()) { + for (const auto& child : *children) { + *os << *child->type() << ", "; + } + } else if (auto parent = get_parent()) { + for (const auto& field : parent->type()->fields()) { + *os << *field->type() << ", "; + } + } + *os << "}"; } - private: - const ArrayVector& chunks_; -}; - -// Owns a chunked ArrayDataVector (created after flattening its parent). -class ChunkedArrayData : public ChunkedColumn { - public: - explicit ChunkedArrayData(const std::shared_ptr& type, - ArrayDataVector chunks = {}) - : ChunkedColumn(type), chunks_(std::move(chunks)) {} - - int num_chunks() const override { return static_cast(chunks_.size()); } - const std::shared_ptr& chunk(int i) const override { return chunks_[i]; } + bool is_valid() const { return get_parent() || get_children(); } + operator bool() const { return is_valid(); } private: - ArrayDataVector chunks_; -}; + // Accessors for the variant + auto get_parent() const { return get_value(); } + auto get_children() const { + return get_value>*>(); + } + template + U get_value() const { + auto ptr = std::get_if(&parent_or_children_); + return ptr ? *ptr : nullptr; + } -// Return a vector of ChunkedColumns - one for each struct field. -// Unlike ChunkedArray::Flatten, this is zero-copy and doesn't merge parent/child -// validity bitmaps. -ChunkedColumnVector ChunkedColumn::FlattenZeroCopy() const { - DCHECK_EQ(type()->id(), Type::STRUCT); - - ChunkedColumnVector columns(type()->num_fields()); - for (int column_idx = 0; column_idx < type()->num_fields(); ++column_idx) { - const auto& child_type = type()->field(column_idx)->type(); - ArrayDataVector chunks(num_chunks()); - for (int chunk_idx = 0; chunk_idx < num_chunks(); ++chunk_idx) { - const auto& parent = chunk(chunk_idx); - const auto& children = parent->child_data; - DCHECK_EQ(columns.size(), children.size()); - auto child = children[column_idx]->Slice(parent->offset, parent->length); - DCHECK(child_type->Equals(child->type)); - chunks[chunk_idx] = child; + static Result> GetChild(const Field& field, int i) { + if (ARROW_PREDICT_FALSE(i < 0 || i >= field.type()->num_fields())) { + return nullptr; } - columns[column_idx] = - std::make_shared(child_type, std::move(chunks)); + return field.type()->field(i); } - return columns; -} + static Result> GetChild(const Array& array, int i) { + if (ARROW_PREDICT_FALSE(array.type_id() != Type::STRUCT)) { + return NonStructError(); + } + if (ARROW_PREDICT_FALSE(i < 0 || i >= array.num_fields())) { + return nullptr; + } -struct FieldPathGetImpl { - static const DataType& GetType(const Array& array) { return *array.type(); } - static const DataType& GetType(const ArrayData& data) { return *data.type; } - static const DataType& GetType(const ChunkedArray& chunked_array) { - return *chunked_array.type(); + const auto& struct_array = checked_cast(array); + if constexpr (IsFlattening) { + return struct_array.GetFlattenedField(i); + } else { + return struct_array.field(i); + } } - static const DataType& GetType(const ChunkedColumn& column) { return *column.type(); } - static void Summarize(const FieldVector& fields, std::stringstream* ss) { - *ss << "{ "; - for (const auto& field : fields) { - *ss << field->ToString() << ", "; + static Result> GetChild(const ChunkedArray& chunked_array, + int i) { + const auto& type = *chunked_array.type(); + if (ARROW_PREDICT_FALSE(type.id() != Type::STRUCT)) { + return NonStructError(); + } + if (ARROW_PREDICT_FALSE(i < 0 || i >= type.num_fields())) { + return nullptr; } - *ss << "}"; - } - template - static void Summarize(const std::vector& columns, std::stringstream* ss) { - *ss << "{ "; - for (const auto& column : columns) { - *ss << GetType(*column) << ", "; + ArrayVector chunks; + chunks.reserve(chunked_array.num_chunks()); + for (const auto& parent_chunk : chunked_array.chunks()) { + ARROW_ASSIGN_OR_RAISE(auto chunk, GetChild(*parent_chunk, i)); + if (!chunk) return nullptr; + chunks.push_back(std::move(chunk)); } - *ss << "}"; - } - static Status NonStructError() { - return Status::NotImplemented("Get child data of non-struct array"); + return ChunkedArray::Make(std::move(chunks), type.field(i)->type()); } - template + std::shared_ptr owned_parent_; + std::variant>*> parent_or_children_; +}; + +using FieldSelector = NestedSelector; +template +using ZeroCopySelector = NestedSelector; +template +using FlatteningSelector = NestedSelector; + +struct FieldPathGetImpl { + template static Status IndexError(const FieldPath* path, int out_of_range_depth, - const std::vector& children) { + const Selector& selector) { std::stringstream ss; ss << "index out of range. "; @@ -1200,137 +1221,53 @@ struct FieldPathGetImpl { } ss << "] "; - if (std::is_same>::value) { - ss << "fields were: "; - } else { - ss << "columns had types: "; - } - Summarize(children, &ss); + selector.Serialize(&ss); return Status::IndexError(ss.str()); } - template - static Result Get(const FieldPath* path, const std::vector* children, - GetChildren&& get_children, int* out_of_range_depth) { - if (path->indices().empty()) { + template + static Result> Get(const FieldPath* path, Selector selector, + int* out_of_range_depth = nullptr) { + if (path->empty()) { return Status::Invalid("empty indices cannot be traversed"); } int depth = 0; - const T* out; - while (true) { - if (children == nullptr) { - return NonStructError(); - } - - auto index = (*path)[depth]; - if (index < 0 || static_cast(index) >= children->size()) { - *out_of_range_depth = depth; - return nullptr; - } - - out = &children->at(index); - if (static_cast(++depth) == path->indices().size()) { - break; + for (auto index : *path) { + ARROW_ASSIGN_OR_RAISE(auto next_selector, selector.GetChild(index)); + + // Handle failed bounds check + if (!next_selector) { + if (out_of_range_depth) { + *out_of_range_depth = depth; + return nullptr; + } + return IndexError(path, depth, selector); } - children = get_children(*out); - } - - return *out; - } - template - static Result Get(const FieldPath* path, const std::vector* children, - GetChildren&& get_children) { - int out_of_range_depth = -1; - ARROW_ASSIGN_OR_RAISE(auto child, - Get(path, children, std::forward(get_children), - &out_of_range_depth)); - if (child != nullptr) { - return std::move(child); + selector = std::move(next_selector); + ++depth; } - return IndexError(path, out_of_range_depth, *children); - } - - static Result> Get(const FieldPath* path, - const FieldVector& fields) { - return FieldPathGetImpl::Get(path, &fields, [](const std::shared_ptr& field) { - return &field->type()->fields(); - }); - } - - static Result> Get( - const FieldPath* path, const ChunkedColumnVector& toplevel_children) { - ChunkedColumnVector children; - - ARROW_ASSIGN_OR_RAISE( - auto child, - FieldPathGetImpl::Get(path, &toplevel_children, - [&children](const std::shared_ptr& parent) - -> const ChunkedColumnVector* { - if (parent->type()->id() != Type::STRUCT) { - return nullptr; - } - children = parent->FlattenZeroCopy(); - return &children; - })); - - return child->ToChunkedArray(); - } - - static Result> Get(const FieldPath* path, - const ArrayDataVector& child_data) { - return FieldPathGetImpl::Get( - path, &child_data, - [](const std::shared_ptr& data) -> const ArrayDataVector* { - if (data->type->id() != Type::STRUCT) { - return nullptr; - } - return &data->child_data; - }); - } - static Status Flatten(const Array& array, ArrayVector* out) { - return checked_cast(array).Flatten().Value(out); - } - static Status Flatten(const ChunkedArray& chunked_array, ChunkedArrayVector* out) { - return chunked_array.Flatten().Value(out); - } - - template - static Result GetFlattened(const FieldPath* path, - const std::vector& toplevel_children) { - std::vector children; - Status error; - auto result = FieldPathGetImpl::Get( - path, &toplevel_children, - [&children, &error](const T& parent) -> const std::vector* { - if (parent->type()->id() != Type::STRUCT) { - return nullptr; - } - error = Flatten(*parent, &children); - return error.ok() ? &children : nullptr; - }); - ARROW_RETURN_NOT_OK(error); - return result; + return selector.Finish(); } }; Result> FieldPath::Get(const Schema& schema) const { - return FieldPathGetImpl::Get(this, schema.fields()); + return Get(schema.fields()); } Result> FieldPath::Get(const Field& field) const { - return FieldPathGetImpl::Get(this, field.type()->fields()); + return Get(field.type()->fields()); } Result> FieldPath::Get(const DataType& type) const { - return FieldPathGetImpl::Get(this, type.fields()); + return Get(type.fields()); } Result> FieldPath::Get(const FieldVector& fields) const { - return FieldPathGetImpl::Get(this, fields); + return FieldPathGetImpl::Get(this, FieldSelector(fields)); } Result> FieldPath::GetAll(const Schema& schm, @@ -1345,50 +1282,32 @@ Result> FieldPath::GetAll(const Schema& schm, } Result> FieldPath::Get(const RecordBatch& batch) const { - ARROW_ASSIGN_OR_RAISE(auto data, FieldPathGetImpl::Get(this, batch.column_data())); - return MakeArray(std::move(data)); + return FieldPathGetImpl::Get(this, ZeroCopySelector(batch.columns())); } Result> FieldPath::Get(const Table& table) const { - ChunkedColumnVector columns(table.num_columns()); - std::transform(table.columns().cbegin(), table.columns().cend(), columns.begin(), - [](const std::shared_ptr& chunked_array) { - return std::make_shared(*chunked_array); - }); - return FieldPathGetImpl::Get(this, columns); + return FieldPathGetImpl::Get(this, ZeroCopySelector(table.columns())); } Result> FieldPath::Get(const Array& array) const { - ARROW_ASSIGN_OR_RAISE(auto data, Get(*array.data())); - return MakeArray(std::move(data)); + return FieldPathGetImpl::Get(this, ZeroCopySelector(array)); } Result> FieldPath::Get(const ArrayData& data) const { - if (data.type->id() != Type::STRUCT) { - return FieldPathGetImpl::NonStructError(); - } - return FieldPathGetImpl::Get(this, data.child_data); + // We indirect from ArrayData to Array rather than vice-versa because, when selecting a + // nested column, the StructArray::field method does the work of adjusting the data's + // offset/length if necessary. + ARROW_ASSIGN_OR_RAISE(auto array, Get(*MakeArray(data.Copy()))); + return array->data(); } Result> FieldPath::Get( const ChunkedArray& chunked_array) const { - if (chunked_array.type()->id() != Type::STRUCT) { - return FieldPathGetImpl::NonStructError(); - } - auto columns = ChunkedArrayRef(chunked_array).FlattenZeroCopy(); - return FieldPathGetImpl::Get(this, columns); + return FieldPathGetImpl::Get(this, ZeroCopySelector(chunked_array)); } Result> FieldPath::GetFlattened(const Array& array) const { - if (array.type_id() != Type::STRUCT) { - return FieldPathGetImpl::NonStructError(); - } - auto&& struct_array = checked_cast(array); - if (struct_array.null_count() == 0) { - return FieldPathGetImpl::GetFlattened(this, struct_array.fields()); - } - ARROW_ASSIGN_OR_RAISE(auto children, struct_array.Flatten()); - return FieldPathGetImpl::GetFlattened(this, children); + return FieldPathGetImpl::Get(this, FlatteningSelector(array)); } Result> FieldPath::GetFlattened(const ArrayData& data) const { @@ -1398,21 +1317,20 @@ Result> FieldPath::GetFlattened(const ArrayData& data Result> FieldPath::GetFlattened( const ChunkedArray& chunked_array) const { - if (chunked_array.type()->id() != Type::STRUCT) { - return FieldPathGetImpl::NonStructError(); - } - ARROW_ASSIGN_OR_RAISE(auto children, chunked_array.Flatten()); - return FieldPathGetImpl::GetFlattened(this, children); + return FieldPathGetImpl::Get(this, FlatteningSelector(chunked_array)); } Result> FieldPath::GetFlattened(const RecordBatch& batch) const { - return FieldPathGetImpl::GetFlattened(this, batch.columns()); + return FieldPathGetImpl::Get(this, FlatteningSelector(batch.columns())); } Result> FieldPath::GetFlattened(const Table& table) const { - return FieldPathGetImpl::GetFlattened(this, table.columns()); + return FieldPathGetImpl::Get(this, FlatteningSelector(table.columns())); } +// ---------------------------------------------------------------------- +// FieldRef + FieldRef::FieldRef(FieldPath indices) : impl_(std::move(indices)) {} void FieldRef::Flatten(std::vector children) { @@ -1642,10 +1560,8 @@ std::vector FieldRef::FindAll(const FieldVector& fields) const { std::vector operator()(const FieldPath& path) { // skip long IndexError construction if path is out of range int out_of_range_depth; - auto maybe_field = FieldPathGetImpl::Get( - &path, &fields_, - [](const std::shared_ptr& field) { return &field->type()->fields(); }, - &out_of_range_depth); + auto maybe_field = + FieldPathGetImpl::Get(&path, FieldSelector(fields_), &out_of_range_depth); DCHECK_OK(maybe_field.status()); From 918b0837411458def166b613eb1b360f916b2ea4 Mon Sep 17 00:00:00 2001 From: benibus Date: Mon, 1 May 2023 17:02:10 -0400 Subject: [PATCH 09/21] Support passing MemoryPool to GetFlattenedField --- cpp/src/arrow/type.cc | 48 ++++++++++++++++++++++++++--------------- cpp/src/arrow/type.h | 50 ++++++++++++++++++++++++------------------- 2 files changed, 59 insertions(+), 39 deletions(-) diff --git a/cpp/src/arrow/type.cc b/cpp/src/arrow/type.cc index 9948837d6df..cec06595823 100644 --- a/cpp/src/arrow/type.cc +++ b/cpp/src/arrow/type.cc @@ -1083,19 +1083,25 @@ class NestedSelector { explicit NestedSelector(const T& parent) : parent_or_children_(&parent) {} explicit NestedSelector(std::shared_ptr parent) : owned_parent_(std::move(parent)), parent_or_children_(owned_parent_.get()) {} + template + NestedSelector(Arg&& arg, MemoryPool* pool) : NestedSelector(std::forward(arg)) { + if (pool) { + pool_ = pool; + } + } // If the index is out of bounds, this returns an invalid selector rather than an // error. Result GetChild(int i) const { std::shared_ptr child; if (auto parent = get_parent()) { - ARROW_ASSIGN_OR_RAISE(child, GetChild(*parent, i)); + ARROW_ASSIGN_OR_RAISE(child, GetChild(*parent, i, pool_)); } else if (auto children = get_children()) { if (ARROW_PREDICT_TRUE(i >= 0 && static_cast(i) < children->size())) { child = (*children)[i]; } } - return NestedSelector(std::move(child)); + return NestedSelector(std::move(child), pool_); } Result> Finish() const { @@ -1148,14 +1154,15 @@ class NestedSelector { return ptr ? *ptr : nullptr; } - static Result> GetChild(const Field& field, int i) { + static Result> GetChild(const Field& field, int i, MemoryPool*) { if (ARROW_PREDICT_FALSE(i < 0 || i >= field.type()->num_fields())) { return nullptr; } return field.type()->field(i); } - static Result> GetChild(const Array& array, int i) { + static Result> GetChild(const Array& array, int i, + MemoryPool* pool) { if (ARROW_PREDICT_FALSE(array.type_id() != Type::STRUCT)) { return NonStructError(); } @@ -1165,14 +1172,14 @@ class NestedSelector { const auto& struct_array = checked_cast(array); if constexpr (IsFlattening) { - return struct_array.GetFlattenedField(i); + return struct_array.GetFlattenedField(i, pool); } else { return struct_array.field(i); } } static Result> GetChild(const ChunkedArray& chunked_array, - int i) { + int i, MemoryPool* pool) { const auto& type = *chunked_array.type(); if (ARROW_PREDICT_FALSE(type.id() != Type::STRUCT)) { return NonStructError(); @@ -1184,7 +1191,7 @@ class NestedSelector { ArrayVector chunks; chunks.reserve(chunked_array.num_chunks()); for (const auto& parent_chunk : chunked_array.chunks()) { - ARROW_ASSIGN_OR_RAISE(auto chunk, GetChild(*parent_chunk, i)); + ARROW_ASSIGN_OR_RAISE(auto chunk, GetChild(*parent_chunk, i, pool)); if (!chunk) return nullptr; chunks.push_back(std::move(chunk)); } @@ -1194,6 +1201,7 @@ class NestedSelector { std::shared_ptr owned_parent_; std::variant>*> parent_or_children_; + MemoryPool* pool_ = default_memory_pool(); }; using FieldSelector = NestedSelector; @@ -1306,26 +1314,32 @@ Result> FieldPath::Get( return FieldPathGetImpl::Get(this, ZeroCopySelector(chunked_array)); } -Result> FieldPath::GetFlattened(const Array& array) const { - return FieldPathGetImpl::Get(this, FlatteningSelector(array)); +Result> FieldPath::GetFlattened(const Array& array, + MemoryPool* pool) const { + return FieldPathGetImpl::Get(this, FlatteningSelector(array, pool)); } -Result> FieldPath::GetFlattened(const ArrayData& data) const { - ARROW_ASSIGN_OR_RAISE(auto array, GetFlattened(*MakeArray(data.Copy()))); +Result> FieldPath::GetFlattened(const ArrayData& data, + MemoryPool* pool) const { + ARROW_ASSIGN_OR_RAISE(auto array, GetFlattened(*MakeArray(data.Copy()), pool)); return array->data(); } Result> FieldPath::GetFlattened( - const ChunkedArray& chunked_array) const { - return FieldPathGetImpl::Get(this, FlatteningSelector(chunked_array)); + const ChunkedArray& chunked_array, MemoryPool* pool) const { + return FieldPathGetImpl::Get(this, + FlatteningSelector(chunked_array, pool)); } -Result> FieldPath::GetFlattened(const RecordBatch& batch) const { - return FieldPathGetImpl::Get(this, FlatteningSelector(batch.columns())); +Result> FieldPath::GetFlattened(const RecordBatch& batch, + MemoryPool* pool) const { + return FieldPathGetImpl::Get(this, FlatteningSelector(batch.columns(), pool)); } -Result> FieldPath::GetFlattened(const Table& table) const { - return FieldPathGetImpl::Get(this, FlatteningSelector(table.columns())); +Result> FieldPath::GetFlattened(const Table& table, + MemoryPool* pool) const { + return FieldPathGetImpl::Get(this, + FlatteningSelector(table.columns(), pool)); } // ---------------------------------------------------------------------- diff --git a/cpp/src/arrow/type.h b/cpp/src/arrow/type.h index 16f0e56d5a2..3575625b10e 100644 --- a/cpp/src/arrow/type.h +++ b/cpp/src/arrow/type.h @@ -1704,12 +1704,16 @@ class ARROW_EXPORT FieldPath { /// /// Unlike `FieldPath::Get`, these variants are not zero-copy and the retrieved child's /// null bitmap is ANDed with its parent's - Result> GetFlattened(const Array& array) const; - Result> GetFlattened(const ArrayData& data) const; - Result> GetFlattened( - const ChunkedArray& chunked_array) const; - Result> GetFlattened(const RecordBatch& batch) const; - Result> GetFlattened(const Table& table) const; + Result> GetFlattened(const Array& array, + MemoryPool* pool = NULLPTR) const; + Result> GetFlattened(const ArrayData& data, + MemoryPool* pool = NULLPTR) const; + Result> GetFlattened(const ChunkedArray& chunked_array, + MemoryPool* pool = NULLPTR) const; + Result> GetFlattened(const RecordBatch& batch, + MemoryPool* pool = NULLPTR) const; + Result> GetFlattened(const Table& table, + MemoryPool* pool = NULLPTR) const; private: std::vector indices_; @@ -1723,13 +1727,13 @@ using FieldPathGetType = template std::enable_if_t>> GetChild( - const T& root, const FieldPath& path) { + const T& root, const FieldPath& path, MemoryPool* = NULLPTR) { return path.Get(root); } template -std::enable_if_t>> GetChild(const T& root, - const FieldPath& path) { - return path.GetFlattened(root); +std::enable_if_t>> GetChild( + const T& root, const FieldPath& path, MemoryPool* pool = NULLPTR) { + return path.GetFlattened(root, pool); } } // namespace internal @@ -1914,8 +1918,9 @@ class ARROW_EXPORT FieldRef : public util::EqualityComparable { return GetAll(root); } template - std::vector> GetAllFlattened(const T& root) const { - return GetAll(root); + std::vector> GetAllFlattened(const T& root, + MemoryPool* pool = NULLPTR) const { + return GetAll(root, pool); } /// \brief Get the single child matching this FieldRef. @@ -1925,8 +1930,8 @@ class ARROW_EXPORT FieldRef : public util::EqualityComparable { return GetOne(root); } template - Result> GetOneFlattened(const T& root) const { - return GetOne(root); + Result> GetOneFlattened(const T& root, MemoryPool* pool = NULLPTR) const { + return GetOne(root, pool); } /// \brief Get the single child matching this FieldRef. @@ -1936,35 +1941,36 @@ class ARROW_EXPORT FieldRef : public util::EqualityComparable { return GetOneOrNone(root); } template - Result> GetOneOrNoneFlattened(const T& root) const { - return GetOneOrNone(root); + Result> GetOneOrNoneFlattened(const T& root, + MemoryPool* pool = NULLPTR) const { + return GetOneOrNone(root, pool); } private: void Flatten(std::vector children); template - Result> GetOne(const T& root) const { + Result> GetOne(const T& root, MemoryPool* pool = NULLPTR) const { ARROW_ASSIGN_OR_RAISE(auto match, FindOne(root)); - return internal::GetChild(root, match).ValueOrDie(); + return internal::GetChild(root, match, pool).ValueOrDie(); } template - std::vector> GetAll(const T& root) const { + std::vector> GetAll(const T& root, MemoryPool* pool = NULLPTR) const { std::vector> out; for (const auto& match : FindAll(root)) { - out.push_back(internal::GetChild(root, match).ValueOrDie()); + out.push_back(internal::GetChild(root, match, pool).ValueOrDie()); } return out; } template - Result> GetOneOrNone(const T& root) const { + Result> GetOneOrNone(const T& root, MemoryPool* pool = NULLPTR) const { ARROW_ASSIGN_OR_RAISE(auto match, FindOneOrNone(root)); if (match.empty()) { return static_cast>(NULLPTR); } - return internal::GetChild(root, match).ValueOrDie(); + return internal::GetChild(root, match, pool).ValueOrDie(); } std::variant> impl_; From 0168b70d21cc41debf56414111926989febe71f8 Mon Sep 17 00:00:00 2001 From: benibus Date: Mon, 1 May 2023 17:11:57 -0400 Subject: [PATCH 10/21] Minor naming fix --- cpp/src/arrow/type.cc | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/cpp/src/arrow/type.cc b/cpp/src/arrow/type.cc index cec06595823..dfc61a96895 100644 --- a/cpp/src/arrow/type.cc +++ b/cpp/src/arrow/type.cc @@ -1110,7 +1110,7 @@ class NestedSelector { } template - std::enable_if_t> Serialize(OStream* os) const { + std::enable_if_t> Summarize(OStream* os) const { const FieldVector* fields = get_children(); if (!fields && get_parent()) { fields = &get_parent()->type()->fields(); @@ -1125,7 +1125,7 @@ class NestedSelector { } template - std::enable_if_t> Serialize(OStream* os) const { + std::enable_if_t> Summarize(OStream* os) const { *os << "column types: { "; if (auto children = get_children()) { for (const auto& child : *children) { @@ -1229,7 +1229,7 @@ struct FieldPathGetImpl { } ss << "] "; - selector.Serialize(&ss); + selector.Summarize(&ss); return Status::IndexError(ss.str()); } From 44bf7681df283ef9a4da4f7d55856f30b471e087 Mon Sep 17 00:00:00 2001 From: Ben Harkins <60872452+benibus@users.noreply.github.com> Date: Mon, 15 May 2023 13:04:35 -0400 Subject: [PATCH 11/21] Apply comment suggestion Co-authored-by: Antoine Pitrou --- cpp/src/arrow/type.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cpp/src/arrow/type.h b/cpp/src/arrow/type.h index 3575625b10e..2d956392be7 100644 --- a/cpp/src/arrow/type.h +++ b/cpp/src/arrow/type.h @@ -1703,7 +1703,7 @@ class ARROW_EXPORT FieldPath { /// RecordBatch, or Table /// /// Unlike `FieldPath::Get`, these variants are not zero-copy and the retrieved child's - /// null bitmap is ANDed with its parent's + /// null bitmap is ANDed with its ancestors' Result> GetFlattened(const Array& array, MemoryPool* pool = NULLPTR) const; Result> GetFlattened(const ArrayData& data, From 015c2fd6bb55bf7c19f1dc10a49035433f01ef7c Mon Sep 17 00:00:00 2001 From: benibus Date: Mon, 15 May 2023 16:35:01 -0400 Subject: [PATCH 12/21] Remove FieldRef::Get indirections --- cpp/src/arrow/type.h | 73 +++++++++++++------------------------- cpp/src/arrow/type_test.cc | 17 ++++++--- 2 files changed, 37 insertions(+), 53 deletions(-) diff --git a/cpp/src/arrow/type.h b/cpp/src/arrow/type.h index 2d956392be7..7d262729f3e 100644 --- a/cpp/src/arrow/type.h +++ b/cpp/src/arrow/type.h @@ -1719,25 +1719,6 @@ class ARROW_EXPORT FieldPath { std::vector indices_; }; -namespace internal { - -template -using FieldPathGetType = - decltype(std::declval().Get(std::declval()).ValueOrDie()); - -template -std::enable_if_t>> GetChild( - const T& root, const FieldPath& path, MemoryPool* = NULLPTR) { - return path.Get(root); -} -template -std::enable_if_t>> GetChild( - const T& root, const FieldPath& path, MemoryPool* pool = NULLPTR) { - return path.GetFlattened(root, pool); -} - -} // namespace internal - /// \class FieldRef /// \brief Descriptor of a (potentially nested) field within a schema. /// @@ -1910,69 +1891,63 @@ class ARROW_EXPORT FieldRef : public util::EqualityComparable { } template - using GetType = internal::FieldPathGetType; + using GetType = decltype(std::declval().Get(std::declval()).ValueOrDie()); /// \brief Get all children matching this FieldRef. template std::vector> GetAll(const T& root) const { - return GetAll(root); + std::vector> out; + for (const auto& match : FindAll(root)) { + out.push_back(match.Get(root).ValueOrDie()); + } + return out; } template std::vector> GetAllFlattened(const T& root, MemoryPool* pool = NULLPTR) const { - return GetAll(root, pool); + std::vector> out; + for (const auto& match : FindAll(root)) { + out.push_back(match.GetFlattened(root, pool).ValueOrDie()); + } + return out; } /// \brief Get the single child matching this FieldRef. /// Emit an error if none or multiple match. template Result> GetOne(const T& root) const { - return GetOne(root); + ARROW_ASSIGN_OR_RAISE(auto match, FindOne(root)); + return match.Get(root).ValueOrDie(); } template Result> GetOneFlattened(const T& root, MemoryPool* pool = NULLPTR) const { - return GetOne(root, pool); + ARROW_ASSIGN_OR_RAISE(auto match, FindOne(root)); + return match.GetFlattened(root, pool).ValueOrDie(); } /// \brief Get the single child matching this FieldRef. /// Return nullptr if none match, emit an error if multiple match. template Result> GetOneOrNone(const T& root) const { - return GetOneOrNone(root); + ARROW_ASSIGN_OR_RAISE(auto match, FindOneOrNone(root)); + if (match.empty()) { + return static_cast>(NULLPTR); + } + return match.Get(root).ValueOrDie(); } template Result> GetOneOrNoneFlattened(const T& root, MemoryPool* pool = NULLPTR) const { - return GetOneOrNone(root, pool); - } - - private: - void Flatten(std::vector children); - - template - Result> GetOne(const T& root, MemoryPool* pool = NULLPTR) const { - ARROW_ASSIGN_OR_RAISE(auto match, FindOne(root)); - return internal::GetChild(root, match, pool).ValueOrDie(); - } - - template - std::vector> GetAll(const T& root, MemoryPool* pool = NULLPTR) const { - std::vector> out; - for (const auto& match : FindAll(root)) { - out.push_back(internal::GetChild(root, match, pool).ValueOrDie()); - } - return out; - } - - template - Result> GetOneOrNone(const T& root, MemoryPool* pool = NULLPTR) const { ARROW_ASSIGN_OR_RAISE(auto match, FindOneOrNone(root)); if (match.empty()) { return static_cast>(NULLPTR); } - return internal::GetChild(root, match, pool).ValueOrDie(); + return match.GetFlattened(root, pool).ValueOrDie(); } + private: + void Flatten(std::vector children); + std::variant> impl_; }; diff --git a/cpp/src/arrow/type_test.cc b/cpp/src/arrow/type_test.cc index e01ad9ca065..45f262a5342 100644 --- a/cpp/src/arrow/type_test.cc +++ b/cpp/src/arrow/type_test.cc @@ -564,7 +564,16 @@ class FieldPathTestFixture : public ::testing::Test { protected: template - using OutputType = typename internal::FieldPathGetType::element_type; + using OutputType = typename FieldRef::GetType::element_type; + + template + static auto DoGet(const T& root, const FieldPath& path, MemoryPool* pool = nullptr) { + if constexpr (Flattened) { + return path.GetFlattened(root, pool); + } else { + return path.Get(root); + } + } template void AssertOutputsEqual(const std::shared_ptr& expected, @@ -599,18 +608,18 @@ class TestFieldPath : public FieldPathTestFixture { {FieldPath({2, 1, 0}), FieldPath({1, 2, 0}), FieldPath{1, 1, 2}}) { EXPECT_RAISES_WITH_MESSAGE_THAT(IndexError, ::testing::HasSubstr("index out of range"), - internal::GetChild(*input, path)); + DoGet(*input, path)); } EXPECT_RAISES_WITH_MESSAGE_THAT( Invalid, ::testing::HasSubstr("empty indices cannot be traversed"), - internal::GetChild(*input, FieldPath())); + DoGet(*input, FieldPath())); } template void TestGetWithNonStructArray() const { EXPECT_RAISES_WITH_MESSAGE_THAT( NotImplemented, ::testing::HasSubstr("Get child data of non-struct array"), - internal::GetChild(*case_->v1_1_0.Get(), FieldPath({1, 1, 0}))); + DoGet(*case_->v1_1_0.Get(), FieldPath({1, 1, 0}))); } template > From 08ce6bd0cbcaa2732c424c3073e309c38a6cdb57 Mon Sep 17 00:00:00 2001 From: benibus Date: Mon, 15 May 2023 16:55:29 -0400 Subject: [PATCH 13/21] Use `if constexpr` in FieldPathTestCase --- cpp/src/arrow/type_test.cc | 76 ++++++++++++++------------------------ 1 file changed, 28 insertions(+), 48 deletions(-) diff --git a/cpp/src/arrow/type_test.cc b/cpp/src/arrow/type_test.cc index 45f262a5342..314dfe3f249 100644 --- a/cpp/src/arrow/type_test.cc +++ b/cpp/src/arrow/type_test.cc @@ -371,7 +371,17 @@ struct FieldPathTestCase { : path(FieldPath(std::move(indices))) {} template - auto&& Get() const; + const auto& Get() const { + if constexpr (std::is_same_v) { + return field; + } else if constexpr (std::is_same_v) { + return array; + } else if constexpr (std::is_same_v) { + return array->data(); + } else if constexpr (std::is_same_v) { + return chunked_array; + } + } FieldPath path; std::shared_ptr field; @@ -392,7 +402,23 @@ struct FieldPathTestCase { std::shared_ptr
table; template - auto&& GetInput() const; + const auto& GetInput() const { + if constexpr (std::is_same_v) { + return schema; + } else if constexpr (std::is_same_v) { + return type; + } else if constexpr (std::is_same_v) { + return array; + } else if constexpr (std::is_same_v) { + return array->data(); + } else if constexpr (std::is_same_v) { + return record_batch; + } else if constexpr (std::is_same_v) { + return chunked_array; + } else if constexpr (std::is_same_v) { + return table; + } + } // Number of chunks for each column in the input Table const std::array num_column_chunks = {15, 20}; @@ -512,52 +538,6 @@ struct FieldPathTestCase { } }; -template <> -auto&& FieldPathTestCase::GetInput() const { - return this->schema; -} -template <> -auto&& FieldPathTestCase::GetInput() const { - return this->type; -} -template <> -auto&& FieldPathTestCase::GetInput() const { - return this->array; -} -template <> -auto&& FieldPathTestCase::GetInput() const { - return this->array->data(); -} -template <> -auto&& FieldPathTestCase::GetInput() const { - return this->chunked_array; -} -template <> -auto&& FieldPathTestCase::GetInput() const { - return this->record_batch; -} -template <> -auto&& FieldPathTestCase::GetInput
() const { - return this->table; -} - -template <> -auto&& FieldPathTestCase::OutputValues::Get() const { - return this->field; -} -template <> -auto&& FieldPathTestCase::OutputValues::Get() const { - return this->array; -} -template <> -auto&& FieldPathTestCase::OutputValues::Get() const { - return this->array->data(); -} -template <> -auto&& FieldPathTestCase::OutputValues::Get() const { - return this->chunked_array; -} - class FieldPathTestFixture : public ::testing::Test { public: FieldPathTestFixture() : case_(FieldPathTestCase::Instance()) {} From c9be785227523486fc876adb08767c194e3bde51 Mon Sep 17 00:00:00 2001 From: benibus Date: Tue, 16 May 2023 11:43:29 -0400 Subject: [PATCH 14/21] Explain some weirdness in AssertOutputsEqual --- cpp/src/arrow/type_test.cc | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/cpp/src/arrow/type_test.cc b/cpp/src/arrow/type_test.cc index 314dfe3f249..47b6da1e969 100644 --- a/cpp/src/arrow/type_test.cc +++ b/cpp/src/arrow/type_test.cc @@ -568,6 +568,11 @@ class FieldPathTestFixture : public ::testing::Test { template void AssertOutputsEqual(const std::shared_ptr& expected, const std::shared_ptr& actual) const { + // We only do this dance due to the way the test inputs/outputs are generated. + // Basically, the "expected" output ChunkedArrays don't have an equal num_chunks since + // they're reused to create the input Table (which has a distinct chunking per + // column). However, if the input was the ChunkedArray, the returned outputs should + // always have the same num_chunks as the input. if constexpr (std::is_same_v) { EXPECT_EQ(case_->chunked_array->num_chunks(), actual->num_chunks()); } else { From 98d43eee41ba0729549e705c3f25d6dcd9c730f9 Mon Sep 17 00:00:00 2001 From: benibus Date: Tue, 16 May 2023 12:32:58 -0400 Subject: [PATCH 15/21] Test/fix IndexError message --- cpp/src/arrow/type.cc | 7 +++---- cpp/src/arrow/type_test.cc | 22 ++++++++++++++++++++++ 2 files changed, 25 insertions(+), 4 deletions(-) diff --git a/cpp/src/arrow/type.cc b/cpp/src/arrow/type.cc index dfc61a96895..68dc2aabe96 100644 --- a/cpp/src/arrow/type.cc +++ b/cpp/src/arrow/type.cc @@ -1220,12 +1220,11 @@ struct FieldPathGetImpl { ss << "indices=[ "; int depth = 0; for (int i : path->indices()) { - if (depth != out_of_range_depth) { + if (depth++ != out_of_range_depth) { ss << i << " "; - continue; + } else { + ss << ">" << i << "< "; } - ss << ">" << i << "< "; - ++depth; } ss << "] "; diff --git a/cpp/src/arrow/type_test.cc b/cpp/src/arrow/type_test.cc index 47b6da1e969..e3fcb5b2f21 100644 --- a/cpp/src/arrow/type_test.cc +++ b/cpp/src/arrow/type_test.cc @@ -600,6 +600,18 @@ class TestFieldPath : public FieldPathTestFixture { DoGet(*input, FieldPath())); } + template > + void TestIndexErrorMessage() const { + auto result = FieldPath({1, 1, 2}).Get(*case_->GetInput()); + std::string substr = "index out of range. indices=[ 1 1 >2< ] "; + if constexpr (std::is_same_v) { + substr += "fields: { f: float, b: bool, }"; + } else { + substr += "column types: { float, bool, }"; + } + EXPECT_RAISES_WITH_MESSAGE_THAT(IndexError, ::testing::HasSubstr(substr), result); + } + template void TestGetWithNonStructArray() const { EXPECT_RAISES_WITH_MESSAGE_THAT( @@ -660,6 +672,16 @@ TEST_F(TestFieldPath, GetWithInvalidIndex) { TestGetWithInvalidIndex(); } +TEST_F(TestFieldPath, IndexErrorMessage) { + TestIndexErrorMessage(); + TestIndexErrorMessage(); + TestIndexErrorMessage(); + TestIndexErrorMessage(); + TestIndexErrorMessage(); + TestIndexErrorMessage(); + TestIndexErrorMessage
(); +} + TEST_F(TestFieldPath, GetWithNonStructArray) { TestGetWithNonStructArray(); TestGetWithNonStructArray(); From fb1d694acba7c52695cee0ee73d46c64c9a90473 Mon Sep 17 00:00:00 2001 From: benibus Date: Tue, 16 May 2023 17:25:01 -0400 Subject: [PATCH 16/21] Add tests for FieldRef methods --- cpp/src/arrow/type_test.cc | 118 ++++++++++++++++++++++++++++++++----- 1 file changed, 104 insertions(+), 14 deletions(-) diff --git a/cpp/src/arrow/type_test.cc b/cpp/src/arrow/type_test.cc index e3fcb5b2f21..b2f973e4ab0 100644 --- a/cpp/src/arrow/type_test.cc +++ b/cpp/src/arrow/type_test.cc @@ -447,12 +447,15 @@ struct FieldPathTestCase { random::RandomArrayGenerator gen(kRandomSeed); // Define child fields and input schema - out.v1_1_1.field = field("b", boolean()); - out.v1_1_0.field = field("f", float32()); - out.v1_1.field = field("s1", struct_({out.v1_1_0.field, out.v1_1_1.field})); - out.v1_0.field = field("i", int32()); - out.v1.field = field("s0", struct_({out.v1_0.field, out.v1_1.field})); - out.v0.field = field("u", utf8()); + + // Intentionally duplicated names for the FieldRef tests + out.v1_1_1.field = field("a", boolean()); + out.v1_1_0.field = field("a", float32()); + + out.v1_1.field = field("b", struct_({out.v1_1_0.field, out.v1_1_1.field})); + out.v1_0.field = field("a", int32()); + out.v1.field = field("b", struct_({out.v1_0.field, out.v1_1.field})); + out.v0.field = field("a", utf8()); out.schema = arrow::schema({out.v0.field, out.v1.field}); out.type = struct_(out.schema->fields()); @@ -605,7 +608,7 @@ class TestFieldPath : public FieldPathTestFixture { auto result = FieldPath({1, 1, 2}).Get(*case_->GetInput()); std::string substr = "index out of range. indices=[ 1 1 >2< ] "; if constexpr (std::is_same_v) { - substr += "fields: { f: float, b: bool, }"; + substr += "fields: { a: float, a: bool, }"; } else { substr += "column types: { float, bool, }"; } @@ -747,7 +750,81 @@ TEST_F(TestFieldPath, GetFromEmptyChunked) { ASSERT_EQ(child->length(), 0); } -TEST(TestFieldRef, Basics) { +class TestFieldRef : public FieldPathTestFixture { + protected: + template + static auto DoGetOne(const T& root, const FieldRef& ref, MemoryPool* pool = nullptr) { + if constexpr (Flattened) { + return ref.GetOneFlattened(root, pool); + } else { + return ref.GetOne(root); + } + } + template + static auto DoGetOneOrNone(const T& root, const FieldRef& ref, + MemoryPool* pool = nullptr) { + if constexpr (Flattened) { + return ref.GetOneOrNoneFlattened(root, pool); + } else { + return ref.GetOneOrNone(root); + } + } + template + static auto DoGetAll(const T& root, const FieldRef& ref, MemoryPool* pool = nullptr) { + if constexpr (Flattened) { + return ref.GetAllFlattened(root, pool); + } else { + return ref.GetAll(root); + } + } + + template + void TestGet() const { + using O = OutputType; + const auto& input = case_->GetInput(); + ASSERT_OK_AND_ASSIGN(auto v0, DoGetOne(*input, FieldRef("a"))); + ASSERT_OK_AND_ASSIGN(auto v1, DoGetOne(*input, FieldRef("b"))); + ASSERT_OK_AND_ASSIGN(auto v1_0, DoGetOne(*input, FieldRef("b", "a"))); + ASSERT_OK_AND_ASSIGN(auto v1_1, DoGetOne(*input, FieldRef("b", "b"))); + ASSERT_OK_AND_ASSIGN(auto v1_1_0, DoGetOne(*input, FieldRef("b", "b", 0))); + ASSERT_OK_AND_ASSIGN(auto v1_1_1, DoGetOne(*input, FieldRef("b", "b", 1))); + + AssertOutputsEqual(case_->v0.Get(), v0); + AssertOutputsEqual(case_->v1.Get(), v1); + if constexpr (Flattened) { + AssertOutputsEqual(case_->v1_0_flat.Get(), v1_0); + AssertOutputsEqual(case_->v1_1_flat.Get(), v1_1); + AssertOutputsEqual(case_->v1_1_0_flat.Get(), v1_1_0); + AssertOutputsEqual(case_->v1_1_1_flat.Get(), v1_1_1); + } else { + AssertOutputsEqual(case_->v1_0.Get(), v1_0); + AssertOutputsEqual(case_->v1_1.Get(), v1_1); + AssertOutputsEqual(case_->v1_1_0.Get(), v1_1_0); + AssertOutputsEqual(case_->v1_1_1.Get(), v1_1_1); + } + + // Cases where multiple matches are found + auto multiple_matches = DoGetAll(*input, FieldRef("b", "b", "a")); + EXPECT_EQ(multiple_matches.size(), 2); + EXPECT_RAISES_WITH_MESSAGE_THAT( + Invalid, ::testing::HasSubstr("Multiple matches for "), + (DoGetOne(*input, FieldRef("b", "b", "a")))); + EXPECT_RAISES_WITH_MESSAGE_THAT( + Invalid, ::testing::HasSubstr("Multiple matches for "), + (DoGetOneOrNone(*input, FieldRef("b", "b", "a")))); + + // Cases where no match is found + auto no_matches = DoGetAll(*input, FieldRef("b", "b", "b")); + EXPECT_EQ(no_matches.size(), 0); + EXPECT_RAISES_WITH_MESSAGE_THAT( + Invalid, ::testing::HasSubstr("No match for "), + (DoGetOne(*input, FieldRef("b", "b", "b")))); + ASSERT_OK_AND_EQ(nullptr, + (DoGetOneOrNone(*input, FieldRef("b", "b", "b")))); + } +}; + +TEST_F(TestFieldRef, Basics) { auto f0 = field("alpha", int32()); auto f1 = field("beta", int32()); auto f2 = field("alpha", int32()); @@ -766,7 +843,7 @@ TEST(TestFieldRef, Basics) { EXPECT_THAT(FieldRef("beta").FindAll(s), ElementsAre(FieldPath{1}, FieldPath{3})); } -TEST(TestFieldRef, FindAllForTable) { +TEST_F(TestFieldRef, FindAllForTable) { constexpr int kNumRows = 100; auto f0 = field("alpha", int32()); auto f1 = field("beta", int32()); @@ -798,7 +875,7 @@ TEST(TestFieldRef, FindAllForTable) { ElementsAre(FieldPath{1}, FieldPath{3})); } -TEST(TestFieldRef, FindAllForRecordBatch) { +TEST_F(TestFieldRef, FindAllForRecordBatch) { constexpr int kNumRows = 100; auto f0 = field("alpha", int32()); auto f1 = field("beta", int32()); @@ -831,7 +908,7 @@ TEST(TestFieldRef, FindAllForRecordBatch) { ElementsAre(FieldPath{1}, FieldPath{3})); } -TEST(TestFieldRef, FromDotPath) { +TEST_F(TestFieldRef, FromDotPath) { ASSERT_OK_AND_EQ(FieldRef("alpha"), FieldRef::FromDotPath(R"(.alpha)")); ASSERT_OK_AND_EQ(FieldRef("", ""), FieldRef::FromDotPath(R"(..)")); @@ -854,7 +931,7 @@ TEST(TestFieldRef, FromDotPath) { ASSERT_RAISES(Invalid, FieldRef::FromDotPath(R"([1stuf])")); } -TEST(TestFieldRef, DotPathRoundTrip) { +TEST_F(TestFieldRef, DotPathRoundTrip) { auto check_roundtrip = [](const FieldRef& ref) { auto dot_path = ref.ToDotPath(); ASSERT_OK_AND_EQ(ref, FieldRef::FromDotPath(dot_path)); @@ -867,7 +944,7 @@ TEST(TestFieldRef, DotPathRoundTrip) { check_roundtrip(FieldRef("foo", 1, FieldRef("bar", 2, 3), FieldRef())); } -TEST(TestFieldRef, Nested) { +TEST_F(TestFieldRef, Nested) { auto f0 = field("alpha", int32()); auto f1_0 = field("alpha", int32()); auto f1 = field("beta", struct_({f1_0})); @@ -884,7 +961,7 @@ TEST(TestFieldRef, Nested) { ElementsAre(FieldPath{2, 1, 0}, FieldPath{2, 1, 1})); } -TEST(TestFieldRef, Flatten) { +TEST_F(TestFieldRef, Flatten) { FieldRef ref; auto assert_name = [](const FieldRef& ref, const std::string& expected) { @@ -920,6 +997,19 @@ TEST(TestFieldRef, Flatten) { {FieldRef("foo"), FieldRef("bar"), FieldRef(1, 2), FieldRef(3)}); } +TEST_F(TestFieldRef, GetFromSchema) { TestGet(); } +TEST_F(TestFieldRef, GetFromDataType) { TestGet(); } + +TEST_F(TestFieldRef, GetFromArray) { TestGet(); } +TEST_F(TestFieldRef, GetFromChunkedArray) { TestGet(); } +TEST_F(TestFieldRef, GetFromRecordBatch) { TestGet(); } +TEST_F(TestFieldRef, GetFromTable) { TestGet
(); } + +TEST_F(TestFieldRef, GetFlattenedFromArray) { TestGet(); } +TEST_F(TestFieldRef, GetFlattenedFromChunkedArray) { TestGet(); } +TEST_F(TestFieldRef, GetFlattenedFromRecordBatch) { TestGet(); } +TEST_F(TestFieldRef, GetFlattenedFromTable) { TestGet(); } + using TestSchema = ::testing::Test; TEST_F(TestSchema, Basics) { From 8ca8399904cc71c3c654a6e77bc4a7c3f82e6502 Mon Sep 17 00:00:00 2001 From: benibus Date: Tue, 16 May 2023 17:40:47 -0400 Subject: [PATCH 17/21] Test cleanup --- cpp/src/arrow/type_test.cc | 135 +++++++++++++++++-------------------- 1 file changed, 63 insertions(+), 72 deletions(-) diff --git a/cpp/src/arrow/type_test.cc b/cpp/src/arrow/type_test.cc index b2f973e4ab0..fb74d3ffcdb 100644 --- a/cpp/src/arrow/type_test.cc +++ b/cpp/src/arrow/type_test.cc @@ -371,7 +371,7 @@ struct FieldPathTestCase { : path(FieldPath(std::move(indices))) {} template - const auto& Get() const { + const auto& OutputAs() const { if constexpr (std::is_same_v) { return field; } else if constexpr (std::is_same_v) { @@ -402,7 +402,7 @@ struct FieldPathTestCase { std::shared_ptr
table; template - const auto& GetInput() const { + const auto& InputAs() const { if constexpr (std::is_same_v) { return schema; } else if constexpr (std::is_same_v) { @@ -549,15 +549,6 @@ class FieldPathTestFixture : public ::testing::Test { template using OutputType = typename FieldRef::GetType::element_type; - template - static auto DoGet(const T& root, const FieldPath& path, MemoryPool* pool = nullptr) { - if constexpr (Flattened) { - return path.GetFlattened(root, pool); - } else { - return path.Get(root); - } - } - template void AssertOutputsEqual(const std::shared_ptr& expected, const std::shared_ptr& actual) const { @@ -589,9 +580,18 @@ class FieldPathTestFixture : public ::testing::Test { class TestFieldPath : public FieldPathTestFixture { protected: + template + static auto DoGet(const T& root, const FieldPath& path, MemoryPool* pool = nullptr) { + if constexpr (Flattened) { + return path.GetFlattened(root, pool); + } else { + return path.Get(root); + } + } + template void TestGetWithInvalidIndex() const { - const auto& input = case_->GetInput(); + const auto& input = case_->InputAs(); for (const auto& path : {FieldPath({2, 1, 0}), FieldPath({1, 2, 0}), FieldPath{1, 1, 2}}) { EXPECT_RAISES_WITH_MESSAGE_THAT(IndexError, @@ -603,9 +603,10 @@ class TestFieldPath : public FieldPathTestFixture { DoGet(*input, FieldPath())); } - template > + template void TestIndexErrorMessage() const { - auto result = FieldPath({1, 1, 2}).Get(*case_->GetInput()); + using O = OutputType; + auto result = DoGet(*case_->InputAs(), FieldPath({1, 1, 2})); std::string substr = "index out of range. indices=[ 1 1 >2< ] "; if constexpr (std::is_same_v) { substr += "fields: { a: float, a: bool, }"; @@ -619,43 +620,33 @@ class TestFieldPath : public FieldPathTestFixture { void TestGetWithNonStructArray() const { EXPECT_RAISES_WITH_MESSAGE_THAT( NotImplemented, ::testing::HasSubstr("Get child data of non-struct array"), - DoGet(*case_->v1_1_0.Get(), FieldPath({1, 1, 0}))); + DoGet(*case_->v1_1_0.OutputAs(), FieldPath({1, 1, 0}))); } - template > + template void TestGet() const { - const auto& input = case_->GetInput(); - ASSERT_OK_AND_ASSIGN(auto v0, FieldPath({0}).Get(*input)); - ASSERT_OK_AND_ASSIGN(auto v1, FieldPath({1}).Get(*input)); - ASSERT_OK_AND_ASSIGN(auto v1_0, FieldPath({1, 0}).Get(*input)); - ASSERT_OK_AND_ASSIGN(auto v1_1, FieldPath({1, 1}).Get(*input)); - ASSERT_OK_AND_ASSIGN(auto v1_1_0, FieldPath({1, 1, 0}).Get(*input)); - ASSERT_OK_AND_ASSIGN(auto v1_1_1, FieldPath({1, 1, 1}).Get(*input)); - - AssertOutputsEqual(case_->v0.Get(), v0); - AssertOutputsEqual(case_->v1.Get(), v1); - AssertOutputsEqual(case_->v1_0.Get(), v1_0); - AssertOutputsEqual(case_->v1_1.Get(), v1_1); - AssertOutputsEqual(case_->v1_1_0.Get(), v1_1_0); - AssertOutputsEqual(case_->v1_1_1.Get(), v1_1_1); - } - - template > - void TestGetFlattened() const { - const auto& input = case_->GetInput(); - ASSERT_OK_AND_ASSIGN(auto v0, FieldPath({0}).GetFlattened(*input)); - ASSERT_OK_AND_ASSIGN(auto v1, FieldPath({1}).GetFlattened(*input)); - ASSERT_OK_AND_ASSIGN(auto v1_0, FieldPath({1, 0}).GetFlattened(*input)); - ASSERT_OK_AND_ASSIGN(auto v1_1, FieldPath({1, 1}).GetFlattened(*input)); - ASSERT_OK_AND_ASSIGN(auto v1_1_0, FieldPath({1, 1, 0}).GetFlattened(*input)); - ASSERT_OK_AND_ASSIGN(auto v1_1_1, FieldPath({1, 1, 1}).GetFlattened(*input)); - - AssertOutputsEqual(case_->v0.Get(), v0); - AssertOutputsEqual(case_->v1.Get(), v1); - AssertOutputsEqual(case_->v1_0_flat.Get(), v1_0); - AssertOutputsEqual(case_->v1_1_flat.Get(), v1_1); - AssertOutputsEqual(case_->v1_1_0_flat.Get(), v1_1_0); - AssertOutputsEqual(case_->v1_1_1_flat.Get(), v1_1_1); + using O = OutputType; + const auto& input = case_->InputAs(); + ASSERT_OK_AND_ASSIGN(auto v0, DoGet(*input, FieldPath({0}))); + ASSERT_OK_AND_ASSIGN(auto v1, DoGet(*input, FieldPath({1}))); + ASSERT_OK_AND_ASSIGN(auto v1_0, DoGet(*input, FieldPath({1, 0}))); + ASSERT_OK_AND_ASSIGN(auto v1_1, DoGet(*input, FieldPath({1, 1}))); + ASSERT_OK_AND_ASSIGN(auto v1_1_0, DoGet(*input, FieldPath({1, 1, 0}))); + ASSERT_OK_AND_ASSIGN(auto v1_1_1, DoGet(*input, FieldPath({1, 1, 1}))); + + AssertOutputsEqual(case_->v0.OutputAs(), v0); + AssertOutputsEqual(case_->v1.OutputAs(), v1); + if constexpr (Flattened) { + AssertOutputsEqual(case_->v1_0_flat.OutputAs(), v1_0); + AssertOutputsEqual(case_->v1_1_flat.OutputAs(), v1_1); + AssertOutputsEqual(case_->v1_1_0_flat.OutputAs(), v1_1_0); + AssertOutputsEqual(case_->v1_1_1_flat.OutputAs(), v1_1_1); + } else { + AssertOutputsEqual(case_->v1_0.OutputAs(), v1_0); + AssertOutputsEqual(case_->v1_1.OutputAs(), v1_1); + AssertOutputsEqual(case_->v1_1_0.OutputAs(), v1_1_0); + AssertOutputsEqual(case_->v1_1_1.OutputAs(), v1_1_1); + } } }; @@ -695,19 +686,6 @@ TEST_F(TestFieldPath, GetWithNonStructArray) { TestGetWithNonStructArray(); } -TEST_F(TestFieldPath, GetFromSchema) { TestGet(); } -TEST_F(TestFieldPath, GetFromDataType) { TestGet(); } - -TEST_F(TestFieldPath, GetFromArray) { TestGet(); } -TEST_F(TestFieldPath, GetFromChunkedArray) { TestGet(); } -TEST_F(TestFieldPath, GetFromRecordBatch) { TestGet(); } -TEST_F(TestFieldPath, GetFromTable) { TestGet
(); } - -TEST_F(TestFieldPath, GetFlattenedFromArray) { TestGetFlattened(); } -TEST_F(TestFieldPath, GetFlattenedFromChunkedArray) { TestGetFlattened(); } -TEST_F(TestFieldPath, GetFlattenedFromRecordBatch) { TestGetFlattened(); } -TEST_F(TestFieldPath, GetFlattenedFromTable) { TestGetFlattened
(); } - TEST_F(TestFieldPath, Basics) { auto f0 = field("alpha", int32()); auto f1 = field("beta", int32()); @@ -750,6 +728,19 @@ TEST_F(TestFieldPath, GetFromEmptyChunked) { ASSERT_EQ(child->length(), 0); } +TEST_F(TestFieldPath, GetFromSchema) { TestGet(); } +TEST_F(TestFieldPath, GetFromDataType) { TestGet(); } + +TEST_F(TestFieldPath, GetFromArray) { TestGet(); } +TEST_F(TestFieldPath, GetFromChunkedArray) { TestGet(); } +TEST_F(TestFieldPath, GetFromRecordBatch) { TestGet(); } +TEST_F(TestFieldPath, GetFromTable) { TestGet
(); } + +TEST_F(TestFieldPath, GetFlattenedFromArray) { TestGet(); } +TEST_F(TestFieldPath, GetFlattenedFromChunkedArray) { TestGet(); } +TEST_F(TestFieldPath, GetFlattenedFromRecordBatch) { TestGet(); } +TEST_F(TestFieldPath, GetFlattenedFromTable) { TestGet(); } + class TestFieldRef : public FieldPathTestFixture { protected: template @@ -781,7 +772,7 @@ class TestFieldRef : public FieldPathTestFixture { template void TestGet() const { using O = OutputType; - const auto& input = case_->GetInput(); + const auto& input = case_->InputAs(); ASSERT_OK_AND_ASSIGN(auto v0, DoGetOne(*input, FieldRef("a"))); ASSERT_OK_AND_ASSIGN(auto v1, DoGetOne(*input, FieldRef("b"))); ASSERT_OK_AND_ASSIGN(auto v1_0, DoGetOne(*input, FieldRef("b", "a"))); @@ -789,18 +780,18 @@ class TestFieldRef : public FieldPathTestFixture { ASSERT_OK_AND_ASSIGN(auto v1_1_0, DoGetOne(*input, FieldRef("b", "b", 0))); ASSERT_OK_AND_ASSIGN(auto v1_1_1, DoGetOne(*input, FieldRef("b", "b", 1))); - AssertOutputsEqual(case_->v0.Get(), v0); - AssertOutputsEqual(case_->v1.Get(), v1); + AssertOutputsEqual(case_->v0.OutputAs(), v0); + AssertOutputsEqual(case_->v1.OutputAs(), v1); if constexpr (Flattened) { - AssertOutputsEqual(case_->v1_0_flat.Get(), v1_0); - AssertOutputsEqual(case_->v1_1_flat.Get(), v1_1); - AssertOutputsEqual(case_->v1_1_0_flat.Get(), v1_1_0); - AssertOutputsEqual(case_->v1_1_1_flat.Get(), v1_1_1); + AssertOutputsEqual(case_->v1_0_flat.OutputAs(), v1_0); + AssertOutputsEqual(case_->v1_1_flat.OutputAs(), v1_1); + AssertOutputsEqual(case_->v1_1_0_flat.OutputAs(), v1_1_0); + AssertOutputsEqual(case_->v1_1_1_flat.OutputAs(), v1_1_1); } else { - AssertOutputsEqual(case_->v1_0.Get(), v1_0); - AssertOutputsEqual(case_->v1_1.Get(), v1_1); - AssertOutputsEqual(case_->v1_1_0.Get(), v1_1_0); - AssertOutputsEqual(case_->v1_1_1.Get(), v1_1_1); + AssertOutputsEqual(case_->v1_0.OutputAs(), v1_0); + AssertOutputsEqual(case_->v1_1.OutputAs(), v1_1); + AssertOutputsEqual(case_->v1_1_0.OutputAs(), v1_1_0); + AssertOutputsEqual(case_->v1_1_1.OutputAs(), v1_1_1); } // Cases where multiple matches are found From 759523c6975195d28a0705c090661e795c7fb751 Mon Sep 17 00:00:00 2001 From: benibus Date: Wed, 17 May 2023 11:15:18 -0400 Subject: [PATCH 18/21] Return FieldPath::GetFlattened errors in FieldRef --- cpp/src/arrow/type.h | 11 ++++++----- cpp/src/arrow/type_test.cc | 8 +++++--- 2 files changed, 11 insertions(+), 8 deletions(-) diff --git a/cpp/src/arrow/type.h b/cpp/src/arrow/type.h index 7d262729f3e..9bca82b1600 100644 --- a/cpp/src/arrow/type.h +++ b/cpp/src/arrow/type.h @@ -1903,11 +1903,12 @@ class ARROW_EXPORT FieldRef : public util::EqualityComparable { return out; } template - std::vector> GetAllFlattened(const T& root, - MemoryPool* pool = NULLPTR) const { + Result>> GetAllFlattened(const T& root, + MemoryPool* pool = NULLPTR) const { std::vector> out; for (const auto& match : FindAll(root)) { - out.push_back(match.GetFlattened(root, pool).ValueOrDie()); + ARROW_ASSIGN_OR_RAISE(auto child, match.GetFlattened(root, pool)); + out.push_back(std::move(child)); } return out; } @@ -1922,7 +1923,7 @@ class ARROW_EXPORT FieldRef : public util::EqualityComparable { template Result> GetOneFlattened(const T& root, MemoryPool* pool = NULLPTR) const { ARROW_ASSIGN_OR_RAISE(auto match, FindOne(root)); - return match.GetFlattened(root, pool).ValueOrDie(); + return match.GetFlattened(root, pool); } /// \brief Get the single child matching this FieldRef. @@ -1942,7 +1943,7 @@ class ARROW_EXPORT FieldRef : public util::EqualityComparable { if (match.empty()) { return static_cast>(NULLPTR); } - return match.GetFlattened(root, pool).ValueOrDie(); + return match.GetFlattened(root, pool); } private: diff --git a/cpp/src/arrow/type_test.cc b/cpp/src/arrow/type_test.cc index fb74d3ffcdb..467ce27c175 100644 --- a/cpp/src/arrow/type_test.cc +++ b/cpp/src/arrow/type_test.cc @@ -765,7 +765,7 @@ class TestFieldRef : public FieldPathTestFixture { if constexpr (Flattened) { return ref.GetAllFlattened(root, pool); } else { - return ref.GetAll(root); + return ToResult(ref.GetAll(root)); } } @@ -795,7 +795,8 @@ class TestFieldRef : public FieldPathTestFixture { } // Cases where multiple matches are found - auto multiple_matches = DoGetAll(*input, FieldRef("b", "b", "a")); + EXPECT_OK_AND_ASSIGN(auto multiple_matches, + DoGetAll(*input, FieldRef("b", "b", "a"))); EXPECT_EQ(multiple_matches.size(), 2); EXPECT_RAISES_WITH_MESSAGE_THAT( Invalid, ::testing::HasSubstr("Multiple matches for "), @@ -805,7 +806,8 @@ class TestFieldRef : public FieldPathTestFixture { (DoGetOneOrNone(*input, FieldRef("b", "b", "a")))); // Cases where no match is found - auto no_matches = DoGetAll(*input, FieldRef("b", "b", "b")); + EXPECT_OK_AND_ASSIGN(auto no_matches, + DoGetAll(*input, FieldRef("b", "b", "b"))); EXPECT_EQ(no_matches.size(), 0); EXPECT_RAISES_WITH_MESSAGE_THAT( Invalid, ::testing::HasSubstr("No match for "), From e09516e25cfd9d61e30ed9be5ad354de17c636c4 Mon Sep 17 00:00:00 2001 From: benibus Date: Wed, 17 May 2023 13:21:26 -0400 Subject: [PATCH 19/21] Additional validation checks --- cpp/src/arrow/type_test.cc | 2 ++ 1 file changed, 2 insertions(+) diff --git a/cpp/src/arrow/type_test.cc b/cpp/src/arrow/type_test.cc index 467ce27c175..cd52b263978 100644 --- a/cpp/src/arrow/type_test.cc +++ b/cpp/src/arrow/type_test.cc @@ -557,11 +557,13 @@ class FieldPathTestFixture : public ::testing::Test { template void AssertOutputsEqual(const std::shared_ptr& expected, const std::shared_ptr& actual) const { + ASSERT_OK(actual->ValidateFull()); AssertArraysEqual(*expected, *actual); } template void AssertOutputsEqual(const std::shared_ptr& expected, const std::shared_ptr& actual) const { + ASSERT_OK(actual->ValidateFull()); // We only do this dance due to the way the test inputs/outputs are generated. // Basically, the "expected" output ChunkedArrays don't have an equal num_chunks since // they're reused to create the input Table (which has a distinct chunking per From 163ee85ee2c4a656a8afad803f985e17a278673e Mon Sep 17 00:00:00 2001 From: benibus Date: Wed, 17 May 2023 13:55:08 -0400 Subject: [PATCH 20/21] Relocate FieldPath/Ref tests to new module --- cpp/src/arrow/CMakeLists.txt | 2 +- cpp/src/arrow/field_ref_test.cc | 691 ++++++++++++++++++++++++++++++++ cpp/src/arrow/type_test.cc | 643 ----------------------------- 3 files changed, 692 insertions(+), 644 deletions(-) create mode 100644 cpp/src/arrow/field_ref_test.cc diff --git a/cpp/src/arrow/CMakeLists.txt b/cpp/src/arrow/CMakeLists.txt index b6eb50099bf..4e6826bc61f 100644 --- a/cpp/src/arrow/CMakeLists.txt +++ b/cpp/src/arrow/CMakeLists.txt @@ -796,7 +796,7 @@ set_source_files_properties(public_api_test.cc PROPERTIES SKIP_PRECOMPILE_HEADER SKIP_UNITY_BUILD_INCLUSION ON) add_arrow_test(scalar_test) -add_arrow_test(type_test) +add_arrow_test(type_test SOURCES field_ref_test.cc type_test.cc) add_arrow_test(table_test SOURCES diff --git a/cpp/src/arrow/field_ref_test.cc b/cpp/src/arrow/field_ref_test.cc new file mode 100644 index 00000000000..a3d17469967 --- /dev/null +++ b/cpp/src/arrow/field_ref_test.cc @@ -0,0 +1,691 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include +#include +#include +#include +#include + +#include + +#include "arrow/array.h" +#include "arrow/memory_pool.h" +#include "arrow/table.h" +#include "arrow/testing/gtest_util.h" +#include "arrow/testing/random.h" +#include "arrow/testing/util.h" +#include "arrow/type.h" +#include "arrow/type_traits.h" +#include "arrow/util/checked_cast.h" +#include "arrow/util/logging.h" + +namespace arrow { + +using testing::ElementsAre; + +using internal::checked_cast; +using internal::checked_pointer_cast; + +struct FieldPathTestCase { + struct OutputValues { + explicit OutputValues(std::vector indices = {}) + : path(FieldPath(std::move(indices))) {} + + template + const auto& OutputAs() const { + if constexpr (std::is_same_v) { + return field; + } else if constexpr (std::is_same_v) { + return array; + } else if constexpr (std::is_same_v) { + return array->data(); + } else if constexpr (std::is_same_v) { + return chunked_array; + } + } + + FieldPath path; + std::shared_ptr field; + std::shared_ptr array; + std::shared_ptr chunked_array; + }; + + static constexpr int kNumColumns = 2; + static constexpr int kNumRows = 100; + static constexpr int kRandomSeed = 0xbeef; + + // Input for the FieldPath::Get functions in multiple forms + std::shared_ptr schema; + std::shared_ptr type; + std::shared_ptr array; + std::shared_ptr record_batch; + std::shared_ptr chunked_array; + std::shared_ptr
table; + + template + const auto& InputAs() const { + if constexpr (std::is_same_v) { + return schema; + } else if constexpr (std::is_same_v) { + return type; + } else if constexpr (std::is_same_v) { + return array; + } else if constexpr (std::is_same_v) { + return array->data(); + } else if constexpr (std::is_same_v) { + return record_batch; + } else if constexpr (std::is_same_v) { + return chunked_array; + } else if constexpr (std::is_same_v) { + return table; + } + } + + // Number of chunks for each column in the input Table + const std::array num_column_chunks = {15, 20}; + // Number of chunks in the input ChunkedArray + const int num_chunks = 15; + + // Expected outputs for each child; + OutputValues v0{{0}}, v1{{1}}; + OutputValues v1_0{{1, 0}}, v1_1{{1, 1}}; + OutputValues v1_1_0{{1, 1, 0}}, v1_1_1{{1, 1, 1}}; + // Expected outputs for nested children with null flattening applied + OutputValues v1_0_flat{{1, 0}}, v1_1_flat{{1, 1}}; + OutputValues v1_1_0_flat{{1, 1, 0}}, v1_1_1_flat{{1, 1, 1}}; + + static const FieldPathTestCase* Instance() { + static const auto maybe_instance = Make(); + return &maybe_instance.ValueOrDie(); + } + + static Result Make() { + // Generate test input based on a single schema. First by creating a StructArray, + // then deriving the other input types (ChunkedArray, RecordBatch, Table, etc) from + // it. We also compute the expected outputs for each child individually (for each + // output type). + FieldPathTestCase out; + random::RandomArrayGenerator gen(kRandomSeed); + + // Define child fields and input schema + + // Intentionally duplicated names for the FieldRef tests + out.v1_1_1.field = field("a", boolean()); + out.v1_1_0.field = field("a", float32()); + + out.v1_1.field = field("b", struct_({out.v1_1_0.field, out.v1_1_1.field})); + out.v1_0.field = field("a", int32()); + out.v1.field = field("b", struct_({out.v1_0.field, out.v1_1.field})); + out.v0.field = field("a", utf8()); + out.schema = arrow::schema({out.v0.field, out.v1.field}); + out.type = struct_(out.schema->fields()); + + // Create null bitmaps for the struct fields independent of its childrens' + // bitmaps. For FieldPath::GetFlattened, parent/child bitmaps should be combined + // - for FieldPath::Get, higher-level nulls are ignored. + auto bitmap1_1 = gen.NullBitmap(kNumRows, 0.15); + auto bitmap1 = gen.NullBitmap(kNumRows, 0.30); + + // Generate raw leaf arrays + out.v1_1_1.array = gen.ArrayOf(out.v1_1_1.field->type(), kNumRows); + out.v1_1_0.array = gen.ArrayOf(out.v1_1_0.field->type(), kNumRows); + out.v1_0.array = gen.ArrayOf(out.v1_0.field->type(), kNumRows); + out.v0.array = gen.ArrayOf(out.v0.field->type(), kNumRows); + // Make struct fields from leaf arrays (we use the custom bitmaps here) + ARROW_ASSIGN_OR_RAISE( + out.v1_1.array, + StructArray::Make({out.v1_1_0.array, out.v1_1_1.array}, + {out.v1_1_0.field, out.v1_1_1.field}, bitmap1_1)); + ARROW_ASSIGN_OR_RAISE(out.v1.array, + StructArray::Make({out.v1_0.array, out.v1_1.array}, + {out.v1_0.field, out.v1_1.field}, bitmap1)); + + // Not used to create the test input, but pre-compute flattened versions of nested + // arrays for comparisons in the GetFlattened tests. + ARROW_ASSIGN_OR_RAISE( + out.v1_0_flat.array, + checked_pointer_cast(out.v1.array)->GetFlattenedField(0)); + ARROW_ASSIGN_OR_RAISE( + out.v1_1_flat.array, + checked_pointer_cast(out.v1.array)->GetFlattenedField(1)); + ARROW_ASSIGN_OR_RAISE( + out.v1_1_0_flat.array, + checked_pointer_cast(out.v1_1_flat.array)->GetFlattenedField(0)); + ARROW_ASSIGN_OR_RAISE( + out.v1_1_1_flat.array, + checked_pointer_cast(out.v1_1_flat.array)->GetFlattenedField(1)); + // Sanity check + ARROW_CHECK(!out.v1_0_flat.array->Equals(out.v1_0.array)); + ARROW_CHECK(!out.v1_1_flat.array->Equals(out.v1_1.array)); + ARROW_CHECK(!out.v1_1_0_flat.array->Equals(out.v1_1_0.array)); + ARROW_CHECK(!out.v1_1_1_flat.array->Equals(out.v1_1_1.array)); + + // Finalize the input Array + ARROW_ASSIGN_OR_RAISE(out.array, StructArray::Make({out.v0.array, out.v1.array}, + {out.v0.field, out.v1.field})); + ARROW_RETURN_NOT_OK(out.array->ValidateFull()); + // Finalize the input RecordBatch + ARROW_ASSIGN_OR_RAISE(out.record_batch, RecordBatch::FromStructArray(out.array)); + ARROW_RETURN_NOT_OK(out.record_batch->ValidateFull()); + // Finalize the input ChunkedArray + out.chunked_array = SliceToChunkedArray(*out.array, out.num_chunks); + ARROW_RETURN_NOT_OK(out.chunked_array->ValidateFull()); + + // For each expected child array, create a chunked equivalent (we use a different + // chunk layout for each top-level column to make the Table test more interesting) + for (OutputValues* v : + {&out.v0, &out.v1, &out.v1_0, &out.v1_1, &out.v1_1_0, &out.v1_1_1, + &out.v1_0_flat, &out.v1_1_flat, &out.v1_1_0_flat, &out.v1_1_1_flat}) { + v->chunked_array = + SliceToChunkedArray(*v->array, out.num_column_chunks[v->path[0]]); + } + // Finalize the input Table + out.table = + Table::Make(out.schema, {out.v0.chunked_array, out.v1.chunked_array}, kNumRows); + ARROW_RETURN_NOT_OK(out.table->ValidateFull()); + + return std::move(out); + } + + private: + static std::shared_ptr SliceToChunkedArray(const Array& array, + int num_chunks) { + ARROW_CHECK(num_chunks > 0 && array.length() >= num_chunks); + ArrayVector chunks; + chunks.reserve(num_chunks); + for (int64_t inc = array.length() / num_chunks, beg = 0, + end = inc + array.length() % num_chunks; + end <= array.length(); beg = end, end += inc) { + chunks.push_back(array.SliceSafe(beg, end - beg).ValueOrDie()); + } + ARROW_CHECK_EQ(static_cast(chunks.size()), num_chunks); + return ChunkedArray::Make(std::move(chunks)).ValueOrDie(); + } +}; + +class FieldPathTestFixture : public ::testing::Test { + public: + FieldPathTestFixture() : case_(FieldPathTestCase::Instance()) {} + + protected: + template + using OutputType = typename FieldRef::GetType::element_type; + + template + void AssertOutputsEqual(const std::shared_ptr& expected, + const std::shared_ptr& actual) const { + AssertFieldEqual(*expected, *actual); + } + template + void AssertOutputsEqual(const std::shared_ptr& expected, + const std::shared_ptr& actual) const { + ASSERT_OK(actual->ValidateFull()); + AssertArraysEqual(*expected, *actual); + } + template + void AssertOutputsEqual(const std::shared_ptr& expected, + const std::shared_ptr& actual) const { + ASSERT_OK(actual->ValidateFull()); + // We only do this dance due to the way the test inputs/outputs are generated. + // Basically, the "expected" output ChunkedArrays don't have an equal num_chunks since + // they're reused to create the input Table (which has a distinct chunking per + // column). However, if the input was the ChunkedArray, the returned outputs should + // always have the same num_chunks as the input. + if constexpr (std::is_same_v) { + EXPECT_EQ(case_->chunked_array->num_chunks(), actual->num_chunks()); + } else { + EXPECT_EQ(expected->num_chunks(), actual->num_chunks()); + } + AssertChunkedEquivalent(*expected, *actual); + } + + const FieldPathTestCase* case_; +}; + +class TestFieldPath : public FieldPathTestFixture { + protected: + template + static auto DoGet(const T& root, const FieldPath& path, MemoryPool* pool = nullptr) { + if constexpr (Flattened) { + return path.GetFlattened(root, pool); + } else { + return path.Get(root); + } + } + + template + void TestGetWithInvalidIndex() const { + const auto& input = case_->InputAs(); + for (const auto& path : + {FieldPath({2, 1, 0}), FieldPath({1, 2, 0}), FieldPath{1, 1, 2}}) { + EXPECT_RAISES_WITH_MESSAGE_THAT(IndexError, + ::testing::HasSubstr("index out of range"), + DoGet(*input, path)); + } + EXPECT_RAISES_WITH_MESSAGE_THAT( + Invalid, ::testing::HasSubstr("empty indices cannot be traversed"), + DoGet(*input, FieldPath())); + } + + template + void TestIndexErrorMessage() const { + using O = OutputType; + auto result = DoGet(*case_->InputAs(), FieldPath({1, 1, 2})); + std::string substr = "index out of range. indices=[ 1 1 >2< ] "; + if constexpr (std::is_same_v) { + substr += "fields: { a: float, a: bool, }"; + } else { + substr += "column types: { float, bool, }"; + } + EXPECT_RAISES_WITH_MESSAGE_THAT(IndexError, ::testing::HasSubstr(substr), result); + } + + template + void TestGetWithNonStructArray() const { + EXPECT_RAISES_WITH_MESSAGE_THAT( + NotImplemented, ::testing::HasSubstr("Get child data of non-struct array"), + DoGet(*case_->v1_1_0.OutputAs(), FieldPath({1, 1, 0}))); + } + + template + void TestGet() const { + using O = OutputType; + const auto& input = case_->InputAs(); + ASSERT_OK_AND_ASSIGN(auto v0, DoGet(*input, FieldPath({0}))); + ASSERT_OK_AND_ASSIGN(auto v1, DoGet(*input, FieldPath({1}))); + ASSERT_OK_AND_ASSIGN(auto v1_0, DoGet(*input, FieldPath({1, 0}))); + ASSERT_OK_AND_ASSIGN(auto v1_1, DoGet(*input, FieldPath({1, 1}))); + ASSERT_OK_AND_ASSIGN(auto v1_1_0, DoGet(*input, FieldPath({1, 1, 0}))); + ASSERT_OK_AND_ASSIGN(auto v1_1_1, DoGet(*input, FieldPath({1, 1, 1}))); + + AssertOutputsEqual(case_->v0.OutputAs(), v0); + AssertOutputsEqual(case_->v1.OutputAs(), v1); + if constexpr (Flattened) { + AssertOutputsEqual(case_->v1_0_flat.OutputAs(), v1_0); + AssertOutputsEqual(case_->v1_1_flat.OutputAs(), v1_1); + AssertOutputsEqual(case_->v1_1_0_flat.OutputAs(), v1_1_0); + AssertOutputsEqual(case_->v1_1_1_flat.OutputAs(), v1_1_1); + } else { + AssertOutputsEqual(case_->v1_0.OutputAs(), v1_0); + AssertOutputsEqual(case_->v1_1.OutputAs(), v1_1); + AssertOutputsEqual(case_->v1_1_0.OutputAs(), v1_1_0); + AssertOutputsEqual(case_->v1_1_1.OutputAs(), v1_1_1); + } + } +}; + +class TestFieldRef : public FieldPathTestFixture { + protected: + template + static auto DoGetOne(const T& root, const FieldRef& ref, MemoryPool* pool = nullptr) { + if constexpr (Flattened) { + return ref.GetOneFlattened(root, pool); + } else { + return ref.GetOne(root); + } + } + template + static auto DoGetOneOrNone(const T& root, const FieldRef& ref, + MemoryPool* pool = nullptr) { + if constexpr (Flattened) { + return ref.GetOneOrNoneFlattened(root, pool); + } else { + return ref.GetOneOrNone(root); + } + } + template + static auto DoGetAll(const T& root, const FieldRef& ref, MemoryPool* pool = nullptr) { + if constexpr (Flattened) { + return ref.GetAllFlattened(root, pool); + } else { + return ToResult(ref.GetAll(root)); + } + } + + template + void TestGet() const { + using O = OutputType; + const auto& input = case_->InputAs(); + ASSERT_OK_AND_ASSIGN(auto v0, DoGetOne(*input, FieldRef("a"))); + ASSERT_OK_AND_ASSIGN(auto v1, DoGetOne(*input, FieldRef("b"))); + ASSERT_OK_AND_ASSIGN(auto v1_0, DoGetOne(*input, FieldRef("b", "a"))); + ASSERT_OK_AND_ASSIGN(auto v1_1, DoGetOne(*input, FieldRef("b", "b"))); + ASSERT_OK_AND_ASSIGN(auto v1_1_0, DoGetOne(*input, FieldRef("b", "b", 0))); + ASSERT_OK_AND_ASSIGN(auto v1_1_1, DoGetOne(*input, FieldRef("b", "b", 1))); + + AssertOutputsEqual(case_->v0.OutputAs(), v0); + AssertOutputsEqual(case_->v1.OutputAs(), v1); + if constexpr (Flattened) { + AssertOutputsEqual(case_->v1_0_flat.OutputAs(), v1_0); + AssertOutputsEqual(case_->v1_1_flat.OutputAs(), v1_1); + AssertOutputsEqual(case_->v1_1_0_flat.OutputAs(), v1_1_0); + AssertOutputsEqual(case_->v1_1_1_flat.OutputAs(), v1_1_1); + } else { + AssertOutputsEqual(case_->v1_0.OutputAs(), v1_0); + AssertOutputsEqual(case_->v1_1.OutputAs(), v1_1); + AssertOutputsEqual(case_->v1_1_0.OutputAs(), v1_1_0); + AssertOutputsEqual(case_->v1_1_1.OutputAs(), v1_1_1); + } + + // Cases where multiple matches are found + EXPECT_OK_AND_ASSIGN(auto multiple_matches, + DoGetAll(*input, FieldRef("b", "b", "a"))); + EXPECT_EQ(multiple_matches.size(), 2); + EXPECT_RAISES_WITH_MESSAGE_THAT( + Invalid, ::testing::HasSubstr("Multiple matches for "), + (DoGetOne(*input, FieldRef("b", "b", "a")))); + EXPECT_RAISES_WITH_MESSAGE_THAT( + Invalid, ::testing::HasSubstr("Multiple matches for "), + (DoGetOneOrNone(*input, FieldRef("b", "b", "a")))); + + // Cases where no match is found + EXPECT_OK_AND_ASSIGN(auto no_matches, + DoGetAll(*input, FieldRef("b", "b", "b"))); + EXPECT_EQ(no_matches.size(), 0); + EXPECT_RAISES_WITH_MESSAGE_THAT( + Invalid, ::testing::HasSubstr("No match for "), + (DoGetOne(*input, FieldRef("b", "b", "b")))); + ASSERT_OK_AND_EQ(nullptr, + (DoGetOneOrNone(*input, FieldRef("b", "b", "b")))); + } +}; + +// ---------------------------------------------------------------------- +// FieldPath + +TEST_F(TestFieldPath, Basics) { + auto f0 = field("alpha", int32()); + auto f1 = field("beta", int32()); + auto f2 = field("alpha", int32()); + auto f3 = field("beta", int32()); + Schema s({f0, f1, f2, f3}); + + // retrieving a field with single-element FieldPath is equivalent to Schema::field + for (int index = 0; index < s.num_fields(); ++index) { + ASSERT_OK_AND_EQ(s.field(index), FieldPath({index}).Get(s)); + } +} + +TEST_F(TestFieldPath, GetFromEmptyChunked) { + FieldVector fields = { + field("i", int32()), + field("s", struct_({field("b", boolean()), field("f", float32())}))}; + std::shared_ptr child; + + // Empty ChunkedArray with no chunks + ChunkedArray chunked_array({}, struct_(fields)); + ASSERT_OK(chunked_array.ValidateFull()); + ASSERT_EQ(chunked_array.num_chunks(), 0); + ASSERT_OK_AND_ASSIGN(child, FieldPath({1, 1}).Get(chunked_array)); + AssertTypeEqual(float32(), child->type()); + ASSERT_EQ(child->length(), 0); + + // Empty Table with no column chunks + ChunkedArrayVector table_columns; + for (const auto& f : fields) { + table_columns.push_back(std::make_shared(ArrayVector{}, f->type())); + } + auto table = Table::Make(schema(fields), table_columns, 0); + ASSERT_OK(table->ValidateFull()); + for (const auto& column : table->columns()) { + ASSERT_EQ(column->num_chunks(), 0); + } + ASSERT_OK_AND_ASSIGN(child, FieldPath({1, 1}).Get(*table)); + AssertTypeEqual(float32(), child->type()); + ASSERT_EQ(child->length(), 0); +} + +TEST_F(TestFieldPath, GetWithInvalidIndex) { + TestGetWithInvalidIndex(); + TestGetWithInvalidIndex(); + TestGetWithInvalidIndex(); + TestGetWithInvalidIndex(); + TestGetWithInvalidIndex(); + TestGetWithInvalidIndex(); + TestGetWithInvalidIndex
(); + // With flattening + TestGetWithInvalidIndex(); + TestGetWithInvalidIndex(); + TestGetWithInvalidIndex(); + TestGetWithInvalidIndex(); + TestGetWithInvalidIndex(); +} + +TEST_F(TestFieldPath, IndexErrorMessage) { + TestIndexErrorMessage(); + TestIndexErrorMessage(); + TestIndexErrorMessage(); + TestIndexErrorMessage(); + TestIndexErrorMessage(); + TestIndexErrorMessage(); + TestIndexErrorMessage
(); +} + +TEST_F(TestFieldPath, GetWithNonStructArray) { + TestGetWithNonStructArray(); + TestGetWithNonStructArray(); + TestGetWithNonStructArray(); + // With flattening + TestGetWithNonStructArray(); + TestGetWithNonStructArray(); + TestGetWithNonStructArray(); +} + +TEST_F(TestFieldPath, GetFromSchema) { TestGet(); } +TEST_F(TestFieldPath, GetFromDataType) { TestGet(); } + +TEST_F(TestFieldPath, GetFromArray) { TestGet(); } +TEST_F(TestFieldPath, GetFromChunkedArray) { TestGet(); } +TEST_F(TestFieldPath, GetFromRecordBatch) { TestGet(); } +TEST_F(TestFieldPath, GetFromTable) { TestGet
(); } + +TEST_F(TestFieldPath, GetFlattenedFromArray) { TestGet(); } +TEST_F(TestFieldPath, GetFlattenedFromChunkedArray) { TestGet(); } +TEST_F(TestFieldPath, GetFlattenedFromRecordBatch) { TestGet(); } +TEST_F(TestFieldPath, GetFlattenedFromTable) { TestGet(); } + +// ---------------------------------------------------------------------- +// FieldRef + +TEST_F(TestFieldRef, Basics) { + auto f0 = field("alpha", int32()); + auto f1 = field("beta", int32()); + auto f2 = field("alpha", int32()); + auto f3 = field("beta", int32()); + Schema s({f0, f1, f2, f3}); + + // lookup by index returns Indices{index} + for (int index = 0; index < s.num_fields(); ++index) { + EXPECT_THAT(FieldRef(index).FindAll(s), ElementsAre(FieldPath{index})); + } + // out of range index results in a failure to match + EXPECT_THAT(FieldRef(s.num_fields() * 2).FindAll(s), ElementsAre()); + + // lookup by name returns the Indices of both matching fields + EXPECT_THAT(FieldRef("alpha").FindAll(s), ElementsAre(FieldPath{0}, FieldPath{2})); + EXPECT_THAT(FieldRef("beta").FindAll(s), ElementsAre(FieldPath{1}, FieldPath{3})); +} + +TEST_F(TestFieldRef, FindAllForTable) { + constexpr int kNumRows = 100; + auto f0 = field("alpha", int32()); + auto f1 = field("beta", int32()); + auto f2 = field("alpha", int32()); + auto f3 = field("beta", int32()); + auto schema = arrow::schema({f0, f1, f2, f3}); + + arrow::random::RandomArrayGenerator gen_{42}; + auto a0 = gen_.ArrayOf(int32(), kNumRows); + auto a1 = gen_.ArrayOf(int32(), kNumRows); + auto a2 = gen_.ArrayOf(int32(), kNumRows); + auto a3 = gen_.ArrayOf(int32(), kNumRows); + + auto table_ptr = Table::Make(schema, {a0, a1, a2, a3}); + ASSERT_OK(table_ptr->ValidateFull()); + + // lookup by index returns Indices{index} + auto schema_num_fields = table_ptr->schema()->num_fields(); + for (int index = 0; index < schema_num_fields; ++index) { + EXPECT_THAT(FieldRef(index).FindAll(*table_ptr), ElementsAre(FieldPath{index})); + } + // out of range index results in a failure to match + EXPECT_THAT(FieldRef(schema_num_fields * 2).FindAll(*table_ptr), ElementsAre()); + + //// lookup by name returns the Indices of both matching fields + EXPECT_THAT(FieldRef("alpha").FindAll(*table_ptr), + ElementsAre(FieldPath{0}, FieldPath{2})); + EXPECT_THAT(FieldRef("beta").FindAll(*table_ptr), + ElementsAre(FieldPath{1}, FieldPath{3})); +} + +TEST_F(TestFieldRef, FindAllForRecordBatch) { + constexpr int kNumRows = 100; + auto f0 = field("alpha", int32()); + auto f1 = field("beta", int32()); + auto f2 = field("alpha", int32()); + auto f3 = field("beta", int32()); + auto schema = arrow::schema({f0, f1, f2, f3}); + + arrow::random::RandomArrayGenerator gen_{42}; + auto a0 = gen_.ArrayOf(int32(), kNumRows); + auto a1 = gen_.ArrayOf(int32(), kNumRows); + auto a2 = gen_.ArrayOf(int32(), kNumRows); + auto a3 = gen_.ArrayOf(int32(), kNumRows); + + auto record_batch_ptr = RecordBatch::Make(schema, kNumRows, {a0, a1, a2, a3}); + ASSERT_OK(record_batch_ptr->ValidateFull()); + + // lookup by index returns Indices{index} + auto schema_num_fields = record_batch_ptr->schema()->num_fields(); + for (int index = 0; index < schema_num_fields; ++index) { + EXPECT_THAT(FieldRef(index).FindAll(*record_batch_ptr), + ElementsAre(FieldPath{index})); + } + // out of range index results in a failure to match + EXPECT_THAT(FieldRef(schema_num_fields * 2).FindAll(*record_batch_ptr), ElementsAre()); + + //// lookup by name returns the Indices of both matching fields + EXPECT_THAT(FieldRef("alpha").FindAll(*record_batch_ptr), + ElementsAre(FieldPath{0}, FieldPath{2})); + EXPECT_THAT(FieldRef("beta").FindAll(*record_batch_ptr), + ElementsAre(FieldPath{1}, FieldPath{3})); +} + +TEST_F(TestFieldRef, FromDotPath) { + ASSERT_OK_AND_EQ(FieldRef("alpha"), FieldRef::FromDotPath(R"(.alpha)")); + + ASSERT_OK_AND_EQ(FieldRef("", ""), FieldRef::FromDotPath(R"(..)")); + + ASSERT_OK_AND_EQ(FieldRef(2), FieldRef::FromDotPath(R"([2])")); + + ASSERT_OK_AND_EQ(FieldRef("beta", 3), FieldRef::FromDotPath(R"(.beta[3])")); + + ASSERT_OK_AND_EQ(FieldRef(5, "gamma", "delta", 7), + FieldRef::FromDotPath(R"([5].gamma.delta[7])")); + + ASSERT_OK_AND_EQ(FieldRef("hello world"), FieldRef::FromDotPath(R"(.hello world)")); + + ASSERT_OK_AND_EQ(FieldRef(R"([y]\tho.\)"), FieldRef::FromDotPath(R"(.\[y\]\\tho\.\)")); + + ASSERT_OK_AND_EQ(FieldRef(), FieldRef::FromDotPath(R"()")); + + ASSERT_RAISES(Invalid, FieldRef::FromDotPath(R"(alpha)")); + ASSERT_RAISES(Invalid, FieldRef::FromDotPath(R"([134234)")); + ASSERT_RAISES(Invalid, FieldRef::FromDotPath(R"([1stuf])")); +} + +TEST_F(TestFieldRef, DotPathRoundTrip) { + auto check_roundtrip = [](const FieldRef& ref) { + auto dot_path = ref.ToDotPath(); + ASSERT_OK_AND_EQ(ref, FieldRef::FromDotPath(dot_path)); + }; + + check_roundtrip(FieldRef()); + check_roundtrip(FieldRef("foo")); + check_roundtrip(FieldRef("foo", 1, "bar", 2, 3)); + check_roundtrip(FieldRef(1, 2, 3)); + check_roundtrip(FieldRef("foo", 1, FieldRef("bar", 2, 3), FieldRef())); +} + +TEST_F(TestFieldRef, Nested) { + auto f0 = field("alpha", int32()); + auto f1_0 = field("alpha", int32()); + auto f1 = field("beta", struct_({f1_0})); + auto f2_0 = field("alpha", int32()); + auto f2_1_0 = field("alpha", int32()); + auto f2_1_1 = field("alpha", int32()); + auto f2_1 = field("gamma", struct_({f2_1_0, f2_1_1})); + auto f2 = field("beta", struct_({f2_0, f2_1})); + Schema s({f0, f1, f2}); + + EXPECT_THAT(FieldRef("beta", "alpha").FindAll(s), + ElementsAre(FieldPath{1, 0}, FieldPath{2, 0})); + EXPECT_THAT(FieldRef("beta", "gamma", "alpha").FindAll(s), + ElementsAre(FieldPath{2, 1, 0}, FieldPath{2, 1, 1})); +} + +TEST_F(TestFieldRef, Flatten) { + FieldRef ref; + + auto assert_name = [](const FieldRef& ref, const std::string& expected) { + ASSERT_TRUE(ref.IsName()); + ASSERT_EQ(*ref.name(), expected); + }; + + auto assert_path = [](const FieldRef& ref, const std::vector& expected) { + ASSERT_TRUE(ref.IsFieldPath()); + ASSERT_EQ(ref.field_path()->indices(), expected); + }; + + auto assert_nested = [](const FieldRef& ref, const std::vector& expected) { + ASSERT_TRUE(ref.IsNested()); + ASSERT_EQ(*ref.nested_refs(), expected); + }; + + assert_path(FieldRef(), {}); + assert_path(FieldRef(1, 2, 3), {1, 2, 3}); + // If all leaves are field paths, they are fully flattened + assert_path(FieldRef(1, FieldRef(2, 3)), {1, 2, 3}); + assert_path(FieldRef(1, FieldRef(2, 3), FieldRef(), FieldRef(FieldRef(4), FieldRef(5))), + {1, 2, 3, 4, 5}); + assert_path(FieldRef(FieldRef(), FieldRef(FieldRef(), FieldRef())), {}); + + assert_name(FieldRef("foo"), "foo"); + + // Nested empty field refs are optimized away + assert_nested(FieldRef("foo", 1, FieldRef(), FieldRef(FieldRef(), "bar")), + {FieldRef("foo"), FieldRef(1), FieldRef("bar")}); + // For now, subsequences of indices are not concatenated + assert_nested(FieldRef("foo", FieldRef("bar"), FieldRef(1, 2), FieldRef(3)), + {FieldRef("foo"), FieldRef("bar"), FieldRef(1, 2), FieldRef(3)}); +} + +TEST_F(TestFieldRef, GetFromSchema) { TestGet(); } +TEST_F(TestFieldRef, GetFromDataType) { TestGet(); } + +TEST_F(TestFieldRef, GetFromArray) { TestGet(); } +TEST_F(TestFieldRef, GetFromChunkedArray) { TestGet(); } +TEST_F(TestFieldRef, GetFromRecordBatch) { TestGet(); } +TEST_F(TestFieldRef, GetFromTable) { TestGet
(); } + +TEST_F(TestFieldRef, GetFlattenedFromArray) { TestGet(); } +TEST_F(TestFieldRef, GetFlattenedFromChunkedArray) { TestGet(); } +TEST_F(TestFieldRef, GetFlattenedFromRecordBatch) { TestGet(); } +TEST_F(TestFieldRef, GetFlattenedFromTable) { TestGet(); } + +} // namespace arrow \ No newline at end of file diff --git a/cpp/src/arrow/type_test.cc b/cpp/src/arrow/type_test.cc index cd52b263978..3c83da9f2e6 100644 --- a/cpp/src/arrow/type_test.cc +++ b/cpp/src/arrow/type_test.cc @@ -37,12 +37,9 @@ #include "arrow/type_traits.h" #include "arrow/util/checked_cast.h" #include "arrow/util/key_value_metadata.h" -#include "arrow/util/logging.h" namespace arrow { -using testing::ElementsAre; - using internal::checked_cast; using internal::checked_pointer_cast; @@ -365,646 +362,6 @@ TEST(TestField, TestMerge) { } } -struct FieldPathTestCase { - struct OutputValues { - explicit OutputValues(std::vector indices = {}) - : path(FieldPath(std::move(indices))) {} - - template - const auto& OutputAs() const { - if constexpr (std::is_same_v) { - return field; - } else if constexpr (std::is_same_v) { - return array; - } else if constexpr (std::is_same_v) { - return array->data(); - } else if constexpr (std::is_same_v) { - return chunked_array; - } - } - - FieldPath path; - std::shared_ptr field; - std::shared_ptr array; - std::shared_ptr chunked_array; - }; - - static constexpr int kNumColumns = 2; - static constexpr int kNumRows = 100; - static constexpr int kRandomSeed = 0xbeef; - - // Input for the FieldPath::Get functions in multiple forms - std::shared_ptr schema; - std::shared_ptr type; - std::shared_ptr array; - std::shared_ptr record_batch; - std::shared_ptr chunked_array; - std::shared_ptr
table; - - template - const auto& InputAs() const { - if constexpr (std::is_same_v) { - return schema; - } else if constexpr (std::is_same_v) { - return type; - } else if constexpr (std::is_same_v) { - return array; - } else if constexpr (std::is_same_v) { - return array->data(); - } else if constexpr (std::is_same_v) { - return record_batch; - } else if constexpr (std::is_same_v) { - return chunked_array; - } else if constexpr (std::is_same_v) { - return table; - } - } - - // Number of chunks for each column in the input Table - const std::array num_column_chunks = {15, 20}; - // Number of chunks in the input ChunkedArray - const int num_chunks = 15; - - // Expected outputs for each child; - OutputValues v0{{0}}, v1{{1}}; - OutputValues v1_0{{1, 0}}, v1_1{{1, 1}}; - OutputValues v1_1_0{{1, 1, 0}}, v1_1_1{{1, 1, 1}}; - // Expected outputs for nested children with null flattening applied - OutputValues v1_0_flat{{1, 0}}, v1_1_flat{{1, 1}}; - OutputValues v1_1_0_flat{{1, 1, 0}}, v1_1_1_flat{{1, 1, 1}}; - - static const FieldPathTestCase* Instance() { - static const auto maybe_instance = Make(); - return &maybe_instance.ValueOrDie(); - } - - static Result Make() { - // Generate test input based on a single schema. First by creating a StructArray, - // then deriving the other input types (ChunkedArray, RecordBatch, Table, etc) from - // it. We also compute the expected outputs for each child individually (for each - // output type). - FieldPathTestCase out; - random::RandomArrayGenerator gen(kRandomSeed); - - // Define child fields and input schema - - // Intentionally duplicated names for the FieldRef tests - out.v1_1_1.field = field("a", boolean()); - out.v1_1_0.field = field("a", float32()); - - out.v1_1.field = field("b", struct_({out.v1_1_0.field, out.v1_1_1.field})); - out.v1_0.field = field("a", int32()); - out.v1.field = field("b", struct_({out.v1_0.field, out.v1_1.field})); - out.v0.field = field("a", utf8()); - out.schema = arrow::schema({out.v0.field, out.v1.field}); - out.type = struct_(out.schema->fields()); - - // Create null bitmaps for the struct fields independent of its childrens' - // bitmaps. For FieldPath::GetFlattened, parent/child bitmaps should be combined - // - for FieldPath::Get, higher-level nulls are ignored. - auto bitmap1_1 = gen.NullBitmap(kNumRows, 0.15); - auto bitmap1 = gen.NullBitmap(kNumRows, 0.30); - - // Generate raw leaf arrays - out.v1_1_1.array = gen.ArrayOf(out.v1_1_1.field->type(), kNumRows); - out.v1_1_0.array = gen.ArrayOf(out.v1_1_0.field->type(), kNumRows); - out.v1_0.array = gen.ArrayOf(out.v1_0.field->type(), kNumRows); - out.v0.array = gen.ArrayOf(out.v0.field->type(), kNumRows); - // Make struct fields from leaf arrays (we use the custom bitmaps here) - ARROW_ASSIGN_OR_RAISE( - out.v1_1.array, - StructArray::Make({out.v1_1_0.array, out.v1_1_1.array}, - {out.v1_1_0.field, out.v1_1_1.field}, bitmap1_1)); - ARROW_ASSIGN_OR_RAISE(out.v1.array, - StructArray::Make({out.v1_0.array, out.v1_1.array}, - {out.v1_0.field, out.v1_1.field}, bitmap1)); - - // Not used to create the test input, but pre-compute flattened versions of nested - // arrays for comparisons in the GetFlattened tests. - ARROW_ASSIGN_OR_RAISE( - out.v1_0_flat.array, - checked_pointer_cast(out.v1.array)->GetFlattenedField(0)); - ARROW_ASSIGN_OR_RAISE( - out.v1_1_flat.array, - checked_pointer_cast(out.v1.array)->GetFlattenedField(1)); - ARROW_ASSIGN_OR_RAISE( - out.v1_1_0_flat.array, - checked_pointer_cast(out.v1_1_flat.array)->GetFlattenedField(0)); - ARROW_ASSIGN_OR_RAISE( - out.v1_1_1_flat.array, - checked_pointer_cast(out.v1_1_flat.array)->GetFlattenedField(1)); - // Sanity check - ARROW_CHECK(!out.v1_0_flat.array->Equals(out.v1_0.array)); - ARROW_CHECK(!out.v1_1_flat.array->Equals(out.v1_1.array)); - ARROW_CHECK(!out.v1_1_0_flat.array->Equals(out.v1_1_0.array)); - ARROW_CHECK(!out.v1_1_1_flat.array->Equals(out.v1_1_1.array)); - - // Finalize the input Array - ARROW_ASSIGN_OR_RAISE(out.array, StructArray::Make({out.v0.array, out.v1.array}, - {out.v0.field, out.v1.field})); - ARROW_RETURN_NOT_OK(out.array->ValidateFull()); - // Finalize the input RecordBatch - ARROW_ASSIGN_OR_RAISE(out.record_batch, RecordBatch::FromStructArray(out.array)); - ARROW_RETURN_NOT_OK(out.record_batch->ValidateFull()); - // Finalize the input ChunkedArray - out.chunked_array = SliceToChunkedArray(*out.array, out.num_chunks); - ARROW_RETURN_NOT_OK(out.chunked_array->ValidateFull()); - - // For each expected child array, create a chunked equivalent (we use a different - // chunk layout for each top-level column to make the Table test more interesting) - for (OutputValues* v : - {&out.v0, &out.v1, &out.v1_0, &out.v1_1, &out.v1_1_0, &out.v1_1_1, - &out.v1_0_flat, &out.v1_1_flat, &out.v1_1_0_flat, &out.v1_1_1_flat}) { - v->chunked_array = - SliceToChunkedArray(*v->array, out.num_column_chunks[v->path[0]]); - } - // Finalize the input Table - out.table = - Table::Make(out.schema, {out.v0.chunked_array, out.v1.chunked_array}, kNumRows); - ARROW_RETURN_NOT_OK(out.table->ValidateFull()); - - return std::move(out); - } - - static std::shared_ptr SliceToChunkedArray(const Array& array, - int num_chunks) { - ARROW_CHECK(num_chunks > 0 && array.length() >= num_chunks); - ArrayVector chunks; - chunks.reserve(num_chunks); - for (int64_t inc = array.length() / num_chunks, beg = 0, - end = inc + array.length() % num_chunks; - end <= array.length(); beg = end, end += inc) { - chunks.push_back(array.SliceSafe(beg, end - beg).ValueOrDie()); - } - ARROW_CHECK_EQ(static_cast(chunks.size()), num_chunks); - return ChunkedArray::Make(std::move(chunks)).ValueOrDie(); - } -}; - -class FieldPathTestFixture : public ::testing::Test { - public: - FieldPathTestFixture() : case_(FieldPathTestCase::Instance()) {} - - protected: - template - using OutputType = typename FieldRef::GetType::element_type; - - template - void AssertOutputsEqual(const std::shared_ptr& expected, - const std::shared_ptr& actual) const { - AssertFieldEqual(*expected, *actual); - } - template - void AssertOutputsEqual(const std::shared_ptr& expected, - const std::shared_ptr& actual) const { - ASSERT_OK(actual->ValidateFull()); - AssertArraysEqual(*expected, *actual); - } - template - void AssertOutputsEqual(const std::shared_ptr& expected, - const std::shared_ptr& actual) const { - ASSERT_OK(actual->ValidateFull()); - // We only do this dance due to the way the test inputs/outputs are generated. - // Basically, the "expected" output ChunkedArrays don't have an equal num_chunks since - // they're reused to create the input Table (which has a distinct chunking per - // column). However, if the input was the ChunkedArray, the returned outputs should - // always have the same num_chunks as the input. - if constexpr (std::is_same_v) { - EXPECT_EQ(case_->chunked_array->num_chunks(), actual->num_chunks()); - } else { - EXPECT_EQ(expected->num_chunks(), actual->num_chunks()); - } - AssertChunkedEquivalent(*expected, *actual); - } - - const FieldPathTestCase* case_; -}; - -class TestFieldPath : public FieldPathTestFixture { - protected: - template - static auto DoGet(const T& root, const FieldPath& path, MemoryPool* pool = nullptr) { - if constexpr (Flattened) { - return path.GetFlattened(root, pool); - } else { - return path.Get(root); - } - } - - template - void TestGetWithInvalidIndex() const { - const auto& input = case_->InputAs(); - for (const auto& path : - {FieldPath({2, 1, 0}), FieldPath({1, 2, 0}), FieldPath{1, 1, 2}}) { - EXPECT_RAISES_WITH_MESSAGE_THAT(IndexError, - ::testing::HasSubstr("index out of range"), - DoGet(*input, path)); - } - EXPECT_RAISES_WITH_MESSAGE_THAT( - Invalid, ::testing::HasSubstr("empty indices cannot be traversed"), - DoGet(*input, FieldPath())); - } - - template - void TestIndexErrorMessage() const { - using O = OutputType; - auto result = DoGet(*case_->InputAs(), FieldPath({1, 1, 2})); - std::string substr = "index out of range. indices=[ 1 1 >2< ] "; - if constexpr (std::is_same_v) { - substr += "fields: { a: float, a: bool, }"; - } else { - substr += "column types: { float, bool, }"; - } - EXPECT_RAISES_WITH_MESSAGE_THAT(IndexError, ::testing::HasSubstr(substr), result); - } - - template - void TestGetWithNonStructArray() const { - EXPECT_RAISES_WITH_MESSAGE_THAT( - NotImplemented, ::testing::HasSubstr("Get child data of non-struct array"), - DoGet(*case_->v1_1_0.OutputAs(), FieldPath({1, 1, 0}))); - } - - template - void TestGet() const { - using O = OutputType; - const auto& input = case_->InputAs(); - ASSERT_OK_AND_ASSIGN(auto v0, DoGet(*input, FieldPath({0}))); - ASSERT_OK_AND_ASSIGN(auto v1, DoGet(*input, FieldPath({1}))); - ASSERT_OK_AND_ASSIGN(auto v1_0, DoGet(*input, FieldPath({1, 0}))); - ASSERT_OK_AND_ASSIGN(auto v1_1, DoGet(*input, FieldPath({1, 1}))); - ASSERT_OK_AND_ASSIGN(auto v1_1_0, DoGet(*input, FieldPath({1, 1, 0}))); - ASSERT_OK_AND_ASSIGN(auto v1_1_1, DoGet(*input, FieldPath({1, 1, 1}))); - - AssertOutputsEqual(case_->v0.OutputAs(), v0); - AssertOutputsEqual(case_->v1.OutputAs(), v1); - if constexpr (Flattened) { - AssertOutputsEqual(case_->v1_0_flat.OutputAs(), v1_0); - AssertOutputsEqual(case_->v1_1_flat.OutputAs(), v1_1); - AssertOutputsEqual(case_->v1_1_0_flat.OutputAs(), v1_1_0); - AssertOutputsEqual(case_->v1_1_1_flat.OutputAs(), v1_1_1); - } else { - AssertOutputsEqual(case_->v1_0.OutputAs(), v1_0); - AssertOutputsEqual(case_->v1_1.OutputAs(), v1_1); - AssertOutputsEqual(case_->v1_1_0.OutputAs(), v1_1_0); - AssertOutputsEqual(case_->v1_1_1.OutputAs(), v1_1_1); - } - } -}; - -TEST_F(TestFieldPath, GetWithInvalidIndex) { - TestGetWithInvalidIndex(); - TestGetWithInvalidIndex(); - TestGetWithInvalidIndex(); - TestGetWithInvalidIndex(); - TestGetWithInvalidIndex(); - TestGetWithInvalidIndex(); - TestGetWithInvalidIndex
(); - // With flattening - TestGetWithInvalidIndex(); - TestGetWithInvalidIndex(); - TestGetWithInvalidIndex(); - TestGetWithInvalidIndex(); - TestGetWithInvalidIndex(); -} - -TEST_F(TestFieldPath, IndexErrorMessage) { - TestIndexErrorMessage(); - TestIndexErrorMessage(); - TestIndexErrorMessage(); - TestIndexErrorMessage(); - TestIndexErrorMessage(); - TestIndexErrorMessage(); - TestIndexErrorMessage
(); -} - -TEST_F(TestFieldPath, GetWithNonStructArray) { - TestGetWithNonStructArray(); - TestGetWithNonStructArray(); - TestGetWithNonStructArray(); - // With flattening - TestGetWithNonStructArray(); - TestGetWithNonStructArray(); - TestGetWithNonStructArray(); -} - -TEST_F(TestFieldPath, Basics) { - auto f0 = field("alpha", int32()); - auto f1 = field("beta", int32()); - auto f2 = field("alpha", int32()); - auto f3 = field("beta", int32()); - Schema s({f0, f1, f2, f3}); - - // retrieving a field with single-element FieldPath is equivalent to Schema::field - for (int index = 0; index < s.num_fields(); ++index) { - ASSERT_OK_AND_EQ(s.field(index), FieldPath({index}).Get(s)); - } -} - -TEST_F(TestFieldPath, GetFromEmptyChunked) { - FieldVector fields = { - field("i", int32()), - field("s", struct_({field("b", boolean()), field("f", float32())}))}; - std::shared_ptr child; - - // Empty ChunkedArray with no chunks - ChunkedArray chunked_array({}, struct_(fields)); - ASSERT_OK(chunked_array.ValidateFull()); - ASSERT_EQ(chunked_array.num_chunks(), 0); - ASSERT_OK_AND_ASSIGN(child, FieldPath({1, 1}).Get(chunked_array)); - AssertTypeEqual(float32(), child->type()); - ASSERT_EQ(child->length(), 0); - - // Empty Table with no column chunks - ChunkedArrayVector table_columns; - for (const auto& f : fields) { - table_columns.push_back(std::make_shared(ArrayVector{}, f->type())); - } - auto table = Table::Make(schema(fields), table_columns, 0); - ASSERT_OK(table->ValidateFull()); - for (const auto& column : table->columns()) { - ASSERT_EQ(column->num_chunks(), 0); - } - ASSERT_OK_AND_ASSIGN(child, FieldPath({1, 1}).Get(*table)); - AssertTypeEqual(float32(), child->type()); - ASSERT_EQ(child->length(), 0); -} - -TEST_F(TestFieldPath, GetFromSchema) { TestGet(); } -TEST_F(TestFieldPath, GetFromDataType) { TestGet(); } - -TEST_F(TestFieldPath, GetFromArray) { TestGet(); } -TEST_F(TestFieldPath, GetFromChunkedArray) { TestGet(); } -TEST_F(TestFieldPath, GetFromRecordBatch) { TestGet(); } -TEST_F(TestFieldPath, GetFromTable) { TestGet
(); } - -TEST_F(TestFieldPath, GetFlattenedFromArray) { TestGet(); } -TEST_F(TestFieldPath, GetFlattenedFromChunkedArray) { TestGet(); } -TEST_F(TestFieldPath, GetFlattenedFromRecordBatch) { TestGet(); } -TEST_F(TestFieldPath, GetFlattenedFromTable) { TestGet(); } - -class TestFieldRef : public FieldPathTestFixture { - protected: - template - static auto DoGetOne(const T& root, const FieldRef& ref, MemoryPool* pool = nullptr) { - if constexpr (Flattened) { - return ref.GetOneFlattened(root, pool); - } else { - return ref.GetOne(root); - } - } - template - static auto DoGetOneOrNone(const T& root, const FieldRef& ref, - MemoryPool* pool = nullptr) { - if constexpr (Flattened) { - return ref.GetOneOrNoneFlattened(root, pool); - } else { - return ref.GetOneOrNone(root); - } - } - template - static auto DoGetAll(const T& root, const FieldRef& ref, MemoryPool* pool = nullptr) { - if constexpr (Flattened) { - return ref.GetAllFlattened(root, pool); - } else { - return ToResult(ref.GetAll(root)); - } - } - - template - void TestGet() const { - using O = OutputType; - const auto& input = case_->InputAs(); - ASSERT_OK_AND_ASSIGN(auto v0, DoGetOne(*input, FieldRef("a"))); - ASSERT_OK_AND_ASSIGN(auto v1, DoGetOne(*input, FieldRef("b"))); - ASSERT_OK_AND_ASSIGN(auto v1_0, DoGetOne(*input, FieldRef("b", "a"))); - ASSERT_OK_AND_ASSIGN(auto v1_1, DoGetOne(*input, FieldRef("b", "b"))); - ASSERT_OK_AND_ASSIGN(auto v1_1_0, DoGetOne(*input, FieldRef("b", "b", 0))); - ASSERT_OK_AND_ASSIGN(auto v1_1_1, DoGetOne(*input, FieldRef("b", "b", 1))); - - AssertOutputsEqual(case_->v0.OutputAs(), v0); - AssertOutputsEqual(case_->v1.OutputAs(), v1); - if constexpr (Flattened) { - AssertOutputsEqual(case_->v1_0_flat.OutputAs(), v1_0); - AssertOutputsEqual(case_->v1_1_flat.OutputAs(), v1_1); - AssertOutputsEqual(case_->v1_1_0_flat.OutputAs(), v1_1_0); - AssertOutputsEqual(case_->v1_1_1_flat.OutputAs(), v1_1_1); - } else { - AssertOutputsEqual(case_->v1_0.OutputAs(), v1_0); - AssertOutputsEqual(case_->v1_1.OutputAs(), v1_1); - AssertOutputsEqual(case_->v1_1_0.OutputAs(), v1_1_0); - AssertOutputsEqual(case_->v1_1_1.OutputAs(), v1_1_1); - } - - // Cases where multiple matches are found - EXPECT_OK_AND_ASSIGN(auto multiple_matches, - DoGetAll(*input, FieldRef("b", "b", "a"))); - EXPECT_EQ(multiple_matches.size(), 2); - EXPECT_RAISES_WITH_MESSAGE_THAT( - Invalid, ::testing::HasSubstr("Multiple matches for "), - (DoGetOne(*input, FieldRef("b", "b", "a")))); - EXPECT_RAISES_WITH_MESSAGE_THAT( - Invalid, ::testing::HasSubstr("Multiple matches for "), - (DoGetOneOrNone(*input, FieldRef("b", "b", "a")))); - - // Cases where no match is found - EXPECT_OK_AND_ASSIGN(auto no_matches, - DoGetAll(*input, FieldRef("b", "b", "b"))); - EXPECT_EQ(no_matches.size(), 0); - EXPECT_RAISES_WITH_MESSAGE_THAT( - Invalid, ::testing::HasSubstr("No match for "), - (DoGetOne(*input, FieldRef("b", "b", "b")))); - ASSERT_OK_AND_EQ(nullptr, - (DoGetOneOrNone(*input, FieldRef("b", "b", "b")))); - } -}; - -TEST_F(TestFieldRef, Basics) { - auto f0 = field("alpha", int32()); - auto f1 = field("beta", int32()); - auto f2 = field("alpha", int32()); - auto f3 = field("beta", int32()); - Schema s({f0, f1, f2, f3}); - - // lookup by index returns Indices{index} - for (int index = 0; index < s.num_fields(); ++index) { - EXPECT_THAT(FieldRef(index).FindAll(s), ElementsAre(FieldPath{index})); - } - // out of range index results in a failure to match - EXPECT_THAT(FieldRef(s.num_fields() * 2).FindAll(s), ElementsAre()); - - // lookup by name returns the Indices of both matching fields - EXPECT_THAT(FieldRef("alpha").FindAll(s), ElementsAre(FieldPath{0}, FieldPath{2})); - EXPECT_THAT(FieldRef("beta").FindAll(s), ElementsAre(FieldPath{1}, FieldPath{3})); -} - -TEST_F(TestFieldRef, FindAllForTable) { - constexpr int kNumRows = 100; - auto f0 = field("alpha", int32()); - auto f1 = field("beta", int32()); - auto f2 = field("alpha", int32()); - auto f3 = field("beta", int32()); - auto schema = arrow::schema({f0, f1, f2, f3}); - - arrow::random::RandomArrayGenerator gen_{42}; - auto a0 = gen_.ArrayOf(int32(), kNumRows); - auto a1 = gen_.ArrayOf(int32(), kNumRows); - auto a2 = gen_.ArrayOf(int32(), kNumRows); - auto a3 = gen_.ArrayOf(int32(), kNumRows); - - auto table_ptr = Table::Make(schema, {a0, a1, a2, a3}); - ASSERT_OK(table_ptr->ValidateFull()); - - // lookup by index returns Indices{index} - auto schema_num_fields = table_ptr->schema()->num_fields(); - for (int index = 0; index < schema_num_fields; ++index) { - EXPECT_THAT(FieldRef(index).FindAll(*table_ptr), ElementsAre(FieldPath{index})); - } - // out of range index results in a failure to match - EXPECT_THAT(FieldRef(schema_num_fields * 2).FindAll(*table_ptr), ElementsAre()); - - //// lookup by name returns the Indices of both matching fields - EXPECT_THAT(FieldRef("alpha").FindAll(*table_ptr), - ElementsAre(FieldPath{0}, FieldPath{2})); - EXPECT_THAT(FieldRef("beta").FindAll(*table_ptr), - ElementsAre(FieldPath{1}, FieldPath{3})); -} - -TEST_F(TestFieldRef, FindAllForRecordBatch) { - constexpr int kNumRows = 100; - auto f0 = field("alpha", int32()); - auto f1 = field("beta", int32()); - auto f2 = field("alpha", int32()); - auto f3 = field("beta", int32()); - auto schema = arrow::schema({f0, f1, f2, f3}); - - arrow::random::RandomArrayGenerator gen_{42}; - auto a0 = gen_.ArrayOf(int32(), kNumRows); - auto a1 = gen_.ArrayOf(int32(), kNumRows); - auto a2 = gen_.ArrayOf(int32(), kNumRows); - auto a3 = gen_.ArrayOf(int32(), kNumRows); - - auto record_batch_ptr = RecordBatch::Make(schema, kNumRows, {a0, a1, a2, a3}); - ASSERT_OK(record_batch_ptr->ValidateFull()); - - // lookup by index returns Indices{index} - auto schema_num_fields = record_batch_ptr->schema()->num_fields(); - for (int index = 0; index < schema_num_fields; ++index) { - EXPECT_THAT(FieldRef(index).FindAll(*record_batch_ptr), - ElementsAre(FieldPath{index})); - } - // out of range index results in a failure to match - EXPECT_THAT(FieldRef(schema_num_fields * 2).FindAll(*record_batch_ptr), ElementsAre()); - - //// lookup by name returns the Indices of both matching fields - EXPECT_THAT(FieldRef("alpha").FindAll(*record_batch_ptr), - ElementsAre(FieldPath{0}, FieldPath{2})); - EXPECT_THAT(FieldRef("beta").FindAll(*record_batch_ptr), - ElementsAre(FieldPath{1}, FieldPath{3})); -} - -TEST_F(TestFieldRef, FromDotPath) { - ASSERT_OK_AND_EQ(FieldRef("alpha"), FieldRef::FromDotPath(R"(.alpha)")); - - ASSERT_OK_AND_EQ(FieldRef("", ""), FieldRef::FromDotPath(R"(..)")); - - ASSERT_OK_AND_EQ(FieldRef(2), FieldRef::FromDotPath(R"([2])")); - - ASSERT_OK_AND_EQ(FieldRef("beta", 3), FieldRef::FromDotPath(R"(.beta[3])")); - - ASSERT_OK_AND_EQ(FieldRef(5, "gamma", "delta", 7), - FieldRef::FromDotPath(R"([5].gamma.delta[7])")); - - ASSERT_OK_AND_EQ(FieldRef("hello world"), FieldRef::FromDotPath(R"(.hello world)")); - - ASSERT_OK_AND_EQ(FieldRef(R"([y]\tho.\)"), FieldRef::FromDotPath(R"(.\[y\]\\tho\.\)")); - - ASSERT_OK_AND_EQ(FieldRef(), FieldRef::FromDotPath(R"()")); - - ASSERT_RAISES(Invalid, FieldRef::FromDotPath(R"(alpha)")); - ASSERT_RAISES(Invalid, FieldRef::FromDotPath(R"([134234)")); - ASSERT_RAISES(Invalid, FieldRef::FromDotPath(R"([1stuf])")); -} - -TEST_F(TestFieldRef, DotPathRoundTrip) { - auto check_roundtrip = [](const FieldRef& ref) { - auto dot_path = ref.ToDotPath(); - ASSERT_OK_AND_EQ(ref, FieldRef::FromDotPath(dot_path)); - }; - - check_roundtrip(FieldRef()); - check_roundtrip(FieldRef("foo")); - check_roundtrip(FieldRef("foo", 1, "bar", 2, 3)); - check_roundtrip(FieldRef(1, 2, 3)); - check_roundtrip(FieldRef("foo", 1, FieldRef("bar", 2, 3), FieldRef())); -} - -TEST_F(TestFieldRef, Nested) { - auto f0 = field("alpha", int32()); - auto f1_0 = field("alpha", int32()); - auto f1 = field("beta", struct_({f1_0})); - auto f2_0 = field("alpha", int32()); - auto f2_1_0 = field("alpha", int32()); - auto f2_1_1 = field("alpha", int32()); - auto f2_1 = field("gamma", struct_({f2_1_0, f2_1_1})); - auto f2 = field("beta", struct_({f2_0, f2_1})); - Schema s({f0, f1, f2}); - - EXPECT_THAT(FieldRef("beta", "alpha").FindAll(s), - ElementsAre(FieldPath{1, 0}, FieldPath{2, 0})); - EXPECT_THAT(FieldRef("beta", "gamma", "alpha").FindAll(s), - ElementsAre(FieldPath{2, 1, 0}, FieldPath{2, 1, 1})); -} - -TEST_F(TestFieldRef, Flatten) { - FieldRef ref; - - auto assert_name = [](const FieldRef& ref, const std::string& expected) { - ASSERT_TRUE(ref.IsName()); - ASSERT_EQ(*ref.name(), expected); - }; - - auto assert_path = [](const FieldRef& ref, const std::vector& expected) { - ASSERT_TRUE(ref.IsFieldPath()); - ASSERT_EQ(ref.field_path()->indices(), expected); - }; - - auto assert_nested = [](const FieldRef& ref, const std::vector& expected) { - ASSERT_TRUE(ref.IsNested()); - ASSERT_EQ(*ref.nested_refs(), expected); - }; - - assert_path(FieldRef(), {}); - assert_path(FieldRef(1, 2, 3), {1, 2, 3}); - // If all leaves are field paths, they are fully flattened - assert_path(FieldRef(1, FieldRef(2, 3)), {1, 2, 3}); - assert_path(FieldRef(1, FieldRef(2, 3), FieldRef(), FieldRef(FieldRef(4), FieldRef(5))), - {1, 2, 3, 4, 5}); - assert_path(FieldRef(FieldRef(), FieldRef(FieldRef(), FieldRef())), {}); - - assert_name(FieldRef("foo"), "foo"); - - // Nested empty field refs are optimized away - assert_nested(FieldRef("foo", 1, FieldRef(), FieldRef(FieldRef(), "bar")), - {FieldRef("foo"), FieldRef(1), FieldRef("bar")}); - // For now, subsequences of indices are not concatenated - assert_nested(FieldRef("foo", FieldRef("bar"), FieldRef(1, 2), FieldRef(3)), - {FieldRef("foo"), FieldRef("bar"), FieldRef(1, 2), FieldRef(3)}); -} - -TEST_F(TestFieldRef, GetFromSchema) { TestGet(); } -TEST_F(TestFieldRef, GetFromDataType) { TestGet(); } - -TEST_F(TestFieldRef, GetFromArray) { TestGet(); } -TEST_F(TestFieldRef, GetFromChunkedArray) { TestGet(); } -TEST_F(TestFieldRef, GetFromRecordBatch) { TestGet(); } -TEST_F(TestFieldRef, GetFromTable) { TestGet
(); } - -TEST_F(TestFieldRef, GetFlattenedFromArray) { TestGet(); } -TEST_F(TestFieldRef, GetFlattenedFromChunkedArray) { TestGet(); } -TEST_F(TestFieldRef, GetFlattenedFromRecordBatch) { TestGet(); } -TEST_F(TestFieldRef, GetFlattenedFromTable) { TestGet(); } - using TestSchema = ::testing::Test; TEST_F(TestSchema, Basics) { From a1662aace86ade5aae22a382969cd335432a496e Mon Sep 17 00:00:00 2001 From: Antoine Pitrou Date: Mon, 22 May 2023 15:23:10 +0200 Subject: [PATCH 21/21] Add docstrings for new methods --- cpp/src/arrow/field_ref_test.cc | 2 +- cpp/src/arrow/type.h | 13 +++++++++++++ 2 files changed, 14 insertions(+), 1 deletion(-) diff --git a/cpp/src/arrow/field_ref_test.cc b/cpp/src/arrow/field_ref_test.cc index a3d17469967..10e2564ed18 100644 --- a/cpp/src/arrow/field_ref_test.cc +++ b/cpp/src/arrow/field_ref_test.cc @@ -688,4 +688,4 @@ TEST_F(TestFieldRef, GetFlattenedFromChunkedArray) { TestGet TEST_F(TestFieldRef, GetFlattenedFromRecordBatch) { TestGet(); } TEST_F(TestFieldRef, GetFlattenedFromTable) { TestGet(); } -} // namespace arrow \ No newline at end of file +} // namespace arrow diff --git a/cpp/src/arrow/type.h b/cpp/src/arrow/type.h index 9bca82b1600..48228d43ef9 100644 --- a/cpp/src/arrow/type.h +++ b/cpp/src/arrow/type.h @@ -1902,6 +1902,10 @@ class ARROW_EXPORT FieldRef : public util::EqualityComparable { } return out; } + /// \brief Get all children matching this FieldRef. + /// + /// Unlike `FieldRef::GetAll`, this variant is not zero-copy and the retrieved + /// children's null bitmaps are ANDed with their ancestors' template Result>> GetAllFlattened(const T& root, MemoryPool* pool = NULLPTR) const { @@ -1920,6 +1924,10 @@ class ARROW_EXPORT FieldRef : public util::EqualityComparable { ARROW_ASSIGN_OR_RAISE(auto match, FindOne(root)); return match.Get(root).ValueOrDie(); } + /// \brief Get the single child matching this FieldRef. + /// + /// Unlike `FieldRef::GetOne`, this variant is not zero-copy and the retrieved + /// child's null bitmap is ANDed with its ancestors' template Result> GetOneFlattened(const T& root, MemoryPool* pool = NULLPTR) const { ARROW_ASSIGN_OR_RAISE(auto match, FindOne(root)); @@ -1936,6 +1944,11 @@ class ARROW_EXPORT FieldRef : public util::EqualityComparable { } return match.Get(root).ValueOrDie(); } + /// \brief Get the single child matching this FieldRef. + /// + /// Return nullptr if none match, emit an error if multiple match. + /// Unlike `FieldRef::GetOneOrNone`, this variant is not zero-copy and the + /// retrieved child's null bitmap is ANDed with its ancestors' template Result> GetOneOrNoneFlattened(const T& root, MemoryPool* pool = NULLPTR) const {