diff --git a/cpp/src/arrow/CMakeLists.txt b/cpp/src/arrow/CMakeLists.txt index b6eb50099bf..4e6826bc61f 100644 --- a/cpp/src/arrow/CMakeLists.txt +++ b/cpp/src/arrow/CMakeLists.txt @@ -796,7 +796,7 @@ set_source_files_properties(public_api_test.cc PROPERTIES SKIP_PRECOMPILE_HEADER SKIP_UNITY_BUILD_INCLUSION ON) add_arrow_test(scalar_test) -add_arrow_test(type_test) +add_arrow_test(type_test SOURCES field_ref_test.cc type_test.cc) add_arrow_test(table_test SOURCES diff --git a/cpp/src/arrow/field_ref_test.cc b/cpp/src/arrow/field_ref_test.cc new file mode 100644 index 00000000000..10e2564ed18 --- /dev/null +++ b/cpp/src/arrow/field_ref_test.cc @@ -0,0 +1,691 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include +#include +#include +#include +#include + +#include + +#include "arrow/array.h" +#include "arrow/memory_pool.h" +#include "arrow/table.h" +#include "arrow/testing/gtest_util.h" +#include "arrow/testing/random.h" +#include "arrow/testing/util.h" +#include "arrow/type.h" +#include "arrow/type_traits.h" +#include "arrow/util/checked_cast.h" +#include "arrow/util/logging.h" + +namespace arrow { + +using testing::ElementsAre; + +using internal::checked_cast; +using internal::checked_pointer_cast; + +struct FieldPathTestCase { + struct OutputValues { + explicit OutputValues(std::vector indices = {}) + : path(FieldPath(std::move(indices))) {} + + template + const auto& OutputAs() const { + if constexpr (std::is_same_v) { + return field; + } else if constexpr (std::is_same_v) { + return array; + } else if constexpr (std::is_same_v) { + return array->data(); + } else if constexpr (std::is_same_v) { + return chunked_array; + } + } + + FieldPath path; + std::shared_ptr field; + std::shared_ptr array; + std::shared_ptr chunked_array; + }; + + static constexpr int kNumColumns = 2; + static constexpr int kNumRows = 100; + static constexpr int kRandomSeed = 0xbeef; + + // Input for the FieldPath::Get functions in multiple forms + std::shared_ptr schema; + std::shared_ptr type; + std::shared_ptr array; + std::shared_ptr record_batch; + std::shared_ptr chunked_array; + std::shared_ptr table; + + template + const auto& InputAs() const { + if constexpr (std::is_same_v) { + return schema; + } else if constexpr (std::is_same_v) { + return type; + } else if constexpr (std::is_same_v) { + return array; + } else if constexpr (std::is_same_v) { + return array->data(); + } else if constexpr (std::is_same_v) { + return record_batch; + } else if constexpr (std::is_same_v) { + return chunked_array; + } else if constexpr (std::is_same_v) { + return table; + } + } + + // Number of chunks for each column in the input Table + const std::array num_column_chunks = {15, 20}; + // Number of chunks in the input ChunkedArray + const int num_chunks = 15; + + // Expected outputs for each child; + OutputValues v0{{0}}, v1{{1}}; + OutputValues v1_0{{1, 0}}, v1_1{{1, 1}}; + OutputValues v1_1_0{{1, 1, 0}}, v1_1_1{{1, 1, 1}}; + // Expected outputs for nested children with null flattening applied + OutputValues v1_0_flat{{1, 0}}, v1_1_flat{{1, 1}}; + OutputValues v1_1_0_flat{{1, 1, 0}}, v1_1_1_flat{{1, 1, 1}}; + + static const FieldPathTestCase* Instance() { + static const auto maybe_instance = Make(); + return &maybe_instance.ValueOrDie(); + } + + static Result Make() { + // Generate test input based on a single schema. First by creating a StructArray, + // then deriving the other input types (ChunkedArray, RecordBatch, Table, etc) from + // it. We also compute the expected outputs for each child individually (for each + // output type). + FieldPathTestCase out; + random::RandomArrayGenerator gen(kRandomSeed); + + // Define child fields and input schema + + // Intentionally duplicated names for the FieldRef tests + out.v1_1_1.field = field("a", boolean()); + out.v1_1_0.field = field("a", float32()); + + out.v1_1.field = field("b", struct_({out.v1_1_0.field, out.v1_1_1.field})); + out.v1_0.field = field("a", int32()); + out.v1.field = field("b", struct_({out.v1_0.field, out.v1_1.field})); + out.v0.field = field("a", utf8()); + out.schema = arrow::schema({out.v0.field, out.v1.field}); + out.type = struct_(out.schema->fields()); + + // Create null bitmaps for the struct fields independent of its childrens' + // bitmaps. For FieldPath::GetFlattened, parent/child bitmaps should be combined + // - for FieldPath::Get, higher-level nulls are ignored. + auto bitmap1_1 = gen.NullBitmap(kNumRows, 0.15); + auto bitmap1 = gen.NullBitmap(kNumRows, 0.30); + + // Generate raw leaf arrays + out.v1_1_1.array = gen.ArrayOf(out.v1_1_1.field->type(), kNumRows); + out.v1_1_0.array = gen.ArrayOf(out.v1_1_0.field->type(), kNumRows); + out.v1_0.array = gen.ArrayOf(out.v1_0.field->type(), kNumRows); + out.v0.array = gen.ArrayOf(out.v0.field->type(), kNumRows); + // Make struct fields from leaf arrays (we use the custom bitmaps here) + ARROW_ASSIGN_OR_RAISE( + out.v1_1.array, + StructArray::Make({out.v1_1_0.array, out.v1_1_1.array}, + {out.v1_1_0.field, out.v1_1_1.field}, bitmap1_1)); + ARROW_ASSIGN_OR_RAISE(out.v1.array, + StructArray::Make({out.v1_0.array, out.v1_1.array}, + {out.v1_0.field, out.v1_1.field}, bitmap1)); + + // Not used to create the test input, but pre-compute flattened versions of nested + // arrays for comparisons in the GetFlattened tests. + ARROW_ASSIGN_OR_RAISE( + out.v1_0_flat.array, + checked_pointer_cast(out.v1.array)->GetFlattenedField(0)); + ARROW_ASSIGN_OR_RAISE( + out.v1_1_flat.array, + checked_pointer_cast(out.v1.array)->GetFlattenedField(1)); + ARROW_ASSIGN_OR_RAISE( + out.v1_1_0_flat.array, + checked_pointer_cast(out.v1_1_flat.array)->GetFlattenedField(0)); + ARROW_ASSIGN_OR_RAISE( + out.v1_1_1_flat.array, + checked_pointer_cast(out.v1_1_flat.array)->GetFlattenedField(1)); + // Sanity check + ARROW_CHECK(!out.v1_0_flat.array->Equals(out.v1_0.array)); + ARROW_CHECK(!out.v1_1_flat.array->Equals(out.v1_1.array)); + ARROW_CHECK(!out.v1_1_0_flat.array->Equals(out.v1_1_0.array)); + ARROW_CHECK(!out.v1_1_1_flat.array->Equals(out.v1_1_1.array)); + + // Finalize the input Array + ARROW_ASSIGN_OR_RAISE(out.array, StructArray::Make({out.v0.array, out.v1.array}, + {out.v0.field, out.v1.field})); + ARROW_RETURN_NOT_OK(out.array->ValidateFull()); + // Finalize the input RecordBatch + ARROW_ASSIGN_OR_RAISE(out.record_batch, RecordBatch::FromStructArray(out.array)); + ARROW_RETURN_NOT_OK(out.record_batch->ValidateFull()); + // Finalize the input ChunkedArray + out.chunked_array = SliceToChunkedArray(*out.array, out.num_chunks); + ARROW_RETURN_NOT_OK(out.chunked_array->ValidateFull()); + + // For each expected child array, create a chunked equivalent (we use a different + // chunk layout for each top-level column to make the Table test more interesting) + for (OutputValues* v : + {&out.v0, &out.v1, &out.v1_0, &out.v1_1, &out.v1_1_0, &out.v1_1_1, + &out.v1_0_flat, &out.v1_1_flat, &out.v1_1_0_flat, &out.v1_1_1_flat}) { + v->chunked_array = + SliceToChunkedArray(*v->array, out.num_column_chunks[v->path[0]]); + } + // Finalize the input Table + out.table = + Table::Make(out.schema, {out.v0.chunked_array, out.v1.chunked_array}, kNumRows); + ARROW_RETURN_NOT_OK(out.table->ValidateFull()); + + return std::move(out); + } + + private: + static std::shared_ptr SliceToChunkedArray(const Array& array, + int num_chunks) { + ARROW_CHECK(num_chunks > 0 && array.length() >= num_chunks); + ArrayVector chunks; + chunks.reserve(num_chunks); + for (int64_t inc = array.length() / num_chunks, beg = 0, + end = inc + array.length() % num_chunks; + end <= array.length(); beg = end, end += inc) { + chunks.push_back(array.SliceSafe(beg, end - beg).ValueOrDie()); + } + ARROW_CHECK_EQ(static_cast(chunks.size()), num_chunks); + return ChunkedArray::Make(std::move(chunks)).ValueOrDie(); + } +}; + +class FieldPathTestFixture : public ::testing::Test { + public: + FieldPathTestFixture() : case_(FieldPathTestCase::Instance()) {} + + protected: + template + using OutputType = typename FieldRef::GetType::element_type; + + template + void AssertOutputsEqual(const std::shared_ptr& expected, + const std::shared_ptr& actual) const { + AssertFieldEqual(*expected, *actual); + } + template + void AssertOutputsEqual(const std::shared_ptr& expected, + const std::shared_ptr& actual) const { + ASSERT_OK(actual->ValidateFull()); + AssertArraysEqual(*expected, *actual); + } + template + void AssertOutputsEqual(const std::shared_ptr& expected, + const std::shared_ptr& actual) const { + ASSERT_OK(actual->ValidateFull()); + // We only do this dance due to the way the test inputs/outputs are generated. + // Basically, the "expected" output ChunkedArrays don't have an equal num_chunks since + // they're reused to create the input Table (which has a distinct chunking per + // column). However, if the input was the ChunkedArray, the returned outputs should + // always have the same num_chunks as the input. + if constexpr (std::is_same_v) { + EXPECT_EQ(case_->chunked_array->num_chunks(), actual->num_chunks()); + } else { + EXPECT_EQ(expected->num_chunks(), actual->num_chunks()); + } + AssertChunkedEquivalent(*expected, *actual); + } + + const FieldPathTestCase* case_; +}; + +class TestFieldPath : public FieldPathTestFixture { + protected: + template + static auto DoGet(const T& root, const FieldPath& path, MemoryPool* pool = nullptr) { + if constexpr (Flattened) { + return path.GetFlattened(root, pool); + } else { + return path.Get(root); + } + } + + template + void TestGetWithInvalidIndex() const { + const auto& input = case_->InputAs(); + for (const auto& path : + {FieldPath({2, 1, 0}), FieldPath({1, 2, 0}), FieldPath{1, 1, 2}}) { + EXPECT_RAISES_WITH_MESSAGE_THAT(IndexError, + ::testing::HasSubstr("index out of range"), + DoGet(*input, path)); + } + EXPECT_RAISES_WITH_MESSAGE_THAT( + Invalid, ::testing::HasSubstr("empty indices cannot be traversed"), + DoGet(*input, FieldPath())); + } + + template + void TestIndexErrorMessage() const { + using O = OutputType; + auto result = DoGet(*case_->InputAs(), FieldPath({1, 1, 2})); + std::string substr = "index out of range. indices=[ 1 1 >2< ] "; + if constexpr (std::is_same_v) { + substr += "fields: { a: float, a: bool, }"; + } else { + substr += "column types: { float, bool, }"; + } + EXPECT_RAISES_WITH_MESSAGE_THAT(IndexError, ::testing::HasSubstr(substr), result); + } + + template + void TestGetWithNonStructArray() const { + EXPECT_RAISES_WITH_MESSAGE_THAT( + NotImplemented, ::testing::HasSubstr("Get child data of non-struct array"), + DoGet(*case_->v1_1_0.OutputAs(), FieldPath({1, 1, 0}))); + } + + template + void TestGet() const { + using O = OutputType; + const auto& input = case_->InputAs(); + ASSERT_OK_AND_ASSIGN(auto v0, DoGet(*input, FieldPath({0}))); + ASSERT_OK_AND_ASSIGN(auto v1, DoGet(*input, FieldPath({1}))); + ASSERT_OK_AND_ASSIGN(auto v1_0, DoGet(*input, FieldPath({1, 0}))); + ASSERT_OK_AND_ASSIGN(auto v1_1, DoGet(*input, FieldPath({1, 1}))); + ASSERT_OK_AND_ASSIGN(auto v1_1_0, DoGet(*input, FieldPath({1, 1, 0}))); + ASSERT_OK_AND_ASSIGN(auto v1_1_1, DoGet(*input, FieldPath({1, 1, 1}))); + + AssertOutputsEqual(case_->v0.OutputAs(), v0); + AssertOutputsEqual(case_->v1.OutputAs(), v1); + if constexpr (Flattened) { + AssertOutputsEqual(case_->v1_0_flat.OutputAs(), v1_0); + AssertOutputsEqual(case_->v1_1_flat.OutputAs(), v1_1); + AssertOutputsEqual(case_->v1_1_0_flat.OutputAs(), v1_1_0); + AssertOutputsEqual(case_->v1_1_1_flat.OutputAs(), v1_1_1); + } else { + AssertOutputsEqual(case_->v1_0.OutputAs(), v1_0); + AssertOutputsEqual(case_->v1_1.OutputAs(), v1_1); + AssertOutputsEqual(case_->v1_1_0.OutputAs(), v1_1_0); + AssertOutputsEqual(case_->v1_1_1.OutputAs(), v1_1_1); + } + } +}; + +class TestFieldRef : public FieldPathTestFixture { + protected: + template + static auto DoGetOne(const T& root, const FieldRef& ref, MemoryPool* pool = nullptr) { + if constexpr (Flattened) { + return ref.GetOneFlattened(root, pool); + } else { + return ref.GetOne(root); + } + } + template + static auto DoGetOneOrNone(const T& root, const FieldRef& ref, + MemoryPool* pool = nullptr) { + if constexpr (Flattened) { + return ref.GetOneOrNoneFlattened(root, pool); + } else { + return ref.GetOneOrNone(root); + } + } + template + static auto DoGetAll(const T& root, const FieldRef& ref, MemoryPool* pool = nullptr) { + if constexpr (Flattened) { + return ref.GetAllFlattened(root, pool); + } else { + return ToResult(ref.GetAll(root)); + } + } + + template + void TestGet() const { + using O = OutputType; + const auto& input = case_->InputAs(); + ASSERT_OK_AND_ASSIGN(auto v0, DoGetOne(*input, FieldRef("a"))); + ASSERT_OK_AND_ASSIGN(auto v1, DoGetOne(*input, FieldRef("b"))); + ASSERT_OK_AND_ASSIGN(auto v1_0, DoGetOne(*input, FieldRef("b", "a"))); + ASSERT_OK_AND_ASSIGN(auto v1_1, DoGetOne(*input, FieldRef("b", "b"))); + ASSERT_OK_AND_ASSIGN(auto v1_1_0, DoGetOne(*input, FieldRef("b", "b", 0))); + ASSERT_OK_AND_ASSIGN(auto v1_1_1, DoGetOne(*input, FieldRef("b", "b", 1))); + + AssertOutputsEqual(case_->v0.OutputAs(), v0); + AssertOutputsEqual(case_->v1.OutputAs(), v1); + if constexpr (Flattened) { + AssertOutputsEqual(case_->v1_0_flat.OutputAs(), v1_0); + AssertOutputsEqual(case_->v1_1_flat.OutputAs(), v1_1); + AssertOutputsEqual(case_->v1_1_0_flat.OutputAs(), v1_1_0); + AssertOutputsEqual(case_->v1_1_1_flat.OutputAs(), v1_1_1); + } else { + AssertOutputsEqual(case_->v1_0.OutputAs(), v1_0); + AssertOutputsEqual(case_->v1_1.OutputAs(), v1_1); + AssertOutputsEqual(case_->v1_1_0.OutputAs(), v1_1_0); + AssertOutputsEqual(case_->v1_1_1.OutputAs(), v1_1_1); + } + + // Cases where multiple matches are found + EXPECT_OK_AND_ASSIGN(auto multiple_matches, + DoGetAll(*input, FieldRef("b", "b", "a"))); + EXPECT_EQ(multiple_matches.size(), 2); + EXPECT_RAISES_WITH_MESSAGE_THAT( + Invalid, ::testing::HasSubstr("Multiple matches for "), + (DoGetOne(*input, FieldRef("b", "b", "a")))); + EXPECT_RAISES_WITH_MESSAGE_THAT( + Invalid, ::testing::HasSubstr("Multiple matches for "), + (DoGetOneOrNone(*input, FieldRef("b", "b", "a")))); + + // Cases where no match is found + EXPECT_OK_AND_ASSIGN(auto no_matches, + DoGetAll(*input, FieldRef("b", "b", "b"))); + EXPECT_EQ(no_matches.size(), 0); + EXPECT_RAISES_WITH_MESSAGE_THAT( + Invalid, ::testing::HasSubstr("No match for "), + (DoGetOne(*input, FieldRef("b", "b", "b")))); + ASSERT_OK_AND_EQ(nullptr, + (DoGetOneOrNone(*input, FieldRef("b", "b", "b")))); + } +}; + +// ---------------------------------------------------------------------- +// FieldPath + +TEST_F(TestFieldPath, Basics) { + auto f0 = field("alpha", int32()); + auto f1 = field("beta", int32()); + auto f2 = field("alpha", int32()); + auto f3 = field("beta", int32()); + Schema s({f0, f1, f2, f3}); + + // retrieving a field with single-element FieldPath is equivalent to Schema::field + for (int index = 0; index < s.num_fields(); ++index) { + ASSERT_OK_AND_EQ(s.field(index), FieldPath({index}).Get(s)); + } +} + +TEST_F(TestFieldPath, GetFromEmptyChunked) { + FieldVector fields = { + field("i", int32()), + field("s", struct_({field("b", boolean()), field("f", float32())}))}; + std::shared_ptr child; + + // Empty ChunkedArray with no chunks + ChunkedArray chunked_array({}, struct_(fields)); + ASSERT_OK(chunked_array.ValidateFull()); + ASSERT_EQ(chunked_array.num_chunks(), 0); + ASSERT_OK_AND_ASSIGN(child, FieldPath({1, 1}).Get(chunked_array)); + AssertTypeEqual(float32(), child->type()); + ASSERT_EQ(child->length(), 0); + + // Empty Table with no column chunks + ChunkedArrayVector table_columns; + for (const auto& f : fields) { + table_columns.push_back(std::make_shared(ArrayVector{}, f->type())); + } + auto table = Table::Make(schema(fields), table_columns, 0); + ASSERT_OK(table->ValidateFull()); + for (const auto& column : table->columns()) { + ASSERT_EQ(column->num_chunks(), 0); + } + ASSERT_OK_AND_ASSIGN(child, FieldPath({1, 1}).Get(*table)); + AssertTypeEqual(float32(), child->type()); + ASSERT_EQ(child->length(), 0); +} + +TEST_F(TestFieldPath, GetWithInvalidIndex) { + TestGetWithInvalidIndex(); + TestGetWithInvalidIndex(); + TestGetWithInvalidIndex(); + TestGetWithInvalidIndex(); + TestGetWithInvalidIndex(); + TestGetWithInvalidIndex(); + TestGetWithInvalidIndex
(); + // With flattening + TestGetWithInvalidIndex(); + TestGetWithInvalidIndex(); + TestGetWithInvalidIndex(); + TestGetWithInvalidIndex(); + TestGetWithInvalidIndex(); +} + +TEST_F(TestFieldPath, IndexErrorMessage) { + TestIndexErrorMessage(); + TestIndexErrorMessage(); + TestIndexErrorMessage(); + TestIndexErrorMessage(); + TestIndexErrorMessage(); + TestIndexErrorMessage(); + TestIndexErrorMessage
(); +} + +TEST_F(TestFieldPath, GetWithNonStructArray) { + TestGetWithNonStructArray(); + TestGetWithNonStructArray(); + TestGetWithNonStructArray(); + // With flattening + TestGetWithNonStructArray(); + TestGetWithNonStructArray(); + TestGetWithNonStructArray(); +} + +TEST_F(TestFieldPath, GetFromSchema) { TestGet(); } +TEST_F(TestFieldPath, GetFromDataType) { TestGet(); } + +TEST_F(TestFieldPath, GetFromArray) { TestGet(); } +TEST_F(TestFieldPath, GetFromChunkedArray) { TestGet(); } +TEST_F(TestFieldPath, GetFromRecordBatch) { TestGet(); } +TEST_F(TestFieldPath, GetFromTable) { TestGet
(); } + +TEST_F(TestFieldPath, GetFlattenedFromArray) { TestGet(); } +TEST_F(TestFieldPath, GetFlattenedFromChunkedArray) { TestGet(); } +TEST_F(TestFieldPath, GetFlattenedFromRecordBatch) { TestGet(); } +TEST_F(TestFieldPath, GetFlattenedFromTable) { TestGet(); } + +// ---------------------------------------------------------------------- +// FieldRef + +TEST_F(TestFieldRef, Basics) { + auto f0 = field("alpha", int32()); + auto f1 = field("beta", int32()); + auto f2 = field("alpha", int32()); + auto f3 = field("beta", int32()); + Schema s({f0, f1, f2, f3}); + + // lookup by index returns Indices{index} + for (int index = 0; index < s.num_fields(); ++index) { + EXPECT_THAT(FieldRef(index).FindAll(s), ElementsAre(FieldPath{index})); + } + // out of range index results in a failure to match + EXPECT_THAT(FieldRef(s.num_fields() * 2).FindAll(s), ElementsAre()); + + // lookup by name returns the Indices of both matching fields + EXPECT_THAT(FieldRef("alpha").FindAll(s), ElementsAre(FieldPath{0}, FieldPath{2})); + EXPECT_THAT(FieldRef("beta").FindAll(s), ElementsAre(FieldPath{1}, FieldPath{3})); +} + +TEST_F(TestFieldRef, FindAllForTable) { + constexpr int kNumRows = 100; + auto f0 = field("alpha", int32()); + auto f1 = field("beta", int32()); + auto f2 = field("alpha", int32()); + auto f3 = field("beta", int32()); + auto schema = arrow::schema({f0, f1, f2, f3}); + + arrow::random::RandomArrayGenerator gen_{42}; + auto a0 = gen_.ArrayOf(int32(), kNumRows); + auto a1 = gen_.ArrayOf(int32(), kNumRows); + auto a2 = gen_.ArrayOf(int32(), kNumRows); + auto a3 = gen_.ArrayOf(int32(), kNumRows); + + auto table_ptr = Table::Make(schema, {a0, a1, a2, a3}); + ASSERT_OK(table_ptr->ValidateFull()); + + // lookup by index returns Indices{index} + auto schema_num_fields = table_ptr->schema()->num_fields(); + for (int index = 0; index < schema_num_fields; ++index) { + EXPECT_THAT(FieldRef(index).FindAll(*table_ptr), ElementsAre(FieldPath{index})); + } + // out of range index results in a failure to match + EXPECT_THAT(FieldRef(schema_num_fields * 2).FindAll(*table_ptr), ElementsAre()); + + //// lookup by name returns the Indices of both matching fields + EXPECT_THAT(FieldRef("alpha").FindAll(*table_ptr), + ElementsAre(FieldPath{0}, FieldPath{2})); + EXPECT_THAT(FieldRef("beta").FindAll(*table_ptr), + ElementsAre(FieldPath{1}, FieldPath{3})); +} + +TEST_F(TestFieldRef, FindAllForRecordBatch) { + constexpr int kNumRows = 100; + auto f0 = field("alpha", int32()); + auto f1 = field("beta", int32()); + auto f2 = field("alpha", int32()); + auto f3 = field("beta", int32()); + auto schema = arrow::schema({f0, f1, f2, f3}); + + arrow::random::RandomArrayGenerator gen_{42}; + auto a0 = gen_.ArrayOf(int32(), kNumRows); + auto a1 = gen_.ArrayOf(int32(), kNumRows); + auto a2 = gen_.ArrayOf(int32(), kNumRows); + auto a3 = gen_.ArrayOf(int32(), kNumRows); + + auto record_batch_ptr = RecordBatch::Make(schema, kNumRows, {a0, a1, a2, a3}); + ASSERT_OK(record_batch_ptr->ValidateFull()); + + // lookup by index returns Indices{index} + auto schema_num_fields = record_batch_ptr->schema()->num_fields(); + for (int index = 0; index < schema_num_fields; ++index) { + EXPECT_THAT(FieldRef(index).FindAll(*record_batch_ptr), + ElementsAre(FieldPath{index})); + } + // out of range index results in a failure to match + EXPECT_THAT(FieldRef(schema_num_fields * 2).FindAll(*record_batch_ptr), ElementsAre()); + + //// lookup by name returns the Indices of both matching fields + EXPECT_THAT(FieldRef("alpha").FindAll(*record_batch_ptr), + ElementsAre(FieldPath{0}, FieldPath{2})); + EXPECT_THAT(FieldRef("beta").FindAll(*record_batch_ptr), + ElementsAre(FieldPath{1}, FieldPath{3})); +} + +TEST_F(TestFieldRef, FromDotPath) { + ASSERT_OK_AND_EQ(FieldRef("alpha"), FieldRef::FromDotPath(R"(.alpha)")); + + ASSERT_OK_AND_EQ(FieldRef("", ""), FieldRef::FromDotPath(R"(..)")); + + ASSERT_OK_AND_EQ(FieldRef(2), FieldRef::FromDotPath(R"([2])")); + + ASSERT_OK_AND_EQ(FieldRef("beta", 3), FieldRef::FromDotPath(R"(.beta[3])")); + + ASSERT_OK_AND_EQ(FieldRef(5, "gamma", "delta", 7), + FieldRef::FromDotPath(R"([5].gamma.delta[7])")); + + ASSERT_OK_AND_EQ(FieldRef("hello world"), FieldRef::FromDotPath(R"(.hello world)")); + + ASSERT_OK_AND_EQ(FieldRef(R"([y]\tho.\)"), FieldRef::FromDotPath(R"(.\[y\]\\tho\.\)")); + + ASSERT_OK_AND_EQ(FieldRef(), FieldRef::FromDotPath(R"()")); + + ASSERT_RAISES(Invalid, FieldRef::FromDotPath(R"(alpha)")); + ASSERT_RAISES(Invalid, FieldRef::FromDotPath(R"([134234)")); + ASSERT_RAISES(Invalid, FieldRef::FromDotPath(R"([1stuf])")); +} + +TEST_F(TestFieldRef, DotPathRoundTrip) { + auto check_roundtrip = [](const FieldRef& ref) { + auto dot_path = ref.ToDotPath(); + ASSERT_OK_AND_EQ(ref, FieldRef::FromDotPath(dot_path)); + }; + + check_roundtrip(FieldRef()); + check_roundtrip(FieldRef("foo")); + check_roundtrip(FieldRef("foo", 1, "bar", 2, 3)); + check_roundtrip(FieldRef(1, 2, 3)); + check_roundtrip(FieldRef("foo", 1, FieldRef("bar", 2, 3), FieldRef())); +} + +TEST_F(TestFieldRef, Nested) { + auto f0 = field("alpha", int32()); + auto f1_0 = field("alpha", int32()); + auto f1 = field("beta", struct_({f1_0})); + auto f2_0 = field("alpha", int32()); + auto f2_1_0 = field("alpha", int32()); + auto f2_1_1 = field("alpha", int32()); + auto f2_1 = field("gamma", struct_({f2_1_0, f2_1_1})); + auto f2 = field("beta", struct_({f2_0, f2_1})); + Schema s({f0, f1, f2}); + + EXPECT_THAT(FieldRef("beta", "alpha").FindAll(s), + ElementsAre(FieldPath{1, 0}, FieldPath{2, 0})); + EXPECT_THAT(FieldRef("beta", "gamma", "alpha").FindAll(s), + ElementsAre(FieldPath{2, 1, 0}, FieldPath{2, 1, 1})); +} + +TEST_F(TestFieldRef, Flatten) { + FieldRef ref; + + auto assert_name = [](const FieldRef& ref, const std::string& expected) { + ASSERT_TRUE(ref.IsName()); + ASSERT_EQ(*ref.name(), expected); + }; + + auto assert_path = [](const FieldRef& ref, const std::vector& expected) { + ASSERT_TRUE(ref.IsFieldPath()); + ASSERT_EQ(ref.field_path()->indices(), expected); + }; + + auto assert_nested = [](const FieldRef& ref, const std::vector& expected) { + ASSERT_TRUE(ref.IsNested()); + ASSERT_EQ(*ref.nested_refs(), expected); + }; + + assert_path(FieldRef(), {}); + assert_path(FieldRef(1, 2, 3), {1, 2, 3}); + // If all leaves are field paths, they are fully flattened + assert_path(FieldRef(1, FieldRef(2, 3)), {1, 2, 3}); + assert_path(FieldRef(1, FieldRef(2, 3), FieldRef(), FieldRef(FieldRef(4), FieldRef(5))), + {1, 2, 3, 4, 5}); + assert_path(FieldRef(FieldRef(), FieldRef(FieldRef(), FieldRef())), {}); + + assert_name(FieldRef("foo"), "foo"); + + // Nested empty field refs are optimized away + assert_nested(FieldRef("foo", 1, FieldRef(), FieldRef(FieldRef(), "bar")), + {FieldRef("foo"), FieldRef(1), FieldRef("bar")}); + // For now, subsequences of indices are not concatenated + assert_nested(FieldRef("foo", FieldRef("bar"), FieldRef(1, 2), FieldRef(3)), + {FieldRef("foo"), FieldRef("bar"), FieldRef(1, 2), FieldRef(3)}); +} + +TEST_F(TestFieldRef, GetFromSchema) { TestGet(); } +TEST_F(TestFieldRef, GetFromDataType) { TestGet(); } + +TEST_F(TestFieldRef, GetFromArray) { TestGet(); } +TEST_F(TestFieldRef, GetFromChunkedArray) { TestGet(); } +TEST_F(TestFieldRef, GetFromRecordBatch) { TestGet(); } +TEST_F(TestFieldRef, GetFromTable) { TestGet
(); } + +TEST_F(TestFieldRef, GetFlattenedFromArray) { TestGet(); } +TEST_F(TestFieldRef, GetFlattenedFromChunkedArray) { TestGet(); } +TEST_F(TestFieldRef, GetFlattenedFromRecordBatch) { TestGet(); } +TEST_F(TestFieldRef, GetFlattenedFromTable) { TestGet(); } + +} // namespace arrow diff --git a/cpp/src/arrow/type.cc b/cpp/src/arrow/type.cc index 606b231f6f7..68dc2aabe96 100644 --- a/cpp/src/arrow/type.cc +++ b/cpp/src/arrow/type.cc @@ -1047,7 +1047,7 @@ std::string DictionaryType::ToString() const { std::string NullType::ToString() const { return name(); } // ---------------------------------------------------------------------- -// FieldRef +// FieldPath size_t FieldPath::hash() const { return internal::ComputeStringHash<0>(indices().data(), indices().size() * sizeof(int)); @@ -1066,236 +1066,215 @@ std::string FieldPath::ToString() const { return repr; } -class ChunkedColumn; -using ChunkedColumnVector = std::vector>; +static Status NonStructError() { + return Status::NotImplemented("Get child data of non-struct array"); +} -class ChunkedColumn { +// Utility class for retrieving a child field/column from a top-level Field, Array, or +// ChunkedArray. The "root" value can either be a single parent or a vector of its +// children. +template +class NestedSelector { public: - virtual ~ChunkedColumn() = default; - - explicit ChunkedColumn(const std::shared_ptr& type = nullptr) : type_(type) {} - - virtual int num_chunks() const = 0; - virtual const std::shared_ptr& chunk(int i) const = 0; + using ArrowType = T; + + explicit NestedSelector(const std::vector>& children) + : parent_or_children_(&children) {} + explicit NestedSelector(const T& parent) : parent_or_children_(&parent) {} + explicit NestedSelector(std::shared_ptr parent) + : owned_parent_(std::move(parent)), parent_or_children_(owned_parent_.get()) {} + template + NestedSelector(Arg&& arg, MemoryPool* pool) : NestedSelector(std::forward(arg)) { + if (pool) { + pool_ = pool; + } + } - const std::shared_ptr& type() const { return type_; } + // If the index is out of bounds, this returns an invalid selector rather than an + // error. + Result GetChild(int i) const { + std::shared_ptr child; + if (auto parent = get_parent()) { + ARROW_ASSIGN_OR_RAISE(child, GetChild(*parent, i, pool_)); + } else if (auto children = get_children()) { + if (ARROW_PREDICT_TRUE(i >= 0 && static_cast(i) < children->size())) { + child = (*children)[i]; + } + } + return NestedSelector(std::move(child), pool_); + } - ChunkedColumnVector Flatten() const; + Result> Finish() const { + DCHECK(get_parent() && owned_parent_); + return owned_parent_; + } - Result> ToChunkedArray() const { - if (num_chunks() == 0) { - return ChunkedArray::MakeEmpty(type()); + template + std::enable_if_t> Summarize(OStream* os) const { + const FieldVector* fields = get_children(); + if (!fields && get_parent()) { + fields = &get_parent()->type()->fields(); } - ArrayVector chunks(num_chunks()); - for (int i = 0; i < num_chunks(); ++i) { - chunks[i] = MakeArray(chunk(i)); + *os << "fields: { "; + if (fields) { + for (const auto& field : *fields) { + *os << field->ToString() << ", "; + } } - return ChunkedArray::Make(std::move(chunks), type()); + *os << "}"; } - private: - const std::shared_ptr& type_; -}; - -// References a chunk vector owned by another ChunkedArray. -// This can be used to avoid transforming a top-level ChunkedArray's ArrayVector into an -// ArrayDataVector if flattening isn't needed. -class ChunkedArrayRef : public ChunkedColumn { - public: - explicit ChunkedArrayRef(const ChunkedArray& chunked_array) - : ChunkedColumn(chunked_array.type()), chunks_(chunked_array.chunks()) {} - - int num_chunks() const override { return static_cast(chunks_.size()); } - const std::shared_ptr& chunk(int i) const override { - return chunks_[i]->data(); + template + std::enable_if_t> Summarize(OStream* os) const { + *os << "column types: { "; + if (auto children = get_children()) { + for (const auto& child : *children) { + *os << *child->type() << ", "; + } + } else if (auto parent = get_parent()) { + for (const auto& field : parent->type()->fields()) { + *os << *field->type() << ", "; + } + } + *os << "}"; } - private: - const ArrayVector& chunks_; -}; - -// Owns a chunked ArrayDataVector (created after flattening its parent). -class ChunkedArrayData : public ChunkedColumn { - public: - explicit ChunkedArrayData(const std::shared_ptr& type, - ArrayDataVector chunks = {}) - : ChunkedColumn(type), chunks_(std::move(chunks)) {} - - int num_chunks() const override { return static_cast(chunks_.size()); } - const std::shared_ptr& chunk(int i) const override { return chunks_[i]; } + bool is_valid() const { return get_parent() || get_children(); } + operator bool() const { return is_valid(); } private: - ArrayDataVector chunks_; -}; + // Accessors for the variant + auto get_parent() const { return get_value(); } + auto get_children() const { + return get_value>*>(); + } + template + U get_value() const { + auto ptr = std::get_if(&parent_or_children_); + return ptr ? *ptr : nullptr; + } -// Return a vector of ChunkedColumns - one for each struct field. -// Unlike ChunkedArray::Flatten, this is zero-copy and doesn't merge parent/child -// validity bitmaps. -ChunkedColumnVector ChunkedColumn::Flatten() const { - DCHECK_EQ(type()->id(), Type::STRUCT); - - ChunkedColumnVector columns(type()->num_fields()); - for (int column_idx = 0; column_idx < type()->num_fields(); ++column_idx) { - const auto& child_type = type()->field(column_idx)->type(); - ArrayDataVector chunks(num_chunks()); - for (int chunk_idx = 0; chunk_idx < num_chunks(); ++chunk_idx) { - const auto& child_data = chunk(chunk_idx)->child_data; - DCHECK_EQ(columns.size(), child_data.size()); - DCHECK(child_type->Equals(child_data[column_idx]->type)); - chunks[chunk_idx] = child_data[column_idx]; + static Result> GetChild(const Field& field, int i, MemoryPool*) { + if (ARROW_PREDICT_FALSE(i < 0 || i >= field.type()->num_fields())) { + return nullptr; } - columns[column_idx] = - std::make_shared(child_type, std::move(chunks)); + return field.type()->field(i); } - return columns; -} - -struct FieldPathGetImpl { - static const DataType& GetType(const ArrayData& data) { return *data.type; } - static const DataType& GetType(const ChunkedColumn& column) { return *column.type(); } + static Result> GetChild(const Array& array, int i, + MemoryPool* pool) { + if (ARROW_PREDICT_FALSE(array.type_id() != Type::STRUCT)) { + return NonStructError(); + } + if (ARROW_PREDICT_FALSE(i < 0 || i >= array.num_fields())) { + return nullptr; + } - static void Summarize(const FieldVector& fields, std::stringstream* ss) { - *ss << "{ "; - for (const auto& field : fields) { - *ss << field->ToString() << ", "; + const auto& struct_array = checked_cast(array); + if constexpr (IsFlattening) { + return struct_array.GetFlattenedField(i, pool); + } else { + return struct_array.field(i); } - *ss << "}"; } - template - static void Summarize(const std::vector& columns, std::stringstream* ss) { - *ss << "{ "; - for (const auto& column : columns) { - *ss << GetType(*column) << ", "; + static Result> GetChild(const ChunkedArray& chunked_array, + int i, MemoryPool* pool) { + const auto& type = *chunked_array.type(); + if (ARROW_PREDICT_FALSE(type.id() != Type::STRUCT)) { + return NonStructError(); } - *ss << "}"; + if (ARROW_PREDICT_FALSE(i < 0 || i >= type.num_fields())) { + return nullptr; + } + + ArrayVector chunks; + chunks.reserve(chunked_array.num_chunks()); + for (const auto& parent_chunk : chunked_array.chunks()) { + ARROW_ASSIGN_OR_RAISE(auto chunk, GetChild(*parent_chunk, i, pool)); + if (!chunk) return nullptr; + chunks.push_back(std::move(chunk)); + } + + return ChunkedArray::Make(std::move(chunks), type.field(i)->type()); } - template + std::shared_ptr owned_parent_; + std::variant>*> parent_or_children_; + MemoryPool* pool_ = default_memory_pool(); +}; + +using FieldSelector = NestedSelector; +template +using ZeroCopySelector = NestedSelector; +template +using FlatteningSelector = NestedSelector; + +struct FieldPathGetImpl { + template static Status IndexError(const FieldPath* path, int out_of_range_depth, - const std::vector& children) { + const Selector& selector) { std::stringstream ss; ss << "index out of range. "; ss << "indices=[ "; int depth = 0; for (int i : path->indices()) { - if (depth != out_of_range_depth) { + if (depth++ != out_of_range_depth) { ss << i << " "; - continue; + } else { + ss << ">" << i << "< "; } - ss << ">" << i << "< "; - ++depth; } ss << "] "; - if (std::is_same>::value) { - ss << "fields were: "; - } else { - ss << "columns had types: "; - } - Summarize(children, &ss); + selector.Summarize(&ss); return Status::IndexError(ss.str()); } - template - static Result Get(const FieldPath* path, const std::vector* children, - GetChildren&& get_children, int* out_of_range_depth) { - if (path->indices().empty()) { + template + static Result> Get(const FieldPath* path, Selector selector, + int* out_of_range_depth = nullptr) { + if (path->empty()) { return Status::Invalid("empty indices cannot be traversed"); } int depth = 0; - const T* out; - while (true) { - if (children == nullptr) { - return Status::NotImplemented("Get child data of non-struct array"); - } - - auto index = (*path)[depth]; - if (index < 0 || static_cast(index) >= children->size()) { - *out_of_range_depth = depth; - return nullptr; + for (auto index : *path) { + ARROW_ASSIGN_OR_RAISE(auto next_selector, selector.GetChild(index)); + + // Handle failed bounds check + if (!next_selector) { + if (out_of_range_depth) { + *out_of_range_depth = depth; + return nullptr; + } + return IndexError(path, depth, selector); } - out = &children->at(index); - if (static_cast(++depth) == path->indices().size()) { - break; - } - children = get_children(*out); + selector = std::move(next_selector); + ++depth; } - return *out; - } - - template - static Result Get(const FieldPath* path, const std::vector* children, - GetChildren&& get_children) { - int out_of_range_depth = -1; - ARROW_ASSIGN_OR_RAISE(auto child, - Get(path, children, std::forward(get_children), - &out_of_range_depth)); - if (child != nullptr) { - return std::move(child); - } - return IndexError(path, out_of_range_depth, *children); - } - - static Result> Get(const FieldPath* path, - const FieldVector& fields) { - return FieldPathGetImpl::Get(path, &fields, [](const std::shared_ptr& field) { - return &field->type()->fields(); - }); - } - - static Result> Get( - const FieldPath* path, const ChunkedColumnVector& toplevel_children) { - ChunkedColumnVector children; - - ARROW_ASSIGN_OR_RAISE( - auto child, - FieldPathGetImpl::Get(path, &toplevel_children, - [&children](const std::shared_ptr& parent) - -> const ChunkedColumnVector* { - if (parent->type()->id() != Type::STRUCT) { - return nullptr; - } - children = parent->Flatten(); - return &children; - })); - - return child->ToChunkedArray(); - } - - static Result> Get(const FieldPath* path, - const ArrayDataVector& child_data) { - return FieldPathGetImpl::Get( - path, &child_data, - [](const std::shared_ptr& data) -> const ArrayDataVector* { - if (data->type->id() != Type::STRUCT) { - return nullptr; - } - return &data->child_data; - }); + return selector.Finish(); } }; Result> FieldPath::Get(const Schema& schema) const { - return FieldPathGetImpl::Get(this, schema.fields()); + return Get(schema.fields()); } Result> FieldPath::Get(const Field& field) const { - return FieldPathGetImpl::Get(this, field.type()->fields()); + return Get(field.type()->fields()); } Result> FieldPath::Get(const DataType& type) const { - return FieldPathGetImpl::Get(this, type.fields()); + return Get(type.fields()); } Result> FieldPath::Get(const FieldVector& fields) const { - return FieldPathGetImpl::Get(this, fields); + return FieldPathGetImpl::Get(this, FieldSelector(fields)); } Result> FieldPath::GetAll(const Schema& schm, @@ -1310,40 +1289,61 @@ Result> FieldPath::GetAll(const Schema& schm, } Result> FieldPath::Get(const RecordBatch& batch) const { - ARROW_ASSIGN_OR_RAISE(auto data, FieldPathGetImpl::Get(this, batch.column_data())); - return MakeArray(std::move(data)); + return FieldPathGetImpl::Get(this, ZeroCopySelector(batch.columns())); } Result> FieldPath::Get(const Table& table) const { - ChunkedColumnVector columns(table.num_columns()); - std::transform(table.columns().cbegin(), table.columns().cend(), columns.begin(), - [](const std::shared_ptr& chunked_array) { - return std::make_shared(*chunked_array); - }); - return FieldPathGetImpl::Get(this, columns); + return FieldPathGetImpl::Get(this, ZeroCopySelector(table.columns())); } Result> FieldPath::Get(const Array& array) const { - ARROW_ASSIGN_OR_RAISE(auto data, Get(*array.data())); - return MakeArray(std::move(data)); + return FieldPathGetImpl::Get(this, ZeroCopySelector(array)); } Result> FieldPath::Get(const ArrayData& data) const { - if (data.type->id() != Type::STRUCT) { - return Status::NotImplemented("Get child data of non-struct array"); - } - return FieldPathGetImpl::Get(this, data.child_data); + // We indirect from ArrayData to Array rather than vice-versa because, when selecting a + // nested column, the StructArray::field method does the work of adjusting the data's + // offset/length if necessary. + ARROW_ASSIGN_OR_RAISE(auto array, Get(*MakeArray(data.Copy()))); + return array->data(); } Result> FieldPath::Get( const ChunkedArray& chunked_array) const { - if (chunked_array.type()->id() != Type::STRUCT) { - return Status::NotImplemented("Get child data of non-struct chunked array"); - } - auto columns = ChunkedArrayRef(chunked_array).Flatten(); - return FieldPathGetImpl::Get(this, columns); + return FieldPathGetImpl::Get(this, ZeroCopySelector(chunked_array)); +} + +Result> FieldPath::GetFlattened(const Array& array, + MemoryPool* pool) const { + return FieldPathGetImpl::Get(this, FlatteningSelector(array, pool)); +} + +Result> FieldPath::GetFlattened(const ArrayData& data, + MemoryPool* pool) const { + ARROW_ASSIGN_OR_RAISE(auto array, GetFlattened(*MakeArray(data.Copy()), pool)); + return array->data(); } +Result> FieldPath::GetFlattened( + const ChunkedArray& chunked_array, MemoryPool* pool) const { + return FieldPathGetImpl::Get(this, + FlatteningSelector(chunked_array, pool)); +} + +Result> FieldPath::GetFlattened(const RecordBatch& batch, + MemoryPool* pool) const { + return FieldPathGetImpl::Get(this, FlatteningSelector(batch.columns(), pool)); +} + +Result> FieldPath::GetFlattened(const Table& table, + MemoryPool* pool) const { + return FieldPathGetImpl::Get(this, + FlatteningSelector(table.columns(), pool)); +} + +// ---------------------------------------------------------------------- +// FieldRef + FieldRef::FieldRef(FieldPath indices) : impl_(std::move(indices)) {} void FieldRef::Flatten(std::vector children) { @@ -1573,10 +1573,8 @@ std::vector FieldRef::FindAll(const FieldVector& fields) const { std::vector operator()(const FieldPath& path) { // skip long IndexError construction if path is out of range int out_of_range_depth; - auto maybe_field = FieldPathGetImpl::Get( - &path, &fields_, - [](const std::shared_ptr& field) { return &field->type()->fields(); }, - &out_of_range_depth); + auto maybe_field = + FieldPathGetImpl::Get(&path, FieldSelector(fields_), &out_of_range_depth); DCHECK_OK(maybe_field.status()); diff --git a/cpp/src/arrow/type.h b/cpp/src/arrow/type.h index a3e78eeb722..48228d43ef9 100644 --- a/cpp/src/arrow/type.h +++ b/cpp/src/arrow/type.h @@ -1699,6 +1699,22 @@ class ARROW_EXPORT FieldPath { /// \brief Retrieve the referenced child from a ChunkedArray Result> Get(const ChunkedArray& chunked_array) const; + /// \brief Retrieve the referenced child/column from an Array, ArrayData, ChunkedArray, + /// RecordBatch, or Table + /// + /// Unlike `FieldPath::Get`, these variants are not zero-copy and the retrieved child's + /// null bitmap is ANDed with its ancestors' + Result> GetFlattened(const Array& array, + MemoryPool* pool = NULLPTR) const; + Result> GetFlattened(const ArrayData& data, + MemoryPool* pool = NULLPTR) const; + Result> GetFlattened(const ChunkedArray& chunked_array, + MemoryPool* pool = NULLPTR) const; + Result> GetFlattened(const RecordBatch& batch, + MemoryPool* pool = NULLPTR) const; + Result> GetFlattened(const Table& table, + MemoryPool* pool = NULLPTR) const; + private: std::vector indices_; }; @@ -1886,6 +1902,20 @@ class ARROW_EXPORT FieldRef : public util::EqualityComparable { } return out; } + /// \brief Get all children matching this FieldRef. + /// + /// Unlike `FieldRef::GetAll`, this variant is not zero-copy and the retrieved + /// children's null bitmaps are ANDed with their ancestors' + template + Result>> GetAllFlattened(const T& root, + MemoryPool* pool = NULLPTR) const { + std::vector> out; + for (const auto& match : FindAll(root)) { + ARROW_ASSIGN_OR_RAISE(auto child, match.GetFlattened(root, pool)); + out.push_back(std::move(child)); + } + return out; + } /// \brief Get the single child matching this FieldRef. /// Emit an error if none or multiple match. @@ -1894,6 +1924,15 @@ class ARROW_EXPORT FieldRef : public util::EqualityComparable { ARROW_ASSIGN_OR_RAISE(auto match, FindOne(root)); return match.Get(root).ValueOrDie(); } + /// \brief Get the single child matching this FieldRef. + /// + /// Unlike `FieldRef::GetOne`, this variant is not zero-copy and the retrieved + /// child's null bitmap is ANDed with its ancestors' + template + Result> GetOneFlattened(const T& root, MemoryPool* pool = NULLPTR) const { + ARROW_ASSIGN_OR_RAISE(auto match, FindOne(root)); + return match.GetFlattened(root, pool); + } /// \brief Get the single child matching this FieldRef. /// Return nullptr if none match, emit an error if multiple match. @@ -1905,6 +1944,20 @@ class ARROW_EXPORT FieldRef : public util::EqualityComparable { } return match.Get(root).ValueOrDie(); } + /// \brief Get the single child matching this FieldRef. + /// + /// Return nullptr if none match, emit an error if multiple match. + /// Unlike `FieldRef::GetOneOrNone`, this variant is not zero-copy and the + /// retrieved child's null bitmap is ANDed with its ancestors' + template + Result> GetOneOrNoneFlattened(const T& root, + MemoryPool* pool = NULLPTR) const { + ARROW_ASSIGN_OR_RAISE(auto match, FindOneOrNone(root)); + if (match.empty()) { + return static_cast>(NULLPTR); + } + return match.GetFlattened(root, pool); + } private: void Flatten(std::vector children); diff --git a/cpp/src/arrow/type_test.cc b/cpp/src/arrow/type_test.cc index 4fb9598f936..3c83da9f2e6 100644 --- a/cpp/src/arrow/type_test.cc +++ b/cpp/src/arrow/type_test.cc @@ -40,8 +40,6 @@ namespace arrow { -using testing::ElementsAre; - using internal::checked_cast; using internal::checked_pointer_cast; @@ -364,407 +362,6 @@ TEST(TestField, TestMerge) { } } -TEST(TestFieldPath, Basics) { - auto f0 = field("alpha", int32()); - auto f1 = field("beta", int32()); - auto f2 = field("alpha", int32()); - auto f3 = field("beta", int32()); - Schema s({f0, f1, f2, f3}); - - // retrieving a field with single-element FieldPath is equivalent to Schema::field - for (int index = 0; index < s.num_fields(); ++index) { - ASSERT_OK_AND_EQ(s.field(index), FieldPath({index}).Get(s)); - } - EXPECT_RAISES_WITH_MESSAGE_THAT(Invalid, - testing::HasSubstr("empty indices cannot be traversed"), - FieldPath().Get(s)); - EXPECT_RAISES_WITH_MESSAGE_THAT(IndexError, testing::HasSubstr("index out of range"), - FieldPath({s.num_fields() * 2}).Get(s)); -} - -TEST(TestFieldPath, GetForTable) { - using testing::HasSubstr; - - constexpr int kNumRows = 4; - auto f0 = field("a", int32()); - auto f1 = field("b", int32()); - auto f2 = field("c", struct_({f1})); - auto f3 = field("d", struct_({f0, f2})); - auto table_schema = schema({f0, f1, f2, f3}); - - // Each column has a different chunking - ChunkedArrayVector columns(4); - columns[0] = ChunkedArrayFromJSON(f0->type(), {"[0,1,2,3]"}); - columns[1] = ChunkedArrayFromJSON(f1->type(), {"[3,2,1]", "[0]"}); - columns[2] = - ChunkedArrayFromJSON(f2->type(), {R"([{"b":3},{"b":2}])", R"([{"b":1},{"b":0}])"}); - columns[3] = ChunkedArrayFromJSON( - f3->type(), {R"([{"a":0,"c":{"b":3}},{"a":1,"c":{"b":2}}])", - R"([{"a":2,"c":{"b":1}}])", R"([{"a":3,"c":{"b":0}}])"}); - auto table = Table::Make(table_schema, columns, kNumRows); - ASSERT_OK(table->ValidateFull()); - - ASSERT_OK_AND_ASSIGN(auto v0, FieldPath({0}).Get(*table)); - ASSERT_OK_AND_ASSIGN(auto v1, FieldPath({1}).Get(*table)); - ASSERT_OK_AND_ASSIGN(auto v2, FieldPath({2}).Get(*table)); - ASSERT_OK_AND_ASSIGN(auto v2_0, FieldPath({2, 0}).Get(*table)); - ASSERT_OK_AND_ASSIGN(auto v3, FieldPath({3}).Get(*table)); - ASSERT_OK_AND_ASSIGN(auto v3_0, FieldPath({3, 0}).Get(*table)); - ASSERT_OK_AND_ASSIGN(auto v3_1, FieldPath({3, 1}).Get(*table)); - ASSERT_OK_AND_ASSIGN(auto v3_1_0, FieldPath({3, 1, 0}).Get(*table)); - - EXPECT_EQ(v0->num_chunks(), columns[0]->num_chunks()); - EXPECT_EQ(v1->num_chunks(), columns[1]->num_chunks()); - EXPECT_EQ(v2->num_chunks(), columns[2]->num_chunks()); - EXPECT_EQ(v2_0->num_chunks(), columns[2]->num_chunks()); - EXPECT_EQ(v3->num_chunks(), columns[3]->num_chunks()); - EXPECT_EQ(v3_0->num_chunks(), columns[3]->num_chunks()); - EXPECT_EQ(v3_1->num_chunks(), columns[3]->num_chunks()); - EXPECT_EQ(v3_1_0->num_chunks(), columns[3]->num_chunks()); - - EXPECT_TRUE(columns[0]->Equals(v0)); - EXPECT_TRUE(columns[0]->Equals(v3_0)); - - EXPECT_TRUE(columns[1]->Equals(v1)); - EXPECT_TRUE(columns[1]->Equals(v2_0)); - EXPECT_TRUE(columns[1]->Equals(v3_1_0)); - - EXPECT_TRUE(columns[2]->Equals(v2)); - EXPECT_TRUE(columns[2]->Equals(v3_1)); - - EXPECT_TRUE(columns[3]->Equals(v3)); - - for (const auto& path : - {FieldPath({4, 1, 0}), FieldPath({3, 2, 0}), FieldPath{3, 1, 1}}) { - EXPECT_RAISES_WITH_MESSAGE_THAT(IndexError, HasSubstr("index out of range"), - path.Get(*table)); - } - EXPECT_RAISES_WITH_MESSAGE_THAT(Invalid, HasSubstr("empty indices cannot be traversed"), - FieldPath().Get(*table)); -} - -TEST(TestFieldPath, GetForChunkedArray) { - using testing::HasSubstr; - - auto f0 = field("a", int32()); - auto f1 = field("b", int32()); - auto f2 = field("c", struct_({f1})); - auto f3 = field("d", struct_({f0, f2})); - auto type = struct_({f0, f1, f3}); - - auto column0 = ChunkedArrayFromJSON(f0->type(), {"[0,1,2,3]"}); - auto column1 = ChunkedArrayFromJSON(f1->type(), {"[3,2,1,0]"}); - auto column2_1 = - ChunkedArrayFromJSON(f2->type(), {R"([{"b":3},{"b":2},{"b":1},{"b":0}])"}); - auto chunked_array = ChunkedArrayFromJSON( - type, - { - R"([{"a":0,"b":3,"d":{"a":0,"c":{"b":3}}}])", - R"([{"a":1,"b":2,"d":{"a":1,"c":{"b":2}}},{"a":2,"b":1,"d":{"a":2,"c":{"b":1}}}])", - R"([{"a":3,"b":0,"d":{"a":3,"c":{"b":0}}}])", - }); - ASSERT_OK(chunked_array->ValidateFull()); - - ASSERT_OK_AND_ASSIGN(auto v0, FieldPath({0}).Get(*chunked_array)); - ASSERT_OK_AND_ASSIGN(auto v1, FieldPath({1}).Get(*chunked_array)); - ASSERT_OK_AND_ASSIGN(auto v2_0, FieldPath({2, 0}).Get(*chunked_array)); - ASSERT_OK_AND_ASSIGN(auto v2_1, FieldPath({2, 1}).Get(*chunked_array)); - ASSERT_OK_AND_ASSIGN(auto v2_1_0, FieldPath({2, 1, 0}).Get(*chunked_array)); - - for (const auto& v : {v0, v1, v2_0, v2_1, v2_1_0}) { - EXPECT_EQ(v->num_chunks(), chunked_array->num_chunks()); - } - - EXPECT_TRUE(column0->Equals(v0)); - EXPECT_TRUE(column0->Equals(v2_0)); - - EXPECT_TRUE(column1->Equals(v1)); - EXPECT_TRUE(column1->Equals(v2_1_0)); - EXPECT_FALSE(column1->Equals(v2_1)); - - EXPECT_TRUE(column2_1->Equals(v2_1)); - - EXPECT_RAISES_WITH_MESSAGE_THAT(NotImplemented, - HasSubstr("Get child data of non-struct chunked array"), - FieldPath({0}).Get(*column0)); -} - -TEST(TestFieldPath, GetForChunkedArrayWithNulls) { - auto int_field = field("i", int32()); - auto int_chunked_array = - ChunkedArrayFromJSON(int_field->type(), {"[0,1]", "[2,null]", "[3,4]"}); - - ASSERT_OK_AND_ASSIGN(auto null_bitmap, AllocateEmptyBitmap(2)); - ArrayVector struct_chunks; - for (const auto& int_chunk : int_chunked_array->chunks()) { - ASSERT_OK_AND_ASSIGN(auto chunk, - StructArray::Make({int_chunk}, {int_field}, null_bitmap, 2)); - struct_chunks.push_back(chunk); - } - - ASSERT_OK_AND_ASSIGN(auto struct_chunked_array, ChunkedArray::Make(struct_chunks)); - ASSERT_OK(struct_chunked_array->ValidateFull()); - // The top-level null bitmap shouldn't affect the validity of the returned child field. - ASSERT_OK_AND_ASSIGN(auto int_child, FieldPath({0}).Get(*struct_chunked_array)); - ASSERT_TRUE(int_chunked_array->Equals(int_child)); -} - -TEST(TestFieldPath, GetForEmptyChunked) { - FieldVector fields = { - field("i", int32()), - field("s", struct_({field("b", boolean()), field("f", float32())}))}; - std::shared_ptr child; - - // Empty ChunkedArray with no chunks - ChunkedArray chunked_array({}, struct_(fields)); - ASSERT_OK(chunked_array.ValidateFull()); - ASSERT_EQ(chunked_array.num_chunks(), 0); - ASSERT_OK_AND_ASSIGN(child, FieldPath({1, 1}).Get(chunked_array)); - AssertTypeEqual(float32(), child->type()); - ASSERT_EQ(child->length(), 0); - - // Empty Table with no column chunks - ChunkedArrayVector table_columns; - for (const auto& f : fields) { - table_columns.push_back(std::make_shared(ArrayVector{}, f->type())); - } - auto table = Table::Make(schema(fields), table_columns, 0); - ASSERT_OK(table->ValidateFull()); - for (const auto& column : table->columns()) { - ASSERT_EQ(column->num_chunks(), 0); - } - ASSERT_OK_AND_ASSIGN(child, FieldPath({1, 1}).Get(*table)); - AssertTypeEqual(float32(), child->type()); - ASSERT_EQ(child->length(), 0); -} - -TEST(TestFieldPath, GetForRecordBatch) { - using testing::HasSubstr; - - constexpr int kNumRows = 100; - auto f0 = field("alpha", int32()); - auto f1 = field("beta", int32()); - auto f2 = field("alpha", int32()); - auto f3 = field("beta", int32()); - auto schema = arrow::schema({f0, f1, f2, f3}); - - arrow::random::RandomArrayGenerator gen_{42}; - auto a0 = gen_.ArrayOf(int32(), kNumRows); - auto a1 = gen_.ArrayOf(int32(), kNumRows); - auto a2 = gen_.ArrayOf(int32(), kNumRows); - auto a3 = gen_.ArrayOf(int32(), kNumRows); - auto array_vector = ArrayVector({a0, a1, a2, a3}); - - auto record_batch_ptr = arrow::RecordBatch::Make(schema, kNumRows, array_vector); - ASSERT_OK(record_batch_ptr->ValidateFull()); - - // retrieving an array FieldPath is equivalent to RecordBatch::column - auto num_columns = record_batch_ptr->num_columns(); - auto record_batch_schema = record_batch_ptr->schema(); - for (int index = 0; index < num_columns; ++index) { - ASSERT_OK_AND_EQ(record_batch_schema->field(index), FieldPath({index}).Get(*schema)); - ASSERT_OK_AND_ASSIGN(auto field_path_column, - FieldPath({index}).Get(*record_batch_ptr)); - EXPECT_TRUE(field_path_column->Equals(record_batch_ptr->column(index))); - } - EXPECT_RAISES_WITH_MESSAGE_THAT(Invalid, HasSubstr("empty indices cannot be traversed"), - FieldPath().Get(*record_batch_ptr)); - EXPECT_RAISES_WITH_MESSAGE_THAT(IndexError, HasSubstr("index out of range"), - FieldPath({num_columns * 2}).Get(*record_batch_ptr)); -} - -TEST(TestFieldRef, Basics) { - auto f0 = field("alpha", int32()); - auto f1 = field("beta", int32()); - auto f2 = field("alpha", int32()); - auto f3 = field("beta", int32()); - Schema s({f0, f1, f2, f3}); - - // lookup by index returns Indices{index} - for (int index = 0; index < s.num_fields(); ++index) { - EXPECT_THAT(FieldRef(index).FindAll(s), ElementsAre(FieldPath{index})); - } - // out of range index results in a failure to match - EXPECT_THAT(FieldRef(s.num_fields() * 2).FindAll(s), ElementsAre()); - - // lookup by name returns the Indices of both matching fields - EXPECT_THAT(FieldRef("alpha").FindAll(s), ElementsAre(FieldPath{0}, FieldPath{2})); - EXPECT_THAT(FieldRef("beta").FindAll(s), ElementsAre(FieldPath{1}, FieldPath{3})); -} - -TEST(TestFieldRef, FindAllForTable) { - constexpr int kNumRows = 100; - auto f0 = field("alpha", int32()); - auto f1 = field("beta", int32()); - auto f2 = field("alpha", int32()); - auto f3 = field("beta", int32()); - auto schema = arrow::schema({f0, f1, f2, f3}); - - arrow::random::RandomArrayGenerator gen_{42}; - auto a0 = gen_.ArrayOf(int32(), kNumRows); - auto a1 = gen_.ArrayOf(int32(), kNumRows); - auto a2 = gen_.ArrayOf(int32(), kNumRows); - auto a3 = gen_.ArrayOf(int32(), kNumRows); - - auto table_ptr = Table::Make(schema, {a0, a1, a2, a3}); - ASSERT_OK(table_ptr->ValidateFull()); - - // lookup by index returns Indices{index} - auto schema_num_fields = table_ptr->schema()->num_fields(); - for (int index = 0; index < schema_num_fields; ++index) { - EXPECT_THAT(FieldRef(index).FindAll(*table_ptr), ElementsAre(FieldPath{index})); - } - // out of range index results in a failure to match - EXPECT_THAT(FieldRef(schema_num_fields * 2).FindAll(*table_ptr), ElementsAre()); - - //// lookup by name returns the Indices of both matching fields - EXPECT_THAT(FieldRef("alpha").FindAll(*table_ptr), - ElementsAre(FieldPath{0}, FieldPath{2})); - EXPECT_THAT(FieldRef("beta").FindAll(*table_ptr), - ElementsAre(FieldPath{1}, FieldPath{3})); -} - -TEST(TestFieldRef, FindAllForRecordBatch) { - constexpr int kNumRows = 100; - auto f0 = field("alpha", int32()); - auto f1 = field("beta", int32()); - auto f2 = field("alpha", int32()); - auto f3 = field("beta", int32()); - auto schema = arrow::schema({f0, f1, f2, f3}); - - arrow::random::RandomArrayGenerator gen_{42}; - auto a0 = gen_.ArrayOf(int32(), kNumRows); - auto a1 = gen_.ArrayOf(int32(), kNumRows); - auto a2 = gen_.ArrayOf(int32(), kNumRows); - auto a3 = gen_.ArrayOf(int32(), kNumRows); - - auto record_batch_ptr = RecordBatch::Make(schema, kNumRows, {a0, a1, a2, a3}); - ASSERT_OK(record_batch_ptr->ValidateFull()); - - // lookup by index returns Indices{index} - auto schema_num_fields = record_batch_ptr->schema()->num_fields(); - for (int index = 0; index < schema_num_fields; ++index) { - EXPECT_THAT(FieldRef(index).FindAll(*record_batch_ptr), - ElementsAre(FieldPath{index})); - } - // out of range index results in a failure to match - EXPECT_THAT(FieldRef(schema_num_fields * 2).FindAll(*record_batch_ptr), ElementsAre()); - - //// lookup by name returns the Indices of both matching fields - EXPECT_THAT(FieldRef("alpha").FindAll(*record_batch_ptr), - ElementsAre(FieldPath{0}, FieldPath{2})); - EXPECT_THAT(FieldRef("beta").FindAll(*record_batch_ptr), - ElementsAre(FieldPath{1}, FieldPath{3})); -} - -TEST(TestFieldRef, FromDotPath) { - ASSERT_OK_AND_EQ(FieldRef("alpha"), FieldRef::FromDotPath(R"(.alpha)")); - - ASSERT_OK_AND_EQ(FieldRef("", ""), FieldRef::FromDotPath(R"(..)")); - - ASSERT_OK_AND_EQ(FieldRef(2), FieldRef::FromDotPath(R"([2])")); - - ASSERT_OK_AND_EQ(FieldRef("beta", 3), FieldRef::FromDotPath(R"(.beta[3])")); - - ASSERT_OK_AND_EQ(FieldRef(5, "gamma", "delta", 7), - FieldRef::FromDotPath(R"([5].gamma.delta[7])")); - - ASSERT_OK_AND_EQ(FieldRef("hello world"), FieldRef::FromDotPath(R"(.hello world)")); - - ASSERT_OK_AND_EQ(FieldRef(R"([y]\tho.\)"), FieldRef::FromDotPath(R"(.\[y\]\\tho\.\)")); - - ASSERT_OK_AND_EQ(FieldRef(), FieldRef::FromDotPath(R"()")); - - ASSERT_RAISES(Invalid, FieldRef::FromDotPath(R"(alpha)")); - ASSERT_RAISES(Invalid, FieldRef::FromDotPath(R"([134234)")); - ASSERT_RAISES(Invalid, FieldRef::FromDotPath(R"([1stuf])")); -} - -TEST(TestFieldRef, DotPathRoundTrip) { - auto check_roundtrip = [](const FieldRef& ref) { - auto dot_path = ref.ToDotPath(); - ASSERT_OK_AND_EQ(ref, FieldRef::FromDotPath(dot_path)); - }; - - check_roundtrip(FieldRef()); - check_roundtrip(FieldRef("foo")); - check_roundtrip(FieldRef("foo", 1, "bar", 2, 3)); - check_roundtrip(FieldRef(1, 2, 3)); - check_roundtrip(FieldRef("foo", 1, FieldRef("bar", 2, 3), FieldRef())); -} - -TEST(TestFieldPath, Nested) { - auto f0 = field("alpha", int32()); - auto f1_0 = field("alpha", int32()); - auto f1 = field("beta", struct_({f1_0})); - auto f2_0 = field("alpha", int32()); - auto f2_1_0 = field("alpha", int32()); - auto f2_1_1 = field("alpha", int32()); - auto f2_1 = field("gamma", struct_({f2_1_0, f2_1_1})); - auto f2 = field("beta", struct_({f2_0, f2_1})); - Schema s({f0, f1, f2}); - - // retrieving fields with nested indices - EXPECT_EQ(FieldPath({0}).Get(s), f0); - EXPECT_EQ(FieldPath({1, 0}).Get(s), f1_0); - EXPECT_EQ(FieldPath({2, 0}).Get(s), f2_0); - EXPECT_EQ(FieldPath({2, 1, 0}).Get(s), f2_1_0); - EXPECT_EQ(FieldPath({2, 1, 1}).Get(s), f2_1_1); -} - -TEST(TestFieldRef, Nested) { - auto f0 = field("alpha", int32()); - auto f1_0 = field("alpha", int32()); - auto f1 = field("beta", struct_({f1_0})); - auto f2_0 = field("alpha", int32()); - auto f2_1_0 = field("alpha", int32()); - auto f2_1_1 = field("alpha", int32()); - auto f2_1 = field("gamma", struct_({f2_1_0, f2_1_1})); - auto f2 = field("beta", struct_({f2_0, f2_1})); - Schema s({f0, f1, f2}); - - EXPECT_THAT(FieldRef("beta", "alpha").FindAll(s), - ElementsAre(FieldPath{1, 0}, FieldPath{2, 0})); - EXPECT_THAT(FieldRef("beta", "gamma", "alpha").FindAll(s), - ElementsAre(FieldPath{2, 1, 0}, FieldPath{2, 1, 1})); -} - -TEST(TestFieldRef, Flatten) { - FieldRef ref; - - auto assert_name = [](const FieldRef& ref, const std::string& expected) { - ASSERT_TRUE(ref.IsName()); - ASSERT_EQ(*ref.name(), expected); - }; - - auto assert_path = [](const FieldRef& ref, const std::vector& expected) { - ASSERT_TRUE(ref.IsFieldPath()); - ASSERT_EQ(ref.field_path()->indices(), expected); - }; - - auto assert_nested = [](const FieldRef& ref, const std::vector& expected) { - ASSERT_TRUE(ref.IsNested()); - ASSERT_EQ(*ref.nested_refs(), expected); - }; - - assert_path(FieldRef(), {}); - assert_path(FieldRef(1, 2, 3), {1, 2, 3}); - // If all leaves are field paths, they are fully flattened - assert_path(FieldRef(1, FieldRef(2, 3)), {1, 2, 3}); - assert_path(FieldRef(1, FieldRef(2, 3), FieldRef(), FieldRef(FieldRef(4), FieldRef(5))), - {1, 2, 3, 4, 5}); - assert_path(FieldRef(FieldRef(), FieldRef(FieldRef(), FieldRef())), {}); - - assert_name(FieldRef("foo"), "foo"); - - // Nested empty field refs are optimized away - assert_nested(FieldRef("foo", 1, FieldRef(), FieldRef(FieldRef(), "bar")), - {FieldRef("foo"), FieldRef(1), FieldRef("bar")}); - // For now, subsequences of indices are not concatenated - assert_nested(FieldRef("foo", FieldRef("bar"), FieldRef(1, 2), FieldRef(3)), - {FieldRef("foo"), FieldRef("bar"), FieldRef(1, 2), FieldRef(3)}); -} - using TestSchema = ::testing::Test; TEST_F(TestSchema, Basics) {