From 05f4c39935a9ad22f724e085c5f1c1eb0a710ecf Mon Sep 17 00:00:00 2001 From: mwish Date: Thu, 23 Nov 2023 01:19:08 +0800 Subject: [PATCH 1/3] finish the path part impl --- cpp/src/parquet/arrow/path_internal.cc | 63 +++++++++++++++------ cpp/src/parquet/arrow/path_internal_test.cc | 2 + cpp/src/parquet/arrow/schema.cc | 2 + 3 files changed, 51 insertions(+), 16 deletions(-) diff --git a/cpp/src/parquet/arrow/path_internal.cc b/cpp/src/parquet/arrow/path_internal.cc index 2d20403eac0..2e04069829a 100644 --- a/cpp/src/parquet/arrow/path_internal.cc +++ b/cpp/src/parquet/arrow/path_internal.cc @@ -312,7 +312,7 @@ struct NullableTerminalNode { // at least one other node). // // Type parameters: -// |RangeSelector| - A strategy for determine the the range of the child node to +// |RangeSelector| - A strategy for determine the range of the child node to // process. // this varies depending on the type of list (int32_t* offsets, int64_t* offsets of // fixed. @@ -450,6 +450,17 @@ struct FixedSizedRangeSelector { int list_size; }; +template +struct VarRangeViewSelector { + ElementRange GetRange(int64_t index) const { + return ElementRange{offsets[index], offsets[index] + sizes[index]}; + } + + // Either int32_t* or int64_t*. + const OffsetType* offsets; + const OffsetType* sizes; +}; + // An intermediate node that handles null values. class NullableNode { public: @@ -510,6 +521,8 @@ class NullableNode { using ListNode = ListPathNode>; using LargeListNode = ListPathNode>; +using ListViewNode = ListPathNode>; +using LargeListViewNode = ListPathNode>; using FixedSizeListNode = ListPathNode; // Contains static information derived from traversing the schema. @@ -517,9 +530,9 @@ struct PathInfo { // The vectors are expected to the same length info. // Note index order matters here. - using Node = - std::variant; + using Node = std::variant; std::vector path; std::shared_ptr primitive_array; @@ -579,15 +592,21 @@ Status WritePath(ElementRange root_range, PathInfo* path_info, IterationResult operator()(NullableNode& node) { return node.Run(stack_position, stack_position + 1, context); } - IterationResult operator()(ListNode& node) { - return node.Run(stack_position, stack_position + 1, context); - } IterationResult operator()(NullableTerminalNode& node) { return node.Run(*stack_position, context); } + IterationResult operator()(ListNode& node) { + return node.Run(stack_position, stack_position + 1, context); + } IterationResult operator()(FixedSizeListNode& node) { return node.Run(stack_position, stack_position + 1, context); } + IterationResult operator()(ListViewNode& node) { + return node.Run(stack_position, stack_position + 1, context); + } + IterationResult operator()(LargeListViewNode& node) { + return node.Run(stack_position, stack_position + 1, context); + } IterationResult operator()(AllPresentTerminalNode& node) { return node.Run(*stack_position, context); } @@ -651,6 +670,8 @@ struct FixupVisitor { void operator()(ListNode& node) { HandleListNode(node); } void operator()(LargeListNode& node) { HandleListNode(node); } void operator()(FixedSizeListNode& node) { HandleListNode(node); } + void operator()(ListViewNode& node) { HandleListNode(node); } + void operator()(LargeListViewNode& node) { HandleListNode(node); } // For non-list intermediate nodes. template @@ -724,19 +745,31 @@ class PathBuilder { template ::arrow::enable_if_t::value || - std::is_same<::arrow::LargeListArray, T>::value, + std::is_same<::arrow::LargeListArray, T>::value || + std::is_same<::arrow::ListViewArray, T>::value || + std::is_same<::arrow::LargeListViewArray, T>::value, Status> Visit(const T& array) { MaybeAddNullable(array); // Increment necessary due to empty lists. info_.max_def_level++; info_.max_rep_level++; - // raw_value_offsets() accounts for any slice offset. - ListPathNode> node( - VarRangeSelector{array.raw_value_offsets()}, - info_.max_rep_level, info_.max_def_level - 1); - info_.path.emplace_back(std::move(node)); - nullable_in_parent_ = array.list_type()->value_field()->nullable(); + // raw_value_offsets() and raw_value_sizes() accounts for any slice offset/size. + if constexpr (std::is_same<::arrow::ListViewArray, T>::value || + std::is_same<::arrow::LargeListViewArray, T>::value) { + ListPathNode> node( + VarRangeViewSelector{array.raw_value_offsets(), + array.raw_value_sizes()}, + info_.max_rep_level, info_.max_def_level - 1); + info_.path.emplace_back(std::move(node)); + nullable_in_parent_ = array.list_view_type()->value_field()->nullable(); + } else { + ListPathNode> node( + VarRangeSelector{array.raw_value_offsets()}, + info_.max_rep_level, info_.max_def_level - 1); + info_.path.emplace_back(std::move(node)); + nullable_in_parent_ = array.list_type()->value_field()->nullable(); + } return VisitInline(*array.values()); } @@ -830,8 +863,6 @@ class PathBuilder { // Types not yet supported in Parquet. NOT_IMPLEMENTED_VISIT(Union) NOT_IMPLEMENTED_VISIT(RunEndEncoded); - NOT_IMPLEMENTED_VISIT(ListView); - NOT_IMPLEMENTED_VISIT(LargeListView); #undef NOT_IMPLEMENTED_VISIT std::vector& paths() { return paths_; } diff --git a/cpp/src/parquet/arrow/path_internal_test.cc b/cpp/src/parquet/arrow/path_internal_test.cc index fb9c404247f..7d866adae4e 100644 --- a/cpp/src/parquet/arrow/path_internal_test.cc +++ b/cpp/src/parquet/arrow/path_internal_test.cc @@ -643,4 +643,6 @@ TEST_F(MultipathLevelBuilderTest, TestPrimitiveNonNullable) { EXPECT_THAT(results_[0].post_list_elements[0].end, Eq(4)); } +// TODO(mwish): testing ListView and LargeListView. + } // namespace parquet::arrow diff --git a/cpp/src/parquet/arrow/schema.cc b/cpp/src/parquet/arrow/schema.cc index f5484f131eb..45e4da44efe 100644 --- a/cpp/src/parquet/arrow/schema.cc +++ b/cpp/src/parquet/arrow/schema.cc @@ -78,6 +78,7 @@ Status FieldToNode(const std::string& name, const std::shared_ptr& field, const WriterProperties& properties, const ArrowWriterProperties& arrow_properties, NodePtr* out); +// TODO(mwish): handle LIST_VIEW Status ListToNode(const std::shared_ptr<::arrow::BaseListType>& type, const std::string& name, bool nullable, int field_id, const WriterProperties& properties, @@ -839,6 +840,7 @@ Status GetOriginSchema(const std::shared_ptr& metadata, Result ApplyOriginalMetadata(const Field& origin_field, SchemaField* inferred); +// TODO(mwish): handle {LARGE_}LIST_VIEW here. std::function(FieldVector)> GetNestedFactory( const ArrowType& origin_type, const ArrowType& inferred_type) { switch (inferred_type.id()) { From f5cac8c9bcd9d395e0f443b729e23db02a3f0043 Mon Sep 17 00:00:00 2001 From: mwish Date: Thu, 30 Nov 2023 16:34:55 +0800 Subject: [PATCH 2/3] add test in path_internal_test --- cpp/src/parquet/arrow/path_internal_test.cc | 245 ++++++++++++-------- cpp/src/parquet/arrow/schema.cc | 23 +- 2 files changed, 162 insertions(+), 106 deletions(-) diff --git a/cpp/src/parquet/arrow/path_internal_test.cc b/cpp/src/parquet/arrow/path_internal_test.cc index 7d866adae4e..30ee676e6b6 100644 --- a/cpp/src/parquet/arrow/path_internal_test.cc +++ b/cpp/src/parquet/arrow/path_internal_test.cc @@ -34,6 +34,7 @@ namespace parquet::arrow { using ::arrow::default_memory_pool; using ::arrow::field; using ::arrow::fixed_size_list; +using ::arrow::list_view; using ::arrow::Status; using ::testing::ElementsAre; using ::testing::ElementsAreArray; @@ -113,9 +114,50 @@ class MultipathLevelBuilderTest : public testing::Test { ArrowWriteContext(default_memory_pool(), arrow_properties_.get()); }; -TEST_F(MultipathLevelBuilderTest, NonNullableSingleListNonNullableEntries) { +class MultipathLevelListTool { + public: + static std::shared_ptr<::arrow::DataType> get_list( + const std::shared_ptr<::arrow::Field>& element) { + return ::arrow::list(element); + } + static std::shared_ptr<::arrow::DataType> get_list( + const std::shared_ptr<::arrow::DataType>& element) { + return ::arrow::list(element); + } + static std::shared_ptr<::arrow::DataType> get_large_list( + const std::shared_ptr<::arrow::Field>& element) { + return ::arrow::large_list(element); + } +}; + +class MultipathLevelListViewTool { + public: + static std::shared_ptr<::arrow::DataType> get_list( + const std::shared_ptr<::arrow::Field>& element) { + return ::arrow::list_view(element); + } + static std::shared_ptr<::arrow::DataType> get_list( + const std::shared_ptr<::arrow::DataType>& element) { + return ::arrow::list_view(element); + } + static std::shared_ptr<::arrow::DataType> get_large_list( + const std::shared_ptr<::arrow::Field>& element) { + return ::arrow::large_list_view(element); + } +}; + +template +class MultipathLevelBuilderListTest : public MultipathLevelBuilderTest { + using MultipathLevelBuilderTest::MultipathLevelBuilderTest; +}; + +using TestListTypes = + ::testing::Types; +TYPED_TEST_SUITE(MultipathLevelBuilderListTest, TestListTypes); + +TYPED_TEST(MultipathLevelBuilderListTest, NonNullableSingleListNonNullableEntries) { auto entries = field("Entries", ::arrow::int64(), /*nullable=*/false); - auto list_type = large_list(entries); + auto list_type = TypeParam::get_large_list(entries); // Translates to parquet schema: // required group bag { // repeated group [unseen] (List) { @@ -127,11 +169,11 @@ TEST_F(MultipathLevelBuilderTest, NonNullableSingleListNonNullableEntries) { // def level 1: a non-null entry auto array = ::arrow::ArrayFromJSON(list_type, R"([[1], [2, 3], [4, 5, 6]])"); - ASSERT_OK( - MultipathLevelBuilder::Write(*array, /*nullable=*/false, &context_, callback_)); + ASSERT_OK(MultipathLevelBuilder::Write(*array, /*nullable=*/false, &this->context_, + this->callback_)); - ASSERT_THAT(results_, SizeIs(1)); - const CapturedResult& result = results_[0]; + ASSERT_THAT(this->results_, SizeIs(1)); + const CapturedResult& result = this->results_[0]; result.CheckLevels(/*def_levels=*/std::vector(/*count=*/6, 1), /*rep_levels=*/{0, 0, 1, 0, 1, 1}); @@ -141,9 +183,9 @@ TEST_F(MultipathLevelBuilderTest, NonNullableSingleListNonNullableEntries) { EXPECT_THAT(result.post_list_elements[0].end, Eq(6)); } -TEST_F(MultipathLevelBuilderTest, NullableSingleListWithAllNullsLists) { +TYPED_TEST(MultipathLevelBuilderListTest, NullableSingleListWithAllNullsLists) { auto entries = field("Entries", ::arrow::int64(), /*nullable=*/false); - auto list_type = list(entries); + auto list_type = TypeParam::get_list(entries); // Translates to parquet schema: // optional group bag { // repeated group [unseen] (List) { @@ -157,18 +199,18 @@ TEST_F(MultipathLevelBuilderTest, NullableSingleListWithAllNullsLists) { auto array = ::arrow::ArrayFromJSON(list_type, R"([null, null, null, null])"); - ASSERT_OK( - MultipathLevelBuilder::Write(*array, /*nullable=*/true, &context_, callback_)); + ASSERT_OK(MultipathLevelBuilder::Write(*array, /*nullable=*/true, &this->context_, + this->callback_)); - ASSERT_THAT(results_, SizeIs(1)); - const CapturedResult& result = results_[0]; + ASSERT_THAT(this->results_, SizeIs(1)); + const CapturedResult& result = this->results_[0]; result.CheckLevels(/*def_levels=*/std::vector(/*count=*/4, 0), /*rep_levels=*/std::vector(4, 0)); } -TEST_F(MultipathLevelBuilderTest, NullableSingleListWithMixedElements) { +TYPED_TEST(MultipathLevelBuilderListTest, NullableSingleListWithMixedElements) { auto entries = field("Entries", ::arrow::int64(), /*nullable=*/false); - auto list_type = list(entries); + auto list_type = TypeParam::get_list(entries); // Translates to parquet schema: // optional group bag { // repeated group [unseen] (List) { @@ -182,19 +224,19 @@ TEST_F(MultipathLevelBuilderTest, NullableSingleListWithMixedElements) { auto array = ::arrow::ArrayFromJSON(list_type, R"([null, [], null, [1]])"); - ASSERT_OK( - MultipathLevelBuilder::Write(*array, /*nullable=*/true, &context_, callback_)); + ASSERT_OK(MultipathLevelBuilder::Write(*array, /*nullable=*/true, &this->context_, + this->callback_)); - ASSERT_THAT(results_, SizeIs(1)); - const CapturedResult& result = results_[0]; + ASSERT_THAT(this->results_, SizeIs(1)); + const CapturedResult& result = this->results_[0]; result.CheckLevels(/*def_levels=*/std::vector{0, 1, 0, 2}, /*rep_levels=*/std::vector(/*count=*/4, 0)); } -TEST_F(MultipathLevelBuilderTest, EmptyLists) { +TYPED_TEST(MultipathLevelBuilderListTest, EmptyLists) { // ARROW-13676 - ensure no out of bounds list memory accesses. auto entries = field("Entries", ::arrow::int64()); - auto list_type = list(entries); + auto list_type = TypeParam::get_list(entries); // Number of elements is important, to work past buffer padding hiding // the issue. auto array = ::arrow::ArrayFromJSON(list_type, R"([ @@ -212,18 +254,18 @@ TEST_F(MultipathLevelBuilderTest, EmptyLists) { // def level 2: a null entry // def level 3: a non-null entry - ASSERT_OK( - MultipathLevelBuilder::Write(*array, /*nullable=*/true, &context_, callback_)); + ASSERT_OK(MultipathLevelBuilder::Write(*array, /*nullable=*/true, &this->context_, + this->callback_)); - ASSERT_THAT(results_, SizeIs(1)); - const CapturedResult& result = results_[0]; + ASSERT_THAT(this->results_, SizeIs(1)); + const CapturedResult& result = this->results_[0]; result.CheckLevels(/*def_levels=*/std::vector(/*count=*/15, 1), /*rep_levels=*/std::vector(15, 0)); } -TEST_F(MultipathLevelBuilderTest, NullableSingleListWithAllEmptyLists) { +TYPED_TEST(MultipathLevelBuilderListTest, NullableSingleListWithAllEmptyLists) { auto entries = field("Entries", ::arrow::int64(), /*nullable=*/false); - auto list_type = list(entries); + auto list_type = TypeParam::get_list(entries); // Translates to parquet schema: // optional group bag { // repeated group [unseen] (List) { @@ -237,11 +279,11 @@ TEST_F(MultipathLevelBuilderTest, NullableSingleListWithAllEmptyLists) { auto array = ::arrow::ArrayFromJSON(list_type, R"([[], [], [], []])"); - ASSERT_OK( - MultipathLevelBuilder::Write(*array, /*nullable=*/true, &context_, callback_)); + ASSERT_OK(MultipathLevelBuilder::Write(*array, /*nullable=*/true, &this->context_, + this->callback_)); - ASSERT_THAT(results_, SizeIs(1)); - const CapturedResult& result = results_[0]; + ASSERT_THAT(this->results_, SizeIs(1)); + const CapturedResult& result = this->results_[0]; result.CheckLevels(/*def_levels=*/std::vector(/*count=*/4, 1), /*rep_levels=*/std::vector(4, 0)); } @@ -259,17 +301,17 @@ TEST_F(MultipathLevelBuilderTest, NullableSingleListWithAllEmptyLists) { // def level 2: a null entry // def level 3: a non-null entry -TEST_F(MultipathLevelBuilderTest, NullableSingleListWithAllNullEntries) { +TYPED_TEST(MultipathLevelBuilderListTest, NullableSingleListWithAllNullEntries) { auto entries = field("Entries", ::arrow::int64(), /*nullable=*/true); - auto list_type = list(entries); + auto list_type = TypeParam::get_list(entries); auto array = ::arrow::ArrayFromJSON(list_type, R"([[null], [null], [null], [null]])"); - ASSERT_OK( - MultipathLevelBuilder::Write(*array, /*nullable=*/true, &context_, callback_)); + ASSERT_OK(MultipathLevelBuilder::Write(*array, /*nullable=*/true, &this->context_, + this->callback_)); - ASSERT_THAT(results_, SizeIs(1)); - const CapturedResult& result = results_[0]; + ASSERT_THAT(this->results_, SizeIs(1)); + const CapturedResult& result = this->results_[0]; result.CheckLevels(/*def_levels=*/std::vector(/*count=*/4, 2), /*rep_levels=*/std::vector(4, 0)); ASSERT_THAT(result.post_list_elements, SizeIs(1)); @@ -277,17 +319,17 @@ TEST_F(MultipathLevelBuilderTest, NullableSingleListWithAllNullEntries) { EXPECT_THAT(result.post_list_elements[0].end, Eq(4)); } -TEST_F(MultipathLevelBuilderTest, NullableSingleListWithAllPresentEntries) { +TYPED_TEST(MultipathLevelBuilderListTest, NullableSingleListWithAllPresentEntries) { auto entries = field("Entries", ::arrow::int64(), /*nullable=*/true); - auto list_type = list(entries); + auto list_type = TypeParam::get_list(entries); auto array = ::arrow::ArrayFromJSON(list_type, R"([[], [], [1], [], [2, 3]])"); - ASSERT_OK( - MultipathLevelBuilder::Write(*array, /*nullable=*/true, &context_, callback_)); + ASSERT_OK(MultipathLevelBuilder::Write(*array, /*nullable=*/true, &this->context_, + this->callback_)); - ASSERT_THAT(results_, SizeIs(1)); - const CapturedResult& result = results_[0]; + ASSERT_THAT(this->results_, SizeIs(1)); + const CapturedResult& result = this->results_[0]; result.CheckLevels(/*def_levels=*/std::vector{1, 1, 3, 1, 3, 3}, /*rep_levels=*/std::vector{0, 0, 0, 0, 0, 1}); @@ -296,33 +338,34 @@ TEST_F(MultipathLevelBuilderTest, NullableSingleListWithAllPresentEntries) { EXPECT_THAT(result.post_list_elements[0].end, Eq(3)); } -TEST_F(MultipathLevelBuilderTest, NullableSingleListWithAllEmptyEntries) { +TYPED_TEST(MultipathLevelBuilderListTest, NullableSingleListWithAllEmptyEntries) { auto entries = field("Entries", ::arrow::int64(), /*nullable=*/true); - auto list_type = list(entries); + auto list_type = TypeParam::get_list(entries); auto array = ::arrow::ArrayFromJSON(list_type, R"([[], [], [], [], []])"); - ASSERT_OK( - MultipathLevelBuilder::Write(*array, /*nullable=*/true, &context_, callback_)); + ASSERT_OK(MultipathLevelBuilder::Write(*array, /*nullable=*/true, &this->context_, + this->callback_)); - ASSERT_THAT(results_, SizeIs(1)); - const CapturedResult& result = results_[0]; + ASSERT_THAT(this->results_, SizeIs(1)); + const CapturedResult& result = this->results_[0]; result.CheckLevels(/*def_levels=*/std::vector(/*count=*/5, 1), /*rep_levels=*/std::vector(/*count=*/5, 0)); } -TEST_F(MultipathLevelBuilderTest, NullableSingleListWithSomeNullEntriesAndSomeNullLists) { +TYPED_TEST(MultipathLevelBuilderListTest, + NullableSingleListWithSomeNullEntriesAndSomeNullLists) { auto entries = field("Entries", ::arrow::int64(), /*nullable=*/true); - auto list_type = list(entries); + auto list_type = TypeParam::get_list(entries); auto array = ::arrow::ArrayFromJSON( list_type, R"([null, [1 , 2, 3], [], [], null, null, [4, 5], [null]])"); - ASSERT_OK( - MultipathLevelBuilder::Write(*array, /*nullable=*/true, &context_, callback_)); + ASSERT_OK(MultipathLevelBuilder::Write(*array, /*nullable=*/true, &this->context_, + this->callback_)); - ASSERT_THAT(results_, SizeIs(1)); - const CapturedResult& result = results_[0]; + ASSERT_THAT(this->results_, SizeIs(1)); + const CapturedResult& result = this->results_[0]; result.CheckLevels( /*def_levels=*/std::vector{0, 3, 3, 3, 1, 1, 0, 0, 3, 3, 2}, @@ -348,51 +391,51 @@ TEST_F(MultipathLevelBuilderTest, NullableSingleListWithSomeNullEntriesAndSomeNu // def level 4: a null entry // def level 5: a non-null entry -TEST_F(MultipathLevelBuilderTest, NestedListsWithSomeEntries) { +TYPED_TEST(MultipathLevelBuilderListTest, NestedListsWithSomeEntries) { auto entries = field("Entries", ::arrow::int64(), /*nullable=*/true); - auto list_field = field("list", list(entries), /*nullable=*/true); - auto nested_list_type = list(list_field); + auto list_field = field("list", TypeParam::get_list(entries), /*nullable=*/true); + auto nested_list_type = TypeParam::get_list(list_field); auto array = ::arrow::ArrayFromJSON( nested_list_type, R"([null, [[1 , 2, 3], [4, 5]], [[], [], []], []])"); - ASSERT_OK( - MultipathLevelBuilder::Write(*array, /*nullable=*/true, &context_, callback_)); + ASSERT_OK(MultipathLevelBuilder::Write(*array, /*nullable=*/true, &this->context_, + this->callback_)); - ASSERT_THAT(results_, SizeIs(1)); - const CapturedResult& result = results_[0]; + ASSERT_THAT(this->results_, SizeIs(1)); + const CapturedResult& result = this->results_[0]; result.CheckLevels(/*def_levels=*/std::vector{0, 5, 5, 5, 5, 5, 3, 3, 3, 1}, /*rep_levels=*/std::vector{0, 0, 2, 2, 1, 2, 0, 1, 1, 0}); } -TEST_F(MultipathLevelBuilderTest, NestedListsWithSomeNulls) { +TYPED_TEST(MultipathLevelBuilderListTest, NestedListsWithSomeNulls) { auto entries = field("Entries", ::arrow::int64(), /*nullable=*/true); - auto list_field = field("list", list(entries), /*nullable=*/true); - auto nested_list_type = list(list_field); + auto list_field = field("list", TypeParam::get_list(entries), /*nullable=*/true); + auto nested_list_type = TypeParam::get_list(list_field); auto array = ::arrow::ArrayFromJSON(nested_list_type, R"([null, [[1, null, 3], null, null], [[4, 5]]])"); - ASSERT_OK( - MultipathLevelBuilder::Write(*array, /*nullable=*/true, &context_, callback_)); + ASSERT_OK(MultipathLevelBuilder::Write(*array, /*nullable=*/true, &this->context_, + this->callback_)); - ASSERT_THAT(results_, SizeIs(1)); - const CapturedResult& result = results_[0]; + ASSERT_THAT(this->results_, SizeIs(1)); + const CapturedResult& result = this->results_[0]; result.CheckLevels(/*def_levels=*/std::vector{0, 5, 4, 5, 2, 2, 5, 5}, /*rep_levels=*/std::vector{0, 0, 2, 2, 1, 1, 0, 2}); } -TEST_F(MultipathLevelBuilderTest, NestedListsWithSomeNullsSomeEmptys) { +TYPED_TEST(MultipathLevelBuilderListTest, NestedListsWithSomeNullsSomeEmptys) { auto entries = field("Entries", ::arrow::int64(), /*nullable=*/true); - auto list_field = field("list", list(entries), /*nullable=*/true); - auto nested_list_type = list(list_field); + auto list_field = field("list", TypeParam::get_list(entries), /*nullable=*/true); + auto nested_list_type = TypeParam::get_list(list_field); auto array = ::arrow::ArrayFromJSON(nested_list_type, R"([null, [[1 , null, 3], [], []], [[4, 5]]])"); - ASSERT_OK( - MultipathLevelBuilder::Write(*array, /*nullable=*/true, &context_, callback_)); + ASSERT_OK(MultipathLevelBuilder::Write(*array, /*nullable=*/true, &this->context_, + this->callback_)); - ASSERT_THAT(results_, SizeIs(1)); - const CapturedResult& result = results_[0]; + ASSERT_THAT(this->results_, SizeIs(1)); + const CapturedResult& result = this->results_[0]; result.CheckLevels(/*def_levels=*/std::vector{0, 5, 4, 5, 3, 3, 5, 5}, /*rep_levels=*/std::vector{0, 0, 2, 2, 1, 1, 0, 2}); } @@ -422,31 +465,31 @@ TEST_F(MultipathLevelBuilderTest, NestedListsWithSomeNullsSomeEmptys) { // def level 6: a null entry // def level 7: a non-null entry -TEST_F(MultipathLevelBuilderTest, TripleNestedListsAllPresent) { +TYPED_TEST(MultipathLevelBuilderListTest, TripleNestedListsAllPresent) { auto entries = field("Entries", ::arrow::int64(), /*nullable=*/true); - auto list_field = field("list", list(entries), /*nullable=*/true); - auto nested_list_type = list(list_field); - auto double_nested_list_type = list(nested_list_type); + auto list_field = field("list", TypeParam::get_list(entries), /*nullable=*/true); + auto nested_list_type = TypeParam::get_list(list_field); + auto double_nested_list_type = TypeParam::get_list(nested_list_type); auto array = ::arrow::ArrayFromJSON(double_nested_list_type, R"([ [[[1, 2, 3], [4, 5, 6]], [[7, 8, 9]]] ])"); - ASSERT_OK( - MultipathLevelBuilder::Write(*array, /*nullable=*/true, &context_, callback_)); + ASSERT_OK(MultipathLevelBuilder::Write(*array, /*nullable=*/true, &this->context_, + this->callback_)); - ASSERT_THAT(results_, SizeIs(1)); - const CapturedResult& result = results_[0]; + ASSERT_THAT(this->results_, SizeIs(1)); + const CapturedResult& result = this->results_[0]; result.CheckLevels(/*def_levels=*/std::vector(/*counter=*/9, 7), /*rep_levels=*/std::vector{ 0, 3, 3, 2, 3, 3, 1, 3, 3 // first row }); } -TEST_F(MultipathLevelBuilderTest, TripleNestedListsWithSomeNullsSomeEmptys) { +TYPED_TEST(MultipathLevelBuilderListTest, TripleNestedListsWithSomeNullsSomeEmptys) { auto entries = field("Entries", ::arrow::int64(), /*nullable=*/true); - auto list_field = field("list", list(entries), /*nullable=*/true); - auto nested_list_type = list(list_field); - auto double_nested_list_type = list(nested_list_type); + auto list_field = field("list", TypeParam::get_list(entries), /*nullable=*/true); + auto nested_list_type = TypeParam::get_list(list_field); + auto double_nested_list_type = TypeParam::get_list(nested_list_type); auto array = ::arrow::ArrayFromJSON(double_nested_list_type, R"([ [null, [[1 , null, 3], []], []], @@ -455,11 +498,11 @@ TEST_F(MultipathLevelBuilderTest, TripleNestedListsWithSomeNullsSomeEmptys) { [] ])"); - ASSERT_OK( - MultipathLevelBuilder::Write(*array, /*nullable=*/true, &context_, callback_)); + ASSERT_OK(MultipathLevelBuilder::Write(*array, /*nullable=*/true, &this->context_, + this->callback_)); - ASSERT_THAT(results_, SizeIs(1)); - const CapturedResult& result = results_[0]; + ASSERT_THAT(this->results_, SizeIs(1)); + const CapturedResult& result = this->results_[0]; result.CheckLevels(/*def_levels=*/std::vector{2, 7, 6, 7, 5, 3, // first row 5, 5, 7, 7, 2, 7, // second row 0, // third row @@ -469,22 +512,22 @@ TEST_F(MultipathLevelBuilderTest, TripleNestedListsWithSomeNullsSomeEmptys) { 0, 0}); } -TEST_F(MultipathLevelBuilderTest, QuadNestedListsAllPresent) { +TYPED_TEST(MultipathLevelBuilderListTest, QuadNestedListsAllPresent) { auto entries = field("Entries", ::arrow::int64(), /*nullable=*/true); - auto list_field = field("list", list(entries), /*nullable=*/true); - auto nested_list_type = list(list_field); - auto double_nested_list_type = list(nested_list_type); - auto triple_nested_list_type = list(double_nested_list_type); + auto list_field = field("list", TypeParam::get_list(entries), /*nullable=*/true); + auto nested_list_type = TypeParam::get_list(list_field); + auto double_nested_list_type = TypeParam::get_list(nested_list_type); + auto triple_nested_list_type = TypeParam::get_list(double_nested_list_type); auto array = ::arrow::ArrayFromJSON(triple_nested_list_type, R"([ [[[[1, 2], [3, 4]], [[5]]], [[[6, 7, 8]]]], [[[[1, 2], [3, 4]], [[5]]], [[[6, 7, 8]]]] ])"); - ASSERT_OK( - MultipathLevelBuilder::Write(*array, /*nullable=*/true, &context_, callback_)); + ASSERT_OK(MultipathLevelBuilder::Write(*array, /*nullable=*/true, &this->context_, + this->callback_)); - ASSERT_THAT(results_, SizeIs(1)); - const CapturedResult& result = results_[0]; + ASSERT_THAT(this->results_, SizeIs(1)); + const CapturedResult& result = this->results_[0]; result.CheckLevels(/*def_levels=*/std::vector(16, 9), /*rep_levels=*/std::vector{ 0, 4, 3, 4, 2, 1, 4, 4, // @@ -643,6 +686,4 @@ TEST_F(MultipathLevelBuilderTest, TestPrimitiveNonNullable) { EXPECT_THAT(results_[0].post_list_elements[0].end, Eq(4)); } -// TODO(mwish): testing ListView and LargeListView. - } // namespace parquet::arrow diff --git a/cpp/src/parquet/arrow/schema.cc b/cpp/src/parquet/arrow/schema.cc index 45e4da44efe..84bbd489071 100644 --- a/cpp/src/parquet/arrow/schema.cc +++ b/cpp/src/parquet/arrow/schema.cc @@ -78,7 +78,6 @@ Status FieldToNode(const std::string& name, const std::shared_ptr& field, const WriterProperties& properties, const ArrowWriterProperties& arrow_properties, NodePtr* out); -// TODO(mwish): handle LIST_VIEW Status ListToNode(const std::shared_ptr<::arrow::BaseListType>& type, const std::string& name, bool nullable, int field_id, const WriterProperties& properties, @@ -410,7 +409,9 @@ Status FieldToNode(const std::string& name, const std::shared_ptr& field, } case ArrowTypeId::FIXED_SIZE_LIST: case ArrowTypeId::LARGE_LIST: - case ArrowTypeId::LIST: { + case ArrowTypeId::LIST: + case ArrowTypeId::LARGE_LIST_VIEW: + case ArrowTypeId::LIST_VIEW: { auto list_type = std::static_pointer_cast<::arrow::BaseListType>(field->type()); return ListToNode(list_type, name, field->nullable(), field_id, properties, arrow_properties, out); @@ -840,7 +841,6 @@ Status GetOriginSchema(const std::shared_ptr& metadata, Result ApplyOriginalMetadata(const Field& origin_field, SchemaField* inferred); -// TODO(mwish): handle {LARGE_}LIST_VIEW here. std::function(FieldVector)> GetNestedFactory( const ArrowType& origin_type, const ArrowType& inferred_type) { switch (inferred_type.id()) { @@ -870,6 +870,20 @@ std::function(FieldVector)> GetNestedFactory( return ::arrow::fixed_size_list(std::move(fields[0]), list_size); }; } + if (origin_type.id() == ::arrow::Type::LIST_VIEW) { + // Reading LIST_VIEW as LIST as a workaround. + return [](FieldVector fields) { + DCHECK_EQ(fields.size(), 1); + return ::arrow::list(std::move(fields[0])); + }; + } + if (origin_type.id() == ::arrow::Type::LARGE_LIST_VIEW) { + // Reading LARGE_LIST_VIEW as LARGE_LIST as a workaround. + return [](FieldVector fields) { + DCHECK_EQ(fields.size(), 1); + return ::arrow::large_list_view(std::move(fields[0])); + }; + } break; default: break; @@ -890,7 +904,8 @@ Result ApplyOriginalStorageMetadata(const Field& origin_field, DCHECK_EQ(static_cast(inferred->children.size()), num_children); const auto factory = GetNestedFactory(*origin_type, *inferred_type); if (factory) { - // The type may be modified (e.g. LargeList) while the children stay the same + // The type may be modified (e.g. LargeList, ListView, LargeListView) + // while the children stay the same. modified |= origin_type->id() != inferred_type->id(); // Apply original metadata recursively to children From 409a4aecb42bb22f0fc884952ecdfcb0adba7959 Mon Sep 17 00:00:00 2001 From: mwish Date: Thu, 30 Nov 2023 17:06:24 +0800 Subject: [PATCH 3/3] update schema test --- cpp/src/parquet/arrow/arrow_schema_test.cc | 41 ++++++++++++++++++++-- 1 file changed, 38 insertions(+), 3 deletions(-) diff --git a/cpp/src/parquet/arrow/arrow_schema_test.cc b/cpp/src/parquet/arrow/arrow_schema_test.cc index 5443214f930..63eb4de9958 100644 --- a/cpp/src/parquet/arrow/arrow_schema_test.cc +++ b/cpp/src/parquet/arrow/arrow_schema_test.cc @@ -999,6 +999,17 @@ TEST_F(TestConvertArrowSchema, ParquetLists) { auto arrow_list = ::arrow::list(arrow_element); arrow_fields.push_back(::arrow::field("my_list", arrow_list, false)); } + // Same as above but using list_view as arrow type. + { + auto element = PrimitiveNode::Make("element", Repetition::OPTIONAL, + ParquetType::BYTE_ARRAY, ConvertedType::UTF8); + auto list = GroupNode::Make("list", Repetition::REPEATED, {element}); + parquet_fields.push_back( + GroupNode::Make("my_list", Repetition::REQUIRED, {list}, ConvertedType::LIST)); + auto arrow_element = ::arrow::field("string", UTF8, true); + auto arrow_list = ::arrow::list_view(arrow_element); + arrow_fields.push_back(::arrow::field("my_list", arrow_list, false)); + } // // List (list nullable, elements non-null) // optional group my_list (LIST) { @@ -1016,6 +1027,17 @@ TEST_F(TestConvertArrowSchema, ParquetLists) { auto arrow_list = ::arrow::list(arrow_element); arrow_fields.push_back(::arrow::field("my_list", arrow_list, true)); } + // Same as above but using list_view as arrow type. + { + auto element = PrimitiveNode::Make("element", Repetition::REQUIRED, + ParquetType::BYTE_ARRAY, ConvertedType::UTF8); + auto list = GroupNode::Make("list", Repetition::REPEATED, {element}); + parquet_fields.push_back( + GroupNode::Make("my_list", Repetition::OPTIONAL, {list}, ConvertedType::LIST)); + auto arrow_element = ::arrow::field("string", UTF8, false); + auto arrow_list = ::arrow::list_view(arrow_element); + arrow_fields.push_back(::arrow::field("my_list", arrow_list, true)); + } ASSERT_OK(ConvertSchema(arrow_fields)); @@ -1081,20 +1103,33 @@ TEST_F(TestConvertArrowSchema, ParquetOtherLists) { // parquet_arrow will always generate 3-level LIST encodings - // // LargeList (list-like non-null, elements nullable) + // // LargeList/ListView/LargeListView + // // (list-like non-null, elements nullable) // required group my_list (LIST) { // repeated group list { // optional binary element (UTF8); // } // } - { + std::vector( + const std::shared_ptr<::arrow::Field>)>> + list_cast_fns; + list_cast_fns.push_back([](const std::shared_ptr<::arrow::Field> field) { + return ::arrow::large_list(field->type()); + }); + list_cast_fns.push_back([](const std::shared_ptr<::arrow::Field> field) { + return ::arrow::list_view(field->type()); + }); + list_cast_fns.push_back([](const std::shared_ptr<::arrow::Field> field) { + return ::arrow::large_list_view(field->type()); + }); + for (auto list_type_fn : list_cast_fns) { auto element = PrimitiveNode::Make("element", Repetition::OPTIONAL, ParquetType::BYTE_ARRAY, ConvertedType::UTF8); auto list = GroupNode::Make("list", Repetition::REPEATED, {element}); parquet_fields.push_back( GroupNode::Make("my_list", Repetition::REQUIRED, {list}, ConvertedType::LIST)); auto arrow_element = ::arrow::field("string", UTF8, true); - auto arrow_list = ::arrow::large_list(arrow_element); + auto arrow_list = list_type_fn(arrow_element); arrow_fields.push_back(::arrow::field("my_list", arrow_list, false)); } // // FixedSizeList[10] (list-like non-null, elements nullable)