Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
41 changes: 38 additions & 3 deletions cpp/src/parquet/arrow/arrow_schema_test.cc
Original file line number Diff line number Diff line change
Expand Up @@ -999,6 +999,17 @@ TEST_F(TestConvertArrowSchema, ParquetLists) {
auto arrow_list = ::arrow::list(arrow_element);
arrow_fields.push_back(::arrow::field("my_list", arrow_list, false));
}
// Same as above but using list_view as arrow type.
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The only thing that changes below is the list factory function, this could perhaps be factored out (also with list_cast_fns below), e.g.:

auto list_type_factory = [](const std::shared_ptr<::arrow::Field> field) {
    return ::arrow::list(field->type());
};

...

for (const auto& list_factory : {list_type_factory, list_view_type_factory}) { ... }

{
auto element = PrimitiveNode::Make("element", Repetition::OPTIONAL,
ParquetType::BYTE_ARRAY, ConvertedType::UTF8);
auto list = GroupNode::Make("list", Repetition::REPEATED, {element});
parquet_fields.push_back(
GroupNode::Make("my_list", Repetition::REQUIRED, {list}, ConvertedType::LIST));
auto arrow_element = ::arrow::field("string", UTF8, true);
auto arrow_list = ::arrow::list_view(arrow_element);
arrow_fields.push_back(::arrow::field("my_list", arrow_list, false));
}

// // List<String> (list nullable, elements non-null)
// optional group my_list (LIST) {
Expand All @@ -1016,6 +1027,17 @@ TEST_F(TestConvertArrowSchema, ParquetLists) {
auto arrow_list = ::arrow::list(arrow_element);
arrow_fields.push_back(::arrow::field("my_list", arrow_list, true));
}
// Same as above but using list_view as arrow type.
{
auto element = PrimitiveNode::Make("element", Repetition::REQUIRED,
ParquetType::BYTE_ARRAY, ConvertedType::UTF8);
auto list = GroupNode::Make("list", Repetition::REPEATED, {element});
parquet_fields.push_back(
GroupNode::Make("my_list", Repetition::OPTIONAL, {list}, ConvertedType::LIST));
auto arrow_element = ::arrow::field("string", UTF8, false);
auto arrow_list = ::arrow::list_view(arrow_element);
arrow_fields.push_back(::arrow::field("my_list", arrow_list, true));
}

ASSERT_OK(ConvertSchema(arrow_fields));

Expand Down Expand Up @@ -1081,20 +1103,33 @@ TEST_F(TestConvertArrowSchema, ParquetOtherLists) {

// parquet_arrow will always generate 3-level LIST encodings

// // LargeList<String> (list-like non-null, elements nullable)
// // LargeList<String>/ListView<String>/LargeListView<String>
// // (list-like non-null, elements nullable)
// required group my_list (LIST) {
// repeated group list {
// optional binary element (UTF8);
// }
// }
{
std::vector<std::function<std::shared_ptr<::arrow::DataType>(
const std::shared_ptr<::arrow::Field>)>>
list_cast_fns;
list_cast_fns.push_back([](const std::shared_ptr<::arrow::Field> field) {
return ::arrow::large_list(field->type());
});
list_cast_fns.push_back([](const std::shared_ptr<::arrow::Field> field) {
return ::arrow::list_view(field->type());
});
list_cast_fns.push_back([](const std::shared_ptr<::arrow::Field> field) {
return ::arrow::large_list_view(field->type());
});
for (auto list_type_fn : list_cast_fns) {
auto element = PrimitiveNode::Make("element", Repetition::OPTIONAL,
ParquetType::BYTE_ARRAY, ConvertedType::UTF8);
auto list = GroupNode::Make("list", Repetition::REPEATED, {element});
parquet_fields.push_back(
GroupNode::Make("my_list", Repetition::REQUIRED, {list}, ConvertedType::LIST));
auto arrow_element = ::arrow::field("string", UTF8, true);
auto arrow_list = ::arrow::large_list(arrow_element);
auto arrow_list = list_type_fn(arrow_element);
arrow_fields.push_back(::arrow::field("my_list", arrow_list, false));
}
// // FixedSizeList[10]<String> (list-like non-null, elements nullable)
Expand Down
61 changes: 46 additions & 15 deletions cpp/src/parquet/arrow/path_internal.cc
Original file line number Diff line number Diff line change
Expand Up @@ -450,6 +450,17 @@ struct FixedSizedRangeSelector {
int list_size;
};

template <typename OffsetType>
struct VarRangeViewSelector {
ElementRange GetRange(int64_t index) const {
return ElementRange{offsets[index], offsets[index] + sizes[index]};
}

// Either int32_t* or int64_t*.
const OffsetType* offsets;
const OffsetType* sizes;
};

// An intermediate node that handles null values.
class NullableNode {
public:
Expand Down Expand Up @@ -510,16 +521,18 @@ class NullableNode {

using ListNode = ListPathNode<VarRangeSelector<int32_t>>;
using LargeListNode = ListPathNode<VarRangeSelector<int64_t>>;
using ListViewNode = ListPathNode<VarRangeViewSelector<int32_t>>;
using LargeListViewNode = ListPathNode<VarRangeViewSelector<int64_t>>;
using FixedSizeListNode = ListPathNode<FixedSizedRangeSelector>;

// Contains static information derived from traversing the schema.
struct PathInfo {
// The vectors are expected to the same length info.

// Note index order matters here.
using Node =
std::variant<NullableTerminalNode, ListNode, LargeListNode, FixedSizeListNode,
NullableNode, AllPresentTerminalNode, AllNullsTerminalNode>;
using Node = std::variant<NullableTerminalNode, ListNode, LargeListNode, ListViewNode,
LargeListViewNode, FixedSizeListNode, NullableNode,
AllPresentTerminalNode, AllNullsTerminalNode>;

std::vector<Node> path;
std::shared_ptr<Array> primitive_array;
Expand Down Expand Up @@ -579,15 +592,21 @@ Status WritePath(ElementRange root_range, PathInfo* path_info,
IterationResult operator()(NullableNode& node) {
return node.Run(stack_position, stack_position + 1, context);
}
IterationResult operator()(ListNode& node) {
return node.Run(stack_position, stack_position + 1, context);
}
IterationResult operator()(NullableTerminalNode& node) {
return node.Run(*stack_position, context);
}
IterationResult operator()(ListNode& node) {
return node.Run(stack_position, stack_position + 1, context);
}
IterationResult operator()(FixedSizeListNode& node) {
return node.Run(stack_position, stack_position + 1, context);
}
IterationResult operator()(ListViewNode& node) {
return node.Run(stack_position, stack_position + 1, context);
}
IterationResult operator()(LargeListViewNode& node) {
return node.Run(stack_position, stack_position + 1, context);
}
IterationResult operator()(AllPresentTerminalNode& node) {
return node.Run(*stack_position, context);
}
Expand Down Expand Up @@ -651,6 +670,8 @@ struct FixupVisitor {
void operator()(ListNode& node) { HandleListNode(node); }
void operator()(LargeListNode& node) { HandleListNode(node); }
void operator()(FixedSizeListNode& node) { HandleListNode(node); }
void operator()(ListViewNode& node) { HandleListNode(node); }
void operator()(LargeListViewNode& node) { HandleListNode(node); }

// For non-list intermediate nodes.
template <typename T>
Expand Down Expand Up @@ -724,19 +745,31 @@ class PathBuilder {

template <typename T>
::arrow::enable_if_t<std::is_same<::arrow::ListArray, T>::value ||
std::is_same<::arrow::LargeListArray, T>::value,
std::is_same<::arrow::LargeListArray, T>::value ||
std::is_same<::arrow::ListViewArray, T>::value ||
std::is_same<::arrow::LargeListViewArray, T>::value,
Status>
Visit(const T& array) {
MaybeAddNullable(array);
// Increment necessary due to empty lists.
info_.max_def_level++;
info_.max_rep_level++;
// raw_value_offsets() accounts for any slice offset.
ListPathNode<VarRangeSelector<typename T::offset_type>> node(
VarRangeSelector<typename T::offset_type>{array.raw_value_offsets()},
info_.max_rep_level, info_.max_def_level - 1);
info_.path.emplace_back(std::move(node));
nullable_in_parent_ = array.list_type()->value_field()->nullable();
// raw_value_offsets() and raw_value_sizes() accounts for any slice offset/size.
if constexpr (std::is_same<::arrow::ListViewArray, T>::value ||
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

You can use is_list_view_type from arrow/type_traits.h.

std::is_same<::arrow::LargeListViewArray, T>::value) {
ListPathNode<VarRangeViewSelector<typename T::offset_type>> node(
VarRangeViewSelector<typename T::offset_type>{array.raw_value_offsets(),
array.raw_value_sizes()},
info_.max_rep_level, info_.max_def_level - 1);
info_.path.emplace_back(std::move(node));
nullable_in_parent_ = array.list_view_type()->value_field()->nullable();
} else {
ListPathNode<VarRangeSelector<typename T::offset_type>> node(
VarRangeSelector<typename T::offset_type>{array.raw_value_offsets()},
info_.max_rep_level, info_.max_def_level - 1);
info_.path.emplace_back(std::move(node));
nullable_in_parent_ = array.list_type()->value_field()->nullable();
}
return VisitInline(*array.values());
}

Expand Down Expand Up @@ -830,8 +863,6 @@ class PathBuilder {
// Types not yet supported in Parquet.
NOT_IMPLEMENTED_VISIT(Union)
NOT_IMPLEMENTED_VISIT(RunEndEncoded);
NOT_IMPLEMENTED_VISIT(ListView);
NOT_IMPLEMENTED_VISIT(LargeListView);

#undef NOT_IMPLEMENTED_VISIT
std::vector<PathInfo>& paths() { return paths_; }
Expand Down
Loading