From 614ab2cb28f65b7d748d37582c834bc14a347ccb Mon Sep 17 00:00:00 2001 From: Felipe Oliveira Carvalho Date: Tue, 18 Apr 2023 17:37:41 -0300 Subject: [PATCH 01/91] [Large]ListViewType: Add the list-view type classes and [LARGE_]LIST_VIEW type IDs --- cpp/src/arrow/array/data.cc | 2 + cpp/src/arrow/type.cc | 60 +++++++++++++++++ cpp/src/arrow/type.h | 65 ++++++++++++++++++ cpp/src/arrow/type_fwd.h | 23 +++++++ cpp/src/arrow/type_test.cc | 127 +++++++++++++++++++++++++++++++++++ cpp/src/arrow/type_traits.cc | 20 +++--- cpp/src/arrow/type_traits.h | 70 +++++++++++++++++++ 7 files changed, 358 insertions(+), 9 deletions(-) diff --git a/cpp/src/arrow/array/data.cc b/cpp/src/arrow/array/data.cc index 678513fd4d1..1af373a19ea 100644 --- a/cpp/src/arrow/array/data.cc +++ b/cpp/src/arrow/array/data.cc @@ -261,6 +261,8 @@ int GetNumBuffers(const DataType& type) { case Type::STRING_VIEW: case Type::BINARY_VIEW: case Type::DENSE_UNION: + case Type::LIST_VIEW: + case Type::LARGE_LIST_VIEW: return 3; case Type::EXTENSION: // The number of buffers depends on the storage type diff --git a/cpp/src/arrow/type.cc b/cpp/src/arrow/type.cc index f378bd97404..45625845d28 100644 --- a/cpp/src/arrow/type.cc +++ b/cpp/src/arrow/type.cc @@ -992,6 +992,18 @@ std::string LargeListType::ToString() const { return s.str(); } +std::string ListViewType::ToString() const { + std::stringstream s; + s << "list_view<" << value_field()->ToString() << ">"; + return s.str(); +} + +std::string LargeListViewType::ToString() const { + std::stringstream s; + s << "large_list_view<" << value_field()->ToString() << ">"; + return s.str(); +} + MapType::MapType(std::shared_ptr key_type, std::shared_ptr item_type, bool keys_sorted) : MapType(::arrow::field("key", std::move(key_type), false), @@ -2888,6 +2900,38 @@ std::string LargeListType::ComputeFingerprint() const { return ""; } +std::string ListViewType::ComputeFingerprint() const { + const auto& child_fingerprint = value_type()->fingerprint(); + if (!child_fingerprint.empty()) { + std::stringstream ss; + ss << TypeIdFingerprint(*this); + if (value_field()->nullable()) { + ss << 'n'; + } else { + ss << 'N'; + } + ss << '{' << child_fingerprint << '}'; + return ss.str(); + } + return ""; +} + +std::string LargeListViewType::ComputeFingerprint() const { + const auto& child_fingerprint = value_type()->fingerprint(); + if (!child_fingerprint.empty()) { + std::stringstream ss; + ss << TypeIdFingerprint(*this); + if (value_field()->nullable()) { + ss << 'n'; + } else { + ss << 'N'; + } + ss << '{' << child_fingerprint << '}'; + return ss.str(); + } + return ""; +} + std::string MapType::ComputeFingerprint() const { const auto& key_fingerprint = key_type()->fingerprint(); const auto& item_fingerprint = item_type()->fingerprint(); @@ -3138,6 +3182,22 @@ std::shared_ptr fixed_size_list(const std::shared_ptr& value_fi return std::make_shared(value_field, list_size); } +std::shared_ptr list_view(std::shared_ptr value_type) { + return std::make_shared(std::move(value_type)); +} + +std::shared_ptr list_view(std::shared_ptr value_field) { + return std::make_shared(std::move(value_field)); +} + +std::shared_ptr large_list_view(std::shared_ptr value_type) { + return std::make_shared(std::move(value_type)); +} + +std::shared_ptr large_list_view(std::shared_ptr value_field) { + return std::make_shared(std::move(value_field)); +} + std::shared_ptr struct_(const FieldVector& fields) { return std::make_shared(fields); } diff --git a/cpp/src/arrow/type.h b/cpp/src/arrow/type.h index a905192e4a5..5b1331ab669 100644 --- a/cpp/src/arrow/type.h +++ b/cpp/src/arrow/type.h @@ -1174,6 +1174,71 @@ class ARROW_EXPORT LargeListType : public BaseListType { std::string ComputeFingerprint() const override; }; +/// \brief Type class for array of list views +class ARROW_EXPORT ListViewType : public BaseListType { + public: + static constexpr Type::type type_id = Type::LIST_VIEW; + using offset_type = int32_t; + + static constexpr const char* type_name() { return "list_view"; } + + // ListView can contain any other logical value type + explicit ListViewType(const std::shared_ptr& value_type) + : ListViewType(std::make_shared("item", value_type)) {} + + explicit ListViewType(const std::shared_ptr& value_field) + : BaseListType(type_id) { + children_ = {value_field}; + } + + DataTypeLayout layout() const override { + return DataTypeLayout({DataTypeLayout::Bitmap(), + DataTypeLayout::FixedWidth(sizeof(offset_type)), + DataTypeLayout::FixedWidth(sizeof(offset_type))}); + } + + std::string ToString() const override; + + std::string name() const override { return "list_view"; } + + protected: + std::string ComputeFingerprint() const override; +}; + +/// \brief Concrete type class for large list-view data +/// +/// LargeListViewType is like ListViewType but with 64-bit rather than 32-bit offsets and +/// sizes. +class ARROW_EXPORT LargeListViewType : public BaseListType { + public: + static constexpr Type::type type_id = Type::LARGE_LIST_VIEW; + using offset_type = int64_t; + + static constexpr const char* type_name() { return "large_list_view"; } + + // LargeListView can contain any other logical value type + explicit LargeListViewType(const std::shared_ptr& value_type) + : LargeListViewType(std::make_shared("item", value_type)) {} + + explicit LargeListViewType(const std::shared_ptr& value_field) + : BaseListType(type_id) { + children_ = {value_field}; + } + + DataTypeLayout layout() const override { + return DataTypeLayout({DataTypeLayout::Bitmap(), + DataTypeLayout::FixedWidth(sizeof(offset_type)), + DataTypeLayout::FixedWidth(sizeof(offset_type))}); + } + + std::string ToString() const override; + + std::string name() const override { return "large_list_view"; } + + protected: + std::string ComputeFingerprint() const override; +}; + /// \brief Concrete type class for map data /// /// Map data is nested data where each value is a variable number of diff --git a/cpp/src/arrow/type_fwd.h b/cpp/src/arrow/type_fwd.h index ca263b71031..7d5a7029c67 100644 --- a/cpp/src/arrow/type_fwd.h +++ b/cpp/src/arrow/type_fwd.h @@ -150,6 +150,10 @@ class LargeListArray; class LargeListBuilder; struct LargeListScalar; +class ListViewType; + +class LargeListViewType; + class MapType; class MapArray; class MapBuilder; @@ -432,6 +436,12 @@ struct Type { /// Bytes view type with 4-byte prefix and inline small string optimization BINARY_VIEW = 40, + /// A list of some logical data type represented by offset and size. + LIST_VIEW = 41, + + /// Like LIST_VIEW, but with 64-bit offsets and sizes + LARGE_LIST_VIEW = 42, + // Leave this at the end MAX_ID }; @@ -523,6 +533,19 @@ std::shared_ptr large_list(const std::shared_ptr& value_type); ARROW_EXPORT std::shared_ptr large_list(const std::shared_ptr& value_type); +/// \brief Create a ListViewType instance +ARROW_EXPORT std::shared_ptr list_view(std::shared_ptr value_type); + +/// \brief Create a ListViewType instance from its child Field type +ARROW_EXPORT std::shared_ptr list_view(std::shared_ptr value_type); + +/// \brief Create a LargetListViewType instance +ARROW_EXPORT std::shared_ptr large_list_view( + std::shared_ptr value_type); + +/// \brief Create a LargetListViewType instance from its child Field type +ARROW_EXPORT std::shared_ptr large_list_view(std::shared_ptr value_type); + /// \brief Create a MapType instance from its key and value DataTypes ARROW_EXPORT std::shared_ptr map(std::shared_ptr key_type, diff --git a/cpp/src/arrow/type_test.cc b/cpp/src/arrow/type_test.cc index 273f8933fa5..dadb15a68a1 100644 --- a/cpp/src/arrow/type_test.cc +++ b/cpp/src/arrow/type_test.cc @@ -1553,6 +1553,46 @@ TEST(TestLargeListType, Basics) { ASSERT_EQ("large_list>", lt2.ToString()); } +TEST(TestListViewType, Basics) { + std::shared_ptr vt = std::make_shared(); + + ListViewType list_view_type(vt); + ASSERT_EQ(list_view_type.id(), Type::LIST_VIEW); + + ASSERT_EQ("list_view", list_view_type.name()); + ASSERT_EQ("list_view", list_view_type.ToString()); + + ASSERT_EQ(list_view_type.value_type()->id(), vt->id()); + ASSERT_EQ(list_view_type.value_type()->id(), vt->id()); + + std::shared_ptr st = std::make_shared(); + std::shared_ptr lt = std::make_shared(st); + ASSERT_EQ("list_view", lt->ToString()); + + ListViewType lt2(lt); + ASSERT_EQ("list_view>", lt2.ToString()); +} + +TEST(TestLargeListViewType, Basics) { + std::shared_ptr vt = std::make_shared(); + + LargeListViewType list_view_type(vt); + ASSERT_EQ(list_view_type.id(), Type::LARGE_LIST_VIEW); + + ASSERT_EQ("large_list_view", list_view_type.name()); + ASSERT_EQ("large_list_view", list_view_type.ToString()); + + ASSERT_EQ(list_view_type.value_type()->id(), vt->id()); + ASSERT_EQ(list_view_type.value_type()->id(), vt->id()); + + std::shared_ptr st = std::make_shared(); + std::shared_ptr lt = std::make_shared(st); + ASSERT_EQ("large_list_view", lt->ToString()); + + LargeListViewType lt2(lt); + ASSERT_EQ("large_list_view>", lt2.ToString()); +} + TEST(TestMapType, Basics) { auto md = key_value_metadata({"foo"}, {"foo value"}); @@ -1829,6 +1869,32 @@ TEST(TestListType, Equals) { ASSERT_FALSE(list_type.Equals(list_type_named, /*check_metadata=*/true)); } +TEST(TestListViewType, Equals) { + auto t1 = list_view(utf8()); + auto t2 = list_view(utf8()); + auto t3 = list_view(binary()); + auto t4 = list_view(field("item", utf8(), /*nullable=*/false)); + auto tl1 = large_list_view(binary()); + auto tl2 = large_list_view(binary()); + auto tl3 = large_list_view(float64()); + + AssertTypeEqual(*t1, *t2); + AssertTypeNotEqual(*t1, *t3); + AssertTypeNotEqual(*t1, *t4); + AssertTypeNotEqual(*t3, *tl1); + AssertTypeEqual(*tl1, *tl2); + AssertTypeNotEqual(*tl2, *tl3); + + std::shared_ptr vt = std::make_shared(); + std::shared_ptr inner_field = std::make_shared("non_default_name", vt); + + ListViewType list_view_type(vt); + ListViewType list_view_type_named(inner_field); + + AssertTypeEqual(list_view_type, list_view_type_named); + ASSERT_FALSE(list_view_type.Equals(list_view_type_named, /*check_metadata=*/true)); +} + TEST(TestListType, Metadata) { auto md1 = key_value_metadata({"foo", "bar"}, {"foo value", "bar value"}); auto md2 = key_value_metadata({"foo", "bar"}, {"foo value", "bar value"}); @@ -1859,6 +1925,66 @@ TEST(TestListType, Metadata) { AssertTypeNotEqual(*t1, *t5, /*check_metadata =*/true); } +TEST(TestListViewType, Metadata) { + auto md1 = key_value_metadata({"foo", "bar"}, {"foo value", "bar value"}); + auto md2 = key_value_metadata({"foo", "bar"}, {"foo value", "bar value"}); + auto md3 = key_value_metadata({"foo"}, {"foo value"}); + + auto f1 = field("item", utf8(), /*nullable =*/true, md1); + auto f2 = field("item", utf8(), /*nullable =*/true, md2); + auto f3 = field("item", utf8(), /*nullable =*/true, md3); + auto f4 = field("item", utf8()); + auto f5 = field("item", utf8(), /*nullable =*/false, md1); + + auto t1 = list_view(f1); + auto t2 = list_view(f2); + auto t3 = list_view(f3); + auto t4 = list_view(f4); + auto t5 = list_view(f5); + + AssertTypeEqual(*t1, *t2); + AssertTypeEqual(*t1, *t2, /*check_metadata =*/false); + + AssertTypeEqual(*t1, *t3); + AssertTypeNotEqual(*t1, *t3, /*check_metadata =*/true); + + AssertTypeEqual(*t1, *t4); + AssertTypeNotEqual(*t1, *t4, /*check_metadata =*/true); + + AssertTypeNotEqual(*t1, *t5); + AssertTypeNotEqual(*t1, *t5, /*check_metadata =*/true); +} + +TEST(TestLargeListViewType, Metadata) { + auto md1 = key_value_metadata({"foo", "bar"}, {"foo value", "bar value"}); + auto md2 = key_value_metadata({"foo", "bar"}, {"foo value", "bar value"}); + auto md3 = key_value_metadata({"foo"}, {"foo value"}); + + auto f1 = field("item", utf8(), /*nullable =*/true, md1); + auto f2 = field("item", utf8(), /*nullable =*/true, md2); + auto f3 = field("item", utf8(), /*nullable =*/true, md3); + auto f4 = field("item", utf8()); + auto f5 = field("item", utf8(), /*nullable =*/false, md1); + + auto t1 = large_list_view(f1); + auto t2 = large_list_view(f2); + auto t3 = large_list_view(f3); + auto t4 = large_list_view(f4); + auto t5 = large_list_view(f5); + + AssertTypeEqual(*t1, *t2); + AssertTypeEqual(*t1, *t2, /*check_metadata =*/false); + + AssertTypeEqual(*t1, *t3); + AssertTypeNotEqual(*t1, *t3, /*check_metadata =*/true); + + AssertTypeEqual(*t1, *t4); + AssertTypeNotEqual(*t1, *t4, /*check_metadata =*/true); + + AssertTypeNotEqual(*t1, *t5); + AssertTypeNotEqual(*t1, *t5, /*check_metadata =*/true); +} + TEST(TestNestedType, Equals) { auto create_struct = [](std::string inner_name, std::string struct_name) -> std::shared_ptr { @@ -2296,6 +2422,7 @@ TEST(TypesTest, TestMembership) { TEST_PREDICATE(all_types, is_fixed_width); TEST_PREDICATE(all_types, is_var_length_list); TEST_PREDICATE(all_types, is_list_like); + TEST_PREDICATE(all_types, is_var_length_list_like); TEST_PREDICATE(all_types, is_nested); TEST_PREDICATE(all_types, is_union); } diff --git a/cpp/src/arrow/type_traits.cc b/cpp/src/arrow/type_traits.cc index de328f322ad..ded54aff463 100644 --- a/cpp/src/arrow/type_traits.cc +++ b/cpp/src/arrow/type_traits.cc @@ -67,21 +67,23 @@ int RequiredValueAlignmentForBuffer(Type::type type_id, int buffer_index) { case Type::BINARY: // Offsets may be cast to int32_t* case Type::DATE32: case Type::TIME32: - case Type::LIST: // Offsets may be cast to int32_t*, data is in child array - case Type::MAP: // This is a list array + case Type::LIST: // Offsets may be cast to int32_t* + case Type::LIST_VIEW: // Offsets and sizes may be cast to int32_t* + case Type::MAP: // Same as LIST case Type::INTERVAL_MONTHS: // Stored as int32_t* case Type::INTERVAL_DAY_TIME: // Stored as two contiguous 32-bit integers return 4; case Type::INT64: case Type::UINT64: case Type::DOUBLE: - case Type::DECIMAL128: // May be cast to GenericBasicDecimal* which requires - // alignment of 8 - case Type::DECIMAL256: // May be cast to GenericBasicDecimal* which requires - // alignment of 8 - case Type::LARGE_BINARY: // Offsets may be cast to int64_t* - case Type::LARGE_LIST: // Offsets may be cast to int64_t* - case Type::LARGE_STRING: // Offsets may be cast to int64_t* + case Type::DECIMAL128: // May be cast to GenericBasicDecimal* which requires + // alignment of 8 + case Type::DECIMAL256: // May be cast to GenericBasicDecimal* which requires + // alignment of 8 + case Type::LARGE_BINARY: // Offsets may be cast to int64_t* + case Type::LARGE_STRING: // Offsets may be cast to int64_t* + case Type::LARGE_LIST: // Offsets may be cast to int64_t* + case Type::LARGE_LIST_VIEW: // Offsets and sizes may be cast to int64_t* case Type::DATE64: case Type::TIME64: case Type::TIMESTAMP: diff --git a/cpp/src/arrow/type_traits.h b/cpp/src/arrow/type_traits.h index 9d8cafacf39..5b3fd63be1d 100644 --- a/cpp/src/arrow/type_traits.h +++ b/cpp/src/arrow/type_traits.h @@ -750,6 +750,13 @@ using is_list_type = template using enable_if_list_type = enable_if_t::value, R>; +template +using is_list_view_type = + std::disjunction, std::is_same>; + +template +using enable_if_list_view = enable_if_t::value, R>; + template using is_list_like_type = std::integral_constant::value || @@ -758,6 +765,14 @@ using is_list_like_type = template using enable_if_list_like = enable_if_t::value, R>; +template +using is_var_length_list_like_type = + std::disjunction, is_list_view_type>; + +template +using enable_if_var_length_list_like = + enable_if_t::value, R>; + template using is_struct_type = std::is_base_of; @@ -1303,6 +1318,39 @@ constexpr bool is_list_like(Type::type type_id) { return false; } +/// \brief Check for a var-length list or list-view like type +/// +/// \param[in] type_id the type-id to check +/// \return whether type-id is a var-length list or list-view like type +constexpr bool is_var_length_list_like(Type::type type_id) { + switch (type_id) { + case Type::LIST: + case Type::LARGE_LIST: + case Type::LIST_VIEW: + case Type::LARGE_LIST_VIEW: + case Type::MAP: + return true; + default: + break; + } + return false; +} + +/// \brief Check for a list-view type +/// +/// \param[in] type_id the type-id to check +/// \return whether type-id is a list-view type one +constexpr bool is_list_view(Type::type type_id) { + switch (type_id) { + case Type::LIST_VIEW: + case Type::LARGE_LIST_VIEW: + return true; + default: + break; + } + return false; +} + /// \brief Check for a nested type /// /// \param[in] type_id the type-id to check @@ -1311,6 +1359,8 @@ constexpr bool is_nested(Type::type type_id) { switch (type_id) { case Type::LIST: case Type::LARGE_LIST: + case Type::LIST_VIEW: + case Type::LARGE_LIST_VIEW: case Type::FIXED_SIZE_LIST: case Type::MAP: case Type::STRUCT: @@ -1403,12 +1453,14 @@ static inline int offset_bit_width(Type::type type_id) { case Type::STRING: case Type::BINARY: case Type::LIST: + case Type::LIST_VIEW: case Type::MAP: case Type::DENSE_UNION: return 32; case Type::LARGE_STRING: case Type::LARGE_BINARY: case Type::LARGE_LIST: + case Type::LARGE_LIST_VIEW: return 64; default: break; @@ -1609,6 +1661,24 @@ static inline bool is_var_length_list(const DataType& type) { /// Convenience for checking using the type's id static inline bool is_list_like(const DataType& type) { return is_list_like(type.id()); } +/// \brief Check for a var-length list or list-view like type +/// +/// \param[in] type the type to check +/// \return whether type is a var-length list or list-view like type +/// +/// Convenience for checking using the type's id +static inline bool is_var_length_list_like(const DataType& type) { + return is_var_length_list_like(type.id()); +} + +/// \brief Check for a list-view type +/// +/// \param[in] type the type to check +/// \return whether type is a list-view type +/// +/// Convenience for checking using the type's id +static inline bool is_list_view(const DataType& type) { return is_list_view(type.id()); } + /// \brief Check for a nested type /// /// \param[in] type the type to check From 03ab0725e2358efedec167682b9f99d775e9b41e Mon Sep 17 00:00:00 2001 From: Felipe Oliveira Carvalho Date: Tue, 18 Apr 2023 18:46:53 -0300 Subject: [PATCH 02/91] [Large]ListViewArray: Add the list-view array classes --- cpp/src/arrow/array/array_base.cc | 2 +- cpp/src/arrow/array/array_nested.cc | 211 ++++++++++++++++++++++++++- cpp/src/arrow/array/array_nested.h | 212 ++++++++++++++++++++++++++-- cpp/src/arrow/type_fwd.h | 2 + cpp/src/arrow/type_traits.h | 16 +++ 5 files changed, 420 insertions(+), 23 deletions(-) diff --git a/cpp/src/arrow/array/array_base.cc b/cpp/src/arrow/array/array_base.cc index eab71de27b1..b483ec420cc 100644 --- a/cpp/src/arrow/array/array_base.cc +++ b/cpp/src/arrow/array/array_base.cc @@ -95,7 +95,7 @@ struct ScalarFromArraySlotImpl { Status Visit(const MonthDayNanoIntervalArray& a) { return Finish(a.Value(index_)); } template - Status Visit(const BaseListArray& a) { + Status Visit(const VarLengthListLikeArray& a) { return Finish(a.value_slice(index_)); } diff --git a/cpp/src/arrow/array/array_nested.cc b/cpp/src/arrow/array/array_nested.cc index d8308c82495..19739f7e015 100644 --- a/cpp/src/arrow/array/array_nested.cc +++ b/cpp/src/arrow/array/array_nested.cc @@ -48,7 +48,7 @@ using internal::checked_pointer_cast; using internal::CopyBitmap; // ---------------------------------------------------------------------- -// ListArray / LargeListArray (common utilities) +// ListArray / LargeListArray / ListViewArray / LargeListViewArray (common utilities) namespace { @@ -137,6 +137,77 @@ Result::ArrayType>> ListArrayFromArray return std::make_shared(std::move(data)); } +template +Result::ArrayType>> ListViewArrayFromArrays( + std::shared_ptr type, const Array& offsets, const Array& sizes, + const Array& values, MemoryPool* pool, std::shared_ptr null_bitmap = NULLPTR, + int64_t null_count = kUnknownNullCount) { + using offset_type = typename TYPE::offset_type; + using ArrayType = typename TypeTraits::ArrayType; + using OffsetArrowType = typename CTypeTraits::ArrowType; + + if (offsets.type_id() != OffsetArrowType::type_id) { + return Status::TypeError("List offsets must be ", OffsetArrowType::type_name()); + } + + if (sizes.length() != offsets.length() && sizes.length() != offsets.length() - 1) { + return Status::Invalid( + "List sizes must have the same length as offsets or one less than offsets"); + } + if (sizes.type_id() != OffsetArrowType::type_id) { + return Status::TypeError("List sizes must be ", OffsetArrowType::type_name()); + } + + if (offsets.offset() != sizes.offset()) { + return Status::Invalid("List offsets and sizes must have the same offset"); + } + const int64_t array_offset = sizes.offset(); + + if (null_bitmap) { + if (offsets.null_count() > 0 || sizes.null_count() > 0) { + return Status::Invalid( + "Ambiguous to specify both validity map and offsets or sizes with nulls"); + } + if (array_offset != 0) { + return Status::Invalid( + "List offsets and sizes must not be slices if a validity map is specified"); + } + } else { + if (offsets.null_count() > 0 && sizes.null_count() > 0) { + return Status::Invalid("Ambiguous to specify both offsets and sizes with nulls"); + } + } + + DCHECK(offsets.length() == sizes.length() || offsets.length() - 1 == sizes.length()); + + using OffsetArrayType = typename TypeTraits::ArrayType; + const auto& typed_offsets = checked_cast(offsets); + const auto& typed_sizes = checked_cast(sizes); + + auto derived_validity_buffer = std::move(null_bitmap); + if (offsets.null_count() > 0) { + derived_validity_buffer = offsets.null_bitmap(); + null_count = offsets.null_count(); + // We allow construction from an offsets array containing one extra value. + // If that is the case, we might need to discount one null from out_null_count. + if (offsets.length() - 1 == sizes.length() && !offsets.IsValid(sizes.length())) { + null_count -= 1; + } + } else if (sizes.null_count() > 0) { + derived_validity_buffer = sizes.null_bitmap(); + null_count = sizes.null_count(); + } + + auto buffers = BufferVector({ + std::move(derived_validity_buffer), + typed_offsets.values(), + typed_sizes.values(), + }); + auto data = ArrayData::Make(type, sizes.length(), std::move(buffers), {values.data()}, + null_count, array_offset); + return std::make_shared(std::move(data)); +} + static std::shared_ptr SliceArrayWithOffsets(const Array& array, int64_t begin, int64_t end) { return array.Slice(begin, end - begin); @@ -191,21 +262,34 @@ Result> FlattenListArray(const ListArrayT& list_array, std::shared_ptr BoxOffsets(const std::shared_ptr& boxed_type, const ArrayData& data) { + const int64_t num_offsets = + is_list_view(data.type->id()) ? data.length : data.length + 1; std::vector> buffers = {nullptr, data.buffers[1]}; auto offsets_data = - std::make_shared(boxed_type, data.length + 1, std::move(buffers), + std::make_shared(boxed_type, /*length=*/num_offsets, std::move(buffers), /*null_count=*/0, data.offset); return MakeArray(offsets_data); } +std::shared_ptr BoxSizes(const std::shared_ptr& boxed_type, + const ArrayData& data) { + DCHECK(is_list_view(data.type->id())); + std::vector> buffers = {nullptr, data.buffers[2]}; + auto sizes_data = + std::make_shared(boxed_type, data.length, std::move(buffers), + /*null_count=*/0, data.offset); + return MakeArray(sizes_data); +} + } // namespace namespace internal { template -inline void SetListData(BaseListArray* self, const std::shared_ptr& data, +inline void SetListData(VarLengthListLikeArray* self, + const std::shared_ptr& data, Type::type expected_type_id) { - ARROW_CHECK_EQ(data->buffers.size(), 2); + ARROW_CHECK_EQ(data->buffers.size(), is_list_view(TYPE::type_id) ? 3 : 2); ARROW_CHECK_EQ(data->type->id(), expected_type_id); ARROW_CHECK_EQ(data->child_data.size(), 1); @@ -214,6 +298,7 @@ inline void SetListData(BaseListArray* self, const std::shared_ptrlist_type_ = checked_cast(data->type.get()); self->raw_value_offsets_ = data->GetValuesSafe(1, /*offset=*/0); + // BaseListViewArray::SetData takes care of setting raw_value_sizes_. ARROW_CHECK_EQ(self->list_type_->value_type()->id(), data->child_data[0]->type->id()); DCHECK(self->list_type_->value_type()->Equals(data->child_data[0]->type)); @@ -225,7 +310,9 @@ inline void SetListData(BaseListArray* self, const std::shared_ptr data) { SetData(std::move(data)); } +ListArray::ListArray(std::shared_ptr data) { + ListArray::SetData(std::move(data)); +} ListArray::ListArray(std::shared_ptr type, int64_t length, std::shared_ptr value_offsets, std::shared_ptr values, @@ -273,7 +360,9 @@ std::shared_ptr ListArray::offsets() const { return BoxOffsets(int32(), * // ---------------------------------------------------------------------- // LargeListArray -LargeListArray::LargeListArray(const std::shared_ptr& data) { SetData(data); } +LargeListArray::LargeListArray(const std::shared_ptr& data) { + LargeListArray::SetData(data); +} LargeListArray::LargeListArray(const std::shared_ptr& type, int64_t length, const std::shared_ptr& value_offsets, @@ -284,7 +373,7 @@ LargeListArray::LargeListArray(const std::shared_ptr& type, int64_t le auto internal_data = ArrayData::Make(type, length, {null_bitmap, value_offsets}, null_count, offset); internal_data->child_data.emplace_back(values->data()); - SetData(internal_data); + LargeListArray::SetData(internal_data); } void LargeListArray::SetData(const std::shared_ptr& data) { @@ -321,6 +410,114 @@ std::shared_ptr LargeListArray::offsets() const { return BoxOffsets(int64(), *data_); } +// ---------------------------------------------------------------------- +// ListViewArray + +ListViewArray::ListViewArray(std::shared_ptr data) { + ListViewArray::SetData(std::move(data)); +} + +ListViewArray::ListViewArray(std::shared_ptr type, int64_t length, + std::shared_ptr value_offsets, + std::shared_ptr value_sizes, + std::shared_ptr values, + std::shared_ptr null_bitmap, int64_t null_count, + int64_t offset) { + ListViewArray::SetData(ArrayData::Make( + std::move(type), length, + {std::move(null_bitmap), std::move(value_offsets), std::move(value_sizes)}, + /*child_data=*/{values->data()}, null_count, offset)); +} + +void ListViewArray::SetData(const std::shared_ptr& data) { + internal::SetListData(this, data); + raw_value_sizes_ = data->GetValuesSafe(2, /*offset=*/0); +} + +Result> ListViewArray::FromArrays( + const Array& offsets, const Array& sizes, const Array& values, MemoryPool* pool, + std::shared_ptr null_bitmap, int64_t null_count) { + return ListViewArrayFromArrays( + std::make_shared(values.type()), offsets, sizes, values, pool, + null_bitmap, null_count); +} + +Result> ListViewArray::FromArrays( + std::shared_ptr type, const Array& offsets, const Array& sizes, + const Array& values, MemoryPool* pool, std::shared_ptr null_bitmap, + int64_t null_count) { + if (type->id() != Type::LIST_VIEW) { + return Status::TypeError("Expected list-view type, got ", type->ToString()); + } + const auto& list_view_type = checked_cast(*type); + if (!list_view_type.value_type()->Equals(values.type())) { + return Status::TypeError("Mismatching list-view value type"); + } + return ListViewArrayFromArrays(std::move(type), offsets, sizes, values, + pool, null_bitmap, null_count); +} + +std::shared_ptr ListViewArray::offsets() const { + return BoxOffsets(int32(), *data_); +} + +std::shared_ptr ListViewArray::sizes() const { return BoxSizes(int32(), *data_); } + +// ---------------------------------------------------------------------- +// LargeListViewArray + +LargeListViewArray::LargeListViewArray(std::shared_ptr data) { + LargeListViewArray::SetData(std::move(data)); +} + +LargeListViewArray::LargeListViewArray(std::shared_ptr type, int64_t length, + std::shared_ptr value_offsets, + std::shared_ptr value_sizes, + std::shared_ptr values, + std::shared_ptr null_bitmap, + int64_t null_count, int64_t offset) { + LargeListViewArray::SetData(ArrayData::Make( + type, length, + {std::move(null_bitmap), std::move(value_offsets), std::move(value_sizes)}, + /*child_data=*/{values->data()}, null_count, offset)); +} + +void LargeListViewArray::SetData(const std::shared_ptr& data) { + internal::SetListData(this, data); + raw_value_sizes_ = data->GetValuesSafe(2, /*offset=*/0); +} + +Result> LargeListViewArray::FromArrays( + const Array& offsets, const Array& sizes, const Array& values, MemoryPool* pool, + std::shared_ptr null_bitmap, int64_t null_count) { + return ListViewArrayFromArrays( + std::make_shared(values.type()), offsets, sizes, values, pool, + null_bitmap, null_count); +} + +Result> LargeListViewArray::FromArrays( + std::shared_ptr type, const Array& offsets, const Array& sizes, + const Array& values, MemoryPool* pool, std::shared_ptr null_bitmap, + int64_t null_count) { + if (type->id() != Type::LARGE_LIST_VIEW) { + return Status::TypeError("Expected large list-view type, got ", type->ToString()); + } + const auto& large_list_view_type = checked_cast(*type); + if (!large_list_view_type.value_type()->Equals(values.type())) { + return Status::TypeError("Mismatching large list-view value type"); + } + return ListViewArrayFromArrays( + std::move(type), offsets, sizes, values, pool, null_bitmap, null_count); +} + +std::shared_ptr LargeListViewArray::offsets() const { + return BoxOffsets(int64(), *data_); +} + +std::shared_ptr LargeListViewArray::sizes() const { + return BoxSizes(int64(), *data_); +} + // ---------------------------------------------------------------------- // MapArray diff --git a/cpp/src/arrow/array/array_nested.h b/cpp/src/arrow/array/array_nested.h index 8d5cc95fec0..6987b4950b7 100644 --- a/cpp/src/arrow/array/array_nested.h +++ b/cpp/src/arrow/array/array_nested.h @@ -15,8 +15,8 @@ // specific language governing permissions and limitations // under the License. -// Array accessor classes for List, LargeList, FixedSizeList, Map, Struct, and -// Union +// Array accessor classes for List, LargeList, ListView, LargeListView, FixedSizeList, +// Map, Struct, and Union #pragma once @@ -43,30 +43,31 @@ namespace arrow { /// @{ // ---------------------------------------------------------------------- -// ListArray +// VarLengthListLikeArray template -class BaseListArray; +class VarLengthListLikeArray; namespace internal { -// Private helper for ListArray::SetData. -// Unfortunately, trying to define BaseListArray::SetData outside of this header +// Private helper for [Large]List[View]Array::SetData. +// Unfortunately, trying to define VarLengthListLikeArray::SetData outside of this header // doesn't play well with MSVC. template -void SetListData(BaseListArray* self, const std::shared_ptr& data, +void SetListData(VarLengthListLikeArray* self, + const std::shared_ptr& data, Type::type expected_type_id = TYPE::type_id); } // namespace internal -/// Base class for variable-sized list arrays, regardless of offset size. +/// Base class for variable-sized list and list-view arrays, regardless of offset size. template -class BaseListArray : public Array { +class VarLengthListLikeArray : public Array { public: using TypeClass = TYPE; using offset_type = typename TypeClass::offset_type; - const TypeClass* list_type() const { return list_type_; } + const TypeClass* var_length_list_like_type() const { return this->list_type_; } /// \brief Return array object containing the list's values /// @@ -87,16 +88,13 @@ class BaseListArray : public Array { offset_type value_offset(int64_t i) const { return raw_value_offsets_[i + data_->offset]; } - offset_type value_length(int64_t i) const { - i += data_->offset; - return raw_value_offsets_[i + 1] - raw_value_offsets_[i]; - } + virtual offset_type value_length(int64_t i) const = 0; std::shared_ptr value_slice(int64_t i) const { return values_->Slice(value_offset(i), value_length(i)); } protected: - friend void internal::SetListData(BaseListArray* self, + friend void internal::SetListData(VarLengthListLikeArray* self, const std::shared_ptr& data, Type::type expected_type_id); @@ -105,6 +103,23 @@ class BaseListArray : public Array { const offset_type* raw_value_offsets_ = NULLPTR; }; +// ---------------------------------------------------------------------- +// ListArray / LargeListArray + +template +class BaseListArray : public VarLengthListLikeArray { + public: + using TypeClass = TYPE; + using offset_type = typename TYPE::offset_type; + + const TypeClass* list_type() const { return this->var_length_list_like_type(); } + + offset_type value_length(int64_t i) const final { + i += this->data_->offset; + return this->raw_value_offsets_[i + 1] - this->raw_value_offsets_[i]; + } +}; + /// Concrete Array class for list data class ARROW_EXPORT ListArray : public BaseListArray { public: @@ -216,6 +231,173 @@ class ARROW_EXPORT LargeListArray : public BaseListArray { void SetData(const std::shared_ptr& data); }; +// ---------------------------------------------------------------------- +// ListViewArray / LargeListViewArray + +template +class BaseListViewArray : public VarLengthListLikeArray { + public: + using TypeClass = TYPE; + using offset_type = typename TYPE::offset_type; + + const TypeClass* list_view_type() const { return this->var_length_list_like_type(); } + + /// Note that this buffer does not account for any slice offset or length. + const std::shared_ptr& value_sizes() const { return this->data_->buffers[2]; } + + /// Return pointer to raw value offsets accounting for any slice offset + const offset_type* raw_value_sizes() const { + return raw_value_sizes_ + this->data_->offset; + } + + offset_type value_length(int64_t i) const final { + return this->raw_value_sizes_[i + this->data_->offset]; + } + + protected: + const offset_type* raw_value_sizes_ = NULLPTR; +}; + +/// \brief Concrete Array class for list-view data +class ARROW_EXPORT ListViewArray : public BaseListViewArray { + public: + explicit ListViewArray(std::shared_ptr data); + + ListViewArray(std::shared_ptr type, int64_t length, + std::shared_ptr value_offsets, + std::shared_ptr value_sizes, std::shared_ptr values, + std::shared_ptr null_bitmap = NULLPTR, + int64_t null_count = kUnknownNullCount, int64_t offset = 0); + + /// \brief Construct ListViewArray from array of offsets, sizes, and child + /// value array + /// + /// Construct a ListViewArray using buffers from offsets and sizes arrays + /// that project views into the child values array. + /// + /// This function does the bare minimum of validation of the offsets/sizes and + /// input types. + /// + /// Offsets of an Array's null bitmap can be present or an explicit + /// null_bitmap, but not both. + /// + /// \param[in] offsets An array of int32 offsets into the values array. NULL values are + /// supported if the corresponding values in sizes is NULL or 0. + /// \param[in] sizes An array containing the int32 sizes of every view. NULL values are + /// taken to represent a NULL list-view in the array being created. + /// \param[in] values Array containing list values + /// \param[in] pool MemoryPool + /// \param[in] null_bitmap Optional validity bitmap + /// \param[in] null_count Optional null count in null_bitmap + static Result> FromArrays( + const Array& offsets, const Array& sizes, const Array& values, + MemoryPool* pool = default_memory_pool(), + std::shared_ptr null_bitmap = NULLPTR, + int64_t null_count = kUnknownNullCount); + + static Result> FromArrays( + std::shared_ptr type, const Array& offsets, const Array& sizes, + const Array& values, MemoryPool* pool = default_memory_pool(), + std::shared_ptr null_bitmap = NULLPTR, + int64_t null_count = kUnknownNullCount); + + /// \brief Return an Array that is a concatenation of the list-views in this array. + /// + /// Note that it's different from `values()` in that it takes into + /// consideration this array's offsets (which can be in any order) + /// and sizes. Nulls are skipped. + /// TODO: implement ListViewArray::Flatten + // Result> Flatten( + // MemoryPool* memory_pool = default_memory_pool()) const; + + /// \brief Return list-view offsets as an Int32Array + /// + /// The returned array will not have a validity bitmap, so you cannot expect + /// to pass it to ListArray::FromArrays() and get back the same list array + /// if the original one has nulls. + std::shared_ptr offsets() const; + + /// \brief Return list-view sizes as an Int32Array + /// + /// The returned array will not have a validity bitmap, so you cannot expect + /// to pass it to ListArray::FromArrays() and get back the same list array + /// if the original one has nulls. + std::shared_ptr sizes() const; + + protected: + // This constructor defers SetData to a derived array class + ListViewArray() = default; + + void SetData(const std::shared_ptr& data); +}; + +/// \brief Concrete Array class for large list-view data (with 64-bit offsets +/// and sizes) +class ARROW_EXPORT LargeListViewArray : public BaseListViewArray { + public: + explicit LargeListViewArray(std::shared_ptr data); + + LargeListViewArray(std::shared_ptr type, int64_t length, + std::shared_ptr value_offsets, + std::shared_ptr value_sizes, std::shared_ptr values, + std::shared_ptr null_bitmap = NULLPTR, + int64_t null_count = kUnknownNullCount, int64_t offset = 0); + + /// \brief Construct LargeListViewArray from array of offsets, sizes, and child + /// value array + /// + /// Construct an LargeListViewArray using buffers from offsets and sizes arrays + /// that project views into the values array. + /// + /// This function does the bare minimum of validation of the offsets/sizes and + /// input types. + /// + /// Offsets of an Array's null bitmap can be present or an explicit + /// null_bitmap, but not both. + /// + /// \param[in] offsets An array of int64 offsets into the values array. NULL values are + /// supported if the corresponding values in sizes is NULL or 0. + /// \param[in] sizes An array containing the int64 sizes of every view. NULL values are + /// taken to represent a NULL list-view in the array being created. + /// \param[in] values Array containing list values + /// \param[in] pool MemoryPool + /// \param[in] null_bitmap Optional validity bitmap + /// \param[in] null_count Optional null count in null_bitmap + static Result> FromArrays( + const Array& offsets, const Array& sizes, const Array& values, + MemoryPool* pool = default_memory_pool(), + std::shared_ptr null_bitmap = NULLPTR, + int64_t null_count = kUnknownNullCount); + + static Result> FromArrays( + std::shared_ptr type, const Array& offsets, const Array& sizes, + const Array& values, MemoryPool* pool = default_memory_pool(), + std::shared_ptr null_bitmap = NULLPTR, + int64_t null_count = kUnknownNullCount); + + /// \brief Return an Array that is a concatenation of the large list-views in this + /// array. + /// + /// Note that it's different from `values()` in that it takes into + /// consideration this array's offsets (which can be in any order) + /// and sizes. Nulls are skipped. + /// TODO: implement LargeListViewArray::Flatten + // Result> Flatten( + // MemoryPool* memory_pool = default_memory_pool()) const; + + /// \brief Return list-view offsets as an Int64Array + std::shared_ptr offsets() const; + + /// \brief Return list-view sizes as an Int64Array + std::shared_ptr sizes() const; + + protected: + // This constructor defers SetData to a derived array class + LargeListViewArray() = default; + + void SetData(const std::shared_ptr& data); +}; + // ---------------------------------------------------------------------- // MapArray diff --git a/cpp/src/arrow/type_fwd.h b/cpp/src/arrow/type_fwd.h index 7d5a7029c67..9cddf7d703e 100644 --- a/cpp/src/arrow/type_fwd.h +++ b/cpp/src/arrow/type_fwd.h @@ -151,8 +151,10 @@ class LargeListBuilder; struct LargeListScalar; class ListViewType; +class ListViewArray; class LargeListViewType; +class LargeListViewArray; class MapType; class MapArray; diff --git a/cpp/src/arrow/type_traits.h b/cpp/src/arrow/type_traits.h index 5b3fd63be1d..391a5716492 100644 --- a/cpp/src/arrow/type_traits.h +++ b/cpp/src/arrow/type_traits.h @@ -463,6 +463,22 @@ struct TypeTraits { constexpr static bool is_parameter_free = false; }; +template <> +struct TypeTraits { + using ArrayType = ListViewArray; + // TODO(felipecrv): Add BuilderType + + constexpr static bool is_parameter_free = false; +}; + +template <> +struct TypeTraits { + using ArrayType = LargeListViewArray; + // TODO(felipecrv): Add BuilderType + + constexpr static bool is_parameter_free = false; +}; + template <> struct TypeTraits { using ArrayType = MapArray; From 0e70f57a27fecbbe911a747c38493ab1c355786c Mon Sep 17 00:00:00 2001 From: Felipe Oliveira Carvalho Date: Tue, 18 Apr 2023 19:06:13 -0300 Subject: [PATCH 03/91] [Large]ListViewScalar: Add list-view scalar classes --- cpp/src/arrow/array/array_test.cc | 4 ++-- cpp/src/arrow/array/data.cc | 22 +++++++++++++++++++++- cpp/src/arrow/scalar.cc | 6 ++++++ cpp/src/arrow/scalar.h | 14 ++++++++++++++ cpp/src/arrow/type_fwd.h | 2 ++ cpp/src/arrow/type_traits.h | 2 ++ 6 files changed, 47 insertions(+), 3 deletions(-) diff --git a/cpp/src/arrow/array/array_test.cc b/cpp/src/arrow/array/array_test.cc index 46908439ef5..1ce306f752b 100644 --- a/cpp/src/arrow/array/array_test.cc +++ b/cpp/src/arrow/array/array_test.cc @@ -752,9 +752,9 @@ TEST_F(TestArray, TestFillFromScalar) { ArraySpan span(*scalar); auto roundtripped_array = span.ToArray(); - AssertArraysEqual(*array, *roundtripped_array); - ASSERT_OK(roundtripped_array->ValidateFull()); + + AssertArraysEqual(*array, *roundtripped_array); ASSERT_OK_AND_ASSIGN(auto roundtripped_scalar, roundtripped_array->GetScalar(0)); AssertScalarsEqual(*scalar, *roundtripped_scalar); } diff --git a/cpp/src/arrow/array/data.cc b/cpp/src/arrow/array/data.cc index 1af373a19ea..3ea5ca88523 100644 --- a/cpp/src/arrow/array/data.cc +++ b/cpp/src/arrow/array/data.cc @@ -244,9 +244,22 @@ BufferSpan OffsetsForScalar(uint8_t* scratch_space, offset_type value_size) { auto* offsets = reinterpret_cast(scratch_space); offsets[0] = 0; offsets[1] = static_cast(value_size); + static_assert(2 * sizeof(offset_type) <= 16); return {scratch_space, sizeof(offset_type) * 2}; } +template +std::pair OffsetsAndSizesForScalar(uint8_t* scratch_space, + offset_type value_size) { + auto* offsets = scratch_space; + auto* sizes = scratch_space + sizeof(offset_type); + reinterpret_cast(offsets)[0] = 0; + reinterpret_cast(sizes)[0] = value_size; + static_assert(2 * sizeof(offset_type) <= 16); + return {BufferSpan{offsets, sizeof(offset_type)}, + BufferSpan{sizes, sizeof(offset_type)}}; +} + int GetNumBuffers(const DataType& type) { switch (type.id()) { case Type::NA: @@ -383,7 +396,7 @@ void ArraySpan::FillFromScalar(const Scalar& value) { const auto& scalar = checked_cast(value); this->buffers[1].data = const_cast(scalar.value->data()); this->buffers[1].size = scalar.value->size(); - } else if (is_list_like(type_id)) { + } else if (is_var_length_list_like(type_id) || type_id == Type::FIXED_SIZE_LIST) { const auto& scalar = checked_cast(value); int64_t value_length = 0; @@ -404,7 +417,14 @@ void ArraySpan::FillFromScalar(const Scalar& value) { OffsetsForScalar(scalar.scratch_space_, static_cast(value_length)); } else if (type_id == Type::LARGE_LIST) { this->buffers[1] = OffsetsForScalar(scalar.scratch_space_, value_length); + } else if (type_id == Type::LIST_VIEW) { + std::tie(this->buffers[1], this->buffers[2]) = OffsetsAndSizesForScalar( + scalar.scratch_space_, static_cast(value_length)); + } else if (type_id == Type::LARGE_LIST_VIEW) { + std::tie(this->buffers[1], this->buffers[2]) = + OffsetsAndSizesForScalar(scalar.scratch_space_, value_length); } else { + DCHECK_EQ(type_id, Type::FIXED_SIZE_LIST); // FIXED_SIZE_LIST: does not have a second buffer this->buffers[1] = {}; } diff --git a/cpp/src/arrow/scalar.cc b/cpp/src/arrow/scalar.cc index 167e2727052..08ab5ae2cd6 100644 --- a/cpp/src/arrow/scalar.cc +++ b/cpp/src/arrow/scalar.cc @@ -587,6 +587,12 @@ ListScalar::ListScalar(std::shared_ptr value, bool is_valid) LargeListScalar::LargeListScalar(std::shared_ptr value, bool is_valid) : BaseListScalar(value, large_list(value->type()), is_valid) {} +ListViewScalar::ListViewScalar(std::shared_ptr value, bool is_valid) + : BaseListScalar(value, list_view(value->type()), is_valid) {} + +LargeListViewScalar::LargeListViewScalar(std::shared_ptr value, bool is_valid) + : BaseListScalar(value, large_list_view(value->type()), is_valid) {} + inline std::shared_ptr MakeMapType(const std::shared_ptr& pair_type) { ARROW_CHECK_EQ(pair_type->id(), Type::STRUCT); ARROW_CHECK_EQ(pair_type->num_fields(), 2); diff --git a/cpp/src/arrow/scalar.h b/cpp/src/arrow/scalar.h index 5175b012852..65c5ee4df0a 100644 --- a/cpp/src/arrow/scalar.h +++ b/cpp/src/arrow/scalar.h @@ -531,6 +531,20 @@ struct ARROW_EXPORT LargeListScalar : public BaseListScalar { explicit LargeListScalar(std::shared_ptr value, bool is_valid = true); }; +struct ARROW_EXPORT ListViewScalar : public BaseListScalar { + using TypeClass = ListViewType; + using BaseListScalar::BaseListScalar; + + explicit ListViewScalar(std::shared_ptr value, bool is_valid = true); +}; + +struct ARROW_EXPORT LargeListViewScalar : public BaseListScalar { + using TypeClass = LargeListViewType; + using BaseListScalar::BaseListScalar; + + explicit LargeListViewScalar(std::shared_ptr value, bool is_valid = true); +}; + struct ARROW_EXPORT MapScalar : public BaseListScalar { using TypeClass = MapType; using BaseListScalar::BaseListScalar; diff --git a/cpp/src/arrow/type_fwd.h b/cpp/src/arrow/type_fwd.h index 9cddf7d703e..6668073a22a 100644 --- a/cpp/src/arrow/type_fwd.h +++ b/cpp/src/arrow/type_fwd.h @@ -152,9 +152,11 @@ struct LargeListScalar; class ListViewType; class ListViewArray; +struct ListViewScalar; class LargeListViewType; class LargeListViewArray; +struct LargeListViewScalar; class MapType; class MapArray; diff --git a/cpp/src/arrow/type_traits.h b/cpp/src/arrow/type_traits.h index 391a5716492..ad63239f8e8 100644 --- a/cpp/src/arrow/type_traits.h +++ b/cpp/src/arrow/type_traits.h @@ -467,6 +467,7 @@ template <> struct TypeTraits { using ArrayType = ListViewArray; // TODO(felipecrv): Add BuilderType + using ScalarType = ListViewScalar; constexpr static bool is_parameter_free = false; }; @@ -475,6 +476,7 @@ template <> struct TypeTraits { using ArrayType = LargeListViewArray; // TODO(felipecrv): Add BuilderType + using ScalarType = LargeListViewScalar; constexpr static bool is_parameter_free = false; }; From 7cbf4f1aef482c2def2cecefcb36931146a23202 Mon Sep 17 00:00:00 2001 From: Felipe Oliveira Carvalho Date: Tue, 18 Apr 2023 19:31:07 -0300 Subject: [PATCH 04/91] Parquet: [Large]ListViewArray: Add placeholder for list-view writing into Parquet --- cpp/src/parquet/arrow/path_internal.cc | 2 ++ cpp/src/parquet/column_writer.cc | 2 ++ 2 files changed, 4 insertions(+) diff --git a/cpp/src/parquet/arrow/path_internal.cc b/cpp/src/parquet/arrow/path_internal.cc index 919c97f4323..2d20403eac0 100644 --- a/cpp/src/parquet/arrow/path_internal.cc +++ b/cpp/src/parquet/arrow/path_internal.cc @@ -830,6 +830,8 @@ class PathBuilder { // Types not yet supported in Parquet. NOT_IMPLEMENTED_VISIT(Union) NOT_IMPLEMENTED_VISIT(RunEndEncoded); + NOT_IMPLEMENTED_VISIT(ListView); + NOT_IMPLEMENTED_VISIT(LargeListView); #undef NOT_IMPLEMENTED_VISIT std::vector& paths() { return paths_; } diff --git a/cpp/src/parquet/column_writer.cc b/cpp/src/parquet/column_writer.cc index 5dff533c1cc..8f4ffc67935 100644 --- a/cpp/src/parquet/column_writer.cc +++ b/cpp/src/parquet/column_writer.cc @@ -129,6 +129,8 @@ struct ValueBufferSlicer { NOT_IMPLEMENTED_VISIT(Union); NOT_IMPLEMENTED_VISIT(List); NOT_IMPLEMENTED_VISIT(LargeList); + NOT_IMPLEMENTED_VISIT(ListView); + NOT_IMPLEMENTED_VISIT(LargeListView); NOT_IMPLEMENTED_VISIT(Struct); NOT_IMPLEMENTED_VISIT(FixedSizeList); NOT_IMPLEMENTED_VISIT(Dictionary); From eb1ce8ddb4f76635477f08d6a082fbd39c3df5e2 Mon Sep 17 00:00:00 2001 From: Felipe Oliveira Carvalho Date: Tue, 18 Apr 2023 19:32:12 -0300 Subject: [PATCH 05/91] Python: [Large]ListViewArray: Disable writing list-view data into Pandas --- python/pyarrow/src/arrow/python/arrow_to_pandas.cc | 2 ++ 1 file changed, 2 insertions(+) diff --git a/python/pyarrow/src/arrow/python/arrow_to_pandas.cc b/python/pyarrow/src/arrow/python/arrow_to_pandas.cc index 8ed5d4e216e..e979342b886 100644 --- a/python/pyarrow/src/arrow/python/arrow_to_pandas.cc +++ b/python/pyarrow/src/arrow/python/arrow_to_pandas.cc @@ -1350,6 +1350,8 @@ struct ObjectWriterVisitor { std::is_same::value || std::is_same::value || std::is_same::value || + std::is_same::value || + std::is_same::value || std::is_same::value || (std::is_base_of::value && !std::is_same::value) || From a97cf860b595a21ac48b39bda540aa43f001e55e Mon Sep 17 00:00:00 2001 From: Felipe Oliveira Carvalho Date: Tue, 25 Apr 2023 23:01:43 -0300 Subject: [PATCH 06/91] visitor_generate.h: Add [Large]ListView to generated visitor code And all the placeholders necessary to make all the code compile --- cpp/src/arrow/array/concatenate.cc | 8 +++++++ cpp/src/arrow/array/diff.cc | 18 ++++++++++++++ cpp/src/arrow/array/util.cc | 22 +++++++++++++++++ cpp/src/arrow/array/validate.cc | 8 +++++++ cpp/src/arrow/compare.cc | 24 +++++++++++++++++++ .../engine/substrait/expression_internal.cc | 8 +++++++ .../arrow/engine/substrait/type_internal.cc | 4 ++++ cpp/src/arrow/integration/json_internal.cc | 22 +++++++++++++++++ cpp/src/arrow/ipc/metadata_internal.cc | 8 +++++++ cpp/src/arrow/ipc/reader.cc | 8 +++++++ cpp/src/arrow/ipc/writer.cc | 8 +++++++ cpp/src/arrow/json/test_common.h | 4 ++++ cpp/src/arrow/pretty_print.cc | 10 ++++++++ cpp/src/arrow/scalar.cc | 24 +++++++++++++++++++ cpp/src/arrow/type.cc | 4 ++++ cpp/src/arrow/visitor.cc | 6 +++++ cpp/src/arrow/visitor.h | 6 +++++ cpp/src/arrow/visitor_generate.h | 2 ++ 18 files changed, 194 insertions(+) diff --git a/cpp/src/arrow/array/concatenate.cc b/cpp/src/arrow/array/concatenate.cc index 37c7271b5b9..c7f3a23476e 100644 --- a/cpp/src/arrow/array/concatenate.cc +++ b/cpp/src/arrow/array/concatenate.cc @@ -288,6 +288,14 @@ class ConcatenateImpl { return ConcatenateImpl(child_data, pool_).Concatenate(&out_->child_data[0]); } + Status Visit(const ListViewType& type) { + return Status::NotImplemented("concatenation of ", type); + } + + Status Visit(const LargeListViewType& type) { + return Status::NotImplemented("concatenation of ", type); + } + Status Visit(const FixedSizeListType& fixed_size_list) { ARROW_ASSIGN_OR_RAISE(auto child_data, ChildData(0, fixed_size_list.list_size())); return ConcatenateImpl(child_data, pool_).Concatenate(&out_->child_data[0]); diff --git a/cpp/src/arrow/array/diff.cc b/cpp/src/arrow/array/diff.cc index be9597e59b3..f9714eda34c 100644 --- a/cpp/src/arrow/array/diff.cc +++ b/cpp/src/arrow/array/diff.cc @@ -289,6 +289,13 @@ class ValueComparatorFactory { Status Visit(const NullType&, const Array&, const Array&) { return Status::NotImplemented("null type"); } + Status Visit(const ListViewType&, const Array&, const Array&) { + return Status::NotImplemented("list-view type"); + } + + Status Visit(const LargeListViewType&, const Array&, const Array&) { + return Status::NotImplemented("list-view type"); + } Status Visit(const ExtensionType&, const Array&, const Array&) { return Status::NotImplemented("extension type"); @@ -589,6 +596,9 @@ Result> Diff(const Array& base, const Array& target return Diff(*base_storage, *target_storage, pool); } else if (base.type()->id() == Type::DICTIONARY) { return Status::NotImplemented("diffing arrays of type ", *base.type()); + } else if (base.type()->id() == Type::LIST_VIEW || + base.type()->id() == Type::LARGE_LIST_VIEW) { + return Status::NotImplemented("diffing arrays of type ", *base.type()); } else { return QuadraticSpaceMyersDiff(base, target, pool).Diff(); } @@ -732,6 +742,14 @@ class MakeFormatterImpl { return Status::OK(); } + Status Visit(const ListViewType& t) { + return Status::NotImplemented("formatting diffs between arrays of type ", t); + } + + Status Visit(const LargeListViewType& t) { + return Status::NotImplemented("formatting diffs between arrays of type ", t); + } + // TODO(bkietz) format maps better Status Visit(const StructType& t) { diff --git a/cpp/src/arrow/array/util.cc b/cpp/src/arrow/array/util.cc index 9ea2fc2b6f0..3b6fd01204f 100644 --- a/cpp/src/arrow/array/util.cc +++ b/cpp/src/arrow/array/util.cc @@ -289,6 +289,12 @@ class ArrayDataEndianSwapper { RETURN_NOT_OK(SwapOffsets(1)); return Status::OK(); } + Status Visit(const ListViewType& type) { + return Status::NotImplemented("swapping endianness of list-view array"); + } + Status Visit(const LargeListViewType& type) { + return Status::NotImplemented("swapping endianness of large list-view array"); + } Status Visit(const DictionaryType& type) { // dictionary was already swapped in ReadDictionary() in ipc/reader.cc @@ -524,6 +530,14 @@ class NullArrayFactory { return Status::OK(); } + Status Visit(const ListViewType& type) { + return Status::NotImplemented("construction of all-null ", type); + } + + Status Visit(const LargeListViewType& type) { + return Status::NotImplemented("construction of all-null ", type); + } + Status Visit(const FixedSizeListType& type) { ARROW_ASSIGN_OR_RAISE(out_->child_data[0], CreateChild(type, 0, length_ * type.list_size())); @@ -704,6 +718,14 @@ class RepeatedArrayFactory { return Status::OK(); } + Status Visit(const ListViewType& type) { + return Status::NotImplemented("construction from scalar of type ", *scalar_.type); + } + + Status Visit(const LargeListViewType& type) { + return Status::NotImplemented("construction from scalar of type ", *scalar_.type); + } + Status Visit(const FixedSizeListType& type) { auto value = checked_cast(scalar_).value; diff --git a/cpp/src/arrow/array/validate.cc b/cpp/src/arrow/array/validate.cc index 3dde41b1450..802ce64b260 100644 --- a/cpp/src/arrow/array/validate.cc +++ b/cpp/src/arrow/array/validate.cc @@ -264,6 +264,14 @@ struct ValidateArrayImpl { Status Visit(const LargeListType& type) { return ValidateListLike(type); } + Status Visit(const ListViewType& type) { + return Status::NotImplemented("ListViewType validation not implemented"); + } + + Status Visit(const LargeListViewType& type) { + return Status::NotImplemented("LargeListViewType validation not implemented"); + } + Status Visit(const MapType& type) { RETURN_NOT_OK(ValidateListLike(type)); return MapArray::ValidateChildData(data.child_data); diff --git a/cpp/src/arrow/compare.cc b/cpp/src/arrow/compare.cc index 50cfdd05a14..2e0fa966cf5 100644 --- a/cpp/src/arrow/compare.cc +++ b/cpp/src/arrow/compare.cc @@ -308,6 +308,14 @@ class RangeDataEqualsImpl { Status Visit(const LargeListType& type) { return CompareList(type); } + Status Visit(const ListViewType& type) { + return Status::NotImplemented("comparing ListViewType"); + } + + Status Visit(const LargeListViewType& type) { + return Status::NotImplemented("comparing LargeListViewType"); + } + Status Visit(const FixedSizeListType& type) { const auto list_size = type.list_size(); const ArrayData& left_data = *left_.child_data[0]; @@ -712,6 +720,14 @@ class TypeEqualsVisitor { return Status::OK(); } + Status Visit(const ListViewType& left) { + return Status::NotImplemented("list-view type comparison"); + } + + Status Visit(const LargeListViewType& left) { + return Status::NotImplemented("large list-view type comparison"); + } + template enable_if_t::value, Status> Visit(const T& left) { return VisitChildren(left); @@ -857,6 +873,14 @@ class ScalarEqualsVisitor { return Status::OK(); } + Status Visit(const ListViewScalar& left) { + return Status::NotImplemented("list-view comparison"); + } + + Status Visit(const LargeListViewScalar& left) { + return Status::NotImplemented("large list-view comparison"); + } + Status Visit(const MapScalar& left) { const auto& right = checked_cast(right_); result_ = ArrayEquals(*left.value, *right.value, options_, floating_approximate_); diff --git a/cpp/src/arrow/engine/substrait/expression_internal.cc b/cpp/src/arrow/engine/substrait/expression_internal.cc index d3952615976..5d892af9a39 100644 --- a/cpp/src/arrow/engine/substrait/expression_internal.cc +++ b/cpp/src/arrow/engine/substrait/expression_internal.cc @@ -807,6 +807,14 @@ struct ScalarToProtoImpl { return Status::OK(); } + Status Visit(const ListViewScalar& s) { + return Status::NotImplemented("list-view to proto"); + } + + Status Visit(const LargeListViewScalar& s) { + return Status::NotImplemented("list-view to proto"); + } + Status Visit(const StructScalar& s) { lit_->set_allocated_struct_(new Lit::Struct()); diff --git a/cpp/src/arrow/engine/substrait/type_internal.cc b/cpp/src/arrow/engine/substrait/type_internal.cc index d3fb058137e..f4a2e6800eb 100644 --- a/cpp/src/arrow/engine/substrait/type_internal.cc +++ b/cpp/src/arrow/engine/substrait/type_internal.cc @@ -313,6 +313,10 @@ struct DataTypeToProtoImpl { return Status::OK(); } + Status Visit(const ListViewType& t) { return NotImplemented(t); } + + Status Visit(const LargeListViewType& t) { return NotImplemented(t); } + Status Visit(const StructType& t) { auto types = SetWithThen(&substrait::Type::set_allocated_struct_)->mutable_types(); diff --git a/cpp/src/arrow/integration/json_internal.cc b/cpp/src/arrow/integration/json_internal.cc index 59749c36a95..0d05abbd7f9 100644 --- a/cpp/src/arrow/integration/json_internal.cc +++ b/cpp/src/arrow/integration/json_internal.cc @@ -422,6 +422,12 @@ class SchemaWriter { return Status::OK(); } + Status Visit(const ListViewType& type) { return Status::NotImplemented(type.name()); } + + Status Visit(const LargeListViewType& type) { + return Status::NotImplemented(type.name()); + } + Status Visit(const MapType& type) { WriteName("map", type); return Status::OK(); @@ -777,6 +783,14 @@ class ArrayWriter { return WriteChildren(array.type()->fields(), {array.values()}); } + Status Visit(const ListViewArray& array) { + return Status::NotImplemented("list-view array in JSON"); + } + + Status Visit(const LargeListViewArray& array) { + return Status::NotImplemented("large list-view array in JSON"); + } + Status Visit(const FixedSizeListArray& array) { WriteValidityField(array); const auto& type = checked_cast(*array.type()); @@ -1651,6 +1665,14 @@ class ArrayReader { return CreateList(type_); } + Status Visit(const ListViewType& type) { + return Status::NotImplemented("list-view in JSON"); + } + + Status Visit(const LargeListViewType& type) { + return Status::NotImplemented("large list-view in JSON"); + } + Status Visit(const MapType& type) { auto list_type = std::make_shared(type.value_field()); RETURN_NOT_OK(CreateList(list_type)); diff --git a/cpp/src/arrow/ipc/metadata_internal.cc b/cpp/src/arrow/ipc/metadata_internal.cc index ab1a58dd1df..c89c9e3f0d1 100644 --- a/cpp/src/arrow/ipc/metadata_internal.cc +++ b/cpp/src/arrow/ipc/metadata_internal.cc @@ -669,6 +669,14 @@ class FieldToFlatbufferVisitor { return Status::OK(); } + Status Visit(const ListViewType& type) { + return Status::NotImplemented("list-view type in IPC"); + } + + Status Visit(const LargeListViewType& type) { + return Status::NotImplemented("large list-view type in IPC"); + } + Status Visit(const MapType& type) { fb_type_ = flatbuf::Type::Map; RETURN_NOT_OK(VisitChildFields(type)); diff --git a/cpp/src/arrow/ipc/reader.cc b/cpp/src/arrow/ipc/reader.cc index d603062d81d..d195687c955 100644 --- a/cpp/src/arrow/ipc/reader.cc +++ b/cpp/src/arrow/ipc/reader.cc @@ -392,6 +392,14 @@ class ArrayLoader { return LoadList(type); } + Status Visit(const ListViewType& type) { + return Status::NotImplemented("list-view array in IPC"); + } + + Status Visit(const LargeListViewType& type) { + return Status::NotImplemented("large list-view array in IPC"); + } + Status Visit(const MapType& type) { RETURN_NOT_OK(LoadList(type)); return MapArray::ValidateChildData(out_->child_data); diff --git a/cpp/src/arrow/ipc/writer.cc b/cpp/src/arrow/ipc/writer.cc index 9668f459d0d..649d9bc4068 100644 --- a/cpp/src/arrow/ipc/writer.cc +++ b/cpp/src/arrow/ipc/writer.cc @@ -447,6 +447,14 @@ class RecordBatchSerializer { return Status::OK(); } + Status Visit(const ListViewArray& array) { + return Status::NotImplemented("list-view array in IPC"); + } + + Status Visit(const LargeListViewArray& array) { + return Status::NotImplemented("large list-view array in IPC"); + } + Status Visit(const FixedSizeListArray& array) { --max_recursion_depth_; auto size = array.list_type()->list_size(); diff --git a/cpp/src/arrow/json/test_common.h b/cpp/src/arrow/json/test_common.h index f7ab6fd1027..2f819779bdb 100644 --- a/cpp/src/arrow/json/test_common.h +++ b/cpp/src/arrow/json/test_common.h @@ -135,6 +135,10 @@ struct GenerateImpl { return OK(writer.EndArray(size)); } + Status Visit(const ListViewType& t) { return NotImplemented(t); } + + Status Visit(const LargeListViewType& t) { return NotImplemented(t); } + Status Visit(const StructType& t) { return Generate(t.fields(), e, &writer, options); } Status Visit(const DayTimeIntervalType& t) { return NotImplemented(t); } diff --git a/cpp/src/arrow/pretty_print.cc b/cpp/src/arrow/pretty_print.cc index b392e027a6b..fea353d18e0 100644 --- a/cpp/src/arrow/pretty_print.cc +++ b/cpp/src/arrow/pretty_print.cc @@ -266,6 +266,14 @@ class ArrayPrinter : public PrettyPrinter { /*is_container=*/true); } + Status WriteDataValues(const ListViewArray& array) { + return Status::NotImplemented("writing data values of a list-view array"); + } + + Status WriteDataValues(const LargeListViewArray& array) { + return Status::NotImplemented("writing data values of a large list-view array"); + } + Status WriteDataValues(const MapArray& array) { const auto keys = array.keys(); const auto items = array.items(); @@ -300,6 +308,8 @@ class ArrayPrinter : public PrettyPrinter { std::is_base_of::value || std::is_base_of::value || std::is_base_of::value || + std::is_base_of::value || + std::is_base_of::value || std::is_base_of::value || std::is_base_of::value, Status> diff --git a/cpp/src/arrow/scalar.cc b/cpp/src/arrow/scalar.cc index 08ab5ae2cd6..4691a66b7b7 100644 --- a/cpp/src/arrow/scalar.cc +++ b/cpp/src/arrow/scalar.cc @@ -62,6 +62,14 @@ namespace { // Implementation of Scalar::hash() struct ScalarHashImpl { + Status Visit(const ListViewScalar& s) { + return Status::NotImplemented("list-view scalar hashing"); + } + + Status Visit(const LargeListViewScalar& s) { + return Status::NotImplemented("large list-view scalar hashing"); + } + Status Visit(const NullScalar& s) { return Status::OK(); } template @@ -326,6 +334,14 @@ struct ScalarValidateImpl { return Status::OK(); } + Status Visit(const ListViewScalar& s) { + return Status::NotImplemented("list-view scalar validation"); + } + + Status Visit(const LargeListViewScalar& s) { + return Status::NotImplemented("large list-view scalar validation"); + } + Status Visit(const FixedSizeListScalar& s) { RETURN_NOT_OK(Visit(static_cast(s))); const auto& list_type = checked_cast(*s.type); @@ -806,6 +822,14 @@ struct MakeNullImpl { Status Visit(const LargeListType& type) { return VisitListLike(type); } + Status Visit(const ListViewType& type) { + return Status::NotImplemented("making null array of list-view"); + } + + Status Visit(const LargeListViewType& type) { + return Status::NotImplemented("making null array of large list-view"); + } + Status Visit(const FixedSizeListType& type) { return VisitListLike(type, type.list_size()); } diff --git a/cpp/src/arrow/type.cc b/cpp/src/arrow/type.cc index 45625845d28..62d2d61598d 100644 --- a/cpp/src/arrow/type.cc +++ b/cpp/src/arrow/type.cc @@ -140,6 +140,8 @@ std::vector AllTypeIds() { Type::STRUCT, Type::LIST, Type::LARGE_LIST, + Type::LIST_VIEW, + Type::LARGE_LIST_VIEW, Type::FIXED_SIZE_LIST, Type::MAP, Type::DENSE_UNION, @@ -209,6 +211,8 @@ std::string ToString(Type::type id) { TO_STRING_CASE(STRUCT) TO_STRING_CASE(LIST) TO_STRING_CASE(LARGE_LIST) + TO_STRING_CASE(LIST_VIEW) + TO_STRING_CASE(LARGE_LIST_VIEW) TO_STRING_CASE(FIXED_SIZE_LIST) TO_STRING_CASE(MAP) TO_STRING_CASE(DENSE_UNION) diff --git a/cpp/src/arrow/visitor.cc b/cpp/src/arrow/visitor.cc index e057f6b12fb..cca99033c93 100644 --- a/cpp/src/arrow/visitor.cc +++ b/cpp/src/arrow/visitor.cc @@ -63,6 +63,8 @@ ARRAY_VISITOR_DEFAULT(MonthIntervalArray) ARRAY_VISITOR_DEFAULT(DurationArray) ARRAY_VISITOR_DEFAULT(ListArray) ARRAY_VISITOR_DEFAULT(LargeListArray) +ARRAY_VISITOR_DEFAULT(ListViewArray) +ARRAY_VISITOR_DEFAULT(LargeListViewArray) ARRAY_VISITOR_DEFAULT(MapArray) ARRAY_VISITOR_DEFAULT(FixedSizeListArray) ARRAY_VISITOR_DEFAULT(StructArray) @@ -117,6 +119,8 @@ TYPE_VISITOR_DEFAULT(Decimal128Type) TYPE_VISITOR_DEFAULT(Decimal256Type) TYPE_VISITOR_DEFAULT(ListType) TYPE_VISITOR_DEFAULT(LargeListType) +TYPE_VISITOR_DEFAULT(ListViewType) +TYPE_VISITOR_DEFAULT(LargeListViewType) TYPE_VISITOR_DEFAULT(MapType) TYPE_VISITOR_DEFAULT(FixedSizeListType) TYPE_VISITOR_DEFAULT(StructType) @@ -170,6 +174,8 @@ SCALAR_VISITOR_DEFAULT(Decimal128Scalar) SCALAR_VISITOR_DEFAULT(Decimal256Scalar) SCALAR_VISITOR_DEFAULT(ListScalar) SCALAR_VISITOR_DEFAULT(LargeListScalar) +SCALAR_VISITOR_DEFAULT(ListViewScalar) +SCALAR_VISITOR_DEFAULT(LargeListViewScalar) SCALAR_VISITOR_DEFAULT(MapScalar) SCALAR_VISITOR_DEFAULT(FixedSizeListScalar) SCALAR_VISITOR_DEFAULT(StructScalar) diff --git a/cpp/src/arrow/visitor.h b/cpp/src/arrow/visitor.h index 650b0e7ee0a..75ef46ae4e5 100644 --- a/cpp/src/arrow/visitor.h +++ b/cpp/src/arrow/visitor.h @@ -64,6 +64,8 @@ class ARROW_EXPORT ArrayVisitor { virtual Status Visit(const Decimal256Array& array); virtual Status Visit(const ListArray& array); virtual Status Visit(const LargeListArray& array); + virtual Status Visit(const ListViewArray& array); + virtual Status Visit(const LargeListViewArray& array); virtual Status Visit(const MapArray& array); virtual Status Visit(const FixedSizeListArray& array); virtual Status Visit(const StructArray& array); @@ -115,6 +117,8 @@ class ARROW_EXPORT TypeVisitor { virtual Status Visit(const Decimal256Type& type); virtual Status Visit(const ListType& type); virtual Status Visit(const LargeListType& type); + virtual Status Visit(const ListViewType& scalar); + virtual Status Visit(const LargeListViewType& scalar); virtual Status Visit(const MapType& type); virtual Status Visit(const FixedSizeListType& type); virtual Status Visit(const StructType& type); @@ -166,6 +170,8 @@ class ARROW_EXPORT ScalarVisitor { virtual Status Visit(const Decimal256Scalar& scalar); virtual Status Visit(const ListScalar& scalar); virtual Status Visit(const LargeListScalar& scalar); + virtual Status Visit(const ListViewScalar& scalar); + virtual Status Visit(const LargeListViewScalar& scalar); virtual Status Visit(const MapScalar& scalar); virtual Status Visit(const FixedSizeListScalar& scalar); virtual Status Visit(const StructScalar& scalar); diff --git a/cpp/src/arrow/visitor_generate.h b/cpp/src/arrow/visitor_generate.h index 4b57abe53ff..cbb081bfed3 100644 --- a/cpp/src/arrow/visitor_generate.h +++ b/cpp/src/arrow/visitor_generate.h @@ -59,6 +59,8 @@ namespace arrow { ACTION(Decimal256); \ ACTION(List); \ ACTION(LargeList); \ + ACTION(ListView); \ + ACTION(LargeListView); \ ACTION(Map); \ ACTION(FixedSizeList); \ ACTION(Struct); \ From 05787703b253dd1f40910a33186589a7e0fda156 Mon Sep 17 00:00:00 2001 From: Felipe Oliveira Carvalho Date: Wed, 5 Jul 2023 12:52:00 -0300 Subject: [PATCH 07/91] [Large]ListViewType: Implement type Compare + tests --- cpp/src/arrow/compare.cc | 11 ++--------- cpp/src/arrow/type_test.cc | 38 ++++++++++++++++++++++++++++++++++++++ 2 files changed, 40 insertions(+), 9 deletions(-) diff --git a/cpp/src/arrow/compare.cc b/cpp/src/arrow/compare.cc index 2e0fa966cf5..9869688ef70 100644 --- a/cpp/src/arrow/compare.cc +++ b/cpp/src/arrow/compare.cc @@ -707,7 +707,8 @@ class TypeEqualsVisitor { } template - enable_if_t::value, Status> Visit(const T& left) { + enable_if_t::value || is_list_view_type::value, Status> Visit( + const T& left) { std::shared_ptr left_field = left.field(0); std::shared_ptr right_field = checked_cast(right_).field(0); bool equal_names = !check_metadata_ || (left_field->name() == right_field->name()); @@ -720,14 +721,6 @@ class TypeEqualsVisitor { return Status::OK(); } - Status Visit(const ListViewType& left) { - return Status::NotImplemented("list-view type comparison"); - } - - Status Visit(const LargeListViewType& left) { - return Status::NotImplemented("large list-view type comparison"); - } - template enable_if_t::value, Status> Visit(const T& left) { return VisitChildren(left); diff --git a/cpp/src/arrow/type_test.cc b/cpp/src/arrow/type_test.cc index dadb15a68a1..009e557f82f 100644 --- a/cpp/src/arrow/type_test.cc +++ b/cpp/src/arrow/type_test.cc @@ -2384,6 +2384,44 @@ TEST(TypesTest, TestRunEndEncodedType) { "run_end_encoded>"); } +TEST(TypesTest, TestListViewType) { + auto int32_expected = std::make_shared(int32()); + auto int32_list_view_type = list_view(int32()); + + ASSERT_EQ(*int32_expected, *int32_list_view_type); + + auto int32_list_view_type_cast = + std::dynamic_pointer_cast(int32_list_view_type); + ASSERT_EQ(*int32_list_view_type_cast->value_type(), *int32()); + + ASSERT_TRUE(int32_list_view_type->field(0)->Equals(Field("item", int32(), true))); + + auto int64_list_view_type = list_view(int64()); + ASSERT_NE(*int32_list_view_type, *int64_list_view_type); + + ASSERT_EQ(int32_list_view_type->ToString(), "list_view"); + ASSERT_EQ(int64_list_view_type->ToString(), "list_view"); +} + +TEST(TypesTest, TestLargeListViewType) { + auto int32_expected = std::make_shared(int32()); + auto int32_list_view_type = large_list_view(int32()); + + ASSERT_EQ(*int32_expected, *int32_list_view_type); + + auto int32_list_view_type_cast = + std::dynamic_pointer_cast(int32_list_view_type); + ASSERT_EQ(*int32_list_view_type_cast->value_type(), *int32()); + + ASSERT_TRUE(int32_list_view_type->field(0)->Equals(Field("item", int32(), true))); + + auto int64_list_view_type = large_list_view(int64()); + ASSERT_NE(*int32_list_view_type, *int64_list_view_type); + + ASSERT_EQ(int32_list_view_type->ToString(), "large_list_view"); + ASSERT_EQ(int64_list_view_type->ToString(), "large_list_view"); +} + #define TEST_PREDICATE(all_types, type_predicate) \ for (auto type : all_types) { \ ASSERT_EQ(type_predicate(type->id()), type_predicate(*type)); \ From fe96002a27636a3e797d2a189ed4c951ba32a11c Mon Sep 17 00:00:00 2001 From: Felipe Oliveira Carvalho Date: Tue, 6 Jun 2023 22:54:06 -0300 Subject: [PATCH 08/91] BaseListBuilder: Make base builder mostly compatible with ListViews --- cpp/src/arrow/array/builder_nested.h | 143 +++++++++++++++++---------- 1 file changed, 93 insertions(+), 50 deletions(-) diff --git a/cpp/src/arrow/array/builder_nested.h b/cpp/src/arrow/array/builder_nested.h index d0b17c23048..c23015f0c8b 100644 --- a/cpp/src/arrow/array/builder_nested.h +++ b/cpp/src/arrow/array/builder_nested.h @@ -43,28 +43,32 @@ namespace arrow { // List builder template -class BaseListBuilder : public ArrayBuilder { +class BaseVarLengthListLikeBuilder : public ArrayBuilder { public: using TypeClass = TYPE; using offset_type = typename TypeClass::offset_type; /// Use this constructor to incrementally build the value array along with offsets and /// null bitmap. - BaseListBuilder(MemoryPool* pool, std::shared_ptr const& value_builder, - const std::shared_ptr& type, - int64_t alignment = kDefaultBufferAlignment) + BaseVarLengthListLikeBuilder(MemoryPool* pool, + std::shared_ptr const& value_builder, + const std::shared_ptr& type, + int64_t alignment = kDefaultBufferAlignment) : ArrayBuilder(pool, alignment), offsets_builder_(pool, alignment), value_builder_(value_builder), value_field_(type->field(0)->WithType(NULLPTR)) {} - BaseListBuilder(MemoryPool* pool, std::shared_ptr const& value_builder, - int64_t alignment = kDefaultBufferAlignment) - : BaseListBuilder(pool, value_builder, list(value_builder->type()), alignment) {} + BaseVarLengthListLikeBuilder(MemoryPool* pool, + std::shared_ptr const& value_builder, + int64_t alignment = kDefaultBufferAlignment) + : BaseVarLengthListLikeBuilder(pool, value_builder, list(value_builder->type()), + alignment) {} Status Resize(int64_t capacity) override { if (ARROW_PREDICT_FALSE(capacity > maximum_elements())) { - return Status::CapacityError("List array cannot reserve space for more than ", + return Status::CapacityError(type_name(), + " array cannot reserve space for more than ", maximum_elements(), " got ", capacity); } ARROW_RETURN_NOT_OK(CheckCapacity(capacity)); @@ -99,7 +103,7 @@ class BaseListBuilder : public ArrayBuilder { Status Append(bool is_valid = true) { ARROW_RETURN_NOT_OK(Reserve(1)); UnsafeAppendToBitmap(is_valid); - UnsafeAppendNextOffset(); + UnsafeAppendDimensions(/*offset=*/value_builder_->length(), /*size=*/0); return Status::OK(); } @@ -108,10 +112,7 @@ class BaseListBuilder : public ArrayBuilder { Status AppendNulls(int64_t length) final { ARROW_RETURN_NOT_OK(Reserve(length)); UnsafeAppendToBitmap(length, false); - const int64_t num_values = value_builder_->length(); - for (int64_t i = 0; i < length; ++i) { - offsets_builder_.UnsafeAppend(static_cast(num_values)); - } + UnsafeAppendEmptyDimensions(/*num_values=*/length); return Status::OK(); } @@ -120,16 +121,17 @@ class BaseListBuilder : public ArrayBuilder { Status AppendEmptyValues(int64_t length) final { ARROW_RETURN_NOT_OK(Reserve(length)); UnsafeAppendToBitmap(length, true); - const int64_t num_values = value_builder_->length(); - for (int64_t i = 0; i < length; ++i) { - offsets_builder_.UnsafeAppend(static_cast(num_values)); - } + UnsafeAppendEmptyDimensions(/*num_values=*/length); return Status::OK(); } Status AppendArraySlice(const ArraySpan& array, int64_t offset, int64_t length) override { const offset_type* offsets = array.GetValues(1); + [[maybe_unused]] const offset_type* sizes = NULLPTR; + if constexpr (is_list_view(TYPE::type_id)) { + sizes = array.GetValues(2); + } const bool all_valid = !array.MayHaveLogicalNulls(); const uint8_t* validity = array.HasValidityBitmap() ? array.buffers[0].data : NULLPTR; ARROW_RETURN_NOT_OK(Reserve(length)); @@ -137,43 +139,28 @@ class BaseListBuilder : public ArrayBuilder { const bool is_valid = all_valid || (validity && bit_util::GetBit(validity, array.offset + row)) || array.IsValid(row); + int64_t size = 0; + if (is_valid) { + if constexpr (is_list_view(TYPE::type_id)) { + size = sizes[row]; + } else { + size = offsets[row + 1] - offsets[row]; + } + } UnsafeAppendToBitmap(is_valid); - UnsafeAppendNextOffset(); + UnsafeAppendDimensions(/*offset=*/value_builder_->length(), size); if (is_valid) { - int64_t slot_length = offsets[row + 1] - offsets[row]; - ARROW_RETURN_NOT_OK(value_builder_->AppendArraySlice(array.child_data[0], - offsets[row], slot_length)); + ARROW_RETURN_NOT_OK( + value_builder_->AppendArraySlice(array.child_data[0], offsets[row], size)); } } return Status::OK(); } - Status FinishInternal(std::shared_ptr* out) override { - ARROW_RETURN_NOT_OK(AppendNextOffset()); - - // Offset padding zeroed by BufferBuilder - std::shared_ptr offsets, null_bitmap; - ARROW_RETURN_NOT_OK(offsets_builder_.Finish(&offsets)); - ARROW_RETURN_NOT_OK(null_bitmap_builder_.Finish(&null_bitmap)); - - if (value_builder_->length() == 0) { - // Try to make sure we get a non-null values buffer (ARROW-2744) - ARROW_RETURN_NOT_OK(value_builder_->Resize(0)); - } - - std::shared_ptr items; - ARROW_RETURN_NOT_OK(value_builder_->FinishInternal(&items)); - - *out = ArrayData::Make(type(), length_, {null_bitmap, offsets}, {std::move(items)}, - null_count_); - Reset(); - return Status::OK(); - } - Status ValidateOverflow(int64_t new_elements) const { auto new_length = value_builder_->length() + new_elements; if (ARROW_PREDICT_FALSE(new_length > maximum_elements())) { - return Status::CapacityError("List array cannot contain more than ", + return Status::CapacityError(type_name(), " array cannot contain more than ", maximum_elements(), " elements, have ", new_elements); } else { return Status::OK(); @@ -191,20 +178,76 @@ class BaseListBuilder : public ArrayBuilder { return std::make_shared(value_field_->WithType(value_builder_->type())); } + private: + static constexpr const char* type_name() { + if constexpr (is_list_view(TYPE::type_id)) { + return "ListView"; + } else { + return "List"; + } + } + protected: + /// \brief Append dimensions for num_values empty list slots. + /// + /// ListViewBuilder overrides this to also append the sizes. + virtual void UnsafeAppendEmptyDimensions(int64_t num_values) { + const int64_t offset = value_builder_->length(); + for (int64_t i = 0; i < num_values; ++i) { + offsets_builder_.UnsafeAppend(static_cast(offset)); + } + } + + /// \brief Append dimensions for a single list slot. + /// + /// ListViewBuilder overrides this to also append the size. + virtual void UnsafeAppendDimensions(int64_t offset, int64_t size) { + offsets_builder_.UnsafeAppend(static_cast(offset)); + } + TypedBufferBuilder offsets_builder_; std::shared_ptr value_builder_; std::shared_ptr value_field_; +}; + +template +class BaseListBuilder : public BaseVarLengthListLikeBuilder { + private: + using BASE = BaseVarLengthListLikeBuilder; + + public: + using TypeClass = TYPE; + using offset_type = typename BASE::offset_type; + + using BASE::BASE; Status AppendNextOffset() { - ARROW_RETURN_NOT_OK(ValidateOverflow(0)); - const int64_t num_values = value_builder_->length(); - return offsets_builder_.Append(static_cast(num_values)); + ARROW_RETURN_NOT_OK(this->ValidateOverflow(0)); + const int64_t num_values = this->value_builder_->length(); + return this->offsets_builder_.Append(static_cast(num_values)); } - void UnsafeAppendNextOffset() { - const int64_t num_values = value_builder_->length(); - offsets_builder_.UnsafeAppend(static_cast(num_values)); + Status FinishInternal(std::shared_ptr* out) override { + ARROW_RETURN_NOT_OK(AppendNextOffset()); + + // Offset padding zeroed by BufferBuilder + std::shared_ptr offsets; + std::shared_ptr null_bitmap; + ARROW_RETURN_NOT_OK(this->offsets_builder_.Finish(&offsets)); + ARROW_RETURN_NOT_OK(this->null_bitmap_builder_.Finish(&null_bitmap)); + + if (this->value_builder_->length() == 0) { + // Try to make sure we get a non-null values buffer (ARROW-2744) + ARROW_RETURN_NOT_OK(this->value_builder_->Resize(0)); + } + + std::shared_ptr items; + ARROW_RETURN_NOT_OK(this->value_builder_->FinishInternal(&items)); + + *out = ArrayData::Make(this->type(), this->length_, {null_bitmap, offsets}, + {std::move(items)}, this->null_count_); + this->Reset(); + return Status::OK(); } }; From ad39a9d9aee5f01b69cbcc88890f93729666fe95 Mon Sep 17 00:00:00 2001 From: Felipe Oliveira Carvalho Date: Thu, 6 Jul 2023 23:15:54 -0300 Subject: [PATCH 09/91] BaseVarLengthListLikeBuilder: Add a version of Append() that takes a size as well --- cpp/src/arrow/array/builder_nested.h | 48 ++++++++++++++++++---------- 1 file changed, 31 insertions(+), 17 deletions(-) diff --git a/cpp/src/arrow/array/builder_nested.h b/cpp/src/arrow/array/builder_nested.h index c23015f0c8b..c73c83d860b 100644 --- a/cpp/src/arrow/array/builder_nested.h +++ b/cpp/src/arrow/array/builder_nested.h @@ -43,27 +43,27 @@ namespace arrow { // List builder template -class BaseVarLengthListLikeBuilder : public ArrayBuilder { +class ARROW_EXPORT VarLengthListLikeBuilder : public ArrayBuilder { public: using TypeClass = TYPE; using offset_type = typename TypeClass::offset_type; /// Use this constructor to incrementally build the value array along with offsets and /// null bitmap. - BaseVarLengthListLikeBuilder(MemoryPool* pool, - std::shared_ptr const& value_builder, - const std::shared_ptr& type, - int64_t alignment = kDefaultBufferAlignment) + VarLengthListLikeBuilder(MemoryPool* pool, + std::shared_ptr const& value_builder, + const std::shared_ptr& type, + int64_t alignment = kDefaultBufferAlignment) : ArrayBuilder(pool, alignment), offsets_builder_(pool, alignment), value_builder_(value_builder), value_field_(type->field(0)->WithType(NULLPTR)) {} - BaseVarLengthListLikeBuilder(MemoryPool* pool, - std::shared_ptr const& value_builder, - int64_t alignment = kDefaultBufferAlignment) - : BaseVarLengthListLikeBuilder(pool, value_builder, list(value_builder->type()), - alignment) {} + VarLengthListLikeBuilder(MemoryPool* pool, + std::shared_ptr const& value_builder, + int64_t alignment = kDefaultBufferAlignment) + : VarLengthListLikeBuilder(pool, value_builder, list(value_builder->type()), + alignment) {} Status Resize(int64_t capacity) override { if (ARROW_PREDICT_FALSE(capacity > maximum_elements())) { @@ -99,15 +99,19 @@ class BaseVarLengthListLikeBuilder : public ArrayBuilder { /// \brief Start a new variable-length list slot /// /// This function should be called before beginning to append elements to the - /// value builder - Status Append(bool is_valid = true) { + /// value builder. Elements appended to the value builder before this function is + /// called, will not be members of any list value. + /// + /// \param list_length The number of elements in the list (necessary on + /// list-view builders) + Status Append(bool is_valid, int64_t list_length) { ARROW_RETURN_NOT_OK(Reserve(1)); UnsafeAppendToBitmap(is_valid); - UnsafeAppendDimensions(/*offset=*/value_builder_->length(), /*size=*/0); + UnsafeAppendDimensions(/*offset=*/value_builder_->length(), /*size=*/list_length); return Status::OK(); } - Status AppendNull() final { return Append(false); } + Status AppendNull() final { return Append(false, 0); } Status AppendNulls(int64_t length) final { ARROW_RETURN_NOT_OK(Reserve(length)); @@ -116,7 +120,7 @@ class BaseVarLengthListLikeBuilder : public ArrayBuilder { return Status::OK(); } - Status AppendEmptyValue() final { return Append(true); } + Status AppendEmptyValue() final { return Append(true, 0); } Status AppendEmptyValues(int64_t length) final { ARROW_RETURN_NOT_OK(Reserve(length)); @@ -211,9 +215,9 @@ class BaseVarLengthListLikeBuilder : public ArrayBuilder { }; template -class BaseListBuilder : public BaseVarLengthListLikeBuilder { +class ARROW_EXPORT BaseListBuilder : public VarLengthListLikeBuilder { private: - using BASE = BaseVarLengthListLikeBuilder; + using BASE = VarLengthListLikeBuilder; public: using TypeClass = TYPE; @@ -221,6 +225,16 @@ class BaseListBuilder : public BaseVarLengthListLikeBuilder { using BASE::BASE; + /// \brief Start a new variable-length list slot + /// + /// This function should be called before beginning to append elements to the + /// value builder + Status Append(bool is_valid = true) { + // The value_length parameter to BASE::Append(bool, int64_t) is ignored when + // building a list array, so we can pass 0 here. + return BASE::Append(is_valid, 0); + } + Status AppendNextOffset() { ARROW_RETURN_NOT_OK(this->ValidateOverflow(0)); const int64_t num_values = this->value_builder_->length(); From 06f9072edbc63335d678ceb71ca4b53dc677044d Mon Sep 17 00:00:00 2001 From: Felipe Oliveira Carvalho Date: Thu, 27 Apr 2023 15:02:59 -0300 Subject: [PATCH 10/91] [Large]ListViewArrayBuilder: Add list-view builder classes --- cpp/src/arrow/array/builder_nested.cc | 14 ++ cpp/src/arrow/array/builder_nested.h | 196 +++++++++++++++++++++++--- cpp/src/arrow/builder.cc | 14 ++ cpp/src/arrow/ipc/json_simple.cc | 17 ++- cpp/src/arrow/type_fwd.h | 2 + cpp/src/arrow/type_traits.h | 14 +- 6 files changed, 228 insertions(+), 29 deletions(-) diff --git a/cpp/src/arrow/array/builder_nested.cc b/cpp/src/arrow/array/builder_nested.cc index fbba1fd0564..5bdc76d96c8 100644 --- a/cpp/src/arrow/array/builder_nested.cc +++ b/cpp/src/arrow/array/builder_nested.cc @@ -30,6 +30,20 @@ namespace arrow { +// ---------------------------------------------------------------------- +// VarLengthListLikeBuilder / BaseListBuilder / BaseListViewBuilder + +template class VarLengthListLikeBuilder; +template class VarLengthListLikeBuilder; +template class VarLengthListLikeBuilder; +template class VarLengthListLikeBuilder; + +template class BaseListBuilder; +template class BaseListBuilder; + +template class BaseListViewBuilder; +template class BaseListViewBuilder; + // ---------------------------------------------------------------------- // MapBuilder diff --git a/cpp/src/arrow/array/builder_nested.h b/cpp/src/arrow/array/builder_nested.h index c73c83d860b..67fb16cebfa 100644 --- a/cpp/src/arrow/array/builder_nested.h +++ b/cpp/src/arrow/array/builder_nested.h @@ -40,7 +40,7 @@ namespace arrow { /// @{ // ---------------------------------------------------------------------- -// List builder +// VarLengthListLikeBuilder template class ARROW_EXPORT VarLengthListLikeBuilder : public ArrayBuilder { @@ -62,7 +62,8 @@ class ARROW_EXPORT VarLengthListLikeBuilder : public ArrayBuilder { VarLengthListLikeBuilder(MemoryPool* pool, std::shared_ptr const& value_builder, int64_t alignment = kDefaultBufferAlignment) - : VarLengthListLikeBuilder(pool, value_builder, list(value_builder->type()), + : VarLengthListLikeBuilder(pool, value_builder, + std::make_shared(value_builder->type()), alignment) {} Status Resize(int64_t capacity) override { @@ -73,8 +74,10 @@ class ARROW_EXPORT VarLengthListLikeBuilder : public ArrayBuilder { } ARROW_RETURN_NOT_OK(CheckCapacity(capacity)); - // One more than requested for offsets - ARROW_RETURN_NOT_OK(offsets_builder_.Resize(capacity + 1)); + // One more than requested for list offsets + const int64_t offsets_capacity = + is_list_view(TYPE::type_id) ? capacity : capacity + 1; + ARROW_RETURN_NOT_OK(offsets_builder_.Resize(offsets_capacity)); return ArrayBuilder::Resize(capacity); } @@ -84,28 +87,18 @@ class ARROW_EXPORT VarLengthListLikeBuilder : public ArrayBuilder { value_builder_->Reset(); } - /// \brief Vector append - /// - /// If passed, valid_bytes is of equal length to values, and any zero byte - /// will be considered as a null for that slot - Status AppendValues(const offset_type* offsets, int64_t length, - const uint8_t* valid_bytes = NULLPTR) { - ARROW_RETURN_NOT_OK(Reserve(length)); - UnsafeAppendToBitmap(valid_bytes, length); - offsets_builder_.UnsafeAppend(offsets, length); - return Status::OK(); - } - /// \brief Start a new variable-length list slot /// /// This function should be called before beginning to append elements to the /// value builder. Elements appended to the value builder before this function is /// called, will not be members of any list value. /// - /// \param list_length The number of elements in the list (necessary on - /// list-view builders) + /// \pre if is_valid is false, list_length MUST be 0 + /// \param is_valid Whether the new list slot is valid + /// \param list_length The number of elements in the list Status Append(bool is_valid, int64_t list_length) { ARROW_RETURN_NOT_OK(Reserve(1)); + assert(is_valid || list_length == 0); UnsafeAppendToBitmap(is_valid); UnsafeAppendDimensions(/*offset=*/value_builder_->length(), /*size=*/list_length); return Status::OK(); @@ -129,6 +122,21 @@ class ARROW_EXPORT VarLengthListLikeBuilder : public ArrayBuilder { return Status::OK(); } + /// \brief Vector append + /// + /// For list-array builders, the sizes are inferred from the offsets. + /// BaseListBuilder provides an implementation that doesn't take sizes, but + /// this virtual function allows dispatching calls to both list-array and + /// list-view-array builders (which need the sizes) + /// + /// \param offsets The offsets of the variable-length lists + /// \param sizes The sizes of the variable-length lists + /// \param length The number of offsets, sizes, and validity bits to append + /// \param valid_bytes If passed, valid_bytes is of equal length to values, + /// and any zero byte will be considered as a null for that slot + virtual Status AppendValues(const offset_type* offsets, const offset_type* sizes, + int64_t length, const uint8_t* valid_bytes) = 0; + Status AppendArraySlice(const ArraySpan& array, int64_t offset, int64_t length) override { const offset_type* offsets = array.GetValues(1); @@ -214,6 +222,9 @@ class ARROW_EXPORT VarLengthListLikeBuilder : public ArrayBuilder { std::shared_ptr value_field_; }; +// ---------------------------------------------------------------------- +// ListBuilder / LargeListBuilder + template class ARROW_EXPORT BaseListBuilder : public VarLengthListLikeBuilder { private: @@ -225,6 +236,8 @@ class ARROW_EXPORT BaseListBuilder : public VarLengthListLikeBuilder { using BASE::BASE; + using BASE::Append; + /// \brief Start a new variable-length list slot /// /// This function should be called before beginning to append elements to the @@ -235,6 +248,42 @@ class ARROW_EXPORT BaseListBuilder : public VarLengthListLikeBuilder { return BASE::Append(is_valid, 0); } + /// \brief Vector append + /// + /// If passed, valid_bytes is of equal length to values, and any zero byte + /// will be considered as a null for that slot + Status AppendValues(const offset_type* offsets, int64_t length, + const uint8_t* valid_bytes = NULLPTR) { + ARROW_RETURN_NOT_OK(this->Reserve(length)); + this->UnsafeAppendToBitmap(valid_bytes, length); + this->offsets_builder_.UnsafeAppend(offsets, length); + return Status::OK(); + } + + Status AppendValues(const offset_type* offsets, const offset_type* sizes, + int64_t length, const uint8_t* valid_bytes) final { + // offsets are assumed to be valid, but the first length-1 sizes have to be + // consistent with the offsets to rule out the possibility that the caller + // is passing sizes that could work if building a list-view, but don't work + // on building a list that requires offsets to be non-decreasing. + if (sizes) { + for (int64_t i = 0; i < length - 1; ++i) { + if (ARROW_PREDICT_FALSE(offsets[i] != offsets[i + 1] - sizes[i])) { + if (!valid_bytes || valid_bytes[i]) { + return Status::Invalid( + "BaseListBuilder: sizes are inconsistent with offsets provided"); + } + } + } + } + return AppendValues(offsets, length, valid_bytes); + } + + Status AppendValues(const offset_type* offsets, const offset_type* sizes, + int64_t length) { + return AppendValues(offsets, sizes, length, /*valid_bytes=*/NULLPTR); + } + Status AppendNextOffset() { ARROW_RETURN_NOT_OK(this->ValidateOverflow(0)); const int64_t num_values = this->value_builder_->length(); @@ -258,7 +307,8 @@ class ARROW_EXPORT BaseListBuilder : public VarLengthListLikeBuilder { std::shared_ptr items; ARROW_RETURN_NOT_OK(this->value_builder_->FinishInternal(&items)); - *out = ArrayData::Make(this->type(), this->length_, {null_bitmap, offsets}, + *out = ArrayData::Make(this->type(), this->length_, + {std::move(null_bitmap), std::move(offsets)}, {std::move(items)}, this->null_count_); this->Reset(); return Status::OK(); @@ -304,6 +354,114 @@ class ARROW_EXPORT LargeListBuilder : public BaseListBuilder { Status Finish(std::shared_ptr* out) { return FinishTyped(out); } }; +// ---------------------------------------------------------------------- +// ListViewBuilder / LargeListViewBuilder + +template +class ARROW_EXPORT BaseListViewBuilder : public VarLengthListLikeBuilder { + private: + using BASE = VarLengthListLikeBuilder; + + public: + using TypeClass = TYPE; + using offset_type = typename BASE::offset_type; + + using BASE::BASE; + + Status Resize(int64_t capacity) override { + ARROW_RETURN_NOT_OK(BASE::Resize(capacity)); + return sizes_builder_.Resize(capacity); + } + + void Reset() override { + BASE::Reset(); + sizes_builder_.Reset(); + } + + /// \brief Vector append + /// + /// If passed, valid_bytes is of equal length to values, and any zero byte + /// will be considered as a null for that slot + Status AppendValues(const offset_type* offsets, const offset_type* sizes, + int64_t length, const uint8_t* valid_bytes) final { + ARROW_RETURN_NOT_OK(this->Reserve(length)); + this->UnsafeAppendToBitmap(valid_bytes, length); + this->offsets_builder_.UnsafeAppend(offsets, length); + this->sizes_builder_.UnsafeAppend(sizes, length); + return Status::OK(); + } + + Status AppendValues(const offset_type* offsets, const offset_type* sizes, + int64_t length) { + return AppendValues(offsets, sizes, length, /*valid_bytes=*/NULLPTR); + } + + Status FinishInternal(std::shared_ptr* out) override { + // Offset and sizes padding zeroed by BufferBuilder + std::shared_ptr null_bitmap; + std::shared_ptr offsets; + std::shared_ptr sizes; + ARROW_RETURN_NOT_OK(this->null_bitmap_builder_.Finish(&null_bitmap)); + ARROW_RETURN_NOT_OK(this->offsets_builder_.Finish(&offsets)); + ARROW_RETURN_NOT_OK(this->sizes_builder_.Finish(&sizes)); + + if (this->value_builder_->length() == 0) { + // Try to make sure we get a non-null values buffer (ARROW-2744) + ARROW_RETURN_NOT_OK(this->value_builder_->Resize(0)); + } + + std::shared_ptr items; + ARROW_RETURN_NOT_OK(this->value_builder_->FinishInternal(&items)); + + *out = ArrayData::Make(this->type(), this->length_, + {std::move(null_bitmap), std::move(offsets), std::move(sizes)}, + {std::move(items)}, this->null_count_); + this->Reset(); + return Status::OK(); + } + + protected: + void UnsafeAppendEmptyDimensions(int64_t num_values) override { + for (int64_t i = 0; i < num_values; ++i) { + this->offsets_builder_.UnsafeAppend(0); + } + for (int64_t i = 0; i < num_values; ++i) { + this->sizes_builder_.UnsafeAppend(0); + } + } + + void UnsafeAppendDimensions(int64_t offset, int64_t size) override { + this->offsets_builder_.UnsafeAppend(static_cast(offset)); + this->sizes_builder_.UnsafeAppend(static_cast(size)); + } + + private: + TypedBufferBuilder sizes_builder_; +}; + +class ARROW_EXPORT ListViewBuilder final : public BaseListViewBuilder { + public: + using BaseListViewBuilder::BaseListViewBuilder; + + /// \cond FALSE + using ArrayBuilder::Finish; + /// \endcond + + Status Finish(std::shared_ptr* out) { return FinishTyped(out); } +}; + +class ARROW_EXPORT LargeListViewBuilder final + : public BaseListViewBuilder { + public: + using BaseListViewBuilder::BaseListViewBuilder; + + /// \cond FALSE + using ArrayBuilder::Finish; + /// \endcond + + Status Finish(std::shared_ptr* out) { return FinishTyped(out); } +}; + // ---------------------------------------------------------------------- // Map builder diff --git a/cpp/src/arrow/builder.cc b/cpp/src/arrow/builder.cc index c7e6207bfef..7042d9818c6 100644 --- a/cpp/src/arrow/builder.cc +++ b/cpp/src/arrow/builder.cc @@ -221,6 +221,20 @@ struct MakeBuilderImpl { return Status::OK(); } + Status Visit(const ListViewType& list_view_type) { + std::shared_ptr value_type = list_view_type.value_type(); + ARROW_ASSIGN_OR_RAISE(auto value_builder, ChildBuilder(value_type)); + out.reset(new ListViewBuilder(pool, std::move(value_builder), std::move(type))); + return Status::OK(); + } + + Status Visit(const LargeListViewType& large_list_view_type) { + std::shared_ptr value_type = large_list_view_type.value_type(); + ARROW_ASSIGN_OR_RAISE(auto value_builder, ChildBuilder(value_type)); + out.reset(new LargeListViewBuilder(pool, std::move(value_builder), std::move(type))); + return Status::OK(); + } + Status Visit(const MapType& map_type) { ARROW_ASSIGN_OR_RAISE(auto key_builder, ChildBuilder(map_type.key_type())); ARROW_ASSIGN_OR_RAISE(auto item_builder, ChildBuilder(map_type.item_type())); diff --git a/cpp/src/arrow/ipc/json_simple.cc b/cpp/src/arrow/ipc/json_simple.cc index 4d2d803f3f6..d40f0dbbd87 100644 --- a/cpp/src/arrow/ipc/json_simple.cc +++ b/cpp/src/arrow/ipc/json_simple.cc @@ -123,12 +123,16 @@ Status GetConverter(const std::shared_ptr&, std::shared_ptr template class ConcreteConverter : public Converter { public: - Status AppendValues(const rj::Value& json_array) override { - auto self = static_cast(this); - if (!json_array.IsArray()) { - return JSONTypeError("array", json_array.GetType()); + Result SizeOfJSONArray(const rj::Value& json_obj) { + if (!json_obj.IsArray()) { + return JSONTypeError("array", json_obj.GetType()); } - auto size = json_array.Size(); + return json_obj.Size(); + } + + Status AppendValues(const rj::Value& json_array) final { + auto self = static_cast(this); + ARROW_ASSIGN_OR_RAISE(auto size, SizeOfJSONArray(json_array)); for (uint32_t i = 0; i < size; ++i) { RETURN_NOT_OK(self->AppendValue(json_array[i])); } @@ -555,8 +559,9 @@ class ListConverter final : public ConcreteConverter> { if (json_obj.IsNull()) { return this->AppendNull(); } - RETURN_NOT_OK(builder_->Append()); // Extend the child converter with this JSON array + ARROW_ASSIGN_OR_RAISE(auto size, this->SizeOfJSONArray(json_obj)); + RETURN_NOT_OK(builder_->Append(true, size)); return child_converter_->AppendValues(json_obj); } diff --git a/cpp/src/arrow/type_fwd.h b/cpp/src/arrow/type_fwd.h index 6668073a22a..63eec10bf72 100644 --- a/cpp/src/arrow/type_fwd.h +++ b/cpp/src/arrow/type_fwd.h @@ -152,10 +152,12 @@ struct LargeListScalar; class ListViewType; class ListViewArray; +class ListViewBuilder; struct ListViewScalar; class LargeListViewType; class LargeListViewArray; +class LargeListViewBuilder; struct LargeListViewScalar; class MapType; diff --git a/cpp/src/arrow/type_traits.h b/cpp/src/arrow/type_traits.h index ad63239f8e8..bf2cc71c745 100644 --- a/cpp/src/arrow/type_traits.h +++ b/cpp/src/arrow/type_traits.h @@ -466,18 +466,24 @@ struct TypeTraits { template <> struct TypeTraits { using ArrayType = ListViewArray; - // TODO(felipecrv): Add BuilderType + using BuilderType = ListViewBuilder; using ScalarType = ListViewScalar; - + using OffsetType = Int32Type; + using OffsetArrayType = Int32Array; + using OffsetBuilderType = Int32Builder; + using OffsetScalarType = Int32Scalar; constexpr static bool is_parameter_free = false; }; template <> struct TypeTraits { using ArrayType = LargeListViewArray; - // TODO(felipecrv): Add BuilderType + using BuilderType = LargeListViewBuilder; using ScalarType = LargeListViewScalar; - + using OffsetType = Int64Type; + using OffsetArrayType = Int64Array; + using OffsetBuilderType = Int64Builder; + using OffsetScalarType = Int64Scalar; constexpr static bool is_parameter_free = false; }; From 08d541b8d9cb93423be8aa6be567184a7b220fc1 Mon Sep 17 00:00:00 2001 From: Felipe Oliveira Carvalho Date: Mon, 24 Apr 2023 22:30:42 -0300 Subject: [PATCH 11/91] [Large]ListViewArray: Buffers validation, creation from JSON, and basic tests --- cpp/src/arrow/CMakeLists.txt | 1 + cpp/src/arrow/array/array_list_view_test.cc | 83 +++++++++++++++++++++ cpp/src/arrow/ipc/json_simple.cc | 19 +++-- 3 files changed, 97 insertions(+), 6 deletions(-) create mode 100644 cpp/src/arrow/array/array_list_view_test.cc diff --git a/cpp/src/arrow/CMakeLists.txt b/cpp/src/arrow/CMakeLists.txt index 9a611701153..4790b7494d4 100644 --- a/cpp/src/arrow/CMakeLists.txt +++ b/cpp/src/arrow/CMakeLists.txt @@ -789,6 +789,7 @@ add_arrow_test(array_test array/array_binary_test.cc array/array_dict_test.cc array/array_list_test.cc + array/array_list_view_test.cc array/array_run_end_test.cc array/array_struct_test.cc array/array_union_test.cc diff --git a/cpp/src/arrow/array/array_list_view_test.cc b/cpp/src/arrow/array/array_list_view_test.cc new file mode 100644 index 00000000000..8f9f9cc1d3a --- /dev/null +++ b/cpp/src/arrow/array/array_list_view_test.cc @@ -0,0 +1,83 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include + +#include "arrow/array/array_nested.h" +#include "arrow/array/util.h" +#include "arrow/testing/gtest_util.h" +#include "arrow/type_fwd.h" +#include "arrow/util/checked_cast.h" + +namespace arrow { + +using internal::checked_cast; + +// ---------------------------------------------------------------------- +// List-view array tests + +namespace { + +class TestListViewArray : public ::testing::Test { + public: + std::shared_ptr string_values; + std::shared_ptr int32_values; + std::shared_ptr int16_values; + + void SetUp() override { + string_values = ArrayFromJSON(utf8(), R"(["Hello", "World", null])"); + int32_values = ArrayFromJSON(int32(), "[1, 20, 3]"); + int16_values = ArrayFromJSON(int16(), "[10, 2, 30]"); + } + + static std::shared_ptr Offsets(std::string_view json) { + return ArrayFromJSON(int32(), json); + } + + static std::shared_ptr Sizes(std::string_view json) { + return ArrayFromJSON(int32(), json); + } +}; + +} // namespace + +TEST_F(TestListViewArray, MakeArray) { + ASSERT_OK_AND_ASSIGN(auto list_view_array, + ListViewArray::FromArrays(*Offsets("[0, 0, 1, 2]"), + *Sizes("[2, 1, 1, 1]"), *string_values)); + auto array_data = list_view_array->data(); + auto new_array = MakeArray(array_data); + ASSERT_ARRAYS_EQUAL(*new_array, *list_view_array); + // Should be the exact same ArrayData object + ASSERT_EQ(new_array->data(), array_data); + ASSERT_NE(std::dynamic_pointer_cast(new_array), NULLPTR); +} + +TEST_F(TestListViewArray, FromOffsetsAndSizes) { + std::shared_ptr list_view_array; + + ASSERT_OK_AND_ASSIGN(list_view_array, ListViewArray::FromArrays( + *Offsets("[0, 0, 1, 1000]"), + *Sizes("[2, 1, 1, null]"), *int32_values)); + ASSERT_EQ(list_view_array->length(), 4); + ASSERT_ARRAYS_EQUAL(*list_view_array->values(), *int32_values); + ASSERT_EQ(list_view_array->offset(), 0); + ASSERT_EQ(list_view_array->data()->GetNullCount(), 1); + ASSERT_EQ(list_view_array->data()->buffers.size(), 3); +} + +} // namespace arrow diff --git a/cpp/src/arrow/ipc/json_simple.cc b/cpp/src/arrow/ipc/json_simple.cc index d40f0dbbd87..ceeabe01677 100644 --- a/cpp/src/arrow/ipc/json_simple.cc +++ b/cpp/src/arrow/ipc/json_simple.cc @@ -540,15 +540,19 @@ class FixedSizeBinaryConverter final // Converter for list arrays template -class ListConverter final : public ConcreteConverter> { +class VarLengthListLikeConverter final + : public ConcreteConverter> { public: using BuilderType = typename TypeTraits::BuilderType; - explicit ListConverter(const std::shared_ptr& type) { this->type_ = type; } + explicit VarLengthListLikeConverter(const std::shared_ptr& type) { + this->type_ = type; + } Status Init() override { - const auto& list_type = checked_cast(*this->type_); - RETURN_NOT_OK(GetConverter(list_type.value_type(), &child_converter_)); + const auto& var_length_list_like_type = checked_cast(*this->type_); + RETURN_NOT_OK( + GetConverter(var_length_list_like_type.value_type(), &child_converter_)); auto child_builder = child_converter_->builder(); builder_ = std::make_shared(default_memory_pool(), child_builder, this->type_); @@ -903,8 +907,11 @@ Status GetConverter(const std::shared_ptr& type, SIMPLE_CONVERTER_CASE(Type::HALF_FLOAT, IntegerConverter) SIMPLE_CONVERTER_CASE(Type::FLOAT, FloatConverter) SIMPLE_CONVERTER_CASE(Type::DOUBLE, FloatConverter) - SIMPLE_CONVERTER_CASE(Type::LIST, ListConverter) - SIMPLE_CONVERTER_CASE(Type::LARGE_LIST, ListConverter) + SIMPLE_CONVERTER_CASE(Type::LIST, VarLengthListLikeConverter) + SIMPLE_CONVERTER_CASE(Type::LARGE_LIST, VarLengthListLikeConverter) + SIMPLE_CONVERTER_CASE(Type::LIST_VIEW, VarLengthListLikeConverter) + SIMPLE_CONVERTER_CASE(Type::LARGE_LIST_VIEW, + VarLengthListLikeConverter) SIMPLE_CONVERTER_CASE(Type::MAP, MapConverter) SIMPLE_CONVERTER_CASE(Type::FIXED_SIZE_LIST, FixedSizeListConverter) SIMPLE_CONVERTER_CASE(Type::STRUCT, StructConverter) From 1d04ed83528cbbee798beb13a1d0a98d60527b37 Mon Sep 17 00:00:00 2001 From: Felipe Oliveira Carvalho Date: Fri, 28 Apr 2023 12:36:07 -0300 Subject: [PATCH 12/91] [Large]ListViewScalar: Implement all operations --- cpp/src/arrow/array/builder_base.cc | 9 ++++-- cpp/src/arrow/array/util.cc | 46 ++++++++++++++++++----------- cpp/src/arrow/compare.cc | 8 +++-- cpp/src/arrow/scalar.cc | 42 ++++++++------------------ cpp/src/arrow/scalar_test.cc | 8 +++++ 5 files changed, 62 insertions(+), 51 deletions(-) diff --git a/cpp/src/arrow/array/builder_base.cc b/cpp/src/arrow/array/builder_base.cc index d3502a0ab64..40e705aa3e4 100644 --- a/cpp/src/arrow/array/builder_base.cc +++ b/cpp/src/arrow/array/builder_base.cc @@ -150,7 +150,8 @@ struct AppendScalarImpl { } template - enable_if_list_like Visit(const T&) { + enable_if_t::value || is_list_like_type::value, Status> Visit( + const T&) { auto builder = checked_cast::BuilderType*>(builder_); int64_t num_children = 0; for (auto it = scalars_begin_; it != scalars_end_; ++it) { @@ -162,8 +163,12 @@ struct AppendScalarImpl { for (int64_t i = 0; i < n_repeats_; i++) { for (auto it = scalars_begin_; it != scalars_end_; ++it) { if (it->is_valid) { - RETURN_NOT_OK(builder->Append()); const Array& list = *checked_cast(*it).value; + if constexpr (T::type_id == Type::MAP || T::type_id == Type::FIXED_SIZE_LIST) { + RETURN_NOT_OK(builder->Append()); + } else { + RETURN_NOT_OK(builder->Append(/*is_valid=*/true, list.length())); + } for (int64_t i = 0; i < list.length(); i++) { ARROW_ASSIGN_OR_RAISE(auto scalar, list.GetScalar(i)); RETURN_NOT_OK(builder->value_builder()->AppendScalar(*scalar)); diff --git a/cpp/src/arrow/array/util.cc b/cpp/src/arrow/array/util.cc index 3b6fd01204f..cfcdadfa9a4 100644 --- a/cpp/src/arrow/array/util.cc +++ b/cpp/src/arrow/array/util.cc @@ -134,7 +134,6 @@ class ArrayDataEndianSwapper { out_->buffers[index] = data_->buffers[index]; return Status::OK(); } - // Except union, offset has one more element rather than data->length ARROW_ASSIGN_OR_RAISE(out_->buffers[index], ByteSwapBuffer(data_->buffers[index])); return Status::OK(); @@ -389,6 +388,12 @@ class NullArrayFactory { return Status::OK(); } + template + enable_if_list_view Visit(const T&) { + buffer_length_ = length_ * sizeof(typename T::offset_type); + return Status::OK(); + } + template enable_if_base_binary Visit(const T&) { // values buffer may be empty, but there must be at least one offset of 0 @@ -524,20 +529,12 @@ class NullArrayFactory { } template - enable_if_var_size_list Visit(const T& type) { - out_->buffers.resize(2, buffer_); + enable_if_var_length_list_like Visit(const T& type) { + out_->buffers.resize(is_list_view(T::type_id) ? 3 : 2, buffer_); ARROW_ASSIGN_OR_RAISE(out_->child_data[0], CreateChild(type, 0, /*length=*/0)); return Status::OK(); } - Status Visit(const ListViewType& type) { - return Status::NotImplemented("construction of all-null ", type); - } - - Status Visit(const LargeListViewType& type) { - return Status::NotImplemented("construction of all-null ", type); - } - Status Visit(const FixedSizeListType& type) { ARROW_ASSIGN_OR_RAISE(out_->child_data[0], CreateChild(type, 0, length_ * type.list_size())); @@ -712,18 +709,26 @@ class RepeatedArrayFactory { std::shared_ptr offsets_buffer; auto size = static_cast(scalar().value->length()); RETURN_NOT_OK(CreateOffsetsBuffer(size, &offsets_buffer)); - out_ = std::make_shared(scalar_.type, length_, offsets_buffer, value_array); return Status::OK(); } - Status Visit(const ListViewType& type) { - return Status::NotImplemented("construction from scalar of type ", *scalar_.type); - } + template + enable_if_list_view Visit(const T& type) { + using ScalarType = typename TypeTraits::ScalarType; + using ArrayType = typename TypeTraits::ArrayType; - Status Visit(const LargeListViewType& type) { - return Status::NotImplemented("construction from scalar of type ", *scalar_.type); + auto value = checked_cast(scalar_).value; + + auto size = static_cast(value->length()); + std::shared_ptr offsets_buffer; + std::shared_ptr sizes_buffer; + RETURN_NOT_OK(CreateIntBuffer(0, &offsets_buffer)); + RETURN_NOT_OK(CreateIntBuffer(size, &sizes_buffer)); + out_ = std::make_shared(scalar_.type, length_, std::move(offsets_buffer), + std::move(sizes_buffer), value); + return Status::OK(); } Status Visit(const FixedSizeListType& type) { @@ -875,6 +880,13 @@ class RepeatedArrayFactory { return builder.Finish(out); } + template + Status CreateIntBuffer(IntType value, std::shared_ptr* out) { + TypedBufferBuilder builder(pool_); + RETURN_NOT_OK(builder.Append(/*num_copies=*/length_, value)); + return builder.Finish(out); + } + Status CreateBufferOf(const void* data, size_t data_length, std::shared_ptr* out) { BufferBuilder builder(pool_); diff --git a/cpp/src/arrow/compare.cc b/cpp/src/arrow/compare.cc index 9869688ef70..2578f8cf0e4 100644 --- a/cpp/src/arrow/compare.cc +++ b/cpp/src/arrow/compare.cc @@ -867,11 +867,15 @@ class ScalarEqualsVisitor { } Status Visit(const ListViewScalar& left) { - return Status::NotImplemented("list-view comparison"); + const auto& right = checked_cast(right_); + result_ = ArrayEquals(*left.value, *right.value, options_, floating_approximate_); + return Status::OK(); } Status Visit(const LargeListViewScalar& left) { - return Status::NotImplemented("large list-view comparison"); + const auto& right = checked_cast(right_); + result_ = ArrayEquals(*left.value, *right.value, options_, floating_approximate_); + return Status::OK(); } Status Visit(const MapScalar& left) { diff --git a/cpp/src/arrow/scalar.cc b/cpp/src/arrow/scalar.cc index 4691a66b7b7..6996b46c8b6 100644 --- a/cpp/src/arrow/scalar.cc +++ b/cpp/src/arrow/scalar.cc @@ -62,14 +62,6 @@ namespace { // Implementation of Scalar::hash() struct ScalarHashImpl { - Status Visit(const ListViewScalar& s) { - return Status::NotImplemented("list-view scalar hashing"); - } - - Status Visit(const LargeListViewScalar& s) { - return Status::NotImplemented("large list-view scalar hashing"); - } - Status Visit(const NullScalar& s) { return Status::OK(); } template @@ -334,14 +326,6 @@ struct ScalarValidateImpl { return Status::OK(); } - Status Visit(const ListViewScalar& s) { - return Status::NotImplemented("list-view scalar validation"); - } - - Status Visit(const LargeListViewScalar& s) { - return Status::NotImplemented("large list-view scalar validation"); - } - Status Visit(const FixedSizeListScalar& s) { RETURN_NOT_OK(Visit(static_cast(s))); const auto& list_type = checked_cast(*s.type); @@ -798,14 +782,6 @@ struct MakeNullImpl { return Status::OK(); } - template ::ScalarType> - Status VisitListLike(const T& type, int64_t value_size = 0) { - ARROW_ASSIGN_OR_RAISE(std::shared_ptr value, - MakeArrayOfNull(type.value_type(), value_size)); - out_ = std::make_shared(std::move(value), type_, /*is_valid=*/false); - return Status::OK(); - } - Status Visit(const FixedSizeBinaryType& type) { ARROW_ASSIGN_OR_RAISE(std::shared_ptr value, AllocateBuffer(type.byte_width())); @@ -816,18 +792,24 @@ struct MakeNullImpl { return Status::OK(); } - Status Visit(const ListType& type) { return VisitListLike(type); } + template ::ScalarType> + Status VisitListLike(const T& type, int64_t list_size = 0) { + ARROW_ASSIGN_OR_RAISE(std::shared_ptr value, + MakeArrayOfNull(type.value_type(), list_size)); + out_ = std::make_shared(std::move(value), type_, /*is_valid=*/false); + return Status::OK(); + } - Status Visit(const MapType& type) { return VisitListLike(type); } + Status Visit(const ListType& type) { return VisitListLike(type); } Status Visit(const LargeListType& type) { return VisitListLike(type); } - Status Visit(const ListViewType& type) { - return Status::NotImplemented("making null array of list-view"); - } + Status Visit(const MapType& type) { return VisitListLike(type); } + + Status Visit(const ListViewType& type) { return VisitListLike(type); } Status Visit(const LargeListViewType& type) { - return Status::NotImplemented("making null array of large list-view"); + return VisitListLike(type); } Status Visit(const FixedSizeListType& type) { diff --git a/cpp/src/arrow/scalar_test.cc b/cpp/src/arrow/scalar_test.cc index a188aea1669..97260aed91c 100644 --- a/cpp/src/arrow/scalar_test.cc +++ b/cpp/src/arrow/scalar_test.cc @@ -394,6 +394,10 @@ class TestRealScalar : public ::testing::Test { void TestLargeListOf() { TestListOf(large_list(type_)); } + void TestListViewOf() { TestListOf(list_view(type_)); } + + void TestLargeListViewOf() { TestListOf(large_list_view(type_)); } + protected: std::shared_ptr type_; std::shared_ptr scalar_val_, scalar_other_, scalar_nan_, scalar_other_nan_, @@ -414,6 +418,10 @@ TYPED_TEST(TestRealScalar, ListOf) { this->TestListOf(); } TYPED_TEST(TestRealScalar, LargeListOf) { this->TestLargeListOf(); } +TYPED_TEST(TestRealScalar, ListViewOf) { this->TestListViewOf(); } + +TYPED_TEST(TestRealScalar, LargeListViewOf) { this->TestLargeListViewOf(); } + template class TestDecimalScalar : public ::testing::Test { public: From 5bfd1696faff9b7016ce26bdb94210b92ed1a5b5 Mon Sep 17 00:00:00 2001 From: Felipe Oliveira Carvalho Date: Fri, 28 Apr 2023 15:08:33 -0300 Subject: [PATCH 13/91] [Large]ListViewArray: Implement Validate --- cpp/src/arrow/array/validate.cc | 168 +++++++++++++++++++++++++------- 1 file changed, 132 insertions(+), 36 deletions(-) diff --git a/cpp/src/arrow/array/validate.cc b/cpp/src/arrow/array/validate.cc index 802ce64b260..35aa9548274 100644 --- a/cpp/src/arrow/array/validate.cc +++ b/cpp/src/arrow/array/validate.cc @@ -23,7 +23,7 @@ #include "arrow/extension_type.h" #include "arrow/type.h" #include "arrow/type_traits.h" -#include "arrow/util/bit_block_counter.h" +#include "arrow/util/bit_run_reader.h" #include "arrow/util/bit_util.h" #include "arrow/util/bitmap_ops.h" #include "arrow/util/checked_cast.h" @@ -264,19 +264,14 @@ struct ValidateArrayImpl { Status Visit(const LargeListType& type) { return ValidateListLike(type); } - Status Visit(const ListViewType& type) { - return Status::NotImplemented("ListViewType validation not implemented"); - } - - Status Visit(const LargeListViewType& type) { - return Status::NotImplemented("LargeListViewType validation not implemented"); - } - Status Visit(const MapType& type) { RETURN_NOT_OK(ValidateListLike(type)); return MapArray::ValidateChildData(data.child_data); } + Status Visit(const ListViewType& type) { return ValidateListView(type); } + Status Visit(const LargeListViewType& type) { return ValidateListView(type); } + Status Visit(const FixedSizeListType& type) { const ArrayData& values = *data.child_data[0]; const int64_t list_size = type.list_size(); @@ -743,6 +738,17 @@ struct ValidateArrayImpl { return Status::OK(); } + template + Status ValidateListView(const ListViewType& type) { + const ArrayData& values = *data.child_data[0]; + const Status child_valid = RecurseInto(values); + if (!child_valid.ok()) { + return Status::Invalid("List-view child array invalid: ", child_valid.ToString()); + } + // For list-views, sizes are validated together with offsets. + return ValidateOffsetsAndSizes(type, values.offset + values.length); + } + template Status ValidateRunEndEncoded(const RunEndEncodedType& type) { if (data.child_data.size() != 2) { @@ -805,20 +811,102 @@ struct ValidateArrayImpl { return Status::OK(); } + private: + /// \pre basic validation has already been performed + template + Status FullyValidateOffsets(int64_t offset_limit) { + const auto* offsets = data.GetValues(1); + auto prev_offset = offsets[0]; + if (prev_offset < 0) { + return Status::Invalid("Offset invariant failure: array starts at negative offset ", + prev_offset); + } + for (int64_t i = 1; i <= data.length; ++i) { + const auto current_offset = offsets[i]; + if (current_offset < prev_offset) { + return Status::Invalid("Offset invariant failure: non-monotonic offset at slot ", + i, ": ", current_offset, " < ", prev_offset); + } + if (current_offset > offset_limit) { + return Status::Invalid("Offset invariant failure: offset for slot ", i, + " out of bounds: ", current_offset, " > ", offset_limit); + } + prev_offset = current_offset; + } + return Status::OK(); + } + + template + Status OutOfBoundsListViewOffset(int64_t slot, int64_t offset_limit) { + const auto* offsets = data.GetValues(1); + const auto offset = offsets[slot]; + return Status::Invalid("Offset invariant failure: offset for slot ", slot, + " out of bounds. Expected ", offset, + " to be at least 0 and less than ", offset_limit); + } + + template + Status OutOfBoundsListViewSize(int64_t slot, int64_t offset_limit) { + const auto* offsets = data.GetValues(1); + const auto* sizes = data.GetValues(2); + const auto size = sizes[slot]; + if (size < 0) { + return Status::Invalid("Offset invariant failure: size for slot ", slot, + " out of bounds: ", size, " < 0"); + } else { + const auto offset = offsets[slot]; + return Status::Invalid("Offset invariant failure: size for slot ", slot, + " out of bounds: ", offset, " + ", size, " > ", + offset_limit); + } + } + + /// \pre basic validation has already been performed + template + Status FullyValidateOffsetsAndSizes(int64_t offset_limit) { + const auto* validity = data.GetValues(0, 0); + const auto* offsets = data.GetValues(1); + const auto* sizes = data.GetValues(2); + + return VisitSetBitRuns( + validity, data.offset, data.length, [&](int64_t run_start, int64_t run_length) { + for (int64_t i = 0; i < run_length; ++i) { + auto slot = run_start + i; + const auto size = sizes[slot]; + if (size > 0) { + const auto offset = offsets[slot]; + if (offset < 0 || offset > offset_limit) { + return OutOfBoundsListViewOffset(slot, offset_limit); + } + if (size > offset_limit - offset) { + return OutOfBoundsListViewSize(slot, offset_limit); + } + } else if (size < 0) { + return OutOfBoundsListViewSize(slot, offset_limit); + } + } + return Status::OK(); + }); + } + template - Status ValidateOffsets(const TypeClass& type, int64_t offset_limit) { + Status ValidateOffsetsAndMaybeSizes(const TypeClass&, int64_t offset_limit) { using offset_type = typename TypeClass::offset_type; + constexpr bool is_list_view = is_list_view_type::value; + const bool non_empty = data.length > 0; if (!IsBufferValid(1)) { - // For length 0, an empty offsets buffer seems accepted as a special case - // (ARROW-544) - if (data.length > 0) { - return Status::Invalid("Non-empty array but offsets are null"); + // For length 0, an empty offsets buffer is accepted (ARROW-544). + return non_empty ? Status::Invalid("Non-empty array but offsets are null") + : Status::OK(); + } + if constexpr (is_list_view) { + if (!IsBufferValid(2)) { + return non_empty ? Status::Invalid("Non-empty array but sizes are null") + : Status::OK(); } - return Status::OK(); } - // An empty list array can have 0 offsets const auto offsets_byte_size = data.buffers[1]->size(); const auto required_offsets = ((data.length > 0) || (offsets_byte_size > 0)) ? data.length + data.offset + 1 @@ -829,33 +917,41 @@ struct ValidateArrayImpl { " isn't large enough for length: ", data.length, " and offset: ", data.offset); } + if constexpr (is_list_view) { + const auto required_sizes = data.length + data.offset; + const auto sizes_bytes_size = data.buffers[2]->size(); + if (sizes_bytes_size / static_cast(sizeof(offset_type)) < required_sizes) { + return Status::Invalid("Sizes buffer size (bytes): ", sizes_bytes_size, + " isn't large enough for length: ", data.length, + " and offset: ", data.offset); + } + } if (full_validation && required_offsets > 0) { - // Validate all offset values - const offset_type* offsets = data.GetValues(1); - - auto prev_offset = offsets[0]; - if (prev_offset < 0) { - return Status::Invalid( - "Offset invariant failure: array starts at negative offset ", prev_offset); - } - for (int64_t i = 1; i <= data.length; ++i) { - const auto current_offset = offsets[i]; - if (current_offset < prev_offset) { - return Status::Invalid( - "Offset invariant failure: non-monotonic offset at slot ", i, ": ", - current_offset, " < ", prev_offset); - } - if (current_offset > offset_limit) { - return Status::Invalid("Offset invariant failure: offset for slot ", i, - " out of bounds: ", current_offset, " > ", offset_limit); - } - prev_offset = current_offset; + if constexpr (is_list_view) { + return FullyValidateOffsetsAndSizes(offset_limit); + } else { + return FullyValidateOffsets(offset_limit); } } return Status::OK(); } + public: + template + enable_if_list_view ValidateOffsetsAndSizes(const TypeClass& type, + int64_t offset_limit) { + return ValidateOffsetsAndMaybeSizes(type, offset_limit); + } + + template + std::enable_if_t::value || + is_base_binary_like(TypeClass::type_id), + Status> + ValidateOffsets(const TypeClass& type, int64_t offset_limit) { + return ValidateOffsetsAndMaybeSizes(type, offset_limit); + } + template Status ValidateDecimals(const DecimalType& type) { using CType = typename TypeTraits::CType; From 40e9e5f84cabfaa68cb0a3eaba52ac43ce6b665c Mon Sep 17 00:00:00 2001 From: Felipe Oliveira Carvalho Date: Tue, 11 Jul 2023 22:33:44 -0300 Subject: [PATCH 14/91] [Large]ListViewArray: Implement Flatten() --- cpp/src/arrow/array/array_nested.cc | 95 +++++++++++++++++++++++++++++ cpp/src/arrow/array/array_nested.h | 10 ++- 2 files changed, 99 insertions(+), 6 deletions(-) diff --git a/cpp/src/arrow/array/array_nested.cc b/cpp/src/arrow/array/array_nested.cc index 19739f7e015..e9193c9708a 100644 --- a/cpp/src/arrow/array/array_nested.cc +++ b/cpp/src/arrow/array/array_nested.cc @@ -260,6 +260,92 @@ Result> FlattenListArray(const ListArrayT& list_array, return Concatenate(non_null_fragments, memory_pool); } +template +Result> FlattenListViewArray(const ListViewArrayT& list_view_array, + MemoryPool* memory_pool) { + using offset_type = typename ListViewArrayT::offset_type; + const int64_t list_view_array_length = list_view_array.length(); + std::shared_ptr value_array = list_view_array.values(); + + if (list_view_array_length == 0) { + return SliceArrayWithOffsets(*value_array, 0, 0); + } + + // If the list array is *all* nulls, then just return an empty array. + if (list_view_array.null_count() == list_view_array.length()) { + return MakeEmptyArray(value_array->type(), memory_pool); + } + + const auto* validity = list_view_array.data()->template GetValues(0); + const auto* offsets = list_view_array.data()->template GetValues(1); + const auto* sizes = list_view_array.data()->template GetValues(2); + + // If a ListViewArray: + // + // 1) does not contain nulls + // 2) has sorted offsets + // 3) has disjoint views which completely cover the values array + // + // then simply slice its value array with the first offset and end of the last list + // view. + if (list_view_array.null_count() == 0) { + bool sorted_and_disjoint = true; + for (int64_t i = 1; sorted_and_disjoint && i < list_view_array_length; ++i) { + sorted_and_disjoint &= + sizes[i - 1] == 0 || offsets[i] - offsets[i - 1] == sizes[i - 1]; + } + + if (sorted_and_disjoint) { + const auto begin_offset = list_view_array.value_offset(0); + const auto end_offset = list_view_array.value_offset(list_view_array_length - 1) + + list_view_array.value_length(list_view_array_length - 1); + return SliceArrayWithOffsets(*value_array, begin_offset, end_offset); + } + } + + std::vector> non_null_fragments; + // Index of first valid, non-empty list-view and last offset + // of the current contiguous fragment in values. + int64_t first_i = -1; + offset_type end_offset = -1; + int64_t i = 0; + for (; i < list_view_array_length; i++) { + if ((validity && !bit_util::GetBit(validity, i)) || sizes[i] == 0) { + continue; + } + first_i = i; + end_offset = offsets[i] + sizes[i]; + break; + } + i += 1; + for (; i < list_view_array_length; i++) { + if ((validity && !bit_util::GetBit(validity, i)) || sizes[i] == 0) { + continue; + } + if (offsets[i] == end_offset) { + end_offset += sizes[i]; + } else { + non_null_fragments.push_back( + SliceArrayWithOffsets(*value_array, offsets[first_i], end_offset)); + first_i = i; + end_offset = offsets[i] + sizes[i]; + } + } + if (first_i >= 0) { + non_null_fragments.push_back( + SliceArrayWithOffsets(*value_array, offsets[first_i], end_offset)); + } + + // Final attempt to avoid invoking Concatenate(). + if (non_null_fragments.size() == 1) { + return non_null_fragments[0]; + } else if (non_null_fragments.size() == 0) { + return MakeEmptyArray(value_array->type(), memory_pool); + } + + return Concatenate(non_null_fragments, memory_pool); +} + std::shared_ptr BoxOffsets(const std::shared_ptr& boxed_type, const ArrayData& data) { const int64_t num_offsets = @@ -457,6 +543,10 @@ Result> ListViewArray::FromArrays( pool, null_bitmap, null_count); } +Result> ListViewArray::Flatten(MemoryPool* memory_pool) const { + return FlattenListViewArray(*this, memory_pool); +} + std::shared_ptr ListViewArray::offsets() const { return BoxOffsets(int32(), *data_); } @@ -510,6 +600,11 @@ Result> LargeListViewArray::FromArrays( std::move(type), offsets, sizes, values, pool, null_bitmap, null_count); } +Result> LargeListViewArray::Flatten( + MemoryPool* memory_pool) const { + return FlattenListViewArray(*this, memory_pool); +} + std::shared_ptr LargeListViewArray::offsets() const { return BoxOffsets(int64(), *data_); } diff --git a/cpp/src/arrow/array/array_nested.h b/cpp/src/arrow/array/array_nested.h index 6987b4950b7..e14a374251a 100644 --- a/cpp/src/arrow/array/array_nested.h +++ b/cpp/src/arrow/array/array_nested.h @@ -306,9 +306,8 @@ class ARROW_EXPORT ListViewArray : public BaseListViewArray { /// Note that it's different from `values()` in that it takes into /// consideration this array's offsets (which can be in any order) /// and sizes. Nulls are skipped. - /// TODO: implement ListViewArray::Flatten - // Result> Flatten( - // MemoryPool* memory_pool = default_memory_pool()) const; + Result> Flatten( + MemoryPool* memory_pool = default_memory_pool()) const; /// \brief Return list-view offsets as an Int32Array /// @@ -381,9 +380,8 @@ class ARROW_EXPORT LargeListViewArray : public BaseListViewArray> Flatten( - // MemoryPool* memory_pool = default_memory_pool()) const; + Result> Flatten( + MemoryPool* memory_pool = default_memory_pool()) const; /// \brief Return list-view offsets as an Int64Array std::shared_ptr offsets() const; From 7e3f5383a7bde90c32977a679cf5593a7a96aedb Mon Sep 17 00:00:00 2001 From: Felipe Oliveira Carvalho Date: Wed, 26 Jul 2023 11:10:13 -0300 Subject: [PATCH 15/91] [Large]ListViewArray: Implement Compare + most of the unit tests --- cpp/src/arrow/array/array_list_test.cc | 322 +++++++++++++++++++++---- cpp/src/arrow/array/array_test.cc | 4 + cpp/src/arrow/compare.cc | 40 ++- cpp/src/arrow/testing/random.cc | 171 +++++++++++-- cpp/src/arrow/testing/random.h | 19 ++ cpp/src/arrow/testing/random_test.cc | 78 +++++- 6 files changed, 555 insertions(+), 79 deletions(-) diff --git a/cpp/src/arrow/array/array_list_test.cc b/cpp/src/arrow/array/array_list_test.cc index a3a2f99851b..0ca7c5d1f37 100644 --- a/cpp/src/arrow/array/array_list_test.cc +++ b/cpp/src/arrow/array/array_list_test.cc @@ -41,10 +41,11 @@ namespace arrow { using internal::checked_cast; using internal::checked_pointer_cast; -using ListTypes = ::testing::Types; +using ListAndListViewTypes = + ::testing::Types; // ---------------------------------------------------------------------- -// List tests +// List and ListView tests template class TestListArray : public ::testing::Test { @@ -57,7 +58,9 @@ class TestListArray : public ::testing::Test { using OffsetArrayType = typename TypeTraits::OffsetArrayType; using OffsetBuilderType = typename TypeTraits::OffsetBuilderType; - void SetUp() { + static constexpr bool kTypeClassIsListView = is_list_view_type::value; + + void SetUp() override { value_type_ = int16(); type_ = std::make_shared(value_type_); @@ -72,8 +75,10 @@ class TestListArray : public ::testing::Test { result_ = std::dynamic_pointer_cast(out); } - void ValidateBasicListArray(const ArrayType* result, const std::vector& values, - const std::vector& is_valid) { + private: + void DoValidateBasicListArray(const ArrayType* result, + const std::vector& values, + const std::vector& is_valid) { ASSERT_OK(result->ValidateFull()); ASSERT_EQ(1, result->null_count()); ASSERT_EQ(0, result->values()->null_count()); @@ -108,6 +113,58 @@ class TestListArray : public ::testing::Test { result_->raw_value_offsets()[result->length()]); } + void DoValidateBasicListViewArray(const ArrayType* result, + const std::vector& values, + const std::vector& is_valid) { + ASSERT_OK(result->ValidateFull()); + ASSERT_EQ(1, result->null_count()); + ASSERT_EQ(0, result->values()->null_count()); + + ASSERT_EQ(3, result->length()); + std::vector ex_offsets = {0, 3, 3}; + std::vector ex_sizes = {3, 0}; + for (size_t i = 0; i < ex_sizes.size(); ++i) { + ASSERT_EQ(ex_offsets[i], result->value_offset(i)); + ASSERT_EQ(ex_sizes[i], result->value_length(i)); + } + ASSERT_EQ(ex_offsets[ex_sizes.size()], result->value_offset(ex_sizes.size())); + + for (int i = 0; i < result->length(); ++i) { + ASSERT_EQ(is_valid[i] == 0, result->IsNull(i)); + } + + ASSERT_EQ(7, result->values()->length()); + auto varr = std::dynamic_pointer_cast(result->values()); + + for (size_t i = 0; i < values.size(); ++i) { + ASSERT_EQ(values[i], varr->Value(i)); + } + + auto offsets = std::dynamic_pointer_cast(result->offsets()); + auto sizes = std::dynamic_pointer_cast(result->sizes()); + ASSERT_EQ(offsets->length(), result->length()); + ASSERT_EQ(offsets->null_count(), 0); + AssertTypeEqual(*offsets->type(), OffsetType()); + ASSERT_EQ(sizes->length(), result->length()); + ASSERT_EQ(sizes->null_count(), 0); + AssertTypeEqual(*sizes->type(), OffsetType()); + + for (int64_t i = 0; i < result->length(); ++i) { + ASSERT_EQ(offsets->Value(i), result_->raw_value_offsets()[i]); + ASSERT_EQ(sizes->Value(i), result_->raw_value_sizes()[i]); + } + } + + void ValidateBasicListArray(const ArrayType* result, const std::vector& values, + const std::vector& is_valid) { + if constexpr (kTypeClassIsListView) { + return DoValidateBasicListViewArray(result, values, is_valid); + } else { + return DoValidateBasicListArray(result, values, is_valid); + } + } + + public: void TestBasics() { std::vector values = {0, 1, 2, 3, 4, 5, 6}; std::vector lengths = {3, 0, 4}; @@ -120,7 +177,7 @@ class TestListArray : public ::testing::Test { int pos = 0; for (size_t i = 0; i < lengths.size(); ++i) { - ASSERT_OK(builder_->Append(is_valid[i] > 0)); + ASSERT_OK(builder_->Append(is_valid[i] > 0, lengths[i])); for (int j = 0; j < lengths[i]; ++j) { ASSERT_OK(vb->Append(values[pos++])); } @@ -133,25 +190,29 @@ class TestListArray : public ::testing::Test { void TestEquality() { auto vb = checked_cast(builder_->value_builder()); - std::shared_ptr array, equal_array, unequal_array; + std::shared_ptr array, equal_array; std::vector equal_offsets = {0, 1, 2, 5, 6, 7, 8, 10}; + std::vector equal_sizes = {1, 1, 3, 1, 1, 1, 2, 0}; std::vector equal_values = {1, 2, 3, 4, 5, 2, 2, 2, 5, 6}; + + std::shared_ptr unequal_array; std::vector unequal_offsets = {0, 1, 4, 7}; + std::vector unequal_sizes = {1, 3, 3, 0}; std::vector unequal_values = {1, 2, 2, 2, 3, 4, 5}; - // setup two equal arrays - ASSERT_OK(builder_->AppendValues(equal_offsets.data(), equal_offsets.size())); + ASSERT_OK(builder_->AppendValues(equal_offsets.data(), equal_sizes.data(), + equal_offsets.size())); ASSERT_OK(vb->AppendValues(equal_values.data(), equal_values.size())); - ASSERT_OK(builder_->Finish(&array)); - ASSERT_OK(builder_->AppendValues(equal_offsets.data(), equal_offsets.size())); - ASSERT_OK(vb->AppendValues(equal_values.data(), equal_values.size())); + ASSERT_OK(builder_->AppendValues(equal_offsets.data(), equal_sizes.data(), + equal_offsets.size())); + ASSERT_OK(vb->AppendValues(equal_values.data(), equal_values.size())); ASSERT_OK(builder_->Finish(&equal_array)); - // now an unequal one - ASSERT_OK(builder_->AppendValues(unequal_offsets.data(), unequal_offsets.size())); - ASSERT_OK(vb->AppendValues(unequal_values.data(), unequal_values.size())); + ASSERT_OK(builder_->AppendValues(unequal_offsets.data(), unequal_sizes.data(), + unequal_offsets.size())); + ASSERT_OK(vb->AppendValues(unequal_values.data(), unequal_values.size())); ASSERT_OK(builder_->Finish(&unequal_array)); // Test array equality @@ -197,16 +258,37 @@ class TestListArray : public ::testing::Test { EXPECT_FALSE(left->Slice(offset)->Equals(right->Slice(offset))); } - void TestFromArraysWithNullBitMap() { - std::shared_ptr offsets_w_nulls, offsets_wo_nulls, values; + private: + Result> FromArrays(const Array& offsets, const Array& sizes, + const Array& values, + std::shared_ptr null_bitmap = NULLPTR, + int64_t null_count = kUnknownNullCount) { + if constexpr (kTypeClassIsListView) { + return ArrayType::FromArrays(offsets, sizes, values, pool_, null_bitmap, + null_count); + } else { + return ArrayType::FromArrays(offsets, values, pool_, null_bitmap, null_count); + } + } + + void TestFromArraysWithNullBitmap() { + std::shared_ptr offsets_w_nulls, offsets_wo_nulls; + std::shared_ptr sizes_w_nulls, sizes_wo_nulls; + std::shared_ptr values; std::vector offsets = {0, 1, 1, 3, 4}; + std::vector sizes = {1, 0, 2, 1}; std::vector offsets_w_nulls_is_valid = {true, false, true, true, true}; + std::vector sizes_w_nulls_is_valid = {true, false, true, true}; ArrayFromVector(offsets_w_nulls_is_valid, offsets, &offsets_w_nulls); ArrayFromVector(offsets, &offsets_wo_nulls); + ArrayFromVector(sizes_w_nulls_is_valid, sizes, + &sizes_w_nulls); + ArrayFromVector(sizes, &sizes_wo_nulls); + auto type = std::make_shared(int32()); auto expected = std::dynamic_pointer_cast( ArrayFromJSON(type, "[[0], null, [0, null], [0]]")); @@ -214,29 +296,41 @@ class TestListArray : public ::testing::Test { // Offsets with nulls will match. ASSERT_OK_AND_ASSIGN(auto result, - ArrayType::FromArrays(*offsets_w_nulls, *values, pool_)); + FromArrays(*offsets_w_nulls, *sizes_wo_nulls, *values)); ASSERT_OK(result->ValidateFull()); AssertArraysEqual(*result, *expected); // Offets without nulls, will replace null with empty list - ASSERT_OK_AND_ASSIGN(result, - ArrayType::FromArrays(*offsets_wo_nulls, *values, pool_)); + ASSERT_OK_AND_ASSIGN(result, FromArrays(*offsets_wo_nulls, *sizes_wo_nulls, *values)); ASSERT_OK(result->ValidateFull()); AssertArraysEqual(*result, *std::dynamic_pointer_cast( ArrayFromJSON(type, "[[0], [], [0, null], [0]]"))); // Specify non-null offsets with null_bitmap - ASSERT_OK_AND_ASSIGN(result, ArrayType::FromArrays(*offsets_wo_nulls, *values, pool_, - expected->null_bitmap())); + ASSERT_OK_AND_ASSIGN(result, FromArrays(*offsets_wo_nulls, *sizes_wo_nulls, *values, + expected->null_bitmap())); ASSERT_OK(result->ValidateFull()); AssertArraysEqual(*result, *expected); // Cannot specify both null offsets with null_bitmap - ASSERT_RAISES(Invalid, ArrayType::FromArrays(*offsets_w_nulls, *values, pool_, - expected->null_bitmap())); + ASSERT_RAISES(Invalid, FromArrays(*offsets_w_nulls, *sizes_wo_nulls, *values, + expected->null_bitmap())); + + if constexpr (kTypeClassIsListView) { + // Sizes with nulls will match. + ASSERT_OK_AND_ASSIGN(auto result, + FromArrays(*offsets_wo_nulls, *sizes_w_nulls, *values)); + ASSERT_OK(result->ValidateFull()); + AssertArraysEqual(*result, *expected); + + // Cannot specify both null sizes with null_bitmap + ASSERT_RAISES(Invalid, FromArrays(*offsets_wo_nulls, *sizes_w_nulls, *values, + expected->null_bitmap())); + } } - void TestFromArraysWithSlicedOffsets() { + template + std::enable_if_t TestFromArraysWithSlicedOffsets() { std::vector offsets = {-1, -1, 0, 1, 2, 4}; std::shared_ptr offsets_wo_nulls; @@ -261,7 +355,8 @@ class TestListArray : public ::testing::Test { AssertArraysEqual(*result, *expected->Slice(1, 2)); } - void TestFromArraysWithSlicedNullOffsets() { + template + std::enable_if_t TestFromArraysWithSlicedNullOffsets() { std::vector offsets = {-1, -1, 0, 1, 1, 3}; std::vector offsets_w_nulls_is_valid = {true, true, true, false, true, true}; @@ -288,7 +383,17 @@ class TestListArray : public ::testing::Test { AssertArraysEqual(*result, *expected->Slice(1, 2)); } - void TestFromArrays() { + public: + void TestFromArraysNullHandling() { + this->TestFromArraysWithNullBitmap(); + if constexpr (!kTypeClassIsListView) { + this->TestFromArraysWithSlicedOffsets(); + this->TestFromArraysWithSlicedNullOffsets(); + } + } + + private: + void DoTestListFromArrays() { std::shared_ptr offsets1, offsets2, offsets3, offsets4, offsets5, values; std::vector offsets_is_valid3 = {true, false, true, true}; @@ -373,6 +478,87 @@ class TestListArray : public ::testing::Test { } } + template + std::enable_if_t DoTestListViewFromArrays() { + std::shared_ptr offsets1, offsets2; + std::shared_ptr sizes1, sizes2, sizes3, sizes4, sizes5; + std::shared_ptr values; + + std::vector sizes_is_valid3 = {true, false, true, true}; + std::vector sizes_is_valid4 = {true, true, false, true}; + std::vector sizes_is_valid5 = {true, true, false, false}; + + std::vector values_is_valid = {true, false, true, true, true, true}; + + std::vector offset1_values = {2, 0, 2}; + std::vector offset2_values = {2, 0, 6}; + std::vector size1_values = {0, 2, 4}; + std::vector size2_values = {4, 2, 0}; + + std::vector values_values = {0, 1, 2, 3, 4, 5}; + const int length = 3; + + ArrayFromVector(offset1_values, &offsets1); + ArrayFromVector(offset2_values, &offsets2); + + ArrayFromVector(size1_values, &sizes1); + ArrayFromVector(size2_values, &sizes2); + ArrayFromVector(sizes_is_valid3, size1_values, &sizes3); + ArrayFromVector(sizes_is_valid4, size2_values, &sizes4); + ArrayFromVector(sizes_is_valid5, size2_values, &sizes5); + + ArrayFromVector(values_is_valid, values_values, &values); + + auto list_type = std::make_shared(int8()); + + ASSERT_OK_AND_ASSIGN(auto list_view1, + ArrayType::FromArrays(*offsets1, *sizes1, *values, pool_)); + ASSERT_OK_AND_ASSIGN(auto list_view3, + ArrayType::FromArrays(*offsets1, *sizes3, *values, pool_)); + ASSERT_OK_AND_ASSIGN(auto list_view4, + ArrayType::FromArrays(*offsets2, *sizes4, *values, pool_)); + ASSERT_OK(list_view1->ValidateFull()); + ASSERT_OK(list_view3->ValidateFull()); + ASSERT_OK(list_view4->ValidateFull()); + + ArrayType expected1(list_type, length, offsets1->data()->buffers[1], + sizes1->data()->buffers[1], values, offsets1->data()->buffers[0], + 0); + AssertArraysEqual(expected1, *list_view1); + + // Use null bitmap from sizes3, but clean sizes from non-null version + ArrayType expected3(list_type, length, offsets1->data()->buffers[1], + sizes1->data()->buffers[1], values, sizes3->data()->buffers[0], + 1); + AssertArraysEqual(expected3, *list_view3); + + ArrayType expected4(list_type, length, offsets2->data()->buffers[1], + sizes2->data()->buffers[1], values, sizes4->data()->buffers[0], + 1); + AssertArraysEqual(expected4, *list_view4); + + // Test failure modes + + std::shared_ptr tmp; + + // Zero-length offsets (not a failure mode for ListViews) + ASSERT_OK(ArrayType::FromArrays(*offsets1->Slice(0, 0), *sizes1->Slice(0, 0), *values, + pool_)); + + // Offsets not the right type + ASSERT_RAISES(TypeError, + ArrayType::FromArrays(/*offsets=*/*values, *sizes1, *values, pool_)); + } + + public: + void TestFromArrays() { + if constexpr (kTypeClassIsListView) { + DoTestListViewFromArrays(); + } else { + DoTestListFromArrays(); + } + } + void TestAppendNull() { ASSERT_OK(builder_->AppendNull()); ASSERT_OK(builder_->AppendNull()); @@ -420,11 +606,13 @@ class TestListArray : public ::testing::Test { std::vector values = {0, 1, 2, 3, 4, 5, 6}; std::vector is_valid = {1, 0, 1}; std::vector offsets = {0, 3, 3}; + std::vector sizes = {3, 0, 1}; Int16Builder* vb = checked_cast(builder_->value_builder()); ASSERT_OK(vb->Reserve(values.size())); - ASSERT_OK(builder_->AppendValues(offsets.data(), offsets.size(), is_valid.data())); + ASSERT_OK(builder_->AppendValues(offsets.data(), sizes.data(), offsets.size(), + is_valid.data())); for (int16_t value : values) { ASSERT_OK(vb->Append(value)); } @@ -434,16 +622,18 @@ class TestListArray : public ::testing::Test { void TestBulkAppendInvalid() { std::vector values = {0, 1, 2, 3, 4, 5, 6}; - std::vector lengths = {3, 0, 4}; std::vector is_valid = {1, 0, 1}; // Should be {0, 3, 3} given the is_valid array std::vector offsets = {0, 2, 4}; + std::vector sizes = {2, 2, 4}; Int16Builder* vb = checked_cast(builder_->value_builder()); ASSERT_OK(vb->Reserve(values.size())); - ASSERT_OK(builder_->AppendValues(offsets.data(), offsets.size(), is_valid.data())); - ASSERT_OK(builder_->AppendValues(offsets.data(), offsets.size(), is_valid.data())); + ASSERT_OK(builder_->AppendValues(offsets.data(), sizes.data(), offsets.size(), + is_valid.data())); + ASSERT_OK(builder_->AppendValues(offsets.data(), sizes.data(), offsets.size(), + is_valid.data())); for (int16_t value : values) { ASSERT_OK(vb->Append(value)); } @@ -466,7 +656,12 @@ class TestListArray : public ::testing::Test { builder_.reset(checked_cast(tmp.release())); std::vector offsets = {1, 2, 4, 8}; - ASSERT_OK(builder_->AppendValues(offsets.data(), offsets.size())); + std::vector sizes = {1, 2, 4}; + if constexpr (kTypeClassIsListView) { + ASSERT_OK(builder_->AppendValues(offsets.data(), sizes.data(), sizes.size())); + } else { + ASSERT_OK(builder_->AppendValues(offsets.data(), offsets.size())); + } std::shared_ptr list_array; ASSERT_OK(builder_->Finish(&list_array)); @@ -520,7 +715,7 @@ class TestListArray : public ::testing::Test { std::dynamic_pointer_cast( ArrayFromJSON(type, "[[1, 2], [3], null, [5, 6], [7, 8], [], [9]]")) ->data(); - ASSERT_EQ(2, array_data->buffers.size()); + ASSERT_EQ(kTypeClassIsListView ? 3 : 2, array_data->buffers.size()); auto null_bitmap_buffer = array_data->buffers[0]; ASSERT_NE(nullptr, null_bitmap_buffer); bit_util::ClearBit(null_bitmap_buffer->mutable_data(), 1); @@ -534,15 +729,37 @@ class TestListArray : public ::testing::Test { << flattened->ToString(); } - Status ValidateOffsets(int64_t length, std::vector offsets, - const std::shared_ptr& values, int64_t offset = 0) { + Status ValidateOffsetsAndSizes(int64_t length, std::vector offsets, + std::vector sizes, + std::shared_ptr values, int64_t offset = 0) { auto type = std::make_shared(values->type()); - ArrayType arr(type, length, Buffer::Wrap(offsets), values, + auto offsets_buffer = Buffer::Wrap(offsets.data(), sizes.size()); + auto sizes_buffer = Buffer::Wrap(sizes); + ArrayType arr(type, length, std::move(offsets_buffer), std::move(sizes_buffer), + std::move(values), /*null_bitmap=*/nullptr, /*null_count=*/0, offset); return arr.ValidateFull(); } - void TestValidateOffsets() { + Status ValidateOffsets(int64_t length, std::vector offsets, + std::shared_ptr values, int64_t offset = 0) { + if constexpr (kTypeClassIsListView) { + std::vector sizes; + sizes.reserve(offsets.empty() ? 0 : offsets.size() - 1); + for (size_t i = 1; i < offsets.size(); ++i) { + sizes.push_back(offsets[i] - offsets[i - 1]); + } + return ValidateOffsetsAndSizes(length, std::move(offsets), std::move(sizes), + std::move(values), offset); + } else { + auto type = std::make_shared(values->type()); + ArrayType arr(type, length, Buffer::Wrap(offsets), std::move(values), + /*null_bitmap=*/nullptr, /*null_count=*/0, offset); + return arr.ValidateFull(); + } + } + + void TestValidateDimensions() { auto empty_values = ArrayFromJSON(int16(), "[]"); auto values = ArrayFromJSON(int16(), "[1, 2, 3, 4, 5, 6, 7]"); @@ -564,13 +781,24 @@ class TestListArray : public ::testing::Test { // Offset out of bounds ASSERT_RAISES(Invalid, ValidateOffsets(1, {0, 8}, values)); - ASSERT_RAISES(Invalid, ValidateOffsets(1, {0, 8, 8}, values, 1)); + if constexpr (kTypeClassIsListView) { + ASSERT_RAISES(Invalid, ValidateOffsets(1, {0, 8, 8}, values, 2)); + } else { + ASSERT_RAISES(Invalid, ValidateOffsets(1, {0, 8, 8}, values, 1)); + } // Negative offset ASSERT_RAISES(Invalid, ValidateOffsets(1, {-1, 0}, values)); ASSERT_RAISES(Invalid, ValidateOffsets(1, {0, -1}, values)); - ASSERT_RAISES(Invalid, ValidateOffsets(2, {0, -1, -1}, values, 1)); // Offsets non-monotonic ASSERT_RAISES(Invalid, ValidateOffsets(2, {0, 7, 4}, values)); + + if constexpr (kTypeClassIsListView) { + // Out of order offsets + ASSERT_OK(ValidateOffsetsAndSizes(2, {4, 1, 2}, {3, 6, 5}, values)); + + // Sizes out of bounds + ASSERT_RAISES(Invalid, ValidateOffsetsAndSizes(2, {4, 1, 2}, {3, 7, 5}, values)); + } } void TestCornerCases() { @@ -581,7 +809,7 @@ class TestListArray : public ::testing::Test { AssertArraysEqual(*result_, *expected); SetUp(); - ASSERT_OK(builder_->Append()); + ASSERT_OK(builder_->Append(/*is_valid=*/true, 0)); Done(); expected = ArrayFromJSON(type_, "[[]]"); AssertArraysEqual(*result_, *expected); @@ -602,7 +830,7 @@ class TestListArray : public ::testing::Test { ASSERT_OK(builder_->ValidateOverflow(max_elements)); ASSERT_RAISES(CapacityError, builder_->ValidateOverflow(max_elements + 1)); - ASSERT_OK(builder_->Append()); + ASSERT_OK(builder_->Append(/*is_valid=*/true, 2)); ASSERT_OK(vb->Append(1)); ASSERT_OK(vb->Append(2)); ASSERT_OK(builder_->ValidateOverflow(max_elements - 2)); @@ -612,7 +840,7 @@ class TestListArray : public ::testing::Test { ASSERT_OK(builder_->ValidateOverflow(max_elements - 2)); ASSERT_RAISES(CapacityError, builder_->ValidateOverflow(max_elements - 1)); - ASSERT_OK(builder_->Append()); + ASSERT_OK(builder_->Append(/*is_valid=*/true, 3)); ASSERT_OK(vb->Append(1)); ASSERT_OK(vb->Append(2)); ASSERT_OK(vb->Append(3)); @@ -629,7 +857,7 @@ class TestListArray : public ::testing::Test { std::shared_ptr result_; }; -TYPED_TEST_SUITE(TestListArray, ListTypes); +TYPED_TEST_SUITE(TestListArray, ListAndListViewTypes); TYPED_TEST(TestListArray, Basics) { this->TestBasics(); } @@ -639,11 +867,7 @@ TYPED_TEST(TestListArray, ValuesEquality) { this->TestValuesEquality(); } TYPED_TEST(TestListArray, FromArrays) { this->TestFromArrays(); } -TYPED_TEST(TestListArray, FromArraysWithNullBitMap) { - this->TestFromArraysWithNullBitMap(); - this->TestFromArraysWithSlicedOffsets(); - this->TestFromArraysWithSlicedNullOffsets(); -} +TYPED_TEST(TestListArray, FromArraysNullHandling) { this->TestFromArraysNullHandling(); } TYPED_TEST(TestListArray, AppendNull) { this->TestAppendNull(); } @@ -666,7 +890,7 @@ TYPED_TEST(TestListArray, TestFlattenNonEmptyBackingNulls) { this->TestFlattenNonEmptyBackingNulls(); } -TYPED_TEST(TestListArray, ValidateOffsets) { this->TestValidateOffsets(); } +TYPED_TEST(TestListArray, ValidateDimensions) { this->TestValidateDimensions(); } TYPED_TEST(TestListArray, CornerCases) { this->TestCornerCases(); } diff --git a/cpp/src/arrow/array/array_test.cc b/cpp/src/arrow/array/array_test.cc index 1ce306f752b..7b93bd07a82 100644 --- a/cpp/src/arrow/array/array_test.cc +++ b/cpp/src/arrow/array/array_test.cc @@ -398,6 +398,8 @@ static std::vector> TestArrayUtilitiesAgainstTheseType large_list(list(large_utf8())), fixed_size_list(utf8(), 3), fixed_size_list(int64(), 4), + list_view(utf8()), + large_list_view(utf8()), dictionary(int32(), utf8()), struct_({field("a", utf8()), field("b", int32())}), sparse_union(union_fields1, union_type_codes), @@ -616,6 +618,8 @@ static ScalarVector GetScalars() { ScalarFromJSON(map(int8(), utf8()), R"([[1, "foo"], [2, "bar"]])"), std::make_shared(ArrayFromJSON(int8(), "[1, 1, 2, 2, 3, 3]")), std::make_shared(ArrayFromJSON(int8(), "[1, 2, 3, 4]")), + std::make_shared(ArrayFromJSON(int8(), "[1, 2, 3]")), + std::make_shared(ArrayFromJSON(int8(), "[1, 1, 2, 2, 3, 3]")), std::make_shared( ScalarVector{ std::make_shared(2), diff --git a/cpp/src/arrow/compare.cc b/cpp/src/arrow/compare.cc index 2578f8cf0e4..bb632e2eb91 100644 --- a/cpp/src/arrow/compare.cc +++ b/cpp/src/arrow/compare.cc @@ -308,13 +308,9 @@ class RangeDataEqualsImpl { Status Visit(const LargeListType& type) { return CompareList(type); } - Status Visit(const ListViewType& type) { - return Status::NotImplemented("comparing ListViewType"); - } + Status Visit(const ListViewType& type) { return CompareListView(type); } - Status Visit(const LargeListViewType& type) { - return Status::NotImplemented("comparing LargeListViewType"); - } + Status Visit(const LargeListViewType& type) { return CompareListView(type); } Status Visit(const FixedSizeListType& type) { const auto list_size = type.list_size(); @@ -501,6 +497,38 @@ class RangeDataEqualsImpl { return Status::OK(); } + template + Status CompareListView(const TypeClass& type) { + const ArrayData& left_values = *left_.child_data[0]; + const ArrayData& right_values = *right_.child_data[0]; + + using offset_type = typename TypeClass::offset_type; + const auto* left_offsets = left_.GetValues(1) + left_start_idx_; + const auto* right_offsets = right_.GetValues(1) + right_start_idx_; + const auto* left_sizes = left_.GetValues(2) + left_start_idx_; + const auto* right_sizes = right_.GetValues(2) + right_start_idx_; + + auto compare_view = [&](int64_t i, int64_t length) -> bool { + for (int64_t j = i; j < i + length; ++j) { + if (left_sizes[j] != right_sizes[j]) { + return false; + } + const offset_type size = left_sizes[j]; + if (size == 0) { + continue; + } + RangeDataEqualsImpl impl(options_, floating_approximate_, left_values, + right_values, left_offsets[j], right_offsets[j], size); + if (!impl.Compare()) { + return false; + } + } + return true; + }; + VisitValidRuns(std::move(compare_view)); + return Status::OK(); + } + template Status CompareRunEndEncoded() { auto left_span = ArraySpan(left_); diff --git a/cpp/src/arrow/testing/random.cc b/cpp/src/arrow/testing/random.cc index 1386075397e..24e9c95f437 100644 --- a/cpp/src/arrow/testing/random.cc +++ b/cpp/src/arrow/testing/random.cc @@ -499,6 +499,7 @@ std::shared_ptr RandomArrayGenerator::FixedSizeBinary(int64_t size, } namespace { + template std::shared_ptr GenerateOffsets(SeedType seed, int64_t size, typename OffsetArrayType::value_type first_offset, @@ -608,6 +609,122 @@ std::shared_ptr OffsetsFromLengthsArray(OffsetArrayType* lengths, std::make_shared(), size, buffers, null_count); return std::make_shared(array_data); } + +// Helper for RandomArrayGenerator::ArrayOf: extract some C value from +// a given metadata key. +template ::ArrowType> +enable_if_parameter_free GetMetadata(const KeyValueMetadata* metadata, + const std::string& key, + T default_value) { + if (!metadata) return default_value; + const auto index = metadata->FindKey(key); + if (index < 0) return default_value; + const auto& value = metadata->value(index); + T output{}; + if (!internal::ParseValue(value.data(), value.length(), &output)) { + ABORT_NOT_OK(Status::Invalid("Could not parse ", key, " = ", value, " as ", + ArrowType::type_name())); + } + return output; +} + +/// Try to pass sizes such that every non-null sizes[i] <= values_size. +template +std::shared_ptr ViewOffsetsFromLengthsArray( + SeedType seed, offset_type avg_length, offset_type values_length, + OffsetArrayType& mutable_sizes_array, bool force_empty_nulls, + bool zero_undefined_offsets, int64_t alignment, MemoryPool* memory_pool) { + using TypeClass = typename OffsetArrayType::TypeClass; + constexpr offset_type kZero = 0; + + auto* sizes = mutable_sizes_array.data()->template GetMutableValues(1); + + BufferVector buffers{2}; + buffers[0] = NULLPTR; // sizes can have nulls, offsets don't have to + buffers[1] = *AllocateBuffer(sizeof(offset_type) * mutable_sizes_array.length(), + alignment, memory_pool); + auto offsets = buffers[1]->mutable_data_as(); + + pcg32_fast rng(seed); + std::uniform_int_distribution offset_delta_dist(-avg_length, avg_length); + offset_type offset_base = 0; + for (int64_t i = 0; i < mutable_sizes_array.length(); ++i) { + // We want to always sample the offset_delta_dist(rng) to make sure + // different options regarding nulls and empty views don't affect + // the other offsets. + offset_type offset = offset_base + offset_delta_dist(rng); + if (mutable_sizes_array.IsNull(i)) { + if (force_empty_nulls) { + sizes[i] = 0; + } + offsets[i] = zero_undefined_offsets ? 0 : offset; + continue; + } + offset_type size = sizes[i]; + if (size == 0) { + offsets[i] = zero_undefined_offsets ? 0 : offset; + } else { + // Ensure that the size is not too large. + if (ARROW_PREDICT_FALSE(size > values_length)) { + size = values_length; + sizes[i] = size; // Fix the size. + } + // Ensure the offset is not negative or too large. + offset = std::max(offset, kZero); + if (offset > values_length - size) { + offset = values_length - size; + } + offsets[i] = offset; + } + offset_base += avg_length; + } + + auto array_data = + ArrayData::Make(TypeTraits::type_singleton(), + mutable_sizes_array.length(), std::move(buffers), /*null_count=*/0); + return std::make_shared(std::move(array_data)); +} + +template +Result> ArrayOfListView(RAG& self, const Field& field, + int64_t length, int64_t alignment, + MemoryPool* memory_pool, + double null_probability) { + using TypeClass = typename ArrayType::TypeClass; + using offset_type = typename ArrayType::offset_type; + using OffsetArrayType = typename CTypeTraits::ArrayType; + using OffsetArrowType = typename CTypeTraits::ArrowType; + + const auto min_length = + GetMetadata(field.metadata().get(), "min_length", 0); + const auto max_length = + GetMetadata(field.metadata().get(), "max_length", 20); + const auto force_empty_nulls = + GetMetadata(field.metadata().get(), "force_empty_nulls", false); + const auto zero_undefined_offsets = + GetMetadata(field.metadata().get(), "zero_undefined_offsets", false); + const auto lengths = internal::checked_pointer_cast( + self.RAG::template Numeric( + length, min_length, max_length, null_probability)); + + // List views don't have to be disjoint, so let's make the values_length a + // multiple of the average list-view size. To make sure every list view + // into the values array can fit, it should be at least max_length. + const offset_type avg_length = min_length + (max_length - min_length) / 2; + const int64_t values_length = std::max(avg_length * (length - lengths->null_count()), + static_cast(max_length)); + DCHECK_LT(values_length, std::numeric_limits::max()); + const auto values = self.RAG::ArrayOf( + *internal::checked_pointer_cast(field.type())->value_field(), + values_length, alignment, memory_pool); + + const auto offsets = ViewOffsetsFromLengthsArray( + self.seed(), avg_length, static_cast(values_length), *lengths, + force_empty_nulls, zero_undefined_offsets, alignment, memory_pool); + + return ArrayType::FromArrays(field.type(), *offsets, *lengths, *values); +} + } // namespace std::shared_ptr RandomArrayGenerator::Offsets( @@ -637,6 +754,31 @@ std::shared_ptr RandomArrayGenerator::List(const Array& values, int64_t s return *::arrow::ListArray::FromArrays(*offsets, values); } +std::shared_ptr RandomArrayGenerator::ListView( + const Array& values, int64_t size, double null_probability, bool force_empty_nulls, + bool zero_undefined_offsets, int64_t alignment, MemoryPool* memory_pool) { + using offset_type = int32_t; + using OffsetArrayType = Int32Array; + using OffsetArrowType = Int32Type; + + DCHECK_LE(values.length(), std::numeric_limits::max()); + DCHECK_LE(size, std::numeric_limits::max()); + const auto values_length = static_cast(values.length()); + + const offset_type avg_length = (values_length - 1) / static_cast(size) + 1; + const offset_type min_length = 0; + const offset_type max_length = std::min(std::max(2 * avg_length, 1), values_length); + const auto lengths = internal::checked_pointer_cast( + Numeric(size, min_length, max_length, + null_probability)); + + const auto offsets = ViewOffsetsFromLengthsArray( + seed(), avg_length, values_length, *lengths, force_empty_nulls, + zero_undefined_offsets, alignment, memory_pool); + + return *ListViewArray::FromArrays(*offsets, *lengths, values, memory_pool); +} + std::shared_ptr RandomArrayGenerator::Map(const std::shared_ptr& keys, const std::shared_ptr& items, int64_t size, double null_probability, @@ -713,27 +855,6 @@ std::shared_ptr RandomArrayGenerator::DenseUnion(const ArrayVector& field return *DenseUnionArray::Make(*type_ids, *offsets, fields, type_codes); } -namespace { - -// Helper for RandomArrayGenerator::ArrayOf: extract some C value from -// a given metadata key. -template ::ArrowType> -enable_if_parameter_free GetMetadata(const KeyValueMetadata* metadata, - const std::string& key, - T default_value) { - if (!metadata) return default_value; - const auto index = metadata->FindKey(key); - if (index < 0) return default_value; - const auto& value = metadata->value(index); - T output{}; - if (!internal::ParseValue(value.data(), value.length(), &output)) { - ABORT_NOT_OK(Status::Invalid("Could not parse ", key, " = ", value)); - } - return output; -} - -} // namespace - std::shared_ptr RandomArrayGenerator::ArrayOf(std::shared_ptr type, int64_t size, double null_probability, @@ -811,6 +932,12 @@ std::shared_ptr RandomArrayGenerator::ArrayOf(const Field& field, int64_t return *ARRAY_TYPE::FromArrays(field.type(), *offsets, *values); \ } +#define GENERATE_LIST_VIEW_CASE(ARRAY_TYPE) \ + case ARRAY_TYPE::TypeClass::type_id: { \ + return *ArrayOfListView(*this, field, length, alignment, memory_pool, \ + null_probability); \ + } + const double null_probability = field.nullable() ? GetMetadata(field.metadata().get(), "null_probability", 0.01) @@ -946,6 +1073,7 @@ std::shared_ptr RandomArrayGenerator::ArrayOf(const Field& field, int64_t } GENERATE_LIST_CASE(ListArray); + GENERATE_LIST_VIEW_CASE(ListViewArray); case Type::type::STRUCT: { ArrayVector child_arrays(field.type()->num_fields()); @@ -1069,6 +1197,7 @@ std::shared_ptr RandomArrayGenerator::ArrayOf(const Field& field, int64_t } GENERATE_LIST_CASE(LargeListArray); + GENERATE_LIST_VIEW_CASE(LargeListViewArray); default: break; diff --git a/cpp/src/arrow/testing/random.h b/cpp/src/arrow/testing/random.h index cbdac3baa01..3bb7b5d3603 100644 --- a/cpp/src/arrow/testing/random.h +++ b/cpp/src/arrow/testing/random.h @@ -458,6 +458,25 @@ class ARROW_TESTING_EXPORT RandomArrayGenerator { int64_t alignment = kDefaultBufferAlignment, MemoryPool* memory_pool = default_memory_pool()); + /// \brief Generate a random ListViewArray + /// + /// \param[in] values The underlying values array + /// \param[in] size The size of the generated list array + /// \param[in] null_probability the probability of a list value being null + /// \param[in] force_empty_nulls if true, null list entries must have 0 length + /// \param[in] zero_undefined_offsets if true, offsets of 0-length lists + /// must be set to 0 + /// \param[in] alignment alignment for memory allocations (in bytes) + /// \param[in] memory_pool memory pool to allocate memory from + /// + /// \return a generated Array + std::shared_ptr ListView(const Array& values, int64_t size, + double null_probability = 0, + bool force_empty_nulls = false, + bool zero_undefined_offsets = false, + int64_t alignment = kDefaultBufferAlignment, + MemoryPool* memory_pool = default_memory_pool()); + /// \brief Generate a random MapArray /// /// \param[in] keys The underlying keys array diff --git a/cpp/src/arrow/testing/random_test.cc b/cpp/src/arrow/testing/random_test.cc index 951b654e56f..a92ecf4e9c4 100644 --- a/cpp/src/arrow/testing/random_test.cc +++ b/cpp/src/arrow/testing/random_test.cc @@ -70,7 +70,7 @@ class RandomArrayTest : public ::testing::TestWithParam { } bool HasList(const DataType& type) { - if (is_var_length_list(type.id())) { + if (is_var_length_list_like(type.id())) { return true; } for (const auto& child : type.fields()) { @@ -99,7 +99,7 @@ TEST_P(RandomArrayTest, GenerateArrayAlignment) { const int64_t alignment = 1024; auto field = GetField(); if (HasList(*field->type())) { - GTEST_SKIP() << "ListArray::FromArrays does not conserve buffer alignment"; + GTEST_SKIP() << "List[View]Array::FromArrays does not conserve buffer alignment"; } auto array = GenerateArray(*field, /*size=*/13, 0xDEADBEEF, alignment); AssertTypeEqual(field->type(), array->type()); @@ -177,6 +177,13 @@ auto values = ::testing::Values( key_value_metadata({{"force_empty_nulls", "true"}})), field("listint81024values", list(int8()), true, key_value_metadata({{"values", "1024"}})), + field("listviewint8", list_view(int8())), + field("listviewlistviewint8", list_view(list_view(int8()))), + field("listviewint8emptynulls", list_view(int8()), true, + key_value_metadata( + {{"force_empty_nulls", "true"}, {"zero_undefined_offsets", "true"}})), + field("listviewint81024values", list_view(int8()), true, + key_value_metadata({{"values", "1024"}})), field("structints", struct_({ field("int8", int8()), field("int16", int16()), @@ -201,7 +208,8 @@ auto values = ::testing::Values( field("fixedsizelist", fixed_size_list(int8(), 4)), field("durationns", duration(TimeUnit::NANO)), field("largestring", large_utf8()), field("largebinary", large_binary()), - field("largelistlistint8", large_list(list(int8())))); + field("largelistlistint8", large_list(list(int8()))), + field("largelistviewlistviewint8", large_list_view(list_view(int8())))); INSTANTIATE_TEST_SUITE_P( TestRandomArrayGeneration, RandomArrayTest, values, @@ -400,6 +408,39 @@ TEST(TypeSpecificTests, ListLengths) { } } +TEST(TypeSpecificTests, ListViewLengths) { + { + auto field = + arrow::field("list_view", list_view(int8()), + key_value_metadata({{"min_length", "1"}, {"max_length", "1"}})); + auto base_array = GenerateArray(*field, kExpectedLength, 0xDEADBEEF); + AssertTypeEqual(field->type(), base_array->type()); + auto array = internal::checked_pointer_cast(base_array); + ASSERT_OK(array->ValidateFull()); + ASSERT_EQ(array->length(), kExpectedLength); + for (int i = 0; i < kExpectedLength; i++) { + if (!array->IsNull(i)) { + ASSERT_EQ(1, array->value_length(i)); + } + } + } + { + auto field = + arrow::field("list_view", large_list_view(int8()), + key_value_metadata({{"min_length", "10"}, {"max_length", "10"}})); + auto base_array = GenerateArray(*field, kExpectedLength, 0xDEADBEEF); + AssertTypeEqual(field->type(), base_array->type()); + auto array = internal::checked_pointer_cast(base_array); + ASSERT_EQ(array->length(), kExpectedLength); + ASSERT_OK(array->ValidateFull()); + for (int i = 0; i < kExpectedLength; i++) { + if (!array->IsNull(i)) { + ASSERT_EQ(10, array->value_length(i)); + } + } + } +} + TEST(TypeSpecificTests, MapValues) { auto field = arrow::field("map", map(int8(), int8()), key_value_metadata({{"values", "4"}})); @@ -500,6 +541,24 @@ TEST(RandomList, Basics) { } } +TEST(RandomListView, Basics) { + random::RandomArrayGenerator rng(42); + for (const double null_probability : {0.0, 0.1, 0.98}) { + SCOPED_TRACE("null_probability = " + std::to_string(null_probability)); + auto values = rng.Int16(1234, 0, 10000, null_probability); + auto array = rng.ListView(*values, 45, null_probability); + ASSERT_OK(array->ValidateFull()); + ASSERT_EQ(array->length(), 45); + const auto& list_view_array = checked_cast(*array); + ASSERT_EQ(list_view_array.values()->length(), 1234); + int64_t null_count = 0; + for (int64_t i = 0; i < array->length(); ++i) { + null_count += array->IsNull(i); + } + ASSERT_EQ(null_count, array->data()->null_count); + } +} + TEST(RandomChildFieldNullablity, List) { random::RandomArrayGenerator rng(42); @@ -513,6 +572,19 @@ TEST(RandomChildFieldNullablity, List) { ARROW_EXPECT_OK(batch->ValidateFull()); } +TEST(RandomChildFieldNullablity, ListView) { + random::RandomArrayGenerator rng(42); + + auto item = arrow::field("item", arrow::int8(), true); + auto nest_list_view_field = arrow::field("list_view", list_view(item), false); + auto list_view_field = arrow::field("list_view", list_view(nest_list_view_field), true); + auto array = rng.ArrayOf(*list_view_field, 428); + ARROW_EXPECT_OK(array->ValidateFull()); + + auto batch = rng.BatchOf({list_view_field}, 428); + ARROW_EXPECT_OK(batch->ValidateFull()); +} + TEST(RandomChildFieldNullablity, Struct) { random::RandomArrayGenerator rng(42); From 21cf4228c7c15c4580ad29c76c5381ec16667560 Mon Sep 17 00:00:00 2001 From: Felipe Oliveira Carvalho Date: Wed, 3 May 2023 16:59:16 -0300 Subject: [PATCH 16/91] [Large]ListViewArray: Implement PrettyPrint + tests --- cpp/src/arrow/array/array_list_view_test.cc | 1 + cpp/src/arrow/pretty_print.cc | 11 +-- cpp/src/arrow/pretty_print_test.cc | 99 +++++++++++++++++++-- 3 files changed, 97 insertions(+), 14 deletions(-) diff --git a/cpp/src/arrow/array/array_list_view_test.cc b/cpp/src/arrow/array/array_list_view_test.cc index 8f9f9cc1d3a..3e48191cedd 100644 --- a/cpp/src/arrow/array/array_list_view_test.cc +++ b/cpp/src/arrow/array/array_list_view_test.cc @@ -19,6 +19,7 @@ #include "arrow/array/array_nested.h" #include "arrow/array/util.h" +#include "arrow/pretty_print.h" #include "arrow/testing/gtest_util.h" #include "arrow/type_fwd.h" #include "arrow/util/checked_cast.h" diff --git a/cpp/src/arrow/pretty_print.cc b/cpp/src/arrow/pretty_print.cc index fea353d18e0..e666ec70f94 100644 --- a/cpp/src/arrow/pretty_print.cc +++ b/cpp/src/arrow/pretty_print.cc @@ -249,7 +249,8 @@ class ArrayPrinter : public PrettyPrinter { } template - enable_if_list_like WriteDataValues(const ArrayType& array) { + enable_if_t::value || is_list_view_type::value, Status> + WriteDataValues(const ArrayType& array) { const auto values = array.values(); const auto child_options = ChildOptions(); ArrayPrinter values_printer(child_options, sink_); @@ -266,14 +267,6 @@ class ArrayPrinter : public PrettyPrinter { /*is_container=*/true); } - Status WriteDataValues(const ListViewArray& array) { - return Status::NotImplemented("writing data values of a list-view array"); - } - - Status WriteDataValues(const LargeListViewArray& array) { - return Status::NotImplemented("writing data values of a large list-view array"); - } - Status WriteDataValues(const MapArray& array) { const auto keys = array.keys(); const auto items = array.items(); diff --git a/cpp/src/arrow/pretty_print_test.cc b/cpp/src/arrow/pretty_print_test.cc index 9217e190d5b..0db6ae48672 100644 --- a/cpp/src/arrow/pretty_print_test.cc +++ b/cpp/src/arrow/pretty_print_test.cc @@ -774,8 +774,11 @@ TEST_F(TestPrettyPrint, BinaryNoNewlines) { CheckPrimitive(options, is_valid, values, expected, false); } -TEST_F(TestPrettyPrint, ListType) { - auto list_type = list(int64()); +template +void TestPrettyPrintVarLengthListLike() { + using LargeTypeClass = typename TypeTraits::LargeType; + auto var_list_type = std::make_shared(int64()); + auto var_large_list_type = std::make_shared(int64()); static const char* ex = R"expected([ [ @@ -836,7 +839,7 @@ TEST_F(TestPrettyPrint, ListType) { ] ])expected"; - auto array = ArrayFromJSON(list_type, "[[null], [], null, [4, 6, 7], [2, 3]]"); + auto array = ArrayFromJSON(var_list_type, "[[null], [], null, [4, 6, 7], [2, 3]]"); auto make_options = [](int indent, int window, int container_window) { auto options = PrettyPrintOptions(indent, window); options.container_window = container_window; @@ -850,8 +853,7 @@ TEST_F(TestPrettyPrint, ListType) { ex_3); CheckArray(*array, {0, 10}, ex_4); - list_type = large_list(int64()); - array = ArrayFromJSON(list_type, "[[null], [], null, [4, 6, 7], [2, 3]]"); + array = ArrayFromJSON(var_large_list_type, "[[null], [], null, [4, 6, 7], [2, 3]]"); CheckStream(*array, make_options(/*indent=*/0, /*window=*/10, /*container_window=*/5), ex); CheckStream(*array, make_options(/*indent=*/2, /*window=*/10, /*container_window=*/5), @@ -861,6 +863,93 @@ TEST_F(TestPrettyPrint, ListType) { CheckArray(*array, {0, 10}, ex_4); } +TEST_F(TestPrettyPrint, ListType) { TestPrettyPrintVarLengthListLike(); } + +template +void TestListViewSpecificPrettyPrinting() { + using ArrayType = typename TypeTraits::ArrayType; + using OffsetType = typename TypeTraits::OffsetType; + + auto string_values = ArrayFromJSON(utf8(), R"(["Hello", "World", null])"); + auto int32_values = ArrayFromJSON(int32(), "[1, 20, 3]"); + auto int16_values = ArrayFromJSON(int16(), "[10, 2, 30]"); + + auto Offsets = [](std::string_view json) { + return ArrayFromJSON(TypeTraits::type_singleton(), json); + }; + auto Sizes = Offsets; + + ASSERT_OK_AND_ASSIGN(auto int_list_view_array, + ArrayType::FromArrays(*Offsets("[0, 0, 1, 2]"), + *Sizes("[2, 1, 1, 1]"), *int32_values)); + ASSERT_OK(int_list_view_array->ValidateFull()); + static const char* ex1 = + "[\n" + " [\n" + " 1,\n" + " 20\n" + " ],\n" + " [\n" + " 1\n" + " ],\n" + " [\n" + " 20\n" + " ],\n" + " [\n" + " 3\n" + " ]\n" + "]"; + CheckStream(*int_list_view_array, {}, ex1); + + ASSERT_OK_AND_ASSIGN(auto string_list_view_array, + ArrayType::FromArrays(*Offsets("[0, 0, 1, 2]"), + *Sizes("[2, 1, 1, 1]"), *string_values)); + ASSERT_OK(string_list_view_array->ValidateFull()); + static const char* ex2 = + "[\n" + " [\n" + " \"Hello\",\n" + " \"World\"\n" + " ],\n" + " [\n" + " \"Hello\"\n" + " ],\n" + " [\n" + " \"World\"\n" + " ],\n" + " [\n" + " null\n" + " ]\n" + "]"; + CheckStream(*string_list_view_array, {}, ex2); + + auto sliced_array = string_list_view_array->Slice(1, 2); + static const char* ex3 = + "[\n" + " [\n" + " \"Hello\"\n" + " ],\n" + " [\n" + " \"World\"\n" + " ]\n" + "]"; + CheckStream(*sliced_array, {}, ex3); + + ASSERT_OK_AND_ASSIGN( + auto empty_array, + ArrayType::FromArrays(*Offsets("[]"), *Sizes("[]"), *int16_values)); + ASSERT_OK(empty_array->ValidateFull()); + static const char* ex4 = "[]"; + CheckStream(*empty_array, {}, ex4); +} + +TEST_F(TestPrettyPrint, ListViewType) { + TestPrettyPrintVarLengthListLike(); + + TestListViewSpecificPrettyPrinting(); + TestListViewSpecificPrettyPrinting(); +} + TEST_F(TestPrettyPrint, ListTypeNoNewlines) { auto list_type = list(int64()); auto empty_array = ArrayFromJSON(list_type, "[]"); From eda45cc3852b0abfe0f780ccb87aa3526d701034 Mon Sep 17 00:00:00 2001 From: Felipe Oliveira Carvalho Date: Fri, 14 Jul 2023 12:39:08 -0300 Subject: [PATCH 17/91] type_traits.h: Add LargeType to TypeTraits of list and list-view types --- cpp/src/arrow/type_traits.h | 2 ++ 1 file changed, 2 insertions(+) diff --git a/cpp/src/arrow/type_traits.h b/cpp/src/arrow/type_traits.h index bf2cc71c745..ed66c9367dc 100644 --- a/cpp/src/arrow/type_traits.h +++ b/cpp/src/arrow/type_traits.h @@ -449,6 +449,7 @@ struct TypeTraits { using OffsetBuilderType = Int32Builder; using OffsetScalarType = Int32Scalar; constexpr static bool is_parameter_free = false; + using LargeType = LargeListType; }; template <> @@ -473,6 +474,7 @@ struct TypeTraits { using OffsetBuilderType = Int32Builder; using OffsetScalarType = Int32Scalar; constexpr static bool is_parameter_free = false; + using LargeType = LargeListViewType; }; template <> From 91c1c00c8258844515a93ce14451d04af5a12fbf Mon Sep 17 00:00:00 2001 From: Felipe Oliveira Carvalho Date: Wed, 3 May 2023 22:26:31 -0300 Subject: [PATCH 18/91] concatenate.cc: Extract a SumBufferSizes() function and make some tweaks --- cpp/src/arrow/array/concatenate.cc | 25 +++++++++++++++---------- 1 file changed, 15 insertions(+), 10 deletions(-) diff --git a/cpp/src/arrow/array/concatenate.cc b/cpp/src/arrow/array/concatenate.cc index c7f3a23476e..3e36a78fd35 100644 --- a/cpp/src/arrow/array/concatenate.cc +++ b/cpp/src/arrow/array/concatenate.cc @@ -98,6 +98,14 @@ Status ConcatenateBitmaps(const std::vector& bitmaps, MemoryPool* pool, return Status::OK(); } +int64_t SumBufferSizes(const BufferVector& buffers) { + int64_t size = 0; + for (const auto& buffer : buffers) { + size += buffer->size(); + } + return size; +} + // Write offsets in src into dst, adjusting them such that first_offset // will be the first offset written. template @@ -113,26 +121,23 @@ Status ConcatenateOffsets(const BufferVector& buffers, MemoryPool* pool, values_ranges->resize(buffers.size()); // allocate output buffer - int64_t out_length = 0; - for (const auto& buffer : buffers) { - out_length += buffer->size() / sizeof(Offset); - } - ARROW_ASSIGN_OR_RAISE(*out, AllocateBuffer((out_length + 1) * sizeof(Offset), pool)); - auto dst = reinterpret_cast((*out)->mutable_data()); + const int64_t out_size = SumBufferSizes(buffers); + ARROW_ASSIGN_OR_RAISE(*out, AllocateBuffer(sizeof(Offset) + out_size, pool)); + auto* out_data = reinterpret_cast((*out)->mutable_data()); int64_t elements_length = 0; Offset values_length = 0; for (size_t i = 0; i < buffers.size(); ++i) { // the first offset from buffers[i] will be adjusted to values_length // (the cumulative length of values spanned by offsets in previous buffers) - RETURN_NOT_OK(PutOffsets(buffers[i], values_length, &dst[elements_length], - &(*values_ranges)[i])); + RETURN_NOT_OK(PutOffsets(buffers[i], values_length, + out_data + elements_length, &(*values_ranges)[i])); elements_length += buffers[i]->size() / sizeof(Offset); values_length += static_cast((*values_ranges)[i].length); } - // the final element in dst is the length of all values spanned by the offsets - dst[out_length] = values_length; + // the final element in out_data is the length of all values spanned by the offsets + out_data[out_size / sizeof(Offset)] = values_length; return Status::OK(); } From 05570add3a432528f7465159a66f5ad920a76870 Mon Sep 17 00:00:00 2001 From: Felipe Oliveira Carvalho Date: Wed, 3 May 2023 22:31:12 -0300 Subject: [PATCH 19/91] concatenate.cc: Pass Buffer pointer already dereferenced to PutOffsets --- cpp/src/arrow/array/concatenate.cc | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/cpp/src/arrow/array/concatenate.cc b/cpp/src/arrow/array/concatenate.cc index 3e36a78fd35..e8c3d016bfb 100644 --- a/cpp/src/arrow/array/concatenate.cc +++ b/cpp/src/arrow/array/concatenate.cc @@ -109,7 +109,7 @@ int64_t SumBufferSizes(const BufferVector& buffers) { // Write offsets in src into dst, adjusting them such that first_offset // will be the first offset written. template -Status PutOffsets(const std::shared_ptr& src, Offset first_offset, Offset* dst, +Status PutOffsets(const Buffer& src, Offset first_offset, Offset* dst, Range* values_range); // Concatenate buffers holding offsets into a single buffer of offsets, @@ -130,7 +130,7 @@ Status ConcatenateOffsets(const BufferVector& buffers, MemoryPool* pool, for (size_t i = 0; i < buffers.size(); ++i) { // the first offset from buffers[i] will be adjusted to values_length // (the cumulative length of values spanned by offsets in previous buffers) - RETURN_NOT_OK(PutOffsets(buffers[i], values_length, + RETURN_NOT_OK(PutOffsets(*buffers[i], values_length, out_data + elements_length, &(*values_ranges)[i])); elements_length += buffers[i]->size() / sizeof(Offset); values_length += static_cast((*values_ranges)[i].length); @@ -142,9 +142,9 @@ Status ConcatenateOffsets(const BufferVector& buffers, MemoryPool* pool, } template -Status PutOffsets(const std::shared_ptr& src, Offset first_offset, Offset* dst, +Status PutOffsets(const Buffer& src, Offset first_offset, Offset* dst, Range* values_range) { - if (src->size() == 0) { + if (src.size() == 0) { // It's allowed to have an empty offsets buffer for a 0-length array // (see Array::Validate) values_range->offset = 0; @@ -153,8 +153,8 @@ Status PutOffsets(const std::shared_ptr& src, Offset first_offset, Offse } // Get the range of offsets to transfer from src - auto src_begin = reinterpret_cast(src->data()); - auto src_end = reinterpret_cast(src->data() + src->size()); + auto src_begin = src.data_as(); + auto src_end = reinterpret_cast(src.data() + src.size()); // Compute the range of values which is spanned by this range of offsets values_range->offset = src_begin[0]; From 56352d90aeb1df75cc34325ba7c3d81f6cb723a8 Mon Sep 17 00:00:00 2001 From: Felipe Oliveira Carvalho Date: Mon, 12 Jun 2023 22:35:54 -0300 Subject: [PATCH 20/91] list_util.h: Add RangeOfValuesUsed() function --- cpp/src/arrow/CMakeLists.txt | 1 + .../compute/kernels/scalar_cast_nested.cc | 1 + cpp/src/arrow/util/CMakeLists.txt | 1 + cpp/src/arrow/util/list_util.cc | 183 ++++++++++++++++++ cpp/src/arrow/util/list_util.h | 46 +++++ cpp/src/arrow/util/list_util_test.cc | 158 +++++++++++++++ 6 files changed, 390 insertions(+) create mode 100644 cpp/src/arrow/util/list_util.cc create mode 100644 cpp/src/arrow/util/list_util.h create mode 100644 cpp/src/arrow/util/list_util_test.cc diff --git a/cpp/src/arrow/CMakeLists.txt b/cpp/src/arrow/CMakeLists.txt index 4790b7494d4..9c6b483b6a2 100644 --- a/cpp/src/arrow/CMakeLists.txt +++ b/cpp/src/arrow/CMakeLists.txt @@ -228,6 +228,7 @@ set(ARROW_SRCS util/hashing.cc util/int_util.cc util/io_util.cc + util/list_util.cc util/logging.cc util/key_value_metadata.cc util/memory.cc diff --git a/cpp/src/arrow/compute/kernels/scalar_cast_nested.cc b/cpp/src/arrow/compute/kernels/scalar_cast_nested.cc index 6fd449a9313..db5fda17d22 100644 --- a/cpp/src/arrow/compute/kernels/scalar_cast_nested.cc +++ b/cpp/src/arrow/compute/kernels/scalar_cast_nested.cc @@ -29,6 +29,7 @@ #include "arrow/compute/kernels/scalar_cast_internal.h" #include "arrow/util/bitmap_ops.h" #include "arrow/util/int_util.h" +#include "arrow/util/list_util.h" namespace arrow { diff --git a/cpp/src/arrow/util/CMakeLists.txt b/cpp/src/arrow/util/CMakeLists.txt index 3cecab3a633..eb3e51d5e44 100644 --- a/cpp/src/arrow/util/CMakeLists.txt +++ b/cpp/src/arrow/util/CMakeLists.txt @@ -53,6 +53,7 @@ add_arrow_test(utility-test int_util_test.cc ${IO_UTIL_TEST_SOURCES} iterator_test.cc + list_util_test.cc logging_test.cc queue_test.cc range_test.cc diff --git a/cpp/src/arrow/util/list_util.cc b/cpp/src/arrow/util/list_util.cc new file mode 100644 index 00000000000..b12459f8f78 --- /dev/null +++ b/cpp/src/arrow/util/list_util.cc @@ -0,0 +1,183 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include +#include + +#include "arrow/array/array_nested.h" +#include "arrow/array/data.h" +#include "arrow/type.h" +#include "arrow/type_traits.h" +#include "arrow/util/bit_run_reader.h" +#include "arrow/util/list_util.h" +#include "arrow/util/logging.h" +#include "arrow/util/string.h" + +namespace arrow { +namespace list_util { + +namespace internal { + +namespace { + +/// \pre input.length() > 0 && input.null_count() != input.length() +/// \param input A LIST_VIEW or LARGE_LIST_VIEW array +template +int64_t MinViewOffset(const ArraySpan& input) { + const uint8_t* validity = input.MayHaveNulls() ? input.buffers[0].data : nullptr; + const auto* offsets = reinterpret_cast(input.buffers[1].data); + const auto* sizes = reinterpret_cast(input.buffers[2].data); + + // It's very likely that the first non-null non-empty list-view starts at + // offset 0 of the child array. + int64_t i = 0; + while (i < input.length && (input.IsNull(i) || sizes[input.offset + i] == 0)) { + i += 1; + } + if (i >= input.length) { + return 0; + } + auto min_offset = offsets[input.offset + i]; + if (ARROW_PREDICT_TRUE(min_offset == 0)) { + // Early exit: offset 0 found already. + return 0; + } + + // Slow path: scan the buffers entirely. + arrow::internal::VisitSetBitRunsVoid( + validity, /*offset=*/input.offset + i + 1, /*length=*/input.length - i - 1, + [&](int64_t i, int64_t run_length) { + for (int64_t j = 0; j < run_length; j++) { + const auto offset = offsets[input.offset + i + j]; + if (ARROW_PREDICT_FALSE(offset < min_offset)) { + if (sizes[input.offset + i + j] > 0) { + min_offset = offset; + } + } + } + }); + return min_offset; +} + +/// \pre input.length() > 0 && input.null_count() != input.length() +/// \param input A LIST_VIEW or LARGE_LIST_VIEW array +template +int64_t MaxViewEnd(const ArraySpan& input) { + const uint8_t* validity = input.MayHaveNulls() ? input.buffers[0].data : NULLPTR; + const auto* offsets = reinterpret_cast(input.buffers[1].data); + const auto* sizes = reinterpret_cast(input.buffers[2].data); + const auto IsNull = [validity](int64_t i) -> bool { + return validity && !arrow::bit_util::GetBit(validity, i); + }; + + int64_t i = input.length - 1; // safe because input.length() > 0 + while (i != 0 && (IsNull(i) || sizes[input.offset + i] == 0)) { + i -= 1; + } + const auto offset = static_cast(offsets[input.offset + i]); + const auto size = sizes[input.offset + i]; + if (i == 0) { + return (IsNull(i) || sizes[input.offset + i] == 0) ? 0 : offset + size; + } + constexpr auto kInt64Max = std::numeric_limits::max(); + if constexpr (sizeof(offset_type) == sizeof(int64_t)) { + if (ARROW_PREDICT_FALSE(offset > kInt64Max - size)) { + // Early-exit: 64-bit overflow detected. This is not possible on a + // valid list-view, but we return the maximum possible value to + // avoid undefined behavior. + return kInt64Max; + } + } + int64_t max_end = + static_cast(offsets[input.offset + i]) + sizes[input.offset + i]; + if (max_end == input.child_data[0].length) { + // Early-exit: maximum possible view-end found already. + return max_end; + } + + // Slow path: scan the buffers entirely. + arrow::internal::VisitSetBitRunsVoid( + validity, input.offset, /*length=*/i + 1, [&](int64_t i, int64_t run_length) { + for (int64_t j = 0; j < run_length; ++j) { + const auto offset = static_cast(offsets[input.offset + i + j]); + const auto size = sizes[input.offset + i + j]; + if (size > 0) { + if constexpr (sizeof(offset_type) == sizeof(int64_t)) { + if (ARROW_PREDICT_FALSE(offset > kInt64Max - size)) { + // 64-bit overflow detected. This is not possible on a valid list-view, + // but we saturate max_end to the maximum possible value to avoid + // undefined behavior. + max_end = kInt64Max; + return; + } + } + max_end = std::max(max_end, offset + size); + } + } + }); + return max_end; +} + +template +std::pair RangeOfValuesUsedByListView(const ArraySpan& input) { + DCHECK(is_list_view(*input.type)); + if (input.length == 0 || input.GetNullCount() == input.length) { + return {0, 0}; + } + const int64_t min_offset = MinViewOffset(input); + const int64_t max_end = MaxViewEnd(input); + return {min_offset, max_end - min_offset}; +} + +template +std::pair RangeOfValuesUsedByList(const ArraySpan& input) { + DCHECK(is_var_length_list(*input.type)); + if (input.length == 0) { + return {0, 0}; + } + const auto* offsets = reinterpret_cast(input.buffers[1].data); + const int64_t min_offset = offsets[input.offset]; + const int64_t max_end = offsets[input.offset + input.length]; + return {min_offset, max_end - min_offset}; +} + +} // namespace + +Result> RangeOfValuesUsed(const ArraySpan& input) { + switch (input.type->id()) { + case Type::LIST: + return RangeOfValuesUsedByList(input); + case Type::MAP: + return RangeOfValuesUsedByList(input); + case Type::LARGE_LIST: + return RangeOfValuesUsedByList(input); + case Type::LIST_VIEW: + return RangeOfValuesUsedByListView(input); + case Type::LARGE_LIST_VIEW: + return RangeOfValuesUsedByListView(input); + default: + break; + } + DCHECK(!is_var_length_list_like(*input.type)); + return Status::TypeError( + "RangeOfValuesUsed: input is not a var-length list-like array"); +} + +} // namespace internal + +} // namespace list_util +} // namespace arrow diff --git a/cpp/src/arrow/util/list_util.h b/cpp/src/arrow/util/list_util.h new file mode 100644 index 00000000000..860d19b29a3 --- /dev/null +++ b/cpp/src/arrow/util/list_util.h @@ -0,0 +1,46 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#pragma once + +#include +#include + +#include "arrow/array/data.h" +#include "arrow/result.h" + +namespace arrow { +namespace list_util { + +/// \brief Get the child array holding the values from a List or ListView array +inline const ArraySpan& ValuesArray(const ArraySpan& span) { return span.child_data[0]; } + +namespace internal { + +/// \brief Calculate the smallest continuous range of values used by the +/// var-length list-like input (list, map and list-view types). +/// +/// \param input The input array such that is_var_length_list_like(input.type) +/// is true +/// \return A pair of (offset, length) describing the range +ARROW_EXPORT Result> RangeOfValuesUsed( + const ArraySpan& input); + +} // namespace internal + +} // namespace list_util +} // namespace arrow diff --git a/cpp/src/arrow/util/list_util_test.cc b/cpp/src/arrow/util/list_util_test.cc new file mode 100644 index 00000000000..b02cdbb21db --- /dev/null +++ b/cpp/src/arrow/util/list_util_test.cc @@ -0,0 +1,158 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include + +#include "arrow/array/builder_nested.h" +#include "arrow/util/list_util.h" + +#include "arrow/testing/builder.h" +#include "arrow/testing/gtest_util.h" + +namespace arrow { + +using internal::checked_cast; + +using ListAndListViewTypes = + ::testing::Types; + +template +class TestListUtils : public ::testing::Test { + public: + using TypeClass = T; + using offset_type = typename TypeClass::offset_type; + using BuilderType = typename TypeTraits::BuilderType; + + void SetUp() override { + value_type_ = int16(); + type_ = std::make_shared(value_type_); + + std::unique_ptr tmp; + ASSERT_OK(MakeBuilder(pool_, type_, &tmp)); + builder_.reset(checked_cast(tmp.release())); + } + + void TestRangeOfValuesUsed() { + std::shared_ptr result; + + // These list-views are built manually with the list-view builders instead + // of using something like ArrayFromJSON() because we want to test the + // RangeOfValuesUsed() function's ability to handle arrays containing + // overlapping list-views. + + // Empty list-like array + ASSERT_OK(builder_->FinishInternal(&result)); + builder_->Reset(); + ASSERT_OK_AND_ASSIGN(auto range, list_util::internal::RangeOfValuesUsed(*result)); + ASSERT_EQ(range.first, 0); + ASSERT_EQ(range.second, 0); + + // List-like array with only nulls + ASSERT_OK(builder_->AppendNulls(3)); + ASSERT_OK(builder_->FinishInternal(&result)); + builder_->Reset(); + ASSERT_OK_AND_ASSIGN(range, list_util::internal::RangeOfValuesUsed(*result)); + ASSERT_EQ(range.first, 0); + ASSERT_EQ(range.second, 0); + + // Array with nulls and non-nulls (starting at a non-zero offset) + Int16Builder* vb = checked_cast(builder_->value_builder()); + ASSERT_OK(vb->Append(-2)); + ASSERT_OK(vb->Append(-1)); + ASSERT_OK(builder_->Append(/*is_valid=*/false, 0)); + ASSERT_OK(builder_->Append(/*is_valid=*/true, 2)); + ASSERT_OK(vb->Append(0)); + ASSERT_OK(vb->Append(1)); + ASSERT_OK(builder_->Append(/*is_valid=*/true, 3)); + ASSERT_OK(vb->Append(2)); + ASSERT_OK(vb->Append(3)); + ASSERT_OK(vb->Append(4)); + if constexpr (is_list_view_type::value) { + ASSERT_OK(vb->Append(10)); + ASSERT_OK(vb->Append(11)); + } + std::shared_ptr array; + ASSERT_OK(builder_->Finish(&array)); + builder_->Reset(); + ASSERT_OK(array->ValidateFull()); + ASSERT_OK_AND_ASSIGN(range, list_util::internal::RangeOfValuesUsed(*array->data())); + ASSERT_EQ(range.first, 2); + ASSERT_EQ(range.second, 5); + + // Overlapping list-views + vb = checked_cast(builder_->value_builder()); + ASSERT_OK(vb->Append(-2)); + ASSERT_OK(vb->Append(-1)); + ASSERT_OK(builder_->Append(/*is_valid=*/false, 0)); + if constexpr (is_list_view_type::value) { + ASSERT_OK(builder_->Append(/*is_valid=*/true, 6)); + ASSERT_OK(vb->Append(0)); + ASSERT_OK(builder_->Append(/*is_valid=*/true, 2)); + ASSERT_OK(vb->Append(1)); + ASSERT_OK(vb->Append(2)); + ASSERT_OK(vb->Append(3)); + ASSERT_OK(builder_->Append(/*is_valid=*/false, 0)); + ASSERT_OK(builder_->Append(/*is_valid=*/true, 1)); + ASSERT_OK(vb->Append(4)); + ASSERT_OK(vb->Append(5)); + // -- used range ends here -- + ASSERT_OK(vb->Append(10)); + ASSERT_OK(vb->Append(11)); + } else { + ASSERT_OK(builder_->Append(/*is_valid=*/true, 6)); + ASSERT_OK(vb->Append(0)); + ASSERT_OK(vb->Append(1)); + ASSERT_OK(vb->Append(2)); + ASSERT_OK(vb->Append(3)); + ASSERT_OK(vb->Append(4)); + ASSERT_OK(vb->Append(5)); + ASSERT_OK(builder_->Append(/*is_valid=*/true, 2)); + ASSERT_OK(vb->Append(1)); + ASSERT_OK(vb->Append(2)); + ASSERT_OK(builder_->Append(/*is_valid=*/false, 0)); + ASSERT_OK(builder_->Append(/*is_valid=*/true, 1)); + ASSERT_OK(vb->Append(4)); + } + ASSERT_OK(builder_->AppendNulls(2)); + ASSERT_OK(builder_->Finish(&array)); + builder_->Reset(); + ASSERT_OK(array->ValidateFull()); + ASSERT_ARRAYS_EQUAL( + *array, *ArrayFromJSON( + type_, "[null, [0, 1, 2, 3, 4, 5], [1, 2], null, [4], null, null]")); + // Check the range + ASSERT_OK_AND_ASSIGN(range, list_util::internal::RangeOfValuesUsed(*array->data())); + ASSERT_EQ(range.first, 2); + if constexpr (is_list_view_type::value) { + ASSERT_EQ(range.second, 6); + } else { + ASSERT_EQ(range.second, 9); + } + } + + protected: + MemoryPool* pool_ = default_memory_pool(); + std::shared_ptr type_; + std::shared_ptr value_type_; + std::shared_ptr builder_; +}; + +TYPED_TEST_SUITE(TestListUtils, ListAndListViewTypes); + +TYPED_TEST(TestListUtils, RangeOfValuesUsed) { this->TestRangeOfValuesUsed(); } + +} // namespace arrow From 14b1a60aa70449ee340db5190ed088ccc1f82c04 Mon Sep 17 00:00:00 2001 From: Felipe Oliveira Carvalho Date: Fri, 5 May 2023 21:14:34 -0300 Subject: [PATCH 21/91] concatenate.cc: Rename variable from adjustment to displacement --- cpp/src/arrow/array/concatenate.cc | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/cpp/src/arrow/array/concatenate.cc b/cpp/src/arrow/array/concatenate.cc index e8c3d016bfb..4ebf782e062 100644 --- a/cpp/src/arrow/array/concatenate.cc +++ b/cpp/src/arrow/array/concatenate.cc @@ -165,12 +165,12 @@ Status PutOffsets(const Buffer& src, Offset first_offset, Offset* dst, // Write offsets into dst, ensuring that the first offset written is // first_offset - auto adjustment = first_offset - src_begin[0]; + auto displacement = first_offset - src_begin[0]; // NOTE: Concatenate can be called during IPC reads to append delta dictionaries. // Avoid UB on non-validated input by doing the addition in the unsigned domain. // (the result can later be validated using Array::ValidateFull) - std::transform(src_begin, src_end, dst, [adjustment](Offset offset) { - return SafeSignedAdd(offset, adjustment); + std::transform(src_begin, src_end, dst, [displacement](Offset offset) { + return SafeSignedAdd(offset, displacement); }); return Status::OK(); } From f54ca238f0b2b14313f97eb132022bdd008ec85e Mon Sep 17 00:00:00 2001 From: Felipe Oliveira Carvalho Date: Fri, 5 May 2023 21:32:17 -0300 Subject: [PATCH 22/91] [Large]ListViewArray: Implement concatenation + tests --- cpp/src/arrow/array/concatenate.cc | 86 +++++++++++++++++++++++-- cpp/src/arrow/array/concatenate_test.cc | 54 ++++++++++++++++ 2 files changed, 135 insertions(+), 5 deletions(-) diff --git a/cpp/src/arrow/array/concatenate.cc b/cpp/src/arrow/array/concatenate.cc index 4ebf782e062..2510f2d7733 100644 --- a/cpp/src/arrow/array/concatenate.cc +++ b/cpp/src/arrow/array/concatenate.cc @@ -41,6 +41,7 @@ #include "arrow/util/checked_cast.h" #include "arrow/util/int_util.h" #include "arrow/util/int_util_overflow.h" +#include "arrow/util/list_util.h" #include "arrow/util/logging.h" #include "arrow/util/ree_util.h" #include "arrow/visit_data_inline.h" @@ -175,6 +176,57 @@ Status PutOffsets(const Buffer& src, Offset first_offset, Offset* dst, return Status::OK(); } +template +void PutListViewOffsets(const Buffer& src, offset_type displacement, offset_type* dst); + +// Concatenate buffers holding list-view offsets into a single buffer of offsets +// +// value_ranges contains the relevant ranges of values in the child array actually +// referenced to by the views. Most commonly, these ranges will start from 0, +// but when that is not the case, we need to adjust the displacement of offsets. +// The concatenated child array does not contain values from the beginning +// if they are not referenced to by any view. +template +Status ConcatenateListViewOffsets(const BufferVector& buffers, + const std::vector& value_ranges, + MemoryPool* pool, std::shared_ptr* out) { + const int64_t out_size = SumBufferSizes(buffers); + ARROW_ASSIGN_OR_RAISE(*out, AllocateBuffer(out_size, pool)); + auto* out_data = (*out)->mutable_data_as(); + + int64_t num_child_values = 0; + int64_t elements_length = 0; + for (size_t i = 0; i < buffers.size(); ++i) { + const auto displacement = + static_cast(num_child_values - value_ranges[i].offset); + PutListViewOffsets(/*src=*/*buffers[i], static_cast(displacement), + /*dst=*/out_data + elements_length); + elements_length += buffers[i]->size() / sizeof(offset_type); + num_child_values += value_ranges[i].length; + if (num_child_values > std::numeric_limits::max()) { + return Status::Invalid("offset overflow while concatenating arrays"); + } + } + DCHECK_EQ(elements_length, static_cast(out_size / sizeof(offset_type))); + + return Status::OK(); +} + +template +void PutListViewOffsets(const Buffer& src, offset_type displacement, offset_type* dst) { + if (src.size() == 0) { + return; + } + auto src_begin = src.data_as(); + auto src_end = reinterpret_cast(src.data() + src.size()); + // NOTE: Concatenate can be called during IPC reads to append delta dictionaries. + // Avoid UB on non-validated input by doing the addition in the unsigned domain. + // (the result can later be validated using Array::ValidateFull) + std::transform(src_begin, src_end, dst, [displacement](offset_type offset) { + return SafeSignedAdd(offset, displacement); + }); +} + class ConcatenateImpl { public: ConcatenateImpl(const ArrayDataVector& in, MemoryPool* pool) @@ -293,12 +345,36 @@ class ConcatenateImpl { return ConcatenateImpl(child_data, pool_).Concatenate(&out_->child_data[0]); } - Status Visit(const ListViewType& type) { - return Status::NotImplemented("concatenation of ", type); - } + template + enable_if_list_view Visit(const T& type) { + using offset_type = typename T::offset_type; + out_->buffers.resize(3); + out_->child_data.resize(1); + + // Calculate the ranges of values that each list-view array uses + std::vector value_ranges; + value_ranges.reserve(in_.size()); + for (const auto& input : in_) { + ArraySpan input_span(*input); + Range range; + ARROW_ASSIGN_OR_RAISE(std::tie(range.offset, range.length), + list_util::internal::RangeOfValuesUsed(input_span)); + value_ranges.push_back(range); + } + + // Concatenate the values + ARROW_ASSIGN_OR_RAISE(ArrayDataVector value_data, ChildData(0, value_ranges)); + RETURN_NOT_OK(ConcatenateImpl(value_data, pool_).Concatenate(&out_->child_data[0])); + out_->child_data[0]->type = type.value_type(); + + // Concatenate the offsets + ARROW_ASSIGN_OR_RAISE(auto offset_buffers, Buffers(1, sizeof(offset_type))); + RETURN_NOT_OK(ConcatenateListViewOffsets(offset_buffers, value_ranges, + pool_, &out_->buffers[1])); - Status Visit(const LargeListViewType& type) { - return Status::NotImplemented("concatenation of ", type); + // Concatenate the sizes + ARROW_ASSIGN_OR_RAISE(auto size_buffers, Buffers(2, sizeof(offset_type))); + return ConcatenateBuffers(size_buffers, pool_).Value(&out_->buffers[2]); } Status Visit(const FixedSizeListType& fixed_size_list) { diff --git a/cpp/src/arrow/array/concatenate_test.cc b/cpp/src/arrow/array/concatenate_test.cc index 0ef1136ea78..8afe9d05ae6 100644 --- a/cpp/src/arrow/array/concatenate_test.cc +++ b/cpp/src/arrow/array/concatenate_test.cc @@ -217,6 +217,60 @@ TEST_F(ConcatenateTest, LargeListType) { }); } +TEST_F(ConcatenateTest, ListViewType) { + Check([this](int32_t size, double null_probability, std::shared_ptr* out) { + auto values_size = size * 4; + auto values = this->GeneratePrimitive(values_size, null_probability); + + std::shared_ptr offsets; + auto offsets_vector = this->Offsets(values_size, size); + offsets_vector.front() = 0; + offsets_vector.back() = values_size; + ArrayFromVector(offsets_vector, &offsets); + + std::shared_ptr sizes; + std::vector sizes_vector; + sizes_vector.reserve(size); + for (int32_t i = 0; i < size; ++i) { + // Make list-views share values with the next list-view by extending the size to a + // point after the next offset. + int32_t size = offsets_vector[i + 1] - offsets_vector[i]; + size = std::min(2 * size / 3, values_size - offsets_vector[i]); + sizes_vector.push_back(size); + ASSERT_LE(offsets_vector[i] + sizes_vector.back(), values_size); + } + ASSERT_EQ(offsets_vector.size(), sizes_vector.size() + 1); + ArrayFromVector(sizes_vector, &sizes); + + ASSERT_OK_AND_ASSIGN(*out, ListViewArray::FromArrays(*offsets, *sizes, *values)); + ASSERT_OK((**out).ValidateFull()); + }); +} + +TEST_F(ConcatenateTest, LargeListViewType) { + Check([this](int32_t size, double null_probability, std::shared_ptr* out) { + auto values_size = size * 4; + auto values = this->GeneratePrimitive(values_size, null_probability); + auto offsets_vector = this->Offsets(values_size, size); + // Ensure first and last offsets encompass the whole values array + offsets_vector.front() = 0; + offsets_vector.back() = static_cast(values_size); + std::vector sizes_vector; + sizes_vector.reserve(size); + for (int64_t i = 0; i < size; ++i) { + int64_t size = offsets_vector[i + 1] - offsets_vector[i]; + size = std::min(2 * size / 3, values_size - offsets_vector[i]); + sizes_vector.push_back(size); + } + ASSERT_EQ(offsets_vector.size(), sizes_vector.size() + 1); + std::shared_ptr offsets, sizes; + ArrayFromVector(offsets_vector, &offsets); + ArrayFromVector(sizes_vector, &sizes); + ASSERT_OK_AND_ASSIGN(*out, LargeListViewArray::FromArrays(*offsets, *sizes, *values)); + ASSERT_OK((**out).ValidateFull()); + }); +} + TEST_F(ConcatenateTest, StructType) { Check([this](int32_t size, double null_probability, std::shared_ptr* out) { auto foo = this->GeneratePrimitive(size, null_probability); From a15bc8cf0a49c0b962331b59252883fe7a4887d4 Mon Sep 17 00:00:00 2001 From: Felipe Oliveira Carvalho Date: Tue, 13 Jun 2023 16:24:40 -0300 Subject: [PATCH 23/91] Fix comment formatting --- cpp/src/arrow/compute/kernels/scalar_if_else.cc | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/cpp/src/arrow/compute/kernels/scalar_if_else.cc b/cpp/src/arrow/compute/kernels/scalar_if_else.cc index 6b4b2339e4a..01417665d21 100644 --- a/cpp/src/arrow/compute/kernels/scalar_if_else.cc +++ b/cpp/src/arrow/compute/kernels/scalar_if_else.cc @@ -82,9 +82,9 @@ std::optional GetConstantValidityWord(const ExecValue& data) { return {}; } -// if the condition is null then output is null otherwise we take validity from the -// selected argument -// ie. cond.valid & (cond.data & left.valid | ~cond.data & right.valid) +/// If the condition is null then output is null otherwise we take validity from the +/// selected argument +/// (i.e. cond.valid & (cond.data & left.valid | ~cond.data & right.valid)). struct IfElseNullPromoter { KernelContext* ctx; const ArraySpan& cond; @@ -368,7 +368,7 @@ void RunIfElseLoopInverted(const ArraySpan& cond, const HandleBlock& handle_bloc } /// Runs if-else when cond is a scalar. Two special functions are required, -/// 1.CopyArrayData, 2. BroadcastScalar +/// 1. CopyArrayData, 2. BroadcastScalar template Status RunIfElseScalar(const BooleanScalar& cond, const ExecValue& left, const ExecValue& right, ExecResult* out, From 9398c9882ec19fadc70f8a3cc6680522ea6b7521 Mon Sep 17 00:00:00 2001 From: Felipe Oliveira Carvalho Date: Fri, 14 Jul 2023 18:36:26 -0300 Subject: [PATCH 24/91] BaseBuilder: Allow ListView scalars and slices to be added to List builders and vice-versa --- cpp/src/arrow/array/builder_base.cc | 24 ++++++++++++-- cpp/src/arrow/array/builder_nested.h | 47 ++++++++++++++++++++++------ 2 files changed, 60 insertions(+), 11 deletions(-) diff --git a/cpp/src/arrow/array/builder_base.cc b/cpp/src/arrow/array/builder_base.cc index 40e705aa3e4..4ac1b83729f 100644 --- a/cpp/src/arrow/array/builder_base.cc +++ b/cpp/src/arrow/array/builder_base.cc @@ -330,10 +330,30 @@ struct DerefConstIterator { pointer operator->() const { return &(**it); } }; +/// If A and B are equivalent types, a builder of type A can receive +/// scalar values of type B and a builder of type B can receive +/// scalar values of type A. +/// +/// \param a Type A. +/// \param b Type B. +bool AreScalarTypesEquivalent(const DataType& a, const DataType& b) { + if (a.Equals(b)) { + return true; + } + if ((a.id() == Type::LIST && b.id() == Type::LIST_VIEW) || + (a.id() == Type::LIST_VIEW && b.id() == Type::LIST) || + (a.id() == Type::LARGE_LIST && b.id() == Type::LARGE_LIST_VIEW) || + (a.id() == Type::LARGE_LIST_VIEW && b.id() == Type::LARGE_LIST)) { + return checked_cast(a).value_type()->Equals( + *checked_cast(b).value_type()); + } + return false; +} + } // namespace Status ArrayBuilder::AppendScalar(const Scalar& scalar, int64_t n_repeats) { - if (!scalar.type->Equals(type())) { + if (!AreScalarTypesEquivalent(*scalar.type, *type())) { return Status::Invalid("Cannot append scalar of type ", scalar.type->ToString(), " to builder for type ", type()->ToString()); } @@ -344,7 +364,7 @@ Status ArrayBuilder::AppendScalars(const ScalarVector& scalars) { if (scalars.empty()) return Status::OK(); const auto ty = type(); for (const auto& scalar : scalars) { - if (!scalar->type->Equals(ty)) { + if (ARROW_PREDICT_FALSE(!AreScalarTypesEquivalent(*scalar->type, *ty))) { return Status::Invalid("Cannot append scalar of type ", scalar->type->ToString(), " to builder for type ", type()->ToString()); } diff --git a/cpp/src/arrow/array/builder_nested.h b/cpp/src/arrow/array/builder_nested.h index 67fb16cebfa..7d5ab0a9bc7 100644 --- a/cpp/src/arrow/array/builder_nested.h +++ b/cpp/src/arrow/array/builder_nested.h @@ -17,6 +17,7 @@ #pragma once +#include #include #include #include @@ -137,23 +138,26 @@ class ARROW_EXPORT VarLengthListLikeBuilder : public ArrayBuilder { virtual Status AppendValues(const offset_type* offsets, const offset_type* sizes, int64_t length, const uint8_t* valid_bytes) = 0; - Status AppendArraySlice(const ArraySpan& array, int64_t offset, - int64_t length) override { + private: + /// \tparam T The DataType of array + template + Status AppendArraySliceImpl(const ArraySpan& array, int64_t offset, int64_t length) { + static_assert( + std::is_same::value, + "VarLengthListLikeBuilder::AppendArraySlice expects a list or list-view " + "with the same offset bit-width"); const offset_type* offsets = array.GetValues(1); [[maybe_unused]] const offset_type* sizes = NULLPTR; - if constexpr (is_list_view(TYPE::type_id)) { + if constexpr (is_list_view(T::type_id)) { sizes = array.GetValues(2); } - const bool all_valid = !array.MayHaveLogicalNulls(); const uint8_t* validity = array.HasValidityBitmap() ? array.buffers[0].data : NULLPTR; ARROW_RETURN_NOT_OK(Reserve(length)); for (int64_t row = offset; row < offset + length; row++) { - const bool is_valid = - all_valid || (validity && bit_util::GetBit(validity, array.offset + row)) || - array.IsValid(row); + const bool is_valid = !validity || bit_util::GetBit(validity, array.offset + row); int64_t size = 0; if (is_valid) { - if constexpr (is_list_view(TYPE::type_id)) { + if constexpr (is_list_view(T::type_id)) { size = sizes[row]; } else { size = offsets[row + 1] - offsets[row]; @@ -161,7 +165,7 @@ class ARROW_EXPORT VarLengthListLikeBuilder : public ArrayBuilder { } UnsafeAppendToBitmap(is_valid); UnsafeAppendDimensions(/*offset=*/value_builder_->length(), size); - if (is_valid) { + if (is_valid && size > 0) { ARROW_RETURN_NOT_OK( value_builder_->AppendArraySlice(array.child_data[0], offsets[row], size)); } @@ -169,6 +173,31 @@ class ARROW_EXPORT VarLengthListLikeBuilder : public ArrayBuilder { return Status::OK(); } + public: + Status AppendArraySlice(const ArraySpan& array, int64_t offset, + int64_t length) override { + const auto array_type_id = array.type->id(); + if constexpr (TYPE::type_id == Type::LIST) { + if (array_type_id == Type::LIST_VIEW) { + return AppendArraySliceImpl(array, offset, length); + } + } else if constexpr (TYPE::type_id == Type::LIST_VIEW) { + if (array_type_id == Type::LIST) { + return AppendArraySliceImpl(array, offset, length); + } + } else if constexpr (TYPE::type_id == Type::LARGE_LIST) { + if (array_type_id == Type::LARGE_LIST_VIEW) { + return AppendArraySliceImpl(array, offset, length); + } + } else if constexpr (TYPE::type_id == Type::LARGE_LIST_VIEW) { + if (array_type_id == Type::LARGE_LIST) { + return AppendArraySliceImpl(array, offset, length); + } + } + assert(TYPE::type_id == array_type_id); + return AppendArraySliceImpl(array, offset, length); + } + Status ValidateOverflow(int64_t new_elements) const { auto new_length = value_builder_->length() + new_elements; if (ARROW_PREDICT_FALSE(new_length > maximum_elements())) { From b5ea57f415a7aae7d219864295f54c05b21dba32 Mon Sep 17 00:00:00 2001 From: Felipe Oliveira Carvalho Date: Thu, 20 Jul 2023 13:25:11 -0300 Subject: [PATCH 25/91] if_else: Rename RunLoop to RunLoopOfNestedIfElseExec --- cpp/src/arrow/compute/kernels/scalar_if_else.cc | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) diff --git a/cpp/src/arrow/compute/kernels/scalar_if_else.cc b/cpp/src/arrow/compute/kernels/scalar_if_else.cc index 01417665d21..75bf71675c5 100644 --- a/cpp/src/arrow/compute/kernels/scalar_if_else.cc +++ b/cpp/src/arrow/compute/kernels/scalar_if_else.cc @@ -1028,7 +1028,7 @@ struct NestedIfElseExec { // AAA static Status Call(KernelContext* ctx, const ArraySpan& cond, const ArraySpan& left, const ArraySpan& right, ExecResult* out) { - return RunLoop( + return RunLoopOfNestedIfElseExec( ctx, cond, out, [&](ArrayBuilder* builder, int64_t i, int64_t length) { return builder->AppendArraySlice(left, i, length); @@ -1041,7 +1041,7 @@ struct NestedIfElseExec { // ASA static Status Call(KernelContext* ctx, const ArraySpan& cond, const Scalar& left, const ArraySpan& right, ExecResult* out) { - return RunLoop( + return RunLoopOfNestedIfElseExec( ctx, cond, out, [&](ArrayBuilder* builder, int64_t i, int64_t length) { return builder->AppendScalar(left, length); @@ -1054,7 +1054,7 @@ struct NestedIfElseExec { // AAS static Status Call(KernelContext* ctx, const ArraySpan& cond, const ArraySpan& left, const Scalar& right, ExecResult* out) { - return RunLoop( + return RunLoopOfNestedIfElseExec( ctx, cond, out, [&](ArrayBuilder* builder, int64_t i, int64_t length) { return builder->AppendArraySlice(left, i, length); @@ -1067,7 +1067,7 @@ struct NestedIfElseExec { // ASS static Status Call(KernelContext* ctx, const ArraySpan& cond, const Scalar& left, const Scalar& right, ExecResult* out) { - return RunLoop( + return RunLoopOfNestedIfElseExec( ctx, cond, out, [&](ArrayBuilder* builder, int64_t i, int64_t length) { return builder->AppendScalar(left, length); @@ -1078,8 +1078,9 @@ struct NestedIfElseExec { } template - static Status RunLoop(KernelContext* ctx, const ArraySpan& cond, ExecResult* out, - HandleLeft&& handle_left, HandleRight&& handle_right) { + static Status RunLoopOfNestedIfElseExec(KernelContext* ctx, const ArraySpan& cond, + ExecResult* out, HandleLeft&& handle_left, + HandleRight&& handle_right) { std::unique_ptr raw_builder; RETURN_NOT_OK(MakeBuilderExactIndex(ctx->memory_pool(), out->type()->GetSharedPtr(), &raw_builder)); From 4b11630943c9be20037a629d9b03bee2af766df9 Mon Sep 17 00:00:00 2001 From: Felipe Oliveira Carvalho Date: Fri, 21 Jul 2023 23:27:49 -0300 Subject: [PATCH 26/91] if_else_benchmarks: Add ListView benchmarks --- .../kernels/scalar_if_else_benchmark.cc | 42 +++++++++++++++---- 1 file changed, 34 insertions(+), 8 deletions(-) diff --git a/cpp/src/arrow/compute/kernels/scalar_if_else_benchmark.cc b/cpp/src/arrow/compute/kernels/scalar_if_else_benchmark.cc index 34225ce9fe0..b72402bbccd 100644 --- a/cpp/src/arrow/compute/kernels/scalar_if_else_benchmark.cc +++ b/cpp/src/arrow/compute/kernels/scalar_if_else_benchmark.cc @@ -67,11 +67,15 @@ struct GetBytesProcessedVisitor { } template - enable_if_var_size_list Visit(const ArrowType& type) { + enable_if_var_length_list_like Visit(const ArrowType& type) { using ArrayType = typename TypeTraits::ArrayType; using OffsetType = typename TypeTraits::OffsetType::c_type; - total_bytes += (arr->length() + 1) * sizeof(OffsetType); + const auto num_offsets = is_list_view(type) ? arr->length() : arr->length() + 1; + total_bytes += num_offsets * sizeof(OffsetType); + // NOTE: the sizes buffer is not counted when type is a list-view as that + // can make the throughput numbers look better just because the sizes + // increase the number of bytes in the input. auto child_array = internal::checked_cast(arr)->values(); return RecurseInto(child_array.get()); } @@ -126,7 +130,7 @@ static void IfElseBench(benchmark::State& state) { } template -static void IfElseBenchList(benchmark::State& state) { +static void IfElseBenchVarLengthListLike(benchmark::State& state) { auto value_type = TypeTraits::type_singleton(); auto list_type = std::make_shared(value_type); return IfElseBench(state, list_type); @@ -172,7 +176,7 @@ static void IfElseBenchContiguous(benchmark::State& state) { } template -static void IfElseBenchListContiguous(benchmark::State& state) { +static void IfElseBenchVarLengthListLikeContiguous(benchmark::State& state) { auto value_type = TypeTraits::type_singleton(); auto list_type = std::make_shared(value_type); return IfElseBenchContiguous(state, list_type); @@ -187,11 +191,11 @@ static void IfElseBench32(benchmark::State& state) { } static void IfElseBenchListUInt32(benchmark::State& state) { - return IfElseBenchList(state); + return IfElseBenchVarLengthListLike(state); } static void IfElseBenchListString32(benchmark::State& state) { - return IfElseBenchList(state); + return IfElseBenchVarLengthListLike(state); } static void IfElseBenchString32(benchmark::State& state) { @@ -211,11 +215,27 @@ static void IfElseBench32Contiguous(benchmark::State& state) { } static void IfElseBenchListUInt32Contiguous(benchmark::State& state) { - return IfElseBenchListContiguous(state); + return IfElseBenchVarLengthListLikeContiguous(state); } static void IfElseBenchListString32Contiguous(benchmark::State& state) { - return IfElseBenchListContiguous(state); + return IfElseBenchVarLengthListLikeContiguous(state); +} + +static void IfElseBenchListViewUInt32(benchmark::State& state) { + return IfElseBenchVarLengthListLike(state); +} + +static void IfElseBenchListViewString32(benchmark::State& state) { + return IfElseBenchVarLengthListLike(state); +} + +static void IfElseBenchListViewUInt32Contiguous(benchmark::State& state) { + return IfElseBenchVarLengthListLikeContiguous(state); +} + +static void IfElseBenchListViewString32Contiguous(benchmark::State& state) { + return IfElseBenchVarLengthListLikeContiguous(state); } static void IfElseBenchString64Contiguous(benchmark::State& state) { @@ -494,6 +514,12 @@ BENCHMARK(IfElseBenchListString32)->Args({kNumItems, 0}); BENCHMARK(IfElseBenchListUInt32Contiguous)->Args({kNumItems, 0}); BENCHMARK(IfElseBenchListString32Contiguous)->Args({kNumItems, 0}); +// IfElse: ListViews +BENCHMARK(IfElseBenchListViewUInt32)->Args({kNumItems, 0}); +BENCHMARK(IfElseBenchListViewString32)->Args({kNumItems, 0}); +BENCHMARK(IfElseBenchListViewUInt32Contiguous)->Args({kNumItems, 0}); +BENCHMARK(IfElseBenchListViewString32Contiguous)->Args({kNumItems, 0}); + // IfElse: Strings BENCHMARK(IfElseBenchString32)->Args({kNumItems, 0}); BENCHMARK(IfElseBenchString64)->Args({kNumItems, 0}); From 1f749d512e8d6ffe1861a6d54b31ceb5b03b861a Mon Sep 17 00:00:00 2001 From: Felipe Oliveira Carvalho Date: Sat, 22 Jul 2023 01:51:51 -0300 Subject: [PATCH 27/91] if_else: Include LIST_VIEW and LARGE_LIST_VIEW --- cpp/src/arrow/compute/kernels/scalar_if_else.cc | 6 +++--- cpp/src/arrow/compute/kernels/scalar_if_else_test.cc | 11 +++++++---- 2 files changed, 10 insertions(+), 7 deletions(-) diff --git a/cpp/src/arrow/compute/kernels/scalar_if_else.cc b/cpp/src/arrow/compute/kernels/scalar_if_else.cc index 75bf71675c5..ee181c053c0 100644 --- a/cpp/src/arrow/compute/kernels/scalar_if_else.cc +++ b/cpp/src/arrow/compute/kernels/scalar_if_else.cc @@ -1309,9 +1309,9 @@ void AddFixedWidthIfElseKernel(const std::shared_ptr& scalar_fun } void AddNestedIfElseKernels(const std::shared_ptr& scalar_function) { - for (const auto type_id : - {Type::LIST, Type::LARGE_LIST, Type::FIXED_SIZE_LIST, Type::STRUCT, - Type::DENSE_UNION, Type::SPARSE_UNION, Type::DICTIONARY}) { + for (const auto type_id : {Type::LIST, Type::LARGE_LIST, Type::LIST_VIEW, + Type::LARGE_LIST_VIEW, Type::FIXED_SIZE_LIST, Type::STRUCT, + Type::DENSE_UNION, Type::SPARSE_UNION, Type::DICTIONARY}) { ScalarKernel kernel({boolean(), InputType(type_id), InputType(type_id)}, LastType, NestedIfElseExec::Exec); kernel.null_handling = NullHandling::COMPUTED_NO_PREALLOCATE; diff --git a/cpp/src/arrow/compute/kernels/scalar_if_else_test.cc b/cpp/src/arrow/compute/kernels/scalar_if_else_test.cc index a9c5a1fc3c9..a11aab81742 100644 --- a/cpp/src/arrow/compute/kernels/scalar_if_else_test.cc +++ b/cpp/src/arrow/compute/kernels/scalar_if_else_test.cc @@ -737,12 +737,15 @@ TEST_F(TestIfElseKernel, Decimal) { } } +using ListAndListViewArrowTypes = + ::testing::Types; + template -class TestIfElseList : public ::testing::Test {}; +class TestIfElseVarLengthListLike : public ::testing::Test {}; -TYPED_TEST_SUITE(TestIfElseList, ListArrowTypes); +TYPED_TEST_SUITE(TestIfElseVarLengthListLike, ListAndListViewArrowTypes); -TYPED_TEST(TestIfElseList, ListOfInt) { +TYPED_TEST(TestIfElseVarLengthListLike, ListOfInt) { auto type = std::make_shared(int32()); CheckWithDifferentShapes(ArrayFromJSON(boolean(), "[true, true, false, false]"), ArrayFromJSON(type, "[[], null, [1, null], [2, 3]]"), @@ -755,7 +758,7 @@ TYPED_TEST(TestIfElseList, ListOfInt) { ArrayFromJSON(type, "[null, null, null, null]")); } -TYPED_TEST(TestIfElseList, ListOfString) { +TYPED_TEST(TestIfElseVarLengthListLike, ListOfString) { auto type = std::make_shared(utf8()); CheckWithDifferentShapes( ArrayFromJSON(boolean(), "[true, true, false, false]"), From 24efd00a263b89a680e3094da2fdfd38d4b6b93c Mon Sep 17 00:00:00 2001 From: Felipe Oliveira Carvalho Date: Tue, 8 Aug 2023 23:33:27 +0200 Subject: [PATCH 28/91] list_util: Add ListView<->List conversion functions + tests --- cpp/src/arrow/util/list_util.cc | 109 ++++++++++++++++++++++++++- cpp/src/arrow/util/list_util.h | 16 ++++ cpp/src/arrow/util/list_util_test.cc | 75 ++++++++++++++++++ 3 files changed, 196 insertions(+), 4 deletions(-) diff --git a/cpp/src/arrow/util/list_util.cc b/cpp/src/arrow/util/list_util.cc index b12459f8f78..9b57328c466 100644 --- a/cpp/src/arrow/util/list_util.cc +++ b/cpp/src/arrow/util/list_util.cc @@ -19,21 +19,24 @@ #include #include "arrow/array/array_nested.h" +#include "arrow/array/builder_nested.h" #include "arrow/array/data.h" #include "arrow/type.h" #include "arrow/type_traits.h" #include "arrow/util/bit_run_reader.h" +#include "arrow/util/checked_cast.h" #include "arrow/util/list_util.h" #include "arrow/util/logging.h" #include "arrow/util/string.h" -namespace arrow { -namespace list_util { +namespace arrow::list_util { namespace internal { namespace { +using arrow::internal::checked_cast; + /// \pre input.length() > 0 && input.null_count() != input.length() /// \param input A LIST_VIEW or LARGE_LIST_VIEW array template @@ -155,6 +158,73 @@ std::pair RangeOfValuesUsedByList(const ArraySpan& input) { return {min_offset, max_end - min_offset}; } +template +Result> ListViewFromListImpl( + const std::shared_ptr& list_data, MemoryPool* pool) { + static_assert( + std::is_same::value, + "Offset types between list type and list-view type are expected to match"); + using offset_type = typename SrcListType::offset_type; + const auto& list_type = checked_cast(*list_data->type); + + // To re-use the validity and offsets buffers, a sizes buffer with enough + // padding on the beginning is allocated and filled with the sizes after + // list_data->offset. + const int64_t buffer_length = list_data->offset + list_data->length; + ARROW_ASSIGN_OR_RAISE(auto sizes_buffer, + AllocateBuffer(buffer_length * sizeof(offset_type), pool)); + const auto* offsets = list_data->template GetValues(1, 0); + auto* sizes = reinterpret_cast(sizes_buffer->mutable_data()); + for (int64_t i = list_data->offset; i < buffer_length; i++) { + sizes[i] = offsets[i + 1] - offsets[i]; + } + BufferVector buffers = {list_data->buffers[0], list_data->buffers[1], + std::move(sizes_buffer)}; + + return ArrayData::Make(std::make_shared(list_type.value_type()), + list_data->length, std::move(buffers), + {list_data->child_data[0]}, list_data->null_count, + list_data->offset); +} + +template +Result> ListFromListViewImpl( + const std::shared_ptr& list_view_data, MemoryPool* pool) { + static_assert( + std::is_same::value, + "Offset types between list type and list-view type are expected to match"); + using offset_type = typename DestListType::offset_type; + using ListBuilderType = typename TypeTraits::BuilderType; + + const auto& list_view_type = + checked_cast(*list_view_data->type); + const auto& value_type = list_view_type.value_type(); + const auto list_type = std::make_shared(value_type); + + ARROW_ASSIGN_OR_RAISE(std::shared_ptr value_builder, + MakeBuilder(value_type, pool)); + auto list_builder = std::make_shared(pool, value_builder, list_type); + RETURN_NOT_OK(list_builder->Reserve(list_view_data->length)); + + ArraySpan values{*list_view_data->child_data[0]}; + const auto* in_validity_bitmap = list_view_data->GetValues(0); + const auto* in_offsets = list_view_data->GetValues(1); + const auto* in_sizes = list_view_data->GetValues(2); + for (int64_t i = 0; i < list_view_data->length; ++i) { + const bool is_valid = + !in_validity_bitmap || + bit_util::GetBit(in_validity_bitmap, list_view_data->offset + i); + const int64_t size = is_valid ? in_sizes[i] : 0; + RETURN_NOT_OK(list_builder->Append(is_valid, size)); + RETURN_NOT_OK(value_builder->AppendArraySlice(values, in_offsets[i], size)); + } + std::shared_ptr list_array_data; + RETURN_NOT_OK(list_builder->FinishInternal(&list_array_data)); + return list_array_data; +} + } // namespace Result> RangeOfValuesUsed(const ArraySpan& input) { @@ -177,7 +247,38 @@ Result> RangeOfValuesUsed(const ArraySpan& input) { "RangeOfValuesUsed: input is not a var-length list-like array"); } +Result> ListViewFromList(const ListArray& source, + MemoryPool* pool) { + ARROW_ASSIGN_OR_RAISE( + auto data, + (internal::ListViewFromListImpl(source.data(), pool))); + return std::make_shared(std::move(data)); +} + +Result> ListViewFromList(const LargeListArray& source, + MemoryPool* pool) { + ARROW_ASSIGN_OR_RAISE(auto data, + (internal::ListViewFromListImpl( + source.data(), pool))); + return std::make_shared(std::move(data)); +} + +Result> ListFromListView(const ListViewArray& source, + MemoryPool* pool) { + ARROW_ASSIGN_OR_RAISE( + auto data, + (internal::ListFromListViewImpl(source.data(), pool))); + return std::make_shared(std::move(data)); +} + +Result> ListFromListView(const LargeListViewArray& source, + MemoryPool* pool) { + ARROW_ASSIGN_OR_RAISE(auto data, + (internal::ListFromListViewImpl( + source.data(), pool))); + return std::make_shared(std::move(data)); +} + } // namespace internal -} // namespace list_util -} // namespace arrow +} // namespace arrow::list_util diff --git a/cpp/src/arrow/util/list_util.h b/cpp/src/arrow/util/list_util.h index 860d19b29a3..1a883f8fc5e 100644 --- a/cpp/src/arrow/util/list_util.h +++ b/cpp/src/arrow/util/list_util.h @@ -40,6 +40,22 @@ namespace internal { ARROW_EXPORT Result> RangeOfValuesUsed( const ArraySpan& input); +/// \brief Build a ListViewArray from a ListArray +ARROW_EXPORT Result> ListViewFromList( + const ListArray& source, MemoryPool* pool); + +/// \brief Build a LargeListViewArray from a LargeListArray +ARROW_EXPORT Result> ListViewFromList( + const LargeListArray& source, MemoryPool* pool); + +/// \brief Build a ListArray from a ListViewArray +ARROW_EXPORT Result> ListFromListView( + const ListViewArray& source, MemoryPool* pool); + +/// \brief Build a LargeListArray from a LargeListViewArray +ARROW_EXPORT Result> ListFromListView( + const LargeListViewArray& source, MemoryPool* pool); + } // namespace internal } // namespace list_util diff --git a/cpp/src/arrow/util/list_util_test.cc b/cpp/src/arrow/util/list_util_test.cc index b02cdbb21db..7339d87f26a 100644 --- a/cpp/src/arrow/util/list_util_test.cc +++ b/cpp/src/arrow/util/list_util_test.cc @@ -26,6 +26,7 @@ namespace arrow { using internal::checked_cast; +using internal::checked_pointer_cast; using ListAndListViewTypes = ::testing::Types; @@ -155,4 +156,78 @@ TYPED_TEST_SUITE(TestListUtils, ListAndListViewTypes); TYPED_TEST(TestListUtils, RangeOfValuesUsed) { this->TestRangeOfValuesUsed(); } +class TestListConversions : public ::testing::Test { + private: + MemoryPool* pool_; + + public: + TestListConversions() : pool_(default_memory_pool()) {} + + template + void DoTestListViewFromList() { + using SrcListArrayClass = typename TypeTraits::ArrayType; + auto list_type = std::make_shared(int32()); + auto list_view_type = std::make_shared(int32()); + + auto expected_list_view_w_nulls = + ArrayFromJSON(list_view_type, "[[1, 2], [3], [], [4], null]"); + auto expected_list_view_wo_nulls = + ArrayFromJSON(list_view_type, "[[1, 2], [], [100000]]"); + + std::shared_ptr list_w_nulls = + ArrayFromJSON(list_type, "[[1, 2], [3], [], [4], null]"); + auto list_wo_nulls = ArrayFromJSON(list_type, "[[1, 2], [], [100000]]"); + + ASSERT_OK_AND_ASSIGN( + auto result, list_util::internal::ListViewFromList( + *checked_pointer_cast(list_w_nulls), pool_)); + ASSERT_OK(result->ValidateFull()); + AssertArraysEqual(*expected_list_view_w_nulls, *result, /*verbose=*/true); + + ASSERT_OK_AND_ASSIGN( + result, list_util::internal::ListViewFromList( + *checked_pointer_cast(list_wo_nulls), pool_)); + ASSERT_OK(result->ValidateFull()); + AssertArraysEqual(*expected_list_view_wo_nulls, *result, /*verbose=*/true); + } + + template + void DoTestListFromListView() { + using SrcListViewArrayClass = typename TypeTraits::ArrayType; + auto list_view_type = std::make_shared(int32()); + auto list_type = std::make_shared(int32()); + + auto list_view_w_nulls = + ArrayFromJSON(list_view_type, "[[1, 2], [3], [], [4], null]"); + auto list_view_wo_nulls = ArrayFromJSON(list_view_type, "[[1, 2], [], [100000]]"); + + auto expected_list_w_nulls = ArrayFromJSON(list_type, "[[1, 2], [3], [], [4], null]"); + auto expected_list_wo_nulls = ArrayFromJSON(list_type, "[[1, 2], [], [100000]]"); + + ASSERT_OK_AND_ASSIGN( + auto result, + list_util::internal::ListFromListView( + *checked_pointer_cast(list_view_w_nulls), pool_)); + ASSERT_OK(result->ValidateFull()); + AssertArraysEqual(*expected_list_w_nulls, *result, /*verbose=*/true); + + ASSERT_OK_AND_ASSIGN( + result, + list_util::internal::ListFromListView( + *checked_pointer_cast(list_view_wo_nulls), pool_)); + ASSERT_OK(result->ValidateFull()); + AssertArraysEqual(*expected_list_wo_nulls, *result, /*verbose=*/true); + } +}; + +TEST_F(TestListConversions, ListViewFromList) { + this->DoTestListViewFromList(); + this->DoTestListViewFromList(); +} + +TEST_F(TestListConversions, ListFromListView) { + this->DoTestListFromListView(); + this->DoTestListFromListView(); +} + } // namespace arrow From 7da4b11809a198e9478fe4a42ce3f4133a7475da Mon Sep 17 00:00:00 2001 From: Felipe Oliveira Carvalho Date: Fri, 18 Aug 2023 18:36:12 +0200 Subject: [PATCH 29/91] Declare dtor explicitly on base list[view] builders --- cpp/src/arrow/array/builder_nested.h | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/cpp/src/arrow/array/builder_nested.h b/cpp/src/arrow/array/builder_nested.h index 7d5ab0a9bc7..d2f9d3c6ba6 100644 --- a/cpp/src/arrow/array/builder_nested.h +++ b/cpp/src/arrow/array/builder_nested.h @@ -67,6 +67,8 @@ class ARROW_EXPORT VarLengthListLikeBuilder : public ArrayBuilder { std::make_shared(value_builder->type()), alignment) {} + ~VarLengthListLikeBuilder() override = default; + Status Resize(int64_t capacity) override { if (ARROW_PREDICT_FALSE(capacity > maximum_elements())) { return Status::CapacityError(type_name(), @@ -267,6 +269,8 @@ class ARROW_EXPORT BaseListBuilder : public VarLengthListLikeBuilder { using BASE::Append; + ~BaseListBuilder() override = default; + /// \brief Start a new variable-length list slot /// /// This function should be called before beginning to append elements to the @@ -397,6 +401,8 @@ class ARROW_EXPORT BaseListViewBuilder : public VarLengthListLikeBuilder { using BASE::BASE; + ~BaseListViewBuilder() override = default; + Status Resize(int64_t capacity) override { ARROW_RETURN_NOT_OK(BASE::Resize(capacity)); return sizes_builder_.Resize(capacity); From 0b04fc6589f7ecbdfa0d5711d9c9c2f26263a251 Mon Sep 17 00:00:00 2001 From: Felipe Oliveira Carvalho Date: Thu, 7 Sep 2023 11:44:48 -0300 Subject: [PATCH 30/91] Revert "BaseBuilder: Allow ListView scalars and slices to be added to List builders and vice-versa" This reverts commit 9b13c8f8028bdb5d9d1c48391ab9041f03b9ce5d. --- cpp/src/arrow/array/builder_base.cc | 24 ++------------ cpp/src/arrow/array/builder_nested.h | 47 ++++++---------------------- 2 files changed, 11 insertions(+), 60 deletions(-) diff --git a/cpp/src/arrow/array/builder_base.cc b/cpp/src/arrow/array/builder_base.cc index 4ac1b83729f..40e705aa3e4 100644 --- a/cpp/src/arrow/array/builder_base.cc +++ b/cpp/src/arrow/array/builder_base.cc @@ -330,30 +330,10 @@ struct DerefConstIterator { pointer operator->() const { return &(**it); } }; -/// If A and B are equivalent types, a builder of type A can receive -/// scalar values of type B and a builder of type B can receive -/// scalar values of type A. -/// -/// \param a Type A. -/// \param b Type B. -bool AreScalarTypesEquivalent(const DataType& a, const DataType& b) { - if (a.Equals(b)) { - return true; - } - if ((a.id() == Type::LIST && b.id() == Type::LIST_VIEW) || - (a.id() == Type::LIST_VIEW && b.id() == Type::LIST) || - (a.id() == Type::LARGE_LIST && b.id() == Type::LARGE_LIST_VIEW) || - (a.id() == Type::LARGE_LIST_VIEW && b.id() == Type::LARGE_LIST)) { - return checked_cast(a).value_type()->Equals( - *checked_cast(b).value_type()); - } - return false; -} - } // namespace Status ArrayBuilder::AppendScalar(const Scalar& scalar, int64_t n_repeats) { - if (!AreScalarTypesEquivalent(*scalar.type, *type())) { + if (!scalar.type->Equals(type())) { return Status::Invalid("Cannot append scalar of type ", scalar.type->ToString(), " to builder for type ", type()->ToString()); } @@ -364,7 +344,7 @@ Status ArrayBuilder::AppendScalars(const ScalarVector& scalars) { if (scalars.empty()) return Status::OK(); const auto ty = type(); for (const auto& scalar : scalars) { - if (ARROW_PREDICT_FALSE(!AreScalarTypesEquivalent(*scalar->type, *ty))) { + if (!scalar->type->Equals(ty)) { return Status::Invalid("Cannot append scalar of type ", scalar->type->ToString(), " to builder for type ", type()->ToString()); } diff --git a/cpp/src/arrow/array/builder_nested.h b/cpp/src/arrow/array/builder_nested.h index d2f9d3c6ba6..36bfa0d7d72 100644 --- a/cpp/src/arrow/array/builder_nested.h +++ b/cpp/src/arrow/array/builder_nested.h @@ -17,7 +17,6 @@ #pragma once -#include #include #include #include @@ -140,26 +139,23 @@ class ARROW_EXPORT VarLengthListLikeBuilder : public ArrayBuilder { virtual Status AppendValues(const offset_type* offsets, const offset_type* sizes, int64_t length, const uint8_t* valid_bytes) = 0; - private: - /// \tparam T The DataType of array - template - Status AppendArraySliceImpl(const ArraySpan& array, int64_t offset, int64_t length) { - static_assert( - std::is_same::value, - "VarLengthListLikeBuilder::AppendArraySlice expects a list or list-view " - "with the same offset bit-width"); + Status AppendArraySlice(const ArraySpan& array, int64_t offset, + int64_t length) override { const offset_type* offsets = array.GetValues(1); [[maybe_unused]] const offset_type* sizes = NULLPTR; - if constexpr (is_list_view(T::type_id)) { + if constexpr (is_list_view(TYPE::type_id)) { sizes = array.GetValues(2); } + const bool all_valid = !array.MayHaveLogicalNulls(); const uint8_t* validity = array.HasValidityBitmap() ? array.buffers[0].data : NULLPTR; ARROW_RETURN_NOT_OK(Reserve(length)); for (int64_t row = offset; row < offset + length; row++) { - const bool is_valid = !validity || bit_util::GetBit(validity, array.offset + row); + const bool is_valid = + all_valid || (validity && bit_util::GetBit(validity, array.offset + row)) || + array.IsValid(row); int64_t size = 0; if (is_valid) { - if constexpr (is_list_view(T::type_id)) { + if constexpr (is_list_view(TYPE::type_id)) { size = sizes[row]; } else { size = offsets[row + 1] - offsets[row]; @@ -167,7 +163,7 @@ class ARROW_EXPORT VarLengthListLikeBuilder : public ArrayBuilder { } UnsafeAppendToBitmap(is_valid); UnsafeAppendDimensions(/*offset=*/value_builder_->length(), size); - if (is_valid && size > 0) { + if (is_valid) { ARROW_RETURN_NOT_OK( value_builder_->AppendArraySlice(array.child_data[0], offsets[row], size)); } @@ -175,31 +171,6 @@ class ARROW_EXPORT VarLengthListLikeBuilder : public ArrayBuilder { return Status::OK(); } - public: - Status AppendArraySlice(const ArraySpan& array, int64_t offset, - int64_t length) override { - const auto array_type_id = array.type->id(); - if constexpr (TYPE::type_id == Type::LIST) { - if (array_type_id == Type::LIST_VIEW) { - return AppendArraySliceImpl(array, offset, length); - } - } else if constexpr (TYPE::type_id == Type::LIST_VIEW) { - if (array_type_id == Type::LIST) { - return AppendArraySliceImpl(array, offset, length); - } - } else if constexpr (TYPE::type_id == Type::LARGE_LIST) { - if (array_type_id == Type::LARGE_LIST_VIEW) { - return AppendArraySliceImpl(array, offset, length); - } - } else if constexpr (TYPE::type_id == Type::LARGE_LIST_VIEW) { - if (array_type_id == Type::LARGE_LIST) { - return AppendArraySliceImpl(array, offset, length); - } - } - assert(TYPE::type_id == array_type_id); - return AppendArraySliceImpl(array, offset, length); - } - Status ValidateOverflow(int64_t new_elements) const { auto new_length = value_builder_->length() + new_elements; if (ARROW_PREDICT_FALSE(new_length > maximum_elements())) { From 693477c44140b77f31f21b070171870714f9f933 Mon Sep 17 00:00:00 2001 From: Felipe Oliveira Carvalho Date: Thu, 7 Sep 2023 22:24:34 -0300 Subject: [PATCH 31/91] concatenate.cc: Add a fast path to Concatenate when only 1 array is passed --- cpp/src/arrow/array/array_nested.cc | 6 ++---- cpp/src/arrow/array/concatenate.cc | 7 +++++-- 2 files changed, 7 insertions(+), 6 deletions(-) diff --git a/cpp/src/arrow/array/array_nested.cc b/cpp/src/arrow/array/array_nested.cc index e9193c9708a..93f159b5510 100644 --- a/cpp/src/arrow/array/array_nested.cc +++ b/cpp/src/arrow/array/array_nested.cc @@ -336,10 +336,8 @@ Result> FlattenListViewArray(const ListViewArrayT& list_v SliceArrayWithOffsets(*value_array, offsets[first_i], end_offset)); } - // Final attempt to avoid invoking Concatenate(). - if (non_null_fragments.size() == 1) { - return non_null_fragments[0]; - } else if (non_null_fragments.size() == 0) { + // Concatenate needs at least one fragment to work. + if (non_null_fragments.size() == 0) { return MakeEmptyArray(value_array->type(), memory_pool); } diff --git a/cpp/src/arrow/array/concatenate.cc b/cpp/src/arrow/array/concatenate.cc index 2510f2d7733..652b4c8ceee 100644 --- a/cpp/src/arrow/array/concatenate.cc +++ b/cpp/src/arrow/array/concatenate.cc @@ -731,8 +731,11 @@ class ConcatenateImpl { } // namespace Result> Concatenate(const ArrayVector& arrays, MemoryPool* pool) { - if (arrays.size() == 0) { - return Status::Invalid("Must pass at least one array"); + switch (arrays.size()) { + case 0: + return Status::Invalid("Must pass at least one array"); + case 1: + return arrays[0]; } // gather ArrayData of input arrays From aa0846eb91bba6bd966f1c59d8dd19bd9d4b6f5d Mon Sep 17 00:00:00 2001 From: Felipe Oliveira Carvalho Date: Fri, 8 Sep 2023 10:43:18 -0300 Subject: [PATCH 32/91] [Large]ListViewArray: Re-write Flatten --- cpp/src/arrow/array/array_nested.cc | 34 ++++++++++++++++------------- 1 file changed, 19 insertions(+), 15 deletions(-) diff --git a/cpp/src/arrow/array/array_nested.cc b/cpp/src/arrow/array/array_nested.cc index 93f159b5510..22c2c2965df 100644 --- a/cpp/src/arrow/array/array_nested.cc +++ b/cpp/src/arrow/array/array_nested.cc @@ -276,7 +276,7 @@ Result> FlattenListViewArray(const ListViewArrayT& list_v return MakeEmptyArray(value_array->type(), memory_pool); } - const auto* validity = list_view_array.data()->template GetValues(0); + const auto* validity = list_view_array.data()->template GetValues(0, 0); const auto* offsets = list_view_array.data()->template GetValues(1); const auto* sizes = list_view_array.data()->template GetValues(2); @@ -303,35 +303,39 @@ Result> FlattenListViewArray(const ListViewArrayT& list_v } } + auto is_null_or_empty = [&](int64_t i) { + return (validity && !bit_util::GetBit(validity, list_view_array.offset() + i)) || + sizes[i] == 0; + }; + std::vector> non_null_fragments; // Index of first valid, non-empty list-view and last offset // of the current contiguous fragment in values. - int64_t first_i = -1; - offset_type end_offset = -1; + constexpr int64_t kUninitialized = -1; + int64_t first_i = kUninitialized; + offset_type end_offset; int64_t i = 0; for (; i < list_view_array_length; i++) { - if ((validity && !bit_util::GetBit(validity, i)) || sizes[i] == 0) { - continue; - } + if (is_null_or_empty(i)) continue; + first_i = i; end_offset = offsets[i] + sizes[i]; break; } i += 1; for (; i < list_view_array_length; i++) { - if ((validity && !bit_util::GetBit(validity, i)) || sizes[i] == 0) { - continue; - } + if (is_null_or_empty(i)) continue; + if (offsets[i] == end_offset) { end_offset += sizes[i]; - } else { - non_null_fragments.push_back( - SliceArrayWithOffsets(*value_array, offsets[first_i], end_offset)); - first_i = i; - end_offset = offsets[i] + sizes[i]; + continue; } + non_null_fragments.push_back( + SliceArrayWithOffsets(*value_array, offsets[first_i], end_offset)); + first_i = i; + end_offset = offsets[i] + sizes[i]; } - if (first_i >= 0) { + if (first_i != kUninitialized) { non_null_fragments.push_back( SliceArrayWithOffsets(*value_array, offsets[first_i], end_offset)); } From f9e93055996d962e61abca645016564c761c5af0 Mon Sep 17 00:00:00 2001 From: Felipe Oliveira Carvalho Date: Tue, 26 Sep 2023 14:41:47 -0300 Subject: [PATCH 33/91] scalar_test.cc: Instantiate basic scalar tests with list-view types --- cpp/src/arrow/scalar_test.cc | 15 ++++++++------- 1 file changed, 8 insertions(+), 7 deletions(-) diff --git a/cpp/src/arrow/scalar_test.cc b/cpp/src/arrow/scalar_test.cc index 97260aed91c..cba817f67b1 100644 --- a/cpp/src/arrow/scalar_test.cc +++ b/cpp/src/arrow/scalar_test.cc @@ -1091,7 +1091,7 @@ void CheckInvalidListCast(const Scalar& scalar, const std::shared_ptr& } template -class TestListScalar : public ::testing::Test { +class TestListLikeScalar : public ::testing::Test { public: using ScalarType = typename TypeTraits::ScalarType; @@ -1185,17 +1185,18 @@ class TestListScalar : public ::testing::Test { std::shared_ptr value_; }; -using ListScalarTestTypes = ::testing::Types; +using ListScalarTestTypes = ::testing::Types; -TYPED_TEST_SUITE(TestListScalar, ListScalarTestTypes); +TYPED_TEST_SUITE(TestListLikeScalar, ListScalarTestTypes); -TYPED_TEST(TestListScalar, Basics) { this->TestBasics(); } +TYPED_TEST(TestListLikeScalar, Basics) { this->TestBasics(); } -TYPED_TEST(TestListScalar, ValidateErrors) { this->TestValidateErrors(); } +TYPED_TEST(TestListLikeScalar, ValidateErrors) { this->TestValidateErrors(); } -TYPED_TEST(TestListScalar, Hashing) { this->TestHashing(); } +TYPED_TEST(TestListLikeScalar, Hashing) { this->TestHashing(); } -TYPED_TEST(TestListScalar, Cast) { this->TestCast(); } +TYPED_TEST(TestListLikeScalar, Cast) { this->TestCast(); } TEST(TestFixedSizeListScalar, ValidateErrors) { const auto ty = fixed_size_list(int16(), 3); From 8cacd446f9878bb2f0e0a32a1e6018a9095bccec Mon Sep 17 00:00:00 2001 From: Felipe Oliveira Carvalho Date: Tue, 26 Sep 2023 16:06:18 -0300 Subject: [PATCH 34/91] concatenate_test.cc: De-duplicate code checking concatenation of lists and list-views --- cpp/src/arrow/array/concatenate_test.cc | 97 +++++++++++++------------ 1 file changed, 50 insertions(+), 47 deletions(-) diff --git a/cpp/src/arrow/array/concatenate_test.cc b/cpp/src/arrow/array/concatenate_test.cc index 8afe9d05ae6..f166370ca0c 100644 --- a/cpp/src/arrow/array/concatenate_test.cc +++ b/cpp/src/arrow/array/concatenate_test.cc @@ -50,6 +50,7 @@ class ConcatenateTest : public ::testing::Test { sizes_({0, 1, 2, 4, 16, 31, 1234}), null_probabilities_({0.0, 0.1, 0.5, 0.9, 1.0}) {} + public: template std::vector Offsets(int32_t length, int32_t slice_count) { std::vector offsets(static_cast(slice_count + 1)); @@ -77,6 +78,7 @@ class ConcatenateTest : public ::testing::Test { return rng_.Numeric(size, 0, 127, null_probability); } + protected: void CheckTrailingBitsAreZeroed(const std::shared_ptr& bitmap, int64_t length) { if (auto preceding_bits = bit_util::kPrecedingBitmask[length % 8]) { auto last_byte = bitmap->data()[length / 8]; @@ -187,87 +189,88 @@ TEST_F(ConcatenateTest, FixedSizeListType) { }); } -TEST_F(ConcatenateTest, ListType) { - Check([this](int32_t size, double null_probability, std::shared_ptr* out) { +template +struct ListConcatenationChecker { + using offset_type = typename ListType::offset_type; + using OffsetArrowType = typename CTypeTraits::ArrowType; + using ListArrayType = typename TypeTraits::ArrayType; + + template + static void Check(Self& self, int32_t size, double null_probability, + std::shared_ptr* out) { auto values_size = size * 4; - auto values = this->GeneratePrimitive(values_size, null_probability); - auto offsets_vector = this->Offsets(values_size, size); + auto values = + self.template GeneratePrimitive(values_size, null_probability); + auto offsets_vector = self.template Offsets(values_size, size); // Ensure first and last offsets encompass the whole values array offsets_vector.front() = 0; - offsets_vector.back() = static_cast(values_size); + offsets_vector.back() = static_cast(values_size); std::shared_ptr offsets; - ArrayFromVector(offsets_vector, &offsets); - ASSERT_OK_AND_ASSIGN(*out, ListArray::FromArrays(*offsets, *values)); + ArrayFromVector(offsets_vector, &offsets); + ASSERT_OK_AND_ASSIGN(*out, ListArrayType::FromArrays(*offsets, *values)); ASSERT_OK((**out).ValidateFull()); + } +}; + +TEST_F(ConcatenateTest, ListType) { + Check([this](int32_t size, double null_probability, std::shared_ptr* out) { + ListConcatenationChecker::Check(*this, size, null_probability, out); }); } TEST_F(ConcatenateTest, LargeListType) { Check([this](int32_t size, double null_probability, std::shared_ptr* out) { - auto values_size = size * 4; - auto values = this->GeneratePrimitive(values_size, null_probability); - auto offsets_vector = this->Offsets(values_size, size); - // Ensure first and last offsets encompass the whole values array - offsets_vector.front() = 0; - offsets_vector.back() = static_cast(values_size); - std::shared_ptr offsets; - ArrayFromVector(offsets_vector, &offsets); - ASSERT_OK_AND_ASSIGN(*out, LargeListArray::FromArrays(*offsets, *values)); - ASSERT_OK((**out).ValidateFull()); + ListConcatenationChecker::Check(*this, size, null_probability, out); }); } -TEST_F(ConcatenateTest, ListViewType) { - Check([this](int32_t size, double null_probability, std::shared_ptr* out) { - auto values_size = size * 4; - auto values = this->GeneratePrimitive(values_size, null_probability); +template +struct ListViewConcatenationChecker { + using offset_type = typename ListViewType::offset_type; + using OffsetArrowType = typename CTypeTraits::ArrowType; + using ListViewArrayType = typename TypeTraits::ArrayType; + + template + static void Check(Self& self, int32_t size, double null_probability, + std::shared_ptr* out) { + auto values_size = 4 * size; + auto values = + self.template GeneratePrimitive(values_size, null_probability); std::shared_ptr offsets; - auto offsets_vector = this->Offsets(values_size, size); + auto offsets_vector = self.template Offsets(values_size, size); offsets_vector.front() = 0; - offsets_vector.back() = values_size; - ArrayFromVector(offsets_vector, &offsets); + ArrayFromVector(offsets_vector, &offsets); std::shared_ptr sizes; - std::vector sizes_vector; + std::vector sizes_vector; sizes_vector.reserve(size); for (int32_t i = 0; i < size; ++i) { // Make list-views share values with the next list-view by extending the size to a // point after the next offset. - int32_t size = offsets_vector[i + 1] - offsets_vector[i]; + offset_type size = offsets_vector[i + 1] - offsets_vector[i]; size = std::min(2 * size / 3, values_size - offsets_vector[i]); sizes_vector.push_back(size); ASSERT_LE(offsets_vector[i] + sizes_vector.back(), values_size); } ASSERT_EQ(offsets_vector.size(), sizes_vector.size() + 1); - ArrayFromVector(sizes_vector, &sizes); + ArrayFromVector(sizes_vector, &sizes); - ASSERT_OK_AND_ASSIGN(*out, ListViewArray::FromArrays(*offsets, *sizes, *values)); + ASSERT_OK_AND_ASSIGN(*out, ListViewArrayType::FromArrays(*offsets, *sizes, *values)); ASSERT_OK((**out).ValidateFull()); + } +}; + +TEST_F(ConcatenateTest, ListViewType) { + Check([this](int32_t size, double null_probability, std::shared_ptr* out) { + ListViewConcatenationChecker::Check(*this, size, null_probability, out); }); } TEST_F(ConcatenateTest, LargeListViewType) { Check([this](int32_t size, double null_probability, std::shared_ptr* out) { - auto values_size = size * 4; - auto values = this->GeneratePrimitive(values_size, null_probability); - auto offsets_vector = this->Offsets(values_size, size); - // Ensure first and last offsets encompass the whole values array - offsets_vector.front() = 0; - offsets_vector.back() = static_cast(values_size); - std::vector sizes_vector; - sizes_vector.reserve(size); - for (int64_t i = 0; i < size; ++i) { - int64_t size = offsets_vector[i + 1] - offsets_vector[i]; - size = std::min(2 * size / 3, values_size - offsets_vector[i]); - sizes_vector.push_back(size); - } - ASSERT_EQ(offsets_vector.size(), sizes_vector.size() + 1); - std::shared_ptr offsets, sizes; - ArrayFromVector(offsets_vector, &offsets); - ArrayFromVector(sizes_vector, &sizes); - ASSERT_OK_AND_ASSIGN(*out, LargeListViewArray::FromArrays(*offsets, *sizes, *values)); - ASSERT_OK((**out).ValidateFull()); + ListViewConcatenationChecker::Check(*this, size, null_probability, + out); }); } From ac5b94e09fc155561922d827cd267ac5cb319da5 Mon Sep 17 00:00:00 2001 From: Felipe Oliveira Carvalho Date: Tue, 26 Sep 2023 16:12:03 -0300 Subject: [PATCH 35/91] Revert "concatenate.cc: Add a fast path to Concatenate when only 1 array is passed" This reverts commit 2f4471fdeb8fcf62af63bd664a3a789d535dc620. --- cpp/src/arrow/array/array_nested.cc | 6 ++++-- cpp/src/arrow/array/concatenate.cc | 7 ++----- 2 files changed, 6 insertions(+), 7 deletions(-) diff --git a/cpp/src/arrow/array/array_nested.cc b/cpp/src/arrow/array/array_nested.cc index 22c2c2965df..fd1a2a24a8f 100644 --- a/cpp/src/arrow/array/array_nested.cc +++ b/cpp/src/arrow/array/array_nested.cc @@ -340,8 +340,10 @@ Result> FlattenListViewArray(const ListViewArrayT& list_v SliceArrayWithOffsets(*value_array, offsets[first_i], end_offset)); } - // Concatenate needs at least one fragment to work. - if (non_null_fragments.size() == 0) { + // Final attempt to avoid invoking Concatenate(). + if (non_null_fragments.size() == 1) { + return non_null_fragments[0]; + } else if (non_null_fragments.size() == 0) { return MakeEmptyArray(value_array->type(), memory_pool); } diff --git a/cpp/src/arrow/array/concatenate.cc b/cpp/src/arrow/array/concatenate.cc index 652b4c8ceee..2510f2d7733 100644 --- a/cpp/src/arrow/array/concatenate.cc +++ b/cpp/src/arrow/array/concatenate.cc @@ -731,11 +731,8 @@ class ConcatenateImpl { } // namespace Result> Concatenate(const ArrayVector& arrays, MemoryPool* pool) { - switch (arrays.size()) { - case 0: - return Status::Invalid("Must pass at least one array"); - case 1: - return arrays[0]; + if (arrays.size() == 0) { + return Status::Invalid("Must pass at least one array"); } // gather ArrayData of input arrays From 69345b4eac1016120a748c5fa53844f1086cfe7f Mon Sep 17 00:00:00 2001 From: Felipe Oliveira Carvalho Date: Tue, 26 Sep 2023 16:52:26 -0300 Subject: [PATCH 36/91] validate.cc: Be strict about nullability of offsets and sizes buffers on list-views --- cpp/src/arrow/array/array_list_test.cc | 11 ++++++++--- cpp/src/arrow/array/validate.cc | 16 ++++++++++------ 2 files changed, 18 insertions(+), 9 deletions(-) diff --git a/cpp/src/arrow/array/array_list_test.cc b/cpp/src/arrow/array/array_list_test.cc index 0ca7c5d1f37..03db87ad5c1 100644 --- a/cpp/src/arrow/array/array_list_test.cc +++ b/cpp/src/arrow/array/array_list_test.cc @@ -745,7 +745,10 @@ class TestListArray : public ::testing::Test { std::shared_ptr values, int64_t offset = 0) { if constexpr (kTypeClassIsListView) { std::vector sizes; - sizes.reserve(offsets.empty() ? 0 : offsets.size() - 1); + // Always reserve some space so Buffer::Wrap doesn't create a null buffer + // when length of the sizes buffer is 0. + sizes.reserve( + std::max(static_cast(1), offsets.empty() ? 0 : offsets.size() - 1)); for (size_t i = 1; i < offsets.size(); ++i) { sizes.push_back(offsets[i] - offsets[i - 1]); } @@ -763,8 +766,10 @@ class TestListArray : public ::testing::Test { auto empty_values = ArrayFromJSON(int16(), "[]"); auto values = ArrayFromJSON(int16(), "[1, 2, 3, 4, 5, 6, 7]"); - // An empty list array can have omitted or 0-length offsets - ASSERT_OK(ValidateOffsets(0, {}, empty_values)); + if constexpr (!kTypeClassIsListView) { + // An empty list array can have omitted or 0-length offsets + ASSERT_OK(ValidateOffsets(0, {}, empty_values)); + } ASSERT_OK(ValidateOffsets(0, {0}, empty_values)); ASSERT_OK(ValidateOffsets(1, {0, 7}, values)); diff --git a/cpp/src/arrow/array/validate.cc b/cpp/src/arrow/array/validate.cc index 35aa9548274..4f26a236d22 100644 --- a/cpp/src/arrow/array/validate.cc +++ b/cpp/src/arrow/array/validate.cc @@ -895,14 +895,18 @@ struct ValidateArrayImpl { constexpr bool is_list_view = is_list_view_type::value; const bool non_empty = data.length > 0; - if (!IsBufferValid(1)) { - // For length 0, an empty offsets buffer is accepted (ARROW-544). - return non_empty ? Status::Invalid("Non-empty array but offsets are null") - : Status::OK(); - } if constexpr (is_list_view) { + if (!IsBufferValid(1)) { + // For length 0, an empty offsets buffer is accepted (ARROW-544). + return Status::Invalid("offsets buffer is null"); + } if (!IsBufferValid(2)) { - return non_empty ? Status::Invalid("Non-empty array but sizes are null") + return Status::Invalid("sizes buffer is null"); + } + } else { + if (!IsBufferValid(1)) { + // For length 0, an empty offsets buffer is accepted (ARROW-544). + return non_empty ? Status::Invalid("Non-empty array but offsets are null") : Status::OK(); } } From 540086a67c60c3d701d30aef8530f4b492b9194f Mon Sep 17 00:00:00 2001 From: Felipe Oliveira Carvalho Date: Tue, 26 Sep 2023 20:02:24 -0300 Subject: [PATCH 37/91] list_util.h: Add SumOfLogicalListSizes() utility --- cpp/src/arrow/util/list_util.cc | 50 ++++++++++++++++++++++++++++ cpp/src/arrow/util/list_util.h | 13 ++++++++ cpp/src/arrow/util/list_util_test.cc | 4 +++ 3 files changed, 67 insertions(+) diff --git a/cpp/src/arrow/util/list_util.cc b/cpp/src/arrow/util/list_util.cc index 9b57328c466..53ac4edfcd2 100644 --- a/cpp/src/arrow/util/list_util.cc +++ b/cpp/src/arrow/util/list_util.cc @@ -158,6 +158,36 @@ std::pair RangeOfValuesUsedByList(const ArraySpan& input) { return {min_offset, max_end - min_offset}; } +template +int64_t SumOfListSizes(const ArraySpan& input) { + DCHECK(is_var_length_list(*input.type)); + const uint8_t* validity = input.buffers[0].data; + const auto* offsets = input.GetValues(1); + int64_t sum = 0; + arrow::internal::VisitSetBitRunsVoid( + validity, input.offset, input.length, + [&sum, offsets](int64_t run_start, int64_t run_length) { + sum += offsets[run_start + run_length + 1] - offsets[run_start]; + }); + return sum; +} + +template +int64_t SumOfListViewSizes(const ArraySpan& input) { + DCHECK(is_list_view(*input.type)); + const uint8_t* validity = input.buffers[0].data; + const auto* sizes = input.GetValues(2); + int64_t sum = 0; + arrow::internal::VisitSetBitRunsVoid( + validity, input.offset, input.length, + [&sum, sizes](int64_t run_start, int64_t run_length) { + for (int64_t i = run_start; i < run_start + run_length; ++i) { + sum += sizes[i]; + } + }); + return sum; +} + template Result> ListViewFromListImpl( const std::shared_ptr& list_data, MemoryPool* pool) { @@ -247,6 +277,26 @@ Result> RangeOfValuesUsed(const ArraySpan& input) { "RangeOfValuesUsed: input is not a var-length list-like array"); } +Result SumOfLogicalListSizes(const ArraySpan& input) { + switch (input.type->id()) { + case Type::LIST: + return SumOfListSizes(input); + case Type::MAP: + return SumOfListSizes(input); + case Type::LARGE_LIST: + return SumOfListSizes(input); + case Type::LIST_VIEW: + return SumOfListViewSizes(input); + case Type::LARGE_LIST_VIEW: + return SumOfListViewSizes(input); + default: + break; + } + DCHECK(!is_var_length_list_like(*input.type)); + return Status::TypeError( + "SumOfLogicalListSizes: input is not a var-length list-like array"); +} + Result> ListViewFromList(const ListArray& source, MemoryPool* pool) { ARROW_ASSIGN_OR_RAISE( diff --git a/cpp/src/arrow/util/list_util.h b/cpp/src/arrow/util/list_util.h index 1a883f8fc5e..b893f3d5d6c 100644 --- a/cpp/src/arrow/util/list_util.h +++ b/cpp/src/arrow/util/list_util.h @@ -40,6 +40,19 @@ namespace internal { ARROW_EXPORT Result> RangeOfValuesUsed( const ArraySpan& input); +/// \brief Calculate the sum of the sizes of all valid lists or list-views +/// +/// This is usally the same as the length of the RangeOfValuesUsed() range, but +/// it can be: +/// - Smaller: when the child array constains many values that are not +/// referenced by the lists or list-views in the parent array +/// - Greater: when the list-views share child array ranges +/// +/// \param input The input array such that is_var_length_list_like(input.type) +/// is true +/// \return The sum of all list or list-view sizes +ARROW_EXPORT Result SumOfLogicalListSizes(const ArraySpan& input); + /// \brief Build a ListViewArray from a ListArray ARROW_EXPORT Result> ListViewFromList( const ListArray& source, MemoryPool* pool); diff --git a/cpp/src/arrow/util/list_util_test.cc b/cpp/src/arrow/util/list_util_test.cc index 7339d87f26a..425580a7084 100644 --- a/cpp/src/arrow/util/list_util_test.cc +++ b/cpp/src/arrow/util/list_util_test.cc @@ -143,6 +143,10 @@ class TestListUtils : public ::testing::Test { } else { ASSERT_EQ(range.second, 9); } + // Check the sum of logical sizes as well + ASSERT_OK_AND_ASSIGN(int64_t sum_of_logical_sizes, + list_util::internal::SumOfLogicalListSizes(*array->data())); + ASSERT_EQ(sum_of_logical_sizes, 9); } protected: From f3596b6e60779a954241f229ec1a105f4a941250 Mon Sep 17 00:00:00 2001 From: Felipe Oliveira Carvalho Date: Tue, 26 Sep 2023 20:12:03 -0300 Subject: [PATCH 38/91] list_util.cc: Use SumOfLogicalListValues when converting between lists and list-views --- cpp/src/arrow/util/list_util.cc | 2 ++ 1 file changed, 2 insertions(+) diff --git a/cpp/src/arrow/util/list_util.cc b/cpp/src/arrow/util/list_util.cc index 53ac4edfcd2..446f7118b83 100644 --- a/cpp/src/arrow/util/list_util.cc +++ b/cpp/src/arrow/util/list_util.cc @@ -233,8 +233,10 @@ Result> ListFromListViewImpl( const auto& value_type = list_view_type.value_type(); const auto list_type = std::make_shared(value_type); + auto sum_of_list_view_sizes = SumOfListViewSizes(*list_view_data); ARROW_ASSIGN_OR_RAISE(std::shared_ptr value_builder, MakeBuilder(value_type, pool)); + RETURN_NOT_OK(value_builder->Reserve(sum_of_list_view_sizes)); auto list_builder = std::make_shared(pool, value_builder, list_type); RETURN_NOT_OK(list_builder->Reserve(list_view_data->length)); From ef436d70b84da5eaefafa2f1506bacc4eb0d2b4f Mon Sep 17 00:00:00 2001 From: Felipe Oliveira Carvalho Date: Tue, 26 Sep 2023 23:20:35 -0300 Subject: [PATCH 39/91] list_util.cc: Rewrite MinViewOffset and MaxViewEnd --- cpp/src/arrow/util/list_util.cc | 164 +++++++++++++++++--------------- 1 file changed, 87 insertions(+), 77 deletions(-) diff --git a/cpp/src/arrow/util/list_util.cc b/cpp/src/arrow/util/list_util.cc index 446f7118b83..9fec15162dd 100644 --- a/cpp/src/arrow/util/list_util.cc +++ b/cpp/src/arrow/util/list_util.cc @@ -36,103 +36,113 @@ namespace internal { namespace { using arrow::internal::checked_cast; +using arrow::internal::ReverseSetBitRunReader; +using arrow::internal::SetBitRunReader; /// \pre input.length() > 0 && input.null_count() != input.length() /// \param input A LIST_VIEW or LARGE_LIST_VIEW array template int64_t MinViewOffset(const ArraySpan& input) { - const uint8_t* validity = input.MayHaveNulls() ? input.buffers[0].data : nullptr; - const auto* offsets = reinterpret_cast(input.buffers[1].data); - const auto* sizes = reinterpret_cast(input.buffers[2].data); + const uint8_t* validity = input.buffers[0].data; + const auto* offsets = input.GetValues(1); + const auto* sizes = input.GetValues(2); - // It's very likely that the first non-null non-empty list-view starts at - // offset 0 of the child array. - int64_t i = 0; - while (i < input.length && (input.IsNull(i) || sizes[input.offset + i] == 0)) { - i += 1; + // Make an access to the sizes buffer only when strictly necessary. +#define MINIMIZE_MIN_VIEW_OFFSET(i) \ + auto offset = offsets[i]; \ + if (min_offset.has_value()) { \ + if (offset < *min_offset && sizes[i] > 0) { \ + if (offset == 0) { \ + return 0; \ + } \ + min_offset = offset; \ + } \ + } else { \ + if (sizes[i] > 0) { \ + if (offset == 0) { \ + return 0; \ + } \ + min_offset = offset; \ + } \ } - if (i >= input.length) { - return 0; - } - auto min_offset = offsets[input.offset + i]; - if (ARROW_PREDICT_TRUE(min_offset == 0)) { - // Early exit: offset 0 found already. - return 0; + + std::optional min_offset; + if (validity == nullptr) { + for (int64_t i = 0; i < input.length; i++) { + MINIMIZE_MIN_VIEW_OFFSET(i); + } + } else { + SetBitRunReader reader(validity, input.offset, input.length); + while (true) { + const auto run = reader.NextRun(); + if (run.length == 0) { + break; + } + for (int64_t i = run.position; i < run.position + run.length; ++i) { + MINIMIZE_MIN_VIEW_OFFSET(i); + } + } } + return min_offset.value_or(0); - // Slow path: scan the buffers entirely. - arrow::internal::VisitSetBitRunsVoid( - validity, /*offset=*/input.offset + i + 1, /*length=*/input.length - i - 1, - [&](int64_t i, int64_t run_length) { - for (int64_t j = 0; j < run_length; j++) { - const auto offset = offsets[input.offset + i + j]; - if (ARROW_PREDICT_FALSE(offset < min_offset)) { - if (sizes[input.offset + i + j] > 0) { - min_offset = offset; - } - } - } - }); - return min_offset; +#undef MINIMIZE_MIN_VIEW_OFFSET } /// \pre input.length() > 0 && input.null_count() != input.length() /// \param input A LIST_VIEW or LARGE_LIST_VIEW array template int64_t MaxViewEnd(const ArraySpan& input) { - const uint8_t* validity = input.MayHaveNulls() ? input.buffers[0].data : NULLPTR; - const auto* offsets = reinterpret_cast(input.buffers[1].data); - const auto* sizes = reinterpret_cast(input.buffers[2].data); - const auto IsNull = [validity](int64_t i) -> bool { - return validity && !arrow::bit_util::GetBit(validity, i); - }; - - int64_t i = input.length - 1; // safe because input.length() > 0 - while (i != 0 && (IsNull(i) || sizes[input.offset + i] == 0)) { - i -= 1; - } - const auto offset = static_cast(offsets[input.offset + i]); - const auto size = sizes[input.offset + i]; - if (i == 0) { - return (IsNull(i) || sizes[input.offset + i] == 0) ? 0 : offset + size; - } constexpr auto kInt64Max = std::numeric_limits::max(); - if constexpr (sizeof(offset_type) == sizeof(int64_t)) { - if (ARROW_PREDICT_FALSE(offset > kInt64Max - size)) { - // Early-exit: 64-bit overflow detected. This is not possible on a - // valid list-view, but we return the maximum possible value to - // avoid undefined behavior. - return kInt64Max; - } + const auto values_length = input.child_data[0].length; + + const uint8_t* validity = input.buffers[0].data; + const auto* offsets = input.GetValues(1); + const auto* sizes = input.GetValues(2); + + // Early-exit: 64-bit overflow detected. This is not possible on a valid list-view, + // but we return the maximum possible value to avoid undefined behavior. +#define MAX_VIEW_END_OVERFLOW_CHECK(offset, size) \ + if constexpr (sizeof(offset_type) == sizeof(int64_t)) { \ + if (ARROW_PREDICT_FALSE((offset) > kInt64Max - (size))) { \ + return kInt64Max; \ + } \ } - int64_t max_end = - static_cast(offsets[input.offset + i]) + sizes[input.offset + i]; - if (max_end == input.child_data[0].length) { - // Early-exit: maximum possible view-end found already. - return max_end; + +#define MAXIMIZE_MAX_VIEW_END(i) \ + const auto offset = static_cast(offsets[i]); \ + const offset_type size = sizes[i]; \ + if (size > 0) { \ + MAX_VIEW_END_OVERFLOW_CHECK(offset, size); \ + const int64_t end = offset + size; \ + if (end > max_end) { \ + if (end == values_length) { \ + return values_length; \ + } \ + max_end = end; \ + } \ } - // Slow path: scan the buffers entirely. - arrow::internal::VisitSetBitRunsVoid( - validity, input.offset, /*length=*/i + 1, [&](int64_t i, int64_t run_length) { - for (int64_t j = 0; j < run_length; ++j) { - const auto offset = static_cast(offsets[input.offset + i + j]); - const auto size = sizes[input.offset + i + j]; - if (size > 0) { - if constexpr (sizeof(offset_type) == sizeof(int64_t)) { - if (ARROW_PREDICT_FALSE(offset > kInt64Max - size)) { - // 64-bit overflow detected. This is not possible on a valid list-view, - // but we saturate max_end to the maximum possible value to avoid - // undefined behavior. - max_end = kInt64Max; - return; - } - } - max_end = std::max(max_end, offset + size); - } - } - }); + int64_t max_end = 0; + if (validity == nullptr) { + for (int64_t i = input.length - 1; i >= 0; --i) { + MAXIMIZE_MAX_VIEW_END(i); + } + } else { + ReverseSetBitRunReader reader(validity, input.offset, input.length); + while (true) { + const auto run = reader.NextRun(); + if (run.length == 0) { + break; + } + for (int64_t i = run.position + run.length - 1; i >= run.position; --i) { + MAXIMIZE_MAX_VIEW_END(i); + } + } + } return max_end; + +#undef MAX_VIEW_END_OVERFLOW_CHECK +#undef MAXIMIZE_MAX_VIEW_END } template From d61d15d3c5f6756042d488a50af684f6ab879899 Mon Sep 17 00:00:00 2001 From: Felipe Oliveira Carvalho Date: Wed, 27 Sep 2023 00:11:30 -0300 Subject: [PATCH 40/91] list_util.c: Zero initial padding area of sizes buffer --- cpp/src/arrow/util/list_util.cc | 3 +++ 1 file changed, 3 insertions(+) diff --git a/cpp/src/arrow/util/list_util.cc b/cpp/src/arrow/util/list_util.cc index 9fec15162dd..644b40853fe 100644 --- a/cpp/src/arrow/util/list_util.cc +++ b/cpp/src/arrow/util/list_util.cc @@ -216,6 +216,9 @@ Result> ListViewFromListImpl( AllocateBuffer(buffer_length * sizeof(offset_type), pool)); const auto* offsets = list_data->template GetValues(1, 0); auto* sizes = reinterpret_cast(sizes_buffer->mutable_data()); + // Zero the initial padding area to avoid leaking any data when buffers are + // sent over IPC or throught the C Data interface. + memset(sizes, 0, list_data->offset * sizeof(offset_type)); for (int64_t i = list_data->offset; i < buffer_length; i++) { sizes[i] = offsets[i + 1] - offsets[i]; } From 0c18fddf334dd0e5462d51b16be443b0b2ff4da3 Mon Sep 17 00:00:00 2001 From: Felipe Oliveira Carvalho Date: Sat, 30 Sep 2023 00:03:34 -0300 Subject: [PATCH 41/91] random.cc: Simplify and split the random generator into two algorithms --- cpp/src/arrow/testing/random.cc | 213 ++++++++++++++++++++++---------- cpp/src/arrow/testing/random.h | 24 +++- 2 files changed, 172 insertions(+), 65 deletions(-) diff --git a/cpp/src/arrow/testing/random.cc b/cpp/src/arrow/testing/random.cc index 24e9c95f437..33ddd476efc 100644 --- a/cpp/src/arrow/testing/random.cc +++ b/cpp/src/arrow/testing/random.cc @@ -628,14 +628,71 @@ enable_if_parameter_free GetMetadata(const KeyValueMetadata* metad return output; } -/// Try to pass sizes such that every non-null sizes[i] <= values_size. +/// \brief Shuffle a list-view array in place using the Fisher–Yates algorithm [1]. +/// +/// [1] https://en.wikipedia.org/wiki/Fisher%E2%80%93Yates_shuffle#The_modern_algorithm +/// +/// \param[in] seed The seed for the random number generator +/// \param[in,out] data The array to shuffle +template +void ShuffleListViewDataInPlace(SeedType seed, ArrayData& data) { + DCHECK_EQ(data.type->id(), ListViewType::type_id); + using offset_type = typename ListViewType::offset_type; + + auto* validity = data.GetMutableValues(0, 0); + auto* offsets = data.GetMutableValues(1); + auto* sizes = data.GetMutableValues(2); + + pcg32_fast rng(seed); + using UniformDist = std::uniform_int_distribution; + UniformDist dist; + for (int64_t i = data.length - 1; i > 0; --i) { + const auto j = dist(rng, UniformDist::param_type(0, i)); + if (ARROW_PREDICT_TRUE(i != j)) { + // Swap validity bits + if (validity) { + const bool valid_i = bit_util::GetBit(validity, data.offset + i); + const bool valid_j = bit_util::GetBit(validity, data.offset + i); + if (valid_i != valid_j) { + bit_util::SetBitTo(validity, data.offset + i, valid_j); + bit_util::SetBitTo(validity, data.offset + j, valid_i); + } + } + // Swap offsets and sizes + std::swap(offsets[i], offsets[j]); + std::swap(sizes[i], sizes[j]); + } + } +} + +/// \brief Generate the list-view offsets based on a random buffer of sizes. +/// +/// The sizes buffer is an input of this function, but when force_empty_nulls is true, +/// some values on the sizes buffer can be set to 0. +/// +/// When sparsity is 0.0, the list-view spans are perfectly packed one after the +/// other. If sparsity is greater than 0.0, the list-view spans are set apart +/// from each other in proportion to the sparsity value and size of each +/// list-view. A negative sparsity means each list-view shares a fraction of the +/// values used by the previous list-view. +/// +/// For instance, a sparsity of -1.0 means the values array will only need enough values +/// for the largest list-view with all the other list-views spanning some of these same +/// values. +/// +/// \param[in] seed The seed for the random number generator +/// \param[in,out] mutable_sizes_array The array of sizes to use +/// \param[in] force_empty_nulls Whether to force null list-view sizes to be 0 +/// \param[in] zero_undefined_offsets Whether to zero the offsets of list-views that have +/// 0 set as the size +/// \param[in] sparsity The sparsity of the generated list-view offsets +/// \param[out] out_max_view_end The maximum value of the end of a list-view template std::shared_ptr ViewOffsetsFromLengthsArray( - SeedType seed, offset_type avg_length, offset_type values_length, - OffsetArrayType& mutable_sizes_array, bool force_empty_nulls, - bool zero_undefined_offsets, int64_t alignment, MemoryPool* memory_pool) { + SeedType seed, OffsetArrayType& mutable_sizes_array, bool force_empty_nulls, + bool zero_undefined_offsets, double sparsity, int64_t* out_max_view_end, + int64_t alignment, MemoryPool* memory_pool) { using TypeClass = typename OffsetArrayType::TypeClass; - constexpr offset_type kZero = 0; auto* sizes = mutable_sizes_array.data()->template GetMutableValues(1); @@ -645,39 +702,27 @@ std::shared_ptr ViewOffsetsFromLengthsArray( alignment, memory_pool); auto offsets = buffers[1]->mutable_data_as(); - pcg32_fast rng(seed); - std::uniform_int_distribution offset_delta_dist(-avg_length, avg_length); - offset_type offset_base = 0; + double offset_base = 0.0; + offset_type max_view_end = 0; for (int64_t i = 0; i < mutable_sizes_array.length(); ++i) { - // We want to always sample the offset_delta_dist(rng) to make sure - // different options regarding nulls and empty views don't affect - // the other offsets. - offset_type offset = offset_base + offset_delta_dist(rng); + const auto offset = static_cast(std::llround(offset_base)); if (mutable_sizes_array.IsNull(i)) { if (force_empty_nulls) { sizes[i] = 0; } offsets[i] = zero_undefined_offsets ? 0 : offset; - continue; - } - offset_type size = sizes[i]; - if (size == 0) { - offsets[i] = zero_undefined_offsets ? 0 : offset; } else { - // Ensure that the size is not too large. - if (ARROW_PREDICT_FALSE(size > values_length)) { - size = values_length; - sizes[i] = size; // Fix the size. - } - // Ensure the offset is not negative or too large. - offset = std::max(offset, kZero); - if (offset > values_length - size) { - offset = values_length - size; + if (sizes[i] == 0) { + offsets[i] = zero_undefined_offsets ? 0 : offset; + } else { + offsets[i] = offset; + DCHECK_LT(offset, std::numeric_limits::max() - sizes[i]); + offset_base = std::max(0.0, offset_base + (sparsity * sizes[i])); } - offsets[i] = offset; } - offset_base += avg_length; + max_view_end = std::max(max_view_end, offsets[i] + sizes[i]); } + *out_max_view_end = max_view_end; auto array_data = ArrayData::Make(TypeTraits::type_singleton(), @@ -703,26 +748,77 @@ Result> ArrayOfListView(RAG& self, const Field& field, GetMetadata(field.metadata().get(), "force_empty_nulls", false); const auto zero_undefined_offsets = GetMetadata(field.metadata().get(), "zero_undefined_offsets", false); + const auto sparsity = GetMetadata(field.metadata().get(), "sparsity", 0.0); const auto lengths = internal::checked_pointer_cast( self.RAG::template Numeric( length, min_length, max_length, null_probability)); - // List views don't have to be disjoint, so let's make the values_length a - // multiple of the average list-view size. To make sure every list view - // into the values array can fit, it should be at least max_length. - const offset_type avg_length = min_length + (max_length - min_length) / 2; - const int64_t values_length = std::max(avg_length * (length - lengths->null_count()), - static_cast(max_length)); - DCHECK_LT(values_length, std::numeric_limits::max()); + int64_t max_view_end = 0; + const auto offsets = ViewOffsetsFromLengthsArray( + self.seed(), *lengths, force_empty_nulls, zero_undefined_offsets, sparsity, + &max_view_end, alignment, memory_pool); + const auto values = self.RAG::ArrayOf( *internal::checked_pointer_cast(field.type())->value_field(), - values_length, alignment, memory_pool); + /*values_length=*/max_view_end, alignment, memory_pool); - const auto offsets = ViewOffsetsFromLengthsArray( - self.seed(), avg_length, static_cast(values_length), *lengths, - force_empty_nulls, zero_undefined_offsets, alignment, memory_pool); + ARROW_ASSIGN_OR_RAISE(auto list_view_array, + ArrayType::FromArrays(field.type(), *offsets, *lengths, *values)); + ShuffleListViewDataInPlace(self.seed(), + const_cast(*list_view_array->data())); + return list_view_array; +} - return ArrayType::FromArrays(field.type(), *offsets, *lengths, *values); +template +Result> RandomListView(RAG& self, const Array& values, + int64_t length, double null_probability, + bool force_empty_nulls, double coverage, + int64_t alignment, + MemoryPool* memory_pool) { + using TypeClass = typename ArrayType::TypeClass; + using offset_type = typename TypeClass::offset_type; + using OffsetArrayType = typename TypeTraits::OffsetArrayType; + using OffsetArrowType = typename OffsetArrayType::TypeClass; + + DCHECK_LE(values.length(), std::numeric_limits::max()); + DCHECK_LE(length, std::numeric_limits::max()); + + auto offsets_array = GenerateOffsets>( + self.seed(), length + 1, 0, static_cast(values.length()), null_probability, + force_empty_nulls, alignment, memory_pool); + auto* offsets = offsets_array->data()->template GetValues(1); + + // The buffers for the sizes array + BufferVector buffers{2}; + buffers[0] = NULLPTR; + buffers[1] = *AllocateBuffer(sizeof(offset_type) * length, alignment, memory_pool); + auto sizes = buffers[1]->mutable_data_as(); + + // Derive sizes from offsets taking coverage into account + pcg32_fast rng(self.seed()); + using NormalDist = std::normal_distribution; + NormalDist size_dist; + for (int64_t i = 0; i < length; ++i) { + const double mean_size = coverage * (offsets[i + 1] - offsets[i]); + const double sampled_size = + std::max(0.0, size_dist(rng, NormalDist::param_type{mean_size})); + // This creates a higher probability of offset[i] + size[i] being closer or equal to + // values.length(), but that skew is acceptable for the purposes of testing. + const auto size = std::min(static_cast(std::llround(sampled_size)), + static_cast(values.length() - offsets[i])); + sizes[i] = offsets_array->IsNull(i) && force_empty_nulls ? 0 : size; + } + + auto sizes_array_data = ArrayData::Make(TypeTraits::type_singleton(), + length, std::move(buffers), /*null_count=*/0); + auto sizes_array = std::make_shared(std::move(sizes_array_data)); + + ARROW_ASSIGN_OR_RAISE( + auto list_view_array, + ArrayType::FromArrays(*offsets_array, *sizes_array, values, memory_pool)); + ShuffleListViewDataInPlace(self.seed(), + const_cast(*list_view_array->data())); + return list_view_array; } } // namespace @@ -754,29 +850,22 @@ std::shared_ptr RandomArrayGenerator::List(const Array& values, int64_t s return *::arrow::ListArray::FromArrays(*offsets, values); } -std::shared_ptr RandomArrayGenerator::ListView( - const Array& values, int64_t size, double null_probability, bool force_empty_nulls, - bool zero_undefined_offsets, int64_t alignment, MemoryPool* memory_pool) { - using offset_type = int32_t; - using OffsetArrayType = Int32Array; - using OffsetArrowType = Int32Type; - - DCHECK_LE(values.length(), std::numeric_limits::max()); - DCHECK_LE(size, std::numeric_limits::max()); - const auto values_length = static_cast(values.length()); - - const offset_type avg_length = (values_length - 1) / static_cast(size) + 1; - const offset_type min_length = 0; - const offset_type max_length = std::min(std::max(2 * avg_length, 1), values_length); - const auto lengths = internal::checked_pointer_cast( - Numeric(size, min_length, max_length, - null_probability)); - - const auto offsets = ViewOffsetsFromLengthsArray( - seed(), avg_length, values_length, *lengths, force_empty_nulls, - zero_undefined_offsets, alignment, memory_pool); +std::shared_ptr RandomArrayGenerator::ListView(const Array& values, int64_t length, + double null_probability, + bool force_empty_nulls, + double coverage, int64_t alignment, + MemoryPool* memory_pool) { + return *RandomListView(*this, values, length, null_probability, + force_empty_nulls, coverage, alignment, + memory_pool); +} - return *ListViewArray::FromArrays(*offsets, *lengths, values, memory_pool); +std::shared_ptr RandomArrayGenerator::LargeListView( + const Array& values, int64_t length, double null_probability, bool force_empty_nulls, + double coverage, int64_t alignment, MemoryPool* memory_pool) { + return *RandomListView(*this, values, length, null_probability, + force_empty_nulls, coverage, alignment, + memory_pool); } std::shared_ptr RandomArrayGenerator::Map(const std::shared_ptr& keys, diff --git a/cpp/src/arrow/testing/random.h b/cpp/src/arrow/testing/random.h index 3bb7b5d3603..1d97a3ada72 100644 --- a/cpp/src/arrow/testing/random.h +++ b/cpp/src/arrow/testing/random.h @@ -464,19 +464,37 @@ class ARROW_TESTING_EXPORT RandomArrayGenerator { /// \param[in] size The size of the generated list array /// \param[in] null_probability the probability of a list value being null /// \param[in] force_empty_nulls if true, null list entries must have 0 length - /// \param[in] zero_undefined_offsets if true, offsets of 0-length lists /// must be set to 0 + /// \param[in] coverage proportion of the values array covered by list-views /// \param[in] alignment alignment for memory allocations (in bytes) /// \param[in] memory_pool memory pool to allocate memory from /// /// \return a generated Array std::shared_ptr ListView(const Array& values, int64_t size, double null_probability = 0, - bool force_empty_nulls = false, - bool zero_undefined_offsets = false, + bool force_empty_nulls = false, double coverage = 1.0, int64_t alignment = kDefaultBufferAlignment, MemoryPool* memory_pool = default_memory_pool()); + /// \brief Generate a random LargeListViewArray + /// + /// \param[in] values The underlying values array + /// \param[in] size The size of the generated list array + /// \param[in] null_probability the probability of a list value being null + /// \param[in] force_empty_nulls if true, null list entries must have 0 length + /// must be set to 0 + /// \param[in] coverage proportion of the values array covered by list-views + /// \param[in] alignment alignment for memory allocations (in bytes) + /// \param[in] memory_pool memory pool to allocate memory from + /// + /// \return a generated Array + std::shared_ptr LargeListView(const Array& values, int64_t size, + double null_probability = 0, + bool force_empty_nulls = false, + double coverage = 1.0, + int64_t alignment = kDefaultBufferAlignment, + MemoryPool* memory_pool = default_memory_pool()); + /// \brief Generate a random MapArray /// /// \param[in] keys The underlying keys array From 7e51ae7028c1a85b4004ac6425af16a4fdeeb110 Mon Sep 17 00:00:00 2001 From: Felipe Oliveira Carvalho Date: Tue, 3 Oct 2023 16:08:15 -0300 Subject: [PATCH 42/91] concatenate.cc: Respect the new invariants imposed on the spec --- cpp/src/arrow/array/concatenate.cc | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/cpp/src/arrow/array/concatenate.cc b/cpp/src/arrow/array/concatenate.cc index 2510f2d7733..569ea77c06e 100644 --- a/cpp/src/arrow/array/concatenate.cc +++ b/cpp/src/arrow/array/concatenate.cc @@ -223,7 +223,8 @@ void PutListViewOffsets(const Buffer& src, offset_type displacement, offset_type // Avoid UB on non-validated input by doing the addition in the unsigned domain. // (the result can later be validated using Array::ValidateFull) std::transform(src_begin, src_end, dst, [displacement](offset_type offset) { - return SafeSignedAdd(offset, displacement); + constexpr offset_type kZero = 0; + return std::max(kZero, SafeSignedAdd(offset, displacement)); }); } From 32865ec0bf7e7f543268b448afd6a577688f5091 Mon Sep 17 00:00:00 2001 From: Felipe Oliveira Carvalho Date: Tue, 3 Oct 2023 15:52:07 -0300 Subject: [PATCH 43/91] validate.cc: Apply list-view invariants to null list-views as well --- cpp/src/arrow/array/validate.cc | 36 +++++++++++++++------------------ 1 file changed, 16 insertions(+), 20 deletions(-) diff --git a/cpp/src/arrow/array/validate.cc b/cpp/src/arrow/array/validate.cc index 4f26a236d22..9ba53258dae 100644 --- a/cpp/src/arrow/array/validate.cc +++ b/cpp/src/arrow/array/validate.cc @@ -864,29 +864,25 @@ struct ValidateArrayImpl { /// \pre basic validation has already been performed template Status FullyValidateOffsetsAndSizes(int64_t offset_limit) { - const auto* validity = data.GetValues(0, 0); const auto* offsets = data.GetValues(1); const auto* sizes = data.GetValues(2); - return VisitSetBitRuns( - validity, data.offset, data.length, [&](int64_t run_start, int64_t run_length) { - for (int64_t i = 0; i < run_length; ++i) { - auto slot = run_start + i; - const auto size = sizes[slot]; - if (size > 0) { - const auto offset = offsets[slot]; - if (offset < 0 || offset > offset_limit) { - return OutOfBoundsListViewOffset(slot, offset_limit); - } - if (size > offset_limit - offset) { - return OutOfBoundsListViewSize(slot, offset_limit); - } - } else if (size < 0) { - return OutOfBoundsListViewSize(slot, offset_limit); - } - } - return Status::OK(); - }); + for (int64_t i = 0; i < data.length; ++i) { + const auto size = sizes[i]; + if (size >= 0) { + const auto offset = offsets[i]; + if (offset < 0 || offset > offset_limit) { + return OutOfBoundsListViewOffset(i, offset_limit); + } + if (size > offset_limit - offset) { + return OutOfBoundsListViewSize(i, offset_limit); + } + } else { + return OutOfBoundsListViewSize(i, offset_limit); + } + } + + return Status::OK(); } template From ffc8f16ea6ce8bb842d6cb6b02b832c650744543 Mon Sep 17 00:00:00 2001 From: Felipe Oliveira Carvalho Date: Thu, 5 Oct 2023 00:11:13 -0300 Subject: [PATCH 44/91] util.cc: Implement Endianness swapping for list-views --- cpp/src/arrow/array/array_test.cc | 2 ++ cpp/src/arrow/array/util.cc | 9 +++++++-- 2 files changed, 9 insertions(+), 2 deletions(-) diff --git a/cpp/src/arrow/array/array_test.cc b/cpp/src/arrow/array/array_test.cc index 7b93bd07a82..be54d62fd77 100644 --- a/cpp/src/arrow/array/array_test.cc +++ b/cpp/src/arrow/array/array_test.cc @@ -3530,6 +3530,8 @@ DataTypeVector SwappableTypes() { large_utf8(), list(int16()), large_list(int16()), + list_view(int16()), + large_list_view(int16()), dictionary(int16(), utf8())}; } diff --git a/cpp/src/arrow/array/util.cc b/cpp/src/arrow/array/util.cc index cfcdadfa9a4..a7002e04e51 100644 --- a/cpp/src/arrow/array/util.cc +++ b/cpp/src/arrow/array/util.cc @@ -288,11 +288,16 @@ class ArrayDataEndianSwapper { RETURN_NOT_OK(SwapOffsets(1)); return Status::OK(); } + Status Visit(const ListViewType& type) { - return Status::NotImplemented("swapping endianness of list-view array"); + RETURN_NOT_OK(SwapOffsets(1)); + RETURN_NOT_OK(SwapOffsets(2)); + return Status::OK(); } Status Visit(const LargeListViewType& type) { - return Status::NotImplemented("swapping endianness of large list-view array"); + RETURN_NOT_OK(SwapOffsets(1)); + RETURN_NOT_OK(SwapOffsets(2)); + return Status::OK(); } Status Visit(const DictionaryType& type) { From 8e001f6b3853904934b397b2a1f43e9a4733c90b Mon Sep 17 00:00:00 2001 From: Felipe Oliveira Carvalho Date: Wed, 4 Oct 2023 18:26:57 -0300 Subject: [PATCH 45/91] IPC: Parameterize JSON simple tests arrow list type --- cpp/src/arrow/ipc/json_simple_test.cc | 264 +++++++++++++------------- 1 file changed, 136 insertions(+), 128 deletions(-) diff --git a/cpp/src/arrow/ipc/json_simple_test.cc b/cpp/src/arrow/ipc/json_simple_test.cc index b67c2699994..0b87e4a6532 100644 --- a/cpp/src/arrow/ipc/json_simple_test.cc +++ b/cpp/src/arrow/ipc/json_simple_test.cc @@ -59,6 +59,8 @@ using ::arrow::internal::BytesToBits; using ::arrow::internal::checked_cast; using ::arrow::internal::checked_pointer_cast; +using ListTypes = ::testing::Types; + // Avoid undefined behaviour on signed overflow template Signed SafeSignedAdd(Signed u, Signed v) { @@ -591,145 +593,151 @@ TEST(TestDecimal, Dictionary) { } } -TEST(TestList, IntegerList) { - auto pool = default_memory_pool(); - std::shared_ptr type = list(int64()); - std::shared_ptr offsets, values, expected, actual; - - ASSERT_OK_AND_ASSIGN(actual, ArrayFromJSON(type, "[]")); - ASSERT_OK(actual->ValidateFull()); - ArrayFromVector({0}, &offsets); - ArrayFromVector({}, &values); - ASSERT_OK_AND_ASSIGN(expected, ListArray::FromArrays(*offsets, *values, pool)); - AssertArraysEqual(*expected, *actual); - - ASSERT_OK_AND_ASSIGN(actual, ArrayFromJSON(type, "[[4, 5], [], [6]]")); - ASSERT_OK(actual->ValidateFull()); - ArrayFromVector({0, 2, 2, 3}, &offsets); - ArrayFromVector({4, 5, 6}, &values); - ASSERT_OK_AND_ASSIGN(expected, ListArray::FromArrays(*offsets, *values, pool)); - AssertArraysEqual(*expected, *actual); - - ASSERT_OK_AND_ASSIGN(actual, ArrayFromJSON(type, "[[], [null], [6, null]]")); - ASSERT_OK(actual->ValidateFull()); - ArrayFromVector({0, 0, 1, 3}, &offsets); - auto is_valid = std::vector{false, true, false}; - ArrayFromVector(is_valid, {0, 6, 0}, &values); - ASSERT_OK_AND_ASSIGN(expected, ListArray::FromArrays(*offsets, *values, pool)); - AssertArraysEqual(*expected, *actual); - - ASSERT_OK_AND_ASSIGN(actual, ArrayFromJSON(type, "[null, [], null]")); - ASSERT_OK(actual->ValidateFull()); - { - std::unique_ptr builder; - ASSERT_OK(MakeBuilder(pool, type, &builder)); - auto& list_builder = checked_cast(*builder); - ASSERT_OK(list_builder.AppendNull()); - ASSERT_OK(list_builder.Append()); - ASSERT_OK(list_builder.AppendNull()); - ASSERT_OK(list_builder.Finish(&expected)); +template +class TestVarLengthListArray : public ::testing::Test { + public: + using TypeClass = T; + using offset_type = typename TypeClass::offset_type; + using ArrayType = typename TypeTraits::ArrayType; + using BuilderType = typename TypeTraits::BuilderType; + using OffsetType = typename TypeTraits::OffsetType; + + void TestIntegerList() { + auto pool = default_memory_pool(); + std::shared_ptr type = std::make_shared(int64()); + std::shared_ptr offsets, values, expected, actual; + + ASSERT_OK_AND_ASSIGN(actual, ArrayFromJSON(type, "[]")); + ASSERT_OK(actual->ValidateFull()); + ArrayFromVector({0}, &offsets); + ArrayFromVector({}, &values); + ASSERT_OK_AND_ASSIGN(expected, ArrayType::FromArrays(*offsets, *values, pool)); + AssertArraysEqual(*expected, *actual); + + ASSERT_OK_AND_ASSIGN(actual, ArrayFromJSON(type, "[[4, 5], [], [6]]")); + ASSERT_OK(actual->ValidateFull()); + ArrayFromVector({0, 2, 2, 3}, &offsets); + ArrayFromVector({4, 5, 6}, &values); + ASSERT_OK_AND_ASSIGN(expected, ArrayType::FromArrays(*offsets, *values, pool)); + AssertArraysEqual(*expected, *actual); + + ASSERT_OK_AND_ASSIGN(actual, ArrayFromJSON(type, "[[], [null], [6, null]]")); + ASSERT_OK(actual->ValidateFull()); + ArrayFromVector({0, 0, 1, 3}, &offsets); + auto is_valid = std::vector{false, true, false}; + ArrayFromVector(is_valid, {0, 6, 0}, &values); + ASSERT_OK_AND_ASSIGN(expected, ArrayType::FromArrays(*offsets, *values, pool)); + AssertArraysEqual(*expected, *actual); + + ASSERT_OK_AND_ASSIGN(actual, ArrayFromJSON(type, "[null, [], null]")); + ASSERT_OK(actual->ValidateFull()); + { + std::unique_ptr builder; + ASSERT_OK(MakeBuilder(pool, type, &builder)); + auto& list_builder = checked_cast(*builder); + ASSERT_OK(list_builder.AppendNull()); + ASSERT_OK(list_builder.Append()); + ASSERT_OK(list_builder.AppendNull()); + ASSERT_OK(list_builder.Finish(&expected)); + } + AssertArraysEqual(*expected, *actual); } - AssertArraysEqual(*expected, *actual); -} -TEST(TestList, IntegerListErrors) { - std::shared_ptr type = list(int64()); - std::shared_ptr array; - - ASSERT_RAISES(Invalid, ArrayFromJSON(type, "[0]")); - ASSERT_RAISES(Invalid, ArrayFromJSON(type, "[[0.0]]")); - ASSERT_RAISES(Invalid, ArrayFromJSON(type, "[[9223372036854775808]]")); -} - -TEST(TestList, NullList) { - auto pool = default_memory_pool(); - std::shared_ptr type = list(null()); - std::shared_ptr offsets, values, expected, actual; - - ASSERT_OK_AND_ASSIGN(actual, ArrayFromJSON(type, "[]")); - ASSERT_OK(actual->ValidateFull()); - ArrayFromVector({0}, &offsets); - values = std::make_shared(0); - ASSERT_OK_AND_ASSIGN(expected, ListArray::FromArrays(*offsets, *values, pool)); - AssertArraysEqual(*expected, *actual); + void TestIntegerListErrors() { + std::shared_ptr type = std::make_shared(int64()); + std::shared_ptr array; - ASSERT_OK_AND_ASSIGN(actual, ArrayFromJSON(type, "[[], [null], [null, null]]")); - ASSERT_OK(actual->ValidateFull()); - ArrayFromVector({0, 0, 1, 3}, &offsets); - values = std::make_shared(3); - ASSERT_OK_AND_ASSIGN(expected, ListArray::FromArrays(*offsets, *values, pool)); - AssertArraysEqual(*expected, *actual); + ASSERT_RAISES(Invalid, ArrayFromJSON(type, "[0]")); + ASSERT_RAISES(Invalid, ArrayFromJSON(type, "[[0.0]]")); + ASSERT_RAISES(Invalid, ArrayFromJSON(type, "[[9223372036854775808]]")); + } - ASSERT_OK_AND_ASSIGN(actual, ArrayFromJSON(type, "[null, [], null]")); - ASSERT_OK(actual->ValidateFull()); - { - std::unique_ptr builder; - ASSERT_OK(MakeBuilder(pool, type, &builder)); - auto& list_builder = checked_cast(*builder); - ASSERT_OK(list_builder.AppendNull()); - ASSERT_OK(list_builder.Append()); - ASSERT_OK(list_builder.AppendNull()); - ASSERT_OK(list_builder.Finish(&expected)); + void TestNullList() { + auto pool = default_memory_pool(); + std::shared_ptr type = std::make_shared(null()); + std::shared_ptr offsets, values, expected, actual; + + ASSERT_OK_AND_ASSIGN(actual, ArrayFromJSON(type, "[]")); + ASSERT_OK(actual->ValidateFull()); + ArrayFromVector({0}, &offsets); + values = std::make_shared(0); + ASSERT_OK_AND_ASSIGN(expected, ArrayType::FromArrays(*offsets, *values, pool)); + AssertArraysEqual(*expected, *actual); + + ASSERT_OK_AND_ASSIGN(actual, ArrayFromJSON(type, "[[], [null], [null, null]]")); + ASSERT_OK(actual->ValidateFull()); + ArrayFromVector({0, 0, 1, 3}, &offsets); + values = std::make_shared(3); + ASSERT_OK_AND_ASSIGN(expected, ArrayType::FromArrays(*offsets, *values, pool)); + AssertArraysEqual(*expected, *actual); + + ASSERT_OK_AND_ASSIGN(actual, ArrayFromJSON(type, "[null, [], null]")); + ASSERT_OK(actual->ValidateFull()); + { + std::unique_ptr builder; + ASSERT_OK(MakeBuilder(pool, type, &builder)); + auto& list_builder = checked_cast(*builder); + ASSERT_OK(list_builder.AppendNull()); + ASSERT_OK(list_builder.Append()); + ASSERT_OK(list_builder.AppendNull()); + ASSERT_OK(list_builder.Finish(&expected)); + } + AssertArraysEqual(*expected, *actual); } - AssertArraysEqual(*expected, *actual); -} -TEST(TestList, IntegerListList) { - auto pool = default_memory_pool(); - std::shared_ptr type = list(list(uint8())); - std::shared_ptr offsets, values, nested, expected, actual; + void TestIntegerListList() { + auto pool = default_memory_pool(); + std::shared_ptr type = + std::make_shared(std::make_shared(uint8())); + std::shared_ptr offsets, values, nested, expected, actual; + + ASSERT_OK_AND_ASSIGN(actual, ArrayFromJSON(type, "[[[4], [5, 6]], [[7, 8, 9]]]")); + ASSERT_OK(actual->ValidateFull()); + ArrayFromVector({0, 1, 3, 6}, &offsets); + ArrayFromVector({4, 5, 6, 7, 8, 9}, &values); + ASSERT_OK_AND_ASSIGN(nested, ArrayType::FromArrays(*offsets, *values, pool)); + ArrayFromVector({0, 2, 3}, &offsets); + ASSERT_OK_AND_ASSIGN(expected, ArrayType::FromArrays(*offsets, *nested, pool)); + ASSERT_EQ(actual->length(), 2); + AssertArraysEqual(*expected, *actual); + + ASSERT_OK_AND_ASSIGN( + actual, ArrayFromJSON(type, "[[], [[]], [[4], [], [5, 6]], [[7, 8, 9]]]")); + ASSERT_OK(actual->ValidateFull()); + ArrayFromVector({0, 0, 1, 1, 3, 6}, &offsets); + ArrayFromVector({4, 5, 6, 7, 8, 9}, &values); + ASSERT_OK_AND_ASSIGN(nested, ArrayType::FromArrays(*offsets, *values, pool)); + ArrayFromVector({0, 0, 1, 4, 5}, &offsets); + ASSERT_OK_AND_ASSIGN(expected, ArrayType::FromArrays(*offsets, *nested, pool)); + ASSERT_EQ(actual->length(), 4); + AssertArraysEqual(*expected, *actual); + + ASSERT_OK_AND_ASSIGN(actual, ArrayFromJSON(type, "[null, [null], [[null]]]")); + ASSERT_OK(actual->ValidateFull()); + { + std::unique_ptr builder; + ASSERT_OK(MakeBuilder(pool, type, &builder)); + auto& list_builder = checked_cast(*builder); + auto& child_builder = checked_cast(*list_builder.value_builder()); + ASSERT_OK(list_builder.AppendNull()); + ASSERT_OK(list_builder.Append()); + ASSERT_OK(child_builder.AppendNull()); + ASSERT_OK(list_builder.Append()); + ASSERT_OK(child_builder.Append()); + ASSERT_OK(list_builder.Finish(&expected)); + } + } +}; - ASSERT_OK_AND_ASSIGN(actual, ArrayFromJSON(type, "[[[4], [5, 6]], [[7, 8, 9]]]")); - ASSERT_OK(actual->ValidateFull()); - ArrayFromVector({0, 1, 3, 6}, &offsets); - ArrayFromVector({4, 5, 6, 7, 8, 9}, &values); - ASSERT_OK_AND_ASSIGN(nested, ListArray::FromArrays(*offsets, *values, pool)); - ArrayFromVector({0, 2, 3}, &offsets); - ASSERT_OK_AND_ASSIGN(expected, ListArray::FromArrays(*offsets, *nested, pool)); - ASSERT_EQ(actual->length(), 2); - AssertArraysEqual(*expected, *actual); +TYPED_TEST_SUITE(TestVarLengthListArray, ListTypes); - ASSERT_OK_AND_ASSIGN(actual, - ArrayFromJSON(type, "[[], [[]], [[4], [], [5, 6]], [[7, 8, 9]]]")); - ASSERT_OK(actual->ValidateFull()); - ArrayFromVector({0, 0, 1, 1, 3, 6}, &offsets); - ArrayFromVector({4, 5, 6, 7, 8, 9}, &values); - ASSERT_OK_AND_ASSIGN(nested, ListArray::FromArrays(*offsets, *values, pool)); - ArrayFromVector({0, 0, 1, 4, 5}, &offsets); - ASSERT_OK_AND_ASSIGN(expected, ListArray::FromArrays(*offsets, *nested, pool)); - ASSERT_EQ(actual->length(), 4); - AssertArraysEqual(*expected, *actual); +TYPED_TEST(TestVarLengthListArray, IntegerList) { this->TestIntegerList(); } - ASSERT_OK_AND_ASSIGN(actual, ArrayFromJSON(type, "[null, [null], [[null]]]")); - ASSERT_OK(actual->ValidateFull()); - { - std::unique_ptr builder; - ASSERT_OK(MakeBuilder(pool, type, &builder)); - auto& list_builder = checked_cast(*builder); - auto& child_builder = checked_cast(*list_builder.value_builder()); - ASSERT_OK(list_builder.AppendNull()); - ASSERT_OK(list_builder.Append()); - ASSERT_OK(child_builder.AppendNull()); - ASSERT_OK(list_builder.Append()); - ASSERT_OK(child_builder.Append()); - ASSERT_OK(list_builder.Finish(&expected)); - } -} +TYPED_TEST(TestVarLengthListArray, IntegerListErrors) { this->TestIntegerListErrors(); } -TEST(TestLargeList, Basics) { - // Similar as TestList above, only testing the basics - auto pool = default_memory_pool(); - std::shared_ptr type = large_list(int16()); - std::shared_ptr offsets, values, expected, actual; +TYPED_TEST(TestVarLengthListArray, NullList) { this->TestNullList(); } - ASSERT_OK_AND_ASSIGN(actual, ArrayFromJSON(type, "[[], [null], [6, null]]")); - ASSERT_OK(actual->ValidateFull()); - ArrayFromVector({0, 0, 1, 3}, &offsets); - auto is_valid = std::vector{false, true, false}; - ArrayFromVector(is_valid, {0, 6, 0}, &values); - ASSERT_OK_AND_ASSIGN(expected, LargeListArray::FromArrays(*offsets, *values, pool)); - AssertArraysEqual(*expected, *actual); -} +TYPED_TEST(TestVarLengthListArray, IntegerListList) { this->TestIntegerListList(); } TEST(TestMap, IntegerToInteger) { auto type = map(int16(), int16()); From 7ff434ce68a3c60f2fb151d25dacda3b28e5fcad Mon Sep 17 00:00:00 2001 From: Felipe Oliveira Carvalho Date: Wed, 4 Oct 2023 21:05:10 -0300 Subject: [PATCH 46/91] IPC: Include ListView types in the simple JSON list tests --- cpp/src/arrow/ipc/json_simple_test.cc | 95 +++++++++++++++++++++------ 1 file changed, 76 insertions(+), 19 deletions(-) diff --git a/cpp/src/arrow/ipc/json_simple_test.cc b/cpp/src/arrow/ipc/json_simple_test.cc index 0b87e4a6532..ea3a9ae1a14 100644 --- a/cpp/src/arrow/ipc/json_simple_test.cc +++ b/cpp/src/arrow/ipc/json_simple_test.cc @@ -59,7 +59,8 @@ using ::arrow::internal::BytesToBits; using ::arrow::internal::checked_cast; using ::arrow::internal::checked_pointer_cast; -using ListTypes = ::testing::Types; +using ListAndListViewTypes = + ::testing::Types; // Avoid undefined behaviour on signed overflow template @@ -602,23 +603,37 @@ class TestVarLengthListArray : public ::testing::Test { using BuilderType = typename TypeTraits::BuilderType; using OffsetType = typename TypeTraits::OffsetType; + static constexpr bool is_list_view_type = is_list_view(TypeClass::type_id); + void TestIntegerList() { auto pool = default_memory_pool(); std::shared_ptr type = std::make_shared(int64()); - std::shared_ptr offsets, values, expected, actual; + std::shared_ptr offsets, sizes, values, expected, actual; ASSERT_OK_AND_ASSIGN(actual, ArrayFromJSON(type, "[]")); ASSERT_OK(actual->ValidateFull()); ArrayFromVector({0}, &offsets); ArrayFromVector({}, &values); - ASSERT_OK_AND_ASSIGN(expected, ArrayType::FromArrays(*offsets, *values, pool)); + if constexpr (is_list_view_type) { + ArrayFromVector({}, &sizes); + ASSERT_OK_AND_ASSIGN(expected, + ArrayType::FromArrays(*offsets, *sizes, *values, pool)); + } else { + ASSERT_OK_AND_ASSIGN(expected, ArrayType::FromArrays(*offsets, *values, pool)); + } AssertArraysEqual(*expected, *actual); ASSERT_OK_AND_ASSIGN(actual, ArrayFromJSON(type, "[[4, 5], [], [6]]")); ASSERT_OK(actual->ValidateFull()); ArrayFromVector({0, 2, 2, 3}, &offsets); ArrayFromVector({4, 5, 6}, &values); - ASSERT_OK_AND_ASSIGN(expected, ArrayType::FromArrays(*offsets, *values, pool)); + if constexpr (is_list_view_type) { + ArrayFromVector({2, 0, 1}, &sizes); + ASSERT_OK_AND_ASSIGN(expected, + ArrayType::FromArrays(*offsets, *sizes, *values, pool)); + } else { + ASSERT_OK_AND_ASSIGN(expected, ArrayType::FromArrays(*offsets, *values, pool)); + } AssertArraysEqual(*expected, *actual); ASSERT_OK_AND_ASSIGN(actual, ArrayFromJSON(type, "[[], [null], [6, null]]")); @@ -626,7 +641,13 @@ class TestVarLengthListArray : public ::testing::Test { ArrayFromVector({0, 0, 1, 3}, &offsets); auto is_valid = std::vector{false, true, false}; ArrayFromVector(is_valid, {0, 6, 0}, &values); - ASSERT_OK_AND_ASSIGN(expected, ArrayType::FromArrays(*offsets, *values, pool)); + if constexpr (is_list_view_type) { + ArrayFromVector({0, 1, 2}, &sizes); + ASSERT_OK_AND_ASSIGN(expected, + ArrayType::FromArrays(*offsets, *sizes, *values, pool)); + } else { + ASSERT_OK_AND_ASSIGN(expected, ArrayType::FromArrays(*offsets, *values, pool)); + } AssertArraysEqual(*expected, *actual); ASSERT_OK_AND_ASSIGN(actual, ArrayFromJSON(type, "[null, [], null]")); @@ -636,7 +657,7 @@ class TestVarLengthListArray : public ::testing::Test { ASSERT_OK(MakeBuilder(pool, type, &builder)); auto& list_builder = checked_cast(*builder); ASSERT_OK(list_builder.AppendNull()); - ASSERT_OK(list_builder.Append()); + ASSERT_OK(list_builder.Append(true, 0)); ASSERT_OK(list_builder.AppendNull()); ASSERT_OK(list_builder.Finish(&expected)); } @@ -655,20 +676,32 @@ class TestVarLengthListArray : public ::testing::Test { void TestNullList() { auto pool = default_memory_pool(); std::shared_ptr type = std::make_shared(null()); - std::shared_ptr offsets, values, expected, actual; + std::shared_ptr offsets, sizes, values, expected, actual; ASSERT_OK_AND_ASSIGN(actual, ArrayFromJSON(type, "[]")); ASSERT_OK(actual->ValidateFull()); ArrayFromVector({0}, &offsets); values = std::make_shared(0); - ASSERT_OK_AND_ASSIGN(expected, ArrayType::FromArrays(*offsets, *values, pool)); + if constexpr (is_list_view_type) { + ArrayFromVector({}, &sizes); + ASSERT_OK_AND_ASSIGN(expected, + ArrayType::FromArrays(*offsets, *sizes, *values, pool)); + } else { + ASSERT_OK_AND_ASSIGN(expected, ArrayType::FromArrays(*offsets, *values, pool)); + } AssertArraysEqual(*expected, *actual); ASSERT_OK_AND_ASSIGN(actual, ArrayFromJSON(type, "[[], [null], [null, null]]")); ASSERT_OK(actual->ValidateFull()); ArrayFromVector({0, 0, 1, 3}, &offsets); values = std::make_shared(3); - ASSERT_OK_AND_ASSIGN(expected, ArrayType::FromArrays(*offsets, *values, pool)); + if constexpr (is_list_view_type) { + ArrayFromVector({0, 1, 2}, &sizes); + ASSERT_OK_AND_ASSIGN(expected, + ArrayType::FromArrays(*offsets, *sizes, *values, pool)); + } else { + ASSERT_OK_AND_ASSIGN(expected, ArrayType::FromArrays(*offsets, *values, pool)); + } AssertArraysEqual(*expected, *actual); ASSERT_OK_AND_ASSIGN(actual, ArrayFromJSON(type, "[null, [], null]")); @@ -678,7 +711,7 @@ class TestVarLengthListArray : public ::testing::Test { ASSERT_OK(MakeBuilder(pool, type, &builder)); auto& list_builder = checked_cast(*builder); ASSERT_OK(list_builder.AppendNull()); - ASSERT_OK(list_builder.Append()); + ASSERT_OK(list_builder.Append(true, 0)); ASSERT_OK(list_builder.AppendNull()); ASSERT_OK(list_builder.Finish(&expected)); } @@ -689,15 +722,27 @@ class TestVarLengthListArray : public ::testing::Test { auto pool = default_memory_pool(); std::shared_ptr type = std::make_shared(std::make_shared(uint8())); - std::shared_ptr offsets, values, nested, expected, actual; + std::shared_ptr offsets, sizes, values, nested, expected, actual; ASSERT_OK_AND_ASSIGN(actual, ArrayFromJSON(type, "[[[4], [5, 6]], [[7, 8, 9]]]")); ASSERT_OK(actual->ValidateFull()); ArrayFromVector({0, 1, 3, 6}, &offsets); ArrayFromVector({4, 5, 6, 7, 8, 9}, &values); - ASSERT_OK_AND_ASSIGN(nested, ArrayType::FromArrays(*offsets, *values, pool)); + if constexpr (is_list_view_type) { + ArrayFromVector({1, 2, 3}, &sizes); + ASSERT_OK_AND_ASSIGN(nested, + ArrayType::FromArrays(*offsets, *sizes, *values, pool)); + } else { + ASSERT_OK_AND_ASSIGN(nested, ArrayType::FromArrays(*offsets, *values, pool)); + } ArrayFromVector({0, 2, 3}, &offsets); - ASSERT_OK_AND_ASSIGN(expected, ArrayType::FromArrays(*offsets, *nested, pool)); + if constexpr (is_list_view_type) { + ArrayFromVector({2, 1}, &sizes); + ASSERT_OK_AND_ASSIGN(expected, + ArrayType::FromArrays(*offsets, *sizes, *nested, pool)); + } else { + ASSERT_OK_AND_ASSIGN(expected, ArrayType::FromArrays(*offsets, *nested, pool)); + } ASSERT_EQ(actual->length(), 2); AssertArraysEqual(*expected, *actual); @@ -706,9 +751,21 @@ class TestVarLengthListArray : public ::testing::Test { ASSERT_OK(actual->ValidateFull()); ArrayFromVector({0, 0, 1, 1, 3, 6}, &offsets); ArrayFromVector({4, 5, 6, 7, 8, 9}, &values); - ASSERT_OK_AND_ASSIGN(nested, ArrayType::FromArrays(*offsets, *values, pool)); + if constexpr (is_list_view_type) { + ArrayFromVector({0, 1, 0, 2, 3}, &sizes); + ASSERT_OK_AND_ASSIGN(nested, + ArrayType::FromArrays(*offsets, *sizes, *values, pool)); + } else { + ASSERT_OK_AND_ASSIGN(nested, ArrayType::FromArrays(*offsets, *values, pool)); + } ArrayFromVector({0, 0, 1, 4, 5}, &offsets); - ASSERT_OK_AND_ASSIGN(expected, ArrayType::FromArrays(*offsets, *nested, pool)); + if constexpr (is_list_view_type) { + ArrayFromVector({0, 1, 3, 1}, &sizes); + ASSERT_OK_AND_ASSIGN(expected, + ArrayType::FromArrays(*offsets, *sizes, *nested, pool)); + } else { + ASSERT_OK_AND_ASSIGN(expected, ArrayType::FromArrays(*offsets, *nested, pool)); + } ASSERT_EQ(actual->length(), 4); AssertArraysEqual(*expected, *actual); @@ -720,16 +777,16 @@ class TestVarLengthListArray : public ::testing::Test { auto& list_builder = checked_cast(*builder); auto& child_builder = checked_cast(*list_builder.value_builder()); ASSERT_OK(list_builder.AppendNull()); - ASSERT_OK(list_builder.Append()); + ASSERT_OK(list_builder.Append(true, 0)); ASSERT_OK(child_builder.AppendNull()); - ASSERT_OK(list_builder.Append()); - ASSERT_OK(child_builder.Append()); + ASSERT_OK(list_builder.Append(true, 0)); + ASSERT_OK(child_builder.Append(true, 0)); ASSERT_OK(list_builder.Finish(&expected)); } } }; -TYPED_TEST_SUITE(TestVarLengthListArray, ListTypes); +TYPED_TEST_SUITE(TestVarLengthListArray, ListAndListViewTypes); TYPED_TEST(TestVarLengthListArray, IntegerList) { this->TestIntegerList(); } From fb92b1e7b2bc7e09a909fc0954598d9414f62804 Mon Sep 17 00:00:00 2001 From: Felipe Oliveira Carvalho Date: Thu, 5 Oct 2023 00:23:32 -0300 Subject: [PATCH 47/91] IPC and JSON Integration tests for list-view types --- .../integration/json_integration_test.cc | 6 +- cpp/src/arrow/integration/json_internal.cc | 53 ++++++++--- cpp/src/arrow/ipc/feather_test.cc | 2 + cpp/src/arrow/ipc/generate_fuzz_corpus.cc | 2 + cpp/src/arrow/ipc/metadata_internal.cc | 22 ++++- cpp/src/arrow/ipc/read_write_test.cc | 8 ++ cpp/src/arrow/ipc/reader.cc | 25 +++-- cpp/src/arrow/ipc/test_common.cc | 70 ++++++++++++++ cpp/src/arrow/ipc/test_common.h | 6 ++ cpp/src/arrow/ipc/writer.cc | 95 +++++++++++++++++-- 10 files changed, 260 insertions(+), 29 deletions(-) diff --git a/cpp/src/arrow/integration/json_integration_test.cc b/cpp/src/arrow/integration/json_integration_test.cc index e023e6a3a44..9b56928c688 100644 --- a/cpp/src/arrow/integration/json_integration_test.cc +++ b/cpp/src/arrow/integration/json_integration_test.cc @@ -793,8 +793,6 @@ void CheckPrimitive(const std::shared_ptr& type, } TEST(TestJsonSchemaWriter, FlatTypes) { - // TODO - // field("f14", date32()) std::vector> fields = { field("f0", int8()), field("f1", int16(), false), @@ -822,6 +820,8 @@ TEST(TestJsonSchemaWriter, FlatTypes) { field("f21", run_end_encoded(int16(), utf8())), field("f22", run_end_encoded(int32(), utf8())), field("f23", run_end_encoded(int64(), utf8())), + field("f24", list_view(int32())), + field("f25", large_list_view(uint8())), }; auto schema = ::arrow::schema(fields); @@ -1147,10 +1147,12 @@ TEST_P(TestJsonRoundTrip, RoundTrip) { const std::vector kBatchCases = { &MakeIntRecordBatch, &MakeListRecordBatch, + &MakeListViewRecordBatch, &MakeFixedSizeListRecordBatch, &MakeNonNullRecordBatch, &MakeZeroLengthRecordBatch, &MakeDeeplyNestedList, + &MakeDeeplyNestedListView, &MakeStringTypesRecordBatchWithNulls, &MakeStruct, &MakeUnion, diff --git a/cpp/src/arrow/integration/json_internal.cc b/cpp/src/arrow/integration/json_internal.cc index 0d05abbd7f9..64eb342d5bd 100644 --- a/cpp/src/arrow/integration/json_internal.cc +++ b/cpp/src/arrow/integration/json_internal.cc @@ -236,7 +236,7 @@ class SchemaWriter { enable_if_t::value || is_primitive_ctype::value || is_base_binary_type::value || is_binary_view_like_type::value || is_var_length_list_type::value || is_struct_type::value || - is_run_end_encoded_type::value> + is_run_end_encoded_type::value || is_list_view_type::value> WriteTypeMetadata(const T& type) {} void WriteTypeMetadata(const MapType& type) { @@ -422,10 +422,14 @@ class SchemaWriter { return Status::OK(); } - Status Visit(const ListViewType& type) { return Status::NotImplemented(type.name()); } + Status Visit(const ListViewType& type) { + WriteName("listview", type); + return Status::OK(); + } Status Visit(const LargeListViewType& type) { - return Status::NotImplemented(type.name()); + WriteName("largelistview", type); + return Status::OK(); } Status Visit(const MapType& type) { @@ -783,12 +787,13 @@ class ArrayWriter { return WriteChildren(array.type()->fields(), {array.values()}); } - Status Visit(const ListViewArray& array) { - return Status::NotImplemented("list-view array in JSON"); - } - - Status Visit(const LargeListViewArray& array) { - return Status::NotImplemented("large list-view array in JSON"); + template + enable_if_list_view Visit( + const ArrayType& array) { + WriteValidityField(array); + WriteIntegerField("OFFSET", array.raw_value_offsets(), array.length()); + WriteIntegerField("SIZE", array.raw_value_sizes(), array.length()); + return WriteChildren(array.type()->fields(), {array.values()}); } Status Visit(const FixedSizeListArray& array) { @@ -1146,6 +1151,16 @@ Result> GetType(const RjObject& json_type, return Status::Invalid("Large list must have exactly one child"); } return large_list(children[0]); + } else if (type_name == "listview") { + if (children.size() != 1) { + return Status::Invalid("List-view must have exactly one child"); + } + return list_view(children[0]); + } else if (type_name == "largelistview") { + if (children.size() != 1) { + return Status::Invalid("Large list-view must have exactly one child"); + } + return large_list_view(children[0]); } else if (type_name == "map") { return GetMap(json_type, children); } else if (type_name == "fixedsizelist") { @@ -1665,12 +1680,24 @@ class ArrayReader { return CreateList(type_); } - Status Visit(const ListViewType& type) { - return Status::NotImplemented("list-view in JSON"); + template + Status CreateListView(const std::shared_ptr& type) { + using offset_type = typename T::offset_type; + + RETURN_NOT_OK(InitializeData(3)); + + RETURN_NOT_OK(GetNullBitmap()); + ARROW_ASSIGN_OR_RAISE(const auto json_offsets, GetMemberArray(obj_, "OFFSET")); + RETURN_NOT_OK(GetIntArray(json_offsets, length_, &data_->buffers[1])); + ARROW_ASSIGN_OR_RAISE(const auto json_sizes, GetMemberArray(obj_, "SIZE")); + RETURN_NOT_OK(GetIntArray(json_sizes, length_, &data_->buffers[2])); + RETURN_NOT_OK(GetChildren(obj_, *type)); + return Status::OK(); } - Status Visit(const LargeListViewType& type) { - return Status::NotImplemented("large list-view in JSON"); + template + enable_if_list_view Visit(const T& type) { + return CreateListView(type_); } Status Visit(const MapType& type) { diff --git a/cpp/src/arrow/ipc/feather_test.cc b/cpp/src/arrow/ipc/feather_test.cc index 0b6ae4f6206..80e441fe2b6 100644 --- a/cpp/src/arrow/ipc/feather_test.cc +++ b/cpp/src/arrow/ipc/feather_test.cc @@ -329,9 +329,11 @@ namespace { const std::vector kBatchCases = { &ipc::test::MakeIntRecordBatch, &ipc::test::MakeListRecordBatch, + &ipc::test::MakeListViewRecordBatch, &ipc::test::MakeFixedSizeListRecordBatch, &ipc::test::MakeNonNullRecordBatch, &ipc::test::MakeDeeplyNestedList, + &ipc::test::MakeDeeplyNestedListView, &ipc::test::MakeStringTypesRecordBatchWithNulls, &ipc::test::MakeStruct, &ipc::test::MakeUnion, diff --git a/cpp/src/arrow/ipc/generate_fuzz_corpus.cc b/cpp/src/arrow/ipc/generate_fuzz_corpus.cc index 50be10991ff..682c352132a 100644 --- a/cpp/src/arrow/ipc/generate_fuzz_corpus.cc +++ b/cpp/src/arrow/ipc/generate_fuzz_corpus.cc @@ -74,6 +74,8 @@ Result>> Batches() { batches.push_back(batch); RETURN_NOT_OK(test::MakeListRecordBatch(&batch)); batches.push_back(batch); + RETURN_NOT_OK(test::MakeListViewRecordBatch(&batch)); + batches.push_back(batch); RETURN_NOT_OK(test::MakeDictionary(&batch)); batches.push_back(batch); RETURN_NOT_OK(test::MakeTimestamps(&batch)); diff --git a/cpp/src/arrow/ipc/metadata_internal.cc b/cpp/src/arrow/ipc/metadata_internal.cc index c89c9e3f0d1..4f41edf8e15 100644 --- a/cpp/src/arrow/ipc/metadata_internal.cc +++ b/cpp/src/arrow/ipc/metadata_internal.cc @@ -361,6 +361,18 @@ Status ConcreteTypeFromFlatbuffer(flatbuf::Type type, const void* type_data, } *out = std::make_shared(children[0]); return Status::OK(); + case flatbuf::Type::ListView: + if (children.size() != 1) { + return Status::Invalid("ListView must have exactly 1 child field"); + } + *out = std::make_shared(children[0]); + return Status::OK(); + case flatbuf::Type::LargeListView: + if (children.size() != 1) { + return Status::Invalid("LargeListView must have exactly 1 child field"); + } + *out = std::make_shared(children[0]); + return Status::OK(); case flatbuf::Type::Map: if (children.size() != 1) { return Status::Invalid("Map must have exactly 1 child field"); @@ -670,11 +682,17 @@ class FieldToFlatbufferVisitor { } Status Visit(const ListViewType& type) { - return Status::NotImplemented("list-view type in IPC"); + fb_type_ = flatbuf::Type::ListView; + RETURN_NOT_OK(VisitChildFields(type)); + type_offset_ = flatbuf::CreateListView(fbb_).Union(); + return Status::OK(); } Status Visit(const LargeListViewType& type) { - return Status::NotImplemented("large list-view type in IPC"); + fb_type_ = flatbuf::Type::LargeListView; + RETURN_NOT_OK(VisitChildFields(type)); + type_offset_ = flatbuf::CreateListView(fbb_).Union(); + return Status::OK(); } Status Visit(const MapType& type) { diff --git a/cpp/src/arrow/ipc/read_write_test.cc b/cpp/src/arrow/ipc/read_write_test.cc index 05a48aec2c7..98127c78388 100644 --- a/cpp/src/arrow/ipc/read_write_test.cc +++ b/cpp/src/arrow/ipc/read_write_test.cc @@ -376,10 +376,12 @@ TEST_F(TestSchemaMetadata, MetadataVersionForwardCompatibility) { const std::vector kBatchCases = { &MakeIntRecordBatch, &MakeListRecordBatch, + &MakeListViewRecordBatch, &MakeFixedSizeListRecordBatch, &MakeNonNullRecordBatch, &MakeZeroLengthRecordBatch, &MakeDeeplyNestedList, + &MakeDeeplyNestedListView, &MakeStringTypesRecordBatchWithNulls, &MakeStruct, &MakeUnion, @@ -974,6 +976,9 @@ TEST_F(TestWriteRecordBatch, IntegerGetRecordBatchSize) { ASSERT_OK(MakeListRecordBatch(&batch)); TestGetRecordBatchSize(options_, batch); + ASSERT_OK(MakeListViewRecordBatch(&batch)); + TestGetRecordBatchSize(options_, batch); + ASSERT_OK(MakeZeroLengthRecordBatch(&batch)); TestGetRecordBatchSize(options_, batch); @@ -982,6 +987,9 @@ TEST_F(TestWriteRecordBatch, IntegerGetRecordBatchSize) { ASSERT_OK(MakeDeeplyNestedList(&batch)); TestGetRecordBatchSize(options_, batch); + + ASSERT_OK(MakeDeeplyNestedListView(&batch)); + TestGetRecordBatchSize(options_, batch); } class RecursionLimits : public ::testing::Test, public io::MemoryMapFixture { diff --git a/cpp/src/arrow/ipc/reader.cc b/cpp/src/arrow/ipc/reader.cc index d195687c955..d8d2d4b41a2 100644 --- a/cpp/src/arrow/ipc/reader.cc +++ b/cpp/src/arrow/ipc/reader.cc @@ -330,6 +330,22 @@ class ArrayLoader { return LoadChildren(type.fields()); } + template + Status LoadListView(const TYPE& type) { + out_->buffers.resize(3); + + RETURN_NOT_OK(LoadCommon(type.id())); + RETURN_NOT_OK(GetBuffer(buffer_index_++, &out_->buffers[1])); + RETURN_NOT_OK(GetBuffer(buffer_index_++, &out_->buffers[2])); + + const int num_children = type.num_fields(); + if (num_children != 1) { + return Status::Invalid("Wrong number of children: ", num_children); + } + + return LoadChildren(type.fields()); + } + Status LoadChildren(const std::vector>& child_fields) { DCHECK_NE(out_, nullptr); ArrayData* parent = out_; @@ -392,12 +408,9 @@ class ArrayLoader { return LoadList(type); } - Status Visit(const ListViewType& type) { - return Status::NotImplemented("list-view array in IPC"); - } - - Status Visit(const LargeListViewType& type) { - return Status::NotImplemented("large list-view array in IPC"); + template + enable_if_list_view Visit(const T& type) { + return LoadListView(type); } Status Visit(const MapType& type) { diff --git a/cpp/src/arrow/ipc/test_common.cc b/cpp/src/arrow/ipc/test_common.cc index 6faaf96b332..1d1f74379fc 100644 --- a/cpp/src/arrow/ipc/test_common.cc +++ b/cpp/src/arrow/ipc/test_common.cc @@ -189,6 +189,30 @@ Status MakeRandomListArray(const std::shared_ptr& child_array, int num_li return MakeListArray(child_array, num_lists, include_nulls, pool, out); } +Status MakeRandomListViewArray(const std::shared_ptr& child_array, int num_lists, + bool include_nulls, MemoryPool* pool, + std::shared_ptr* out) { + const auto seed = static_cast(child_array->length()); + random::RandomArrayGenerator rand(seed); + + const double null_probability = include_nulls ? 0.5 : 0.0; + *out = rand.ListView(*child_array, num_lists, null_probability, false, 0.9, + kDefaultBufferAlignment, pool); + return Status::OK(); +} + +Status MakeRandomLargeListViewArray(const std::shared_ptr& child_array, + int num_lists, bool include_nulls, MemoryPool* pool, + std::shared_ptr* out) { + const auto seed = static_cast(child_array->length()); + random::RandomArrayGenerator rand(seed); + + const double null_probability = include_nulls ? 0.5 : 0.0; + *out = rand.LargeListView(*child_array, num_lists, null_probability, false, 0.9, + kDefaultBufferAlignment, pool); + return Status::OK(); +} + Status MakeRandomLargeListArray(const std::shared_ptr& child_array, int num_lists, bool include_nulls, MemoryPool* pool, std::shared_ptr* out) { @@ -418,6 +442,31 @@ Status MakeListRecordBatch(std::shared_ptr* out) { return Status::OK(); } +Status MakeListViewRecordBatch(std::shared_ptr* out) { + // Make the schema + auto f0 = field("f0", list_view(int32())); + auto f1 = field("f1", list_view(list_view(int32()))); + auto f2 = field("f2", large_list_view(int32())); + auto schema = ::arrow::schema({f0, f1, f2}); + + // Example data + + MemoryPool* pool = default_memory_pool(); + const int length = 200; + std::shared_ptr leaf_values, list_array, list_list_array, large_list_array; + const bool include_nulls = true; + RETURN_NOT_OK(MakeRandomInt32Array(1000, include_nulls, pool, &leaf_values)); + RETURN_NOT_OK( + MakeRandomListViewArray(leaf_values, length, include_nulls, pool, &list_array)); + RETURN_NOT_OK( + MakeRandomListViewArray(list_array, length, include_nulls, pool, &list_list_array)); + RETURN_NOT_OK(MakeRandomLargeListViewArray(leaf_values, length, include_nulls, pool, + &large_list_array)); + *out = + RecordBatch::Make(schema, length, {list_array, list_list_array, large_list_array}); + return Status::OK(); +} + Status MakeFixedSizeListRecordBatch(std::shared_ptr* out) { // Make the schema auto f0 = field("f0", fixed_size_list(int32(), 1)); @@ -505,6 +554,27 @@ Status MakeDeeplyNestedList(std::shared_ptr* out) { return Status::OK(); } +Status MakeDeeplyNestedListView(std::shared_ptr* out) { + const int batch_length = 5; + auto type = int32(); + + MemoryPool* pool = default_memory_pool(); + std::shared_ptr array; + const bool include_nulls = true; + RETURN_NOT_OK(MakeRandomInt32Array(1000, include_nulls, pool, &array)); + for (int i = 0; i < 63; ++i) { + type = std::static_pointer_cast(list_view(type)); + RETURN_NOT_OK( + MakeRandomListViewArray(array, batch_length, include_nulls, pool, &array)); + } + + auto f0 = field("f0", type); + auto schema = ::arrow::schema({f0}); + std::vector> arrays = {array}; + *out = RecordBatch::Make(schema, batch_length, arrays); + return Status::OK(); +} + Status MakeStruct(std::shared_ptr* out) { // reuse constructed list columns std::shared_ptr list_batch; diff --git a/cpp/src/arrow/ipc/test_common.h b/cpp/src/arrow/ipc/test_common.h index fc0c8ddbea3..db8613cbb1e 100644 --- a/cpp/src/arrow/ipc/test_common.h +++ b/cpp/src/arrow/ipc/test_common.h @@ -107,6 +107,9 @@ Status MakeNullRecordBatch(std::shared_ptr* out); ARROW_TESTING_EXPORT Status MakeListRecordBatch(std::shared_ptr* out); +ARROW_TESTING_EXPORT +Status MakeListViewRecordBatch(std::shared_ptr* out); + ARROW_TESTING_EXPORT Status MakeFixedSizeListRecordBatch(std::shared_ptr* out); @@ -119,6 +122,9 @@ Status MakeNonNullRecordBatch(std::shared_ptr* out); ARROW_TESTING_EXPORT Status MakeDeeplyNestedList(std::shared_ptr* out); +ARROW_TESTING_EXPORT +Status MakeDeeplyNestedListView(std::shared_ptr* out); + ARROW_TESTING_EXPORT Status MakeStruct(std::shared_ptr* out); diff --git a/cpp/src/arrow/ipc/writer.cc b/cpp/src/arrow/ipc/writer.cc index 649d9bc4068..93256440f4a 100644 --- a/cpp/src/arrow/ipc/writer.cc +++ b/cpp/src/arrow/ipc/writer.cc @@ -350,6 +350,67 @@ class RecordBatchSerializer { return Status::OK(); } + template + Status GetZeroBasedListViewOffsets(const ArrayType& array, + std::shared_ptr* out_value_offsets, + offset_type* out_min_offset, + offset_type* out_max_end) { + auto offsets = array.value_offsets(); + auto sizes = array.value_sizes(); + + const int64_t required_bytes = sizeof(offset_type) * array.length(); + if (array.offset() != 0) { + // If we have a non-zero offset, it's likely that the smallest offset is + // not zero. We must a) create a new offsets array with shifted offsets and + // b) slice the values array accordingly. + + ARROW_ASSIGN_OR_RAISE(auto shifted_offsets, + AllocateBuffer(required_bytes, options_.memory_pool)); + offset_type min_offset = 0; + offset_type max_end = 0; + if (array.length() > 0) { + min_offset = std::numeric_limits::max(); + for (int i = 0; i < array.length(); ++i) { + min_offset = std::min(min_offset, array.value_offset(i)); + max_end = std::max(max_end, array.value_offset(i) + array.value_length(i)); + } + } + + auto* dest_offsets = shifted_offsets->mutable_data_as(); + + for (int i = 0; i < array.length(); ++i) { + dest_offsets[i] = array.value_offset(i) - min_offset; + } + *out_min_offset = min_offset; + *out_max_end = max_end; + offsets = std::move(shifted_offsets); + } else { + // ARROW-6046: Slice offsets to used extent, in case we have a truncated + // slice + if (offsets != nullptr && offsets->size() > required_bytes) { + offsets = SliceBuffer(offsets, 0, required_bytes); + } + *out_min_offset = 0; + *out_max_end = static_cast(array.values()->length()); + } + *out_value_offsets = std::move(offsets); + return Status::OK(); + } + + template + Status GetListViewSizes(const ArrayType& array, + std::shared_ptr* out_value_sizes) { + const int64_t required_bytes = sizeof(offset_type) * array.length(); + auto sizes = array.value_sizes(); + if (sizes != nullptr && (array.offset() != 0 || sizes->size() > required_bytes)) { + // ARROW-6046: Slice offsets to used extent, in case we have a truncated slice + auto offset_bytes = array.offset() * sizeof(offset_type); + sizes = SliceBuffer(sizes, offset_bytes, required_bytes); + } + *out_value_sizes = std::move(sizes); + return Status::OK(); + } + Status Visit(const BooleanArray& array) { std::shared_ptr data; RETURN_NOT_OK(GetTruncatedBitmap(array.offset(), array.length(), array.values(), @@ -428,7 +489,6 @@ class RecordBatchSerializer { RETURN_NOT_OK(GetZeroBasedValueOffsets(array, &value_offsets)); out_->body_buffers.emplace_back(value_offsets); - --max_recursion_depth_; std::shared_ptr values = array.values(); offset_type values_offset = 0; @@ -442,17 +502,40 @@ class RecordBatchSerializer { // Must also slice the values values = values->Slice(values_offset, values_length); } + --max_recursion_depth_; RETURN_NOT_OK(VisitArray(*values)); ++max_recursion_depth_; return Status::OK(); } - Status Visit(const ListViewArray& array) { - return Status::NotImplemented("list-view array in IPC"); - } + template + enable_if_list_view Visit(const T& array) { + using offset_type = typename T::offset_type; + + offset_type min_offset = 0; + offset_type max_end = 0; + { + std::shared_ptr value_offsets; + RETURN_NOT_OK( + GetZeroBasedListViewOffsets(array, &value_offsets, &min_offset, &max_end)); + out_->body_buffers.push_back(std::move(value_offsets)); + } + { + std::shared_ptr value_sizes; + RETURN_NOT_OK(GetListViewSizes(array, &value_sizes)); + out_->body_buffers.push_back(std::move(value_sizes)); + } - Status Visit(const LargeListViewArray& array) { - return Status::NotImplemented("large list-view array in IPC"); + std::shared_ptr values = array.values(); + + if (min_offset != 0 || max_end < values->length()) { + // Must also slice the values + values = values->Slice(min_offset, max_end); + } + --max_recursion_depth_; + RETURN_NOT_OK(VisitArray(*values)); + ++max_recursion_depth_; + return Status::OK(); } Status Visit(const FixedSizeListArray& array) { From e40b1fac592cadb991040cabffb5e536c883640e Mon Sep 17 00:00:00 2001 From: Felipe Oliveira Carvalho Date: Thu, 5 Oct 2023 01:09:11 -0300 Subject: [PATCH 48/91] Add ListView support to the C data interface --- cpp/src/arrow/c/bridge.cc | 52 ++++++++++++- cpp/src/arrow/c/bridge_test.cc | 101 ++++++++++++++++++++++++++ docs/source/format/CDataInterface.rst | 6 ++ 3 files changed, 156 insertions(+), 3 deletions(-) diff --git a/cpp/src/arrow/c/bridge.cc b/cpp/src/arrow/c/bridge.cc index 033371d3d67..085c5df076d 100644 --- a/cpp/src/arrow/c/bridge.cc +++ b/cpp/src/arrow/c/bridge.cc @@ -444,6 +444,10 @@ struct SchemaExporter { Status Visit(const LargeListType& type) { return SetFormat("+L"); } + Status Visit(const ListViewType& type) { return SetFormat("+vl"); } + + Status Visit(const LargeListViewType& type) { return SetFormat("+vL"); } + Status Visit(const FixedSizeListType& type) { return SetFormat("+w:" + ToChars(type.list_size())); } @@ -1100,6 +1104,16 @@ struct SchemaImporter { return ProcessListLike(); case 'L': return ProcessListLike(); + case 'v': { + RETURN_NOT_OK(f_parser_.CheckHasNext()); + switch (f_parser_.Next()) { + case 'l': + return ProcessListView(); + case 'L': + return ProcessListView(); + } + break; + } case 'w': return ProcessFixedSizeList(); case 's': @@ -1204,6 +1218,15 @@ struct SchemaImporter { return Status::OK(); } + template + Status ProcessListView() { + RETURN_NOT_OK(f_parser_.CheckAtEnd()); + RETURN_NOT_OK(CheckNumChildren(1)); + ARROW_ASSIGN_OR_RAISE(auto field, MakeChildField(0)); + type_ = std::make_shared(std::move(field)); + return Status::OK(); + } + Status ProcessMap() { RETURN_NOT_OK(f_parser_.CheckAtEnd()); RETURN_NOT_OK(CheckNumChildren(1)); @@ -1572,6 +1595,10 @@ struct ArrayImporter { Status Visit(const LargeListType& type) { return ImportListLike(type); } + Status Visit(const ListViewType& type) { return ImportListView(type); } + + Status Visit(const LargeListViewType& type) { return ImportListView(type); } + Status Visit(const FixedSizeListType& type) { RETURN_NOT_OK(CheckNumChildren(1)); RETURN_NOT_OK(CheckNumBuffers(1)); @@ -1667,6 +1694,18 @@ struct ArrayImporter { return Status::OK(); } + template + Status ImportListView(const ListViewType& type) { + using offset_type = typename ListViewType::offset_type; + RETURN_NOT_OK(CheckNumChildren(1)); + RETURN_NOT_OK(CheckNumBuffers(3)); + RETURN_NOT_OK(AllocateArrayData()); + RETURN_NOT_OK(ImportNullBitmap()); + RETURN_NOT_OK((ImportOffsetsBuffer(1))); + RETURN_NOT_OK(ImportSizesBuffer(2)); + return Status::OK(); + } + Status CheckNoChildren() { return CheckNumChildren(0); } Status CheckNumChildren(int64_t n_children) { @@ -1735,11 +1774,18 @@ struct ArrayImporter { return ImportBuffer(buffer_id, buffer_size); } - template + template Status ImportOffsetsBuffer(int32_t buffer_id) { // Compute visible size of buffer - int64_t buffer_size = - sizeof(OffsetType) * (c_struct_->length + c_struct_->offset + 1); + int64_t buffer_size = sizeof(OffsetType) * (c_struct_->length + c_struct_->offset + + (with_extra_offset ? 1 : 0)); + return ImportBuffer(buffer_id, buffer_size); + } + + template + Status ImportSizesBuffer(int32_t buffer_id) { + // Compute visible size of buffer + int64_t buffer_size = sizeof(OffsetType) * (c_struct_->length + c_struct_->offset); return ImportBuffer(buffer_id, buffer_size); } diff --git a/cpp/src/arrow/c/bridge_test.cc b/cpp/src/arrow/c/bridge_test.cc index bd0e498a9f3..122d4a0e7b8 100644 --- a/cpp/src/arrow/c/bridge_test.cc +++ b/cpp/src/arrow/c/bridge_test.cc @@ -397,6 +397,14 @@ TEST_F(TestSchemaExport, List) { TestNested(list(large_list(int32())), {"+l", "+L", "i"}, {"", "item", "item"}); } +TEST_F(TestSchemaExport, ListView) { + TestNested(list_view(int8()), {"+vl", "c"}, {"", "item"}); + TestNested(large_list_view(uint16()), {"+vL", "S"}, {"", "item"}); + + TestNested(list_view(large_list_view(int32())), {"+vl", "+vL", "i"}, + {"", "item", "item"}); +} + TEST_F(TestSchemaExport, Struct) { auto type = struct_({field("a", int8()), field("b", utf8())}); TestNested(type, {"+s", "c", "u"}, {"", "a", "b"}, @@ -945,6 +953,33 @@ TEST_F(TestArrayExport, ListSliced) { } } +TEST_F(TestArrayExport, ListView) { + TestNested(list_view(int8()), "[[1, 2], [3, null], null]"); + TestNested(large_list_view(uint16()), "[[1, 2], [3, null], null]"); + TestNested(fixed_size_list(int64(), 2), "[[1, 2], [3, null], null]"); + + TestNested(list_view(large_list_view(int32())), "[[[1, 2], [3], null], null]"); +} + +TEST_F(TestArrayExport, ListViewSliced) { + { + auto factory = []() { + return ArrayFromJSON(list_view(int8()), "[[1, 2], [3, null], [4, 5, 6], null]") + ->Slice(1, 2); + }; + TestNested(factory); + } + { + auto factory = []() { + auto values = ArrayFromJSON(int16(), "[1, 2, 3, 4, null, 5, 6, 7, 8]")->Slice(1, 6); + auto offsets = ArrayFromJSON(int32(), "[5, 2, 0, 3]")->Slice(1, 2); + auto sizes = ArrayFromJSON(int32(), "[2, 3, 6, 1]")->Slice(1, 2); + return ListViewArray::FromArrays(*offsets, *sizes, *values); + }; + TestNested(factory); + } +} + TEST_F(TestArrayExport, Struct) { const char* data = R"([[1, "foo"], [2, null]])"; auto type = struct_({field("a", int8()), field("b", utf8())}); @@ -1490,6 +1525,45 @@ TEST_F(TestDeviceArrayExport, ListSliced) { } } +TEST_F(TestDeviceArrayExport, ListView) { + std::shared_ptr device = std::make_shared(1); + auto mm = device->default_memory_manager(); + + TestNested(mm, list_view(int8()), "[[1, 2], [3, null], null]"); + TestNested(mm, large_list_view(uint16()), "[[1, 2], [3, null], null]"); + + TestNested(mm, list_view(large_list_view(int32())), "[[[1, 2], [3], null], null]"); +} + +TEST_F(TestDeviceArrayExport, ListViewSliced) { + std::shared_ptr device = std::make_shared(1); + auto mm = device->default_memory_manager(); + + { + auto factory = [=]() { + return (*ToDevice(mm, *ArrayFromJSON(list_view(int8()), + "[[1, 2], [3, null], [4, 5, 6], null]") + ->data())) + ->Slice(1, 2); + }; + TestNested(factory); + } + { + auto factory = [=]() { + auto values = + (*ToDevice(mm, + *ArrayFromJSON(int16(), "[1, 2, 3, 4, null, 5, 6, 7, 8]")->data())) + ->Slice(1, 6); + auto offsets = + (*ToDevice(mm, *ArrayFromJSON(int32(), "[5, 2, 0, 3]")->data()))->Slice(1, 2); + auto sizes = + (*ToDevice(mm, *ArrayFromJSON(int32(), "[2, 3, 6, 1]")->data()))->Slice(1, 2); + return ListViewArray::FromArrays(*offsets, *sizes, *values); + }; + TestNested(factory); + } +} + TEST_F(TestDeviceArrayExport, Struct) { std::shared_ptr device = std::make_shared(1); auto mm = device->default_memory_manager(); @@ -1930,6 +2004,33 @@ TEST_F(TestSchemaImport, NestedList) { CheckImport(list(fixed_size_list(int8(), 3))); } +TEST_F(TestSchemaImport, ListView) { + FillPrimitive(AddChild(), "c"); + FillListLike("+vl"); + CheckImport(list_view(int8())); + + FillPrimitive(AddChild(), "s", "item", 0); + FillListLike("+vl"); + CheckImport(list_view(field("item", int16(), /*nullable=*/false))); + + // Large list-view + FillPrimitive(AddChild(), "s"); + FillListLike("+vL"); + CheckImport(large_list_view(int16())); +} + +TEST_F(TestSchemaImport, NestedListView) { + FillPrimitive(AddChild(), "c"); + FillListLike(AddChild(), "+vl"); + FillListLike("+vL"); + CheckImport(large_list_view(list_view(int8()))); + + FillPrimitive(AddChild(), "c"); + FillListLike(AddChild(), "+w:3"); + FillListLike("+vl"); + CheckImport(list_view(fixed_size_list(int8(), 3))); +} + TEST_F(TestSchemaImport, Struct) { FillPrimitive(AddChild(), "u", "strs"); FillPrimitive(AddChild(), "S", "ints"); diff --git a/docs/source/format/CDataInterface.rst b/docs/source/format/CDataInterface.rst index e0884686acf..e2022171214 100644 --- a/docs/source/format/CDataInterface.rst +++ b/docs/source/format/CDataInterface.rst @@ -207,6 +207,10 @@ names and types of child fields are read from the child arrays. +------------------------+---------------------------------------------------+------------+ | ``+L`` | large list | | +------------------------+---------------------------------------------------+------------+ +| ``+lv`` | list-view | | ++------------------------+---------------------------------------------------+------------+ +| ``+Lv`` | large list-view | | ++------------------------+---------------------------------------------------+------------+ | ``+w:123`` | fixed-sized list [123 items] | | +------------------------+---------------------------------------------------+------------+ | ``+s`` | struct | | @@ -243,6 +247,8 @@ Examples array has format string ``d:12,5``. * A ``list`` array has format string ``+l``, and its single child has format string ``L``. +* A ``large_list_view`` array has format string ``+Lv``, and its single + child has format string ``L``. * A ``struct`` has format string ``+s``; its two children have names ``ints`` and ``floats``, and format strings ``i`` and ``f`` respectively. From 17c0370a78f5dda91462612b94524a59fd329f90 Mon Sep 17 00:00:00 2001 From: Felipe Oliveira Carvalho Date: Wed, 11 Oct 2023 14:14:45 -0300 Subject: [PATCH 49/91] validate.cc: Re-do list-view modifications on top of concurrent changes --- cpp/src/arrow/array/validate.cc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cpp/src/arrow/array/validate.cc b/cpp/src/arrow/array/validate.cc index 9ba53258dae..84b7b28d180 100644 --- a/cpp/src/arrow/array/validate.cc +++ b/cpp/src/arrow/array/validate.cc @@ -909,7 +909,7 @@ struct ValidateArrayImpl { const auto offsets_byte_size = data.buffers[1]->size(); const auto required_offsets = ((data.length > 0) || (offsets_byte_size > 0)) - ? data.length + data.offset + 1 + ? data.length + data.offset + (is_list_view ? 0 : 1) : 0; if (offsets_byte_size / static_cast(sizeof(offset_type)) < required_offsets) { From fd927a8601a7ef6aa8283a6d0dbb515deff2f9db Mon Sep 17 00:00:00 2001 From: Felipe Oliveira Carvalho Date: Wed, 11 Oct 2023 14:35:09 -0300 Subject: [PATCH 50/91] RangeOfValuesUsed: Don't call MaxViewEnd if MinViewOffset is a nullopt Since that implies all non-null list-views are empty. --- cpp/src/arrow/util/list_util.cc | 14 +++++++++----- 1 file changed, 9 insertions(+), 5 deletions(-) diff --git a/cpp/src/arrow/util/list_util.cc b/cpp/src/arrow/util/list_util.cc index 644b40853fe..b10cd2459a7 100644 --- a/cpp/src/arrow/util/list_util.cc +++ b/cpp/src/arrow/util/list_util.cc @@ -42,7 +42,7 @@ using arrow::internal::SetBitRunReader; /// \pre input.length() > 0 && input.null_count() != input.length() /// \param input A LIST_VIEW or LARGE_LIST_VIEW array template -int64_t MinViewOffset(const ArraySpan& input) { +std::optional MinViewOffset(const ArraySpan& input) { const uint8_t* validity = input.buffers[0].data; const auto* offsets = input.GetValues(1); const auto* sizes = input.GetValues(2); @@ -83,7 +83,7 @@ int64_t MinViewOffset(const ArraySpan& input) { } } } - return min_offset.value_or(0); + return min_offset; #undef MINIMIZE_MIN_VIEW_OFFSET } @@ -151,9 +151,13 @@ std::pair RangeOfValuesUsedByListView(const ArraySpan& input) if (input.length == 0 || input.GetNullCount() == input.length) { return {0, 0}; } - const int64_t min_offset = MinViewOffset(input); + const auto min_offset = MinViewOffset(input); + // If all list-views are empty, min_offset will be std::nullopt. + if (!min_offset.has_value()) { + return {0, 0}; + } const int64_t max_end = MaxViewEnd(input); - return {min_offset, max_end - min_offset}; + return {*min_offset, max_end - *min_offset}; } template @@ -162,7 +166,7 @@ std::pair RangeOfValuesUsedByList(const ArraySpan& input) { if (input.length == 0) { return {0, 0}; } - const auto* offsets = reinterpret_cast(input.buffers[1].data); + const auto* offsets = input.buffers[1].data_as(); const int64_t min_offset = offsets[input.offset]; const int64_t max_end = offsets[input.offset + input.length]; return {min_offset, max_end - min_offset}; From 627af2ff6c8962e7e81230acf3a4fba75785ca80 Mon Sep 17 00:00:00 2001 From: Felipe Oliveira Carvalho Date: Thu, 12 Oct 2023 10:43:54 -0300 Subject: [PATCH 51/91] concatenate.cc: Rename SumBufferSizes{->InBytes} --- cpp/src/arrow/array/concatenate.cc | 15 ++++++++------- 1 file changed, 8 insertions(+), 7 deletions(-) diff --git a/cpp/src/arrow/array/concatenate.cc b/cpp/src/arrow/array/concatenate.cc index 569ea77c06e..6e56371a018 100644 --- a/cpp/src/arrow/array/concatenate.cc +++ b/cpp/src/arrow/array/concatenate.cc @@ -99,7 +99,7 @@ Status ConcatenateBitmaps(const std::vector& bitmaps, MemoryPool* pool, return Status::OK(); } -int64_t SumBufferSizes(const BufferVector& buffers) { +int64_t SumBufferSizesInBytes(const BufferVector& buffers) { int64_t size = 0; for (const auto& buffer : buffers) { size += buffer->size(); @@ -122,8 +122,8 @@ Status ConcatenateOffsets(const BufferVector& buffers, MemoryPool* pool, values_ranges->resize(buffers.size()); // allocate output buffer - const int64_t out_size = SumBufferSizes(buffers); - ARROW_ASSIGN_OR_RAISE(*out, AllocateBuffer(sizeof(Offset) + out_size, pool)); + const int64_t out_size_in_bytes = SumBufferSizesInBytes(buffers); + ARROW_ASSIGN_OR_RAISE(*out, AllocateBuffer(sizeof(Offset) + out_size_in_bytes, pool)); auto* out_data = reinterpret_cast((*out)->mutable_data()); int64_t elements_length = 0; @@ -138,7 +138,7 @@ Status ConcatenateOffsets(const BufferVector& buffers, MemoryPool* pool, } // the final element in out_data is the length of all values spanned by the offsets - out_data[out_size / sizeof(Offset)] = values_length; + out_data[out_size_in_bytes / sizeof(Offset)] = values_length; return Status::OK(); } @@ -190,8 +190,8 @@ template Status ConcatenateListViewOffsets(const BufferVector& buffers, const std::vector& value_ranges, MemoryPool* pool, std::shared_ptr* out) { - const int64_t out_size = SumBufferSizes(buffers); - ARROW_ASSIGN_OR_RAISE(*out, AllocateBuffer(out_size, pool)); + const int64_t out_size_in_bytes = SumBufferSizesInBytes(buffers); + ARROW_ASSIGN_OR_RAISE(*out, AllocateBuffer(out_size_in_bytes, pool)); auto* out_data = (*out)->mutable_data_as(); int64_t num_child_values = 0; @@ -207,7 +207,8 @@ Status ConcatenateListViewOffsets(const BufferVector& buffers, return Status::Invalid("offset overflow while concatenating arrays"); } } - DCHECK_EQ(elements_length, static_cast(out_size / sizeof(offset_type))); + DCHECK_EQ(elements_length, + static_cast(out_size_in_bytes / sizeof(offset_type))); return Status::OK(); } From e1cfd963624148084533caabd84045061c3f102b Mon Sep 17 00:00:00 2001 From: Felipe Oliveira Carvalho Date: Tue, 17 Oct 2023 18:06:14 -0300 Subject: [PATCH 52/91] concatenate.cc: Use mutable_data_as --- cpp/src/arrow/array/concatenate.cc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cpp/src/arrow/array/concatenate.cc b/cpp/src/arrow/array/concatenate.cc index 6e56371a018..0f5fc8dbedd 100644 --- a/cpp/src/arrow/array/concatenate.cc +++ b/cpp/src/arrow/array/concatenate.cc @@ -124,7 +124,7 @@ Status ConcatenateOffsets(const BufferVector& buffers, MemoryPool* pool, // allocate output buffer const int64_t out_size_in_bytes = SumBufferSizesInBytes(buffers); ARROW_ASSIGN_OR_RAISE(*out, AllocateBuffer(sizeof(Offset) + out_size_in_bytes, pool)); - auto* out_data = reinterpret_cast((*out)->mutable_data()); + auto* out_data = (*out)->mutable_data_as(); int64_t elements_length = 0; Offset values_length = 0; From 3ecbcf149fa81b432ccddca7fcb505ed3b17f369 Mon Sep 17 00:00:00 2001 From: Felipe Oliveira Carvalho Date: Mon, 16 Oct 2023 20:35:21 -0300 Subject: [PATCH 53/91] Clarify the validations in List[View]Array::FromArrays() docstrings --- cpp/src/arrow/array/array_nested.h | 43 +++++++++++++++++++++++------- 1 file changed, 33 insertions(+), 10 deletions(-) diff --git a/cpp/src/arrow/array/array_nested.h b/cpp/src/arrow/array/array_nested.h index e14a374251a..6a46b1719d5 100644 --- a/cpp/src/arrow/array/array_nested.h +++ b/cpp/src/arrow/array/array_nested.h @@ -135,10 +135,14 @@ class ARROW_EXPORT ListArray : public BaseListArray { /// This function does the bare minimum of validation of the offsets and /// input types, and will allocate a new offsets array if necessary (i.e. if /// the offsets contain any nulls). If the offsets do not have nulls, they - /// are assumed to be well-formed + /// are assumed to be well-formed. + /// + /// If a null_bitmap is not provided, the nulls will be inferred from the offsets' or + /// sizes' null bitmap. Only one of these two is allowed to have a null bitmap. But if a + /// null_bitmap is provided, the offsets array and the sizes array can't have nulls. /// - /// Offsets of an Array's null bitmap can be present or an explicit - /// null_bitmap, but not both. + /// And when a null_bitmap is provided, neither the offsets or sizes array can be a + /// slice (i.e. an array with offset() > 0). /// /// \param[in] offsets Array containing n + 1 offsets encoding length and /// size. Must be of int32 type @@ -196,7 +200,14 @@ class ARROW_EXPORT LargeListArray : public BaseListArray { /// This function does the bare minimum of validation of the offsets and /// input types, and will allocate a new offsets array if necessary (i.e. if /// the offsets contain any nulls). If the offsets do not have nulls, they - /// are assumed to be well-formed + /// are assumed to be well-formed. + /// + /// If a null_bitmap is not provided, the nulls will be inferred from the + /// offsets's null bitmap. But if a null_bitmap is provided, the offsets array + /// can't have nulls. + /// + /// If a null_bitmap is provided, the offsets array can't be a slice (i.e. an + /// array with offset() > 0). /// /// \param[in] offsets Array containing n + 1 offsets encoding length and /// size. Must be of int64 type @@ -276,10 +287,16 @@ class ARROW_EXPORT ListViewArray : public BaseListViewArray { /// that project views into the child values array. /// /// This function does the bare minimum of validation of the offsets/sizes and - /// input types. + /// input types. The offset and length of the offsets and sizes arrays must + /// match and that will be checked, but their contents will be assumed to be + /// well-formed. + /// + /// If a null_bitmap is not provided, the nulls will be inferred from the + /// offsets's null bitmap. But if a null_bitmap is provided, the offsets array + /// can't have nulls. /// - /// Offsets of an Array's null bitmap can be present or an explicit - /// null_bitmap, but not both. + /// If a null_bitmap is provided, the offsets array can't be a slice (i.e. an + /// array with offset() > 0). /// /// \param[in] offsets An array of int32 offsets into the values array. NULL values are /// supported if the corresponding values in sizes is NULL or 0. @@ -349,10 +366,16 @@ class ARROW_EXPORT LargeListViewArray : public BaseListViewArray 0). /// /// \param[in] offsets An array of int64 offsets into the values array. NULL values are /// supported if the corresponding values in sizes is NULL or 0. From 0cf760c4f7ed59f84f9ded9371e956629af8e564 Mon Sep 17 00:00:00 2001 From: Felipe Oliveira Carvalho Date: Mon, 16 Oct 2023 23:44:11 -0300 Subject: [PATCH 54/91] Rewrite ListView Flatten --- cpp/src/arrow/array/array_list_test.cc | 38 +++++++- cpp/src/arrow/array/array_nested.cc | 130 +++++++++++++------------ 2 files changed, 105 insertions(+), 63 deletions(-) diff --git a/cpp/src/arrow/array/array_list_test.cc b/cpp/src/arrow/array/array_list_test.cc index 03db87ad5c1..271fde671f5 100644 --- a/cpp/src/arrow/array/array_list_test.cc +++ b/cpp/src/arrow/array/array_list_test.cc @@ -680,10 +680,16 @@ class TestListArray : public ::testing::Test { void TestFlattenSimple() { auto type = std::make_shared(int32()); auto list_array = std::dynamic_pointer_cast( - ArrayFromJSON(type, "[[1, 2], [3], [4], null, [5], [], [6]]")); + ArrayFromJSON(type, "[[], null, [1, 2], [3], [4], null, [5], [], [6]]")); ASSERT_OK_AND_ASSIGN(auto flattened, list_array->Flatten()); ASSERT_OK(flattened->ValidateFull()); EXPECT_TRUE(flattened->Equals(ArrayFromJSON(int32(), "[1, 2, 3, 4, 5, 6]"))); + + list_array = std::dynamic_pointer_cast( + ArrayFromJSON(type, "[[], [], [1, 2], [3], [4], [], [5], [], [6]]")); + ASSERT_OK_AND_ASSIGN(flattened, list_array->Flatten()); + ASSERT_OK(flattened->ValidateFull()); + EXPECT_TRUE(flattened->Equals(ArrayFromJSON(int32(), "[1, 2, 3, 4, 5, 6]"))); } void TestFlattenNulls() { @@ -695,6 +701,35 @@ class TestListArray : public ::testing::Test { AssertTypeEqual(*flattened->type(), *value_type_); } + void TestFlattenAllEmpty() { + auto type = std::make_shared(int32()); + auto list_array = std::dynamic_pointer_cast( + ArrayFromJSON(type, "[[], [], [], [], [], [], []]")); + ASSERT_OK_AND_ASSIGN(auto flattened, list_array->Flatten()); + ASSERT_OK(flattened->ValidateFull()); + EXPECT_TRUE(flattened->Equals(ArrayFromJSON(int32(), "[]"))); + + if constexpr (kTypeClassIsListView) { + auto list_array = std::dynamic_pointer_cast( + ArrayFromJSON(type, "[[1, 2], [3], null, [5, 6], [7, 8], [], [9]]")); + auto array_data = list_array->data(); + + auto offsets = array_data->buffers[1]->template mutable_data_as(); + auto sizes = array_data->buffers[2]->template mutable_data_as(); + + // Set all sizes to 0, except the one for the null entry + memset(sizes, 0, sizeof(offset_type) * array_data->length); + sizes[2] = 4; + // Make the offset of the null entry be non-zero and out of order + offsets[2] = 1; + + ASSERT_OK(list_array->ValidateFull()); + ASSERT_OK_AND_ASSIGN(auto flattened, list_array->Flatten()); + EXPECT_TRUE(flattened->Equals(ArrayFromJSON(int32(), "[]"))) + << flattened->ToString(); + } + } + void TestFlattenSliced() { auto type = std::make_shared(int32()); auto list_array = std::dynamic_pointer_cast( @@ -890,6 +925,7 @@ TYPED_TEST(TestListArray, BuilderPreserveFieldName) { TYPED_TEST(TestListArray, FlattenSimple) { this->TestFlattenSimple(); } TYPED_TEST(TestListArray, FlattenNulls) { this->TestFlattenNulls(); } +TYPED_TEST(TestListArray, FlattenAllEmpty) { this->TestFlattenAllEmpty(); } TYPED_TEST(TestListArray, FlattenZeroLength) { this->TestFlattenZeroLength(); } TYPED_TEST(TestListArray, TestFlattenNonEmptyBackingNulls) { this->TestFlattenNonEmptyBackingNulls(); diff --git a/cpp/src/arrow/array/array_nested.cc b/cpp/src/arrow/array/array_nested.cc index fd1a2a24a8f..d1044b95100 100644 --- a/cpp/src/arrow/array/array_nested.cc +++ b/cpp/src/arrow/array/array_nested.cc @@ -260,10 +260,11 @@ Result> FlattenListArray(const ListArrayT& list_array, return Concatenate(non_null_fragments, memory_pool); } -template +template Result> FlattenListViewArray(const ListViewArrayT& list_view_array, MemoryPool* memory_pool) { using offset_type = typename ListViewArrayT::offset_type; + const int64_t list_view_array_offset = list_view_array.offset(); const int64_t list_view_array_length = list_view_array.length(); std::shared_ptr value_array = list_view_array.values(); @@ -272,82 +273,81 @@ Result> FlattenListViewArray(const ListViewArrayT& list_v } // If the list array is *all* nulls, then just return an empty array. - if (list_view_array.null_count() == list_view_array.length()) { - return MakeEmptyArray(value_array->type(), memory_pool); + if constexpr (HasNulls) { + if (list_view_array.null_count() == list_view_array.length()) { + return MakeEmptyArray(value_array->type(), memory_pool); + } } const auto* validity = list_view_array.data()->template GetValues(0, 0); const auto* offsets = list_view_array.data()->template GetValues(1); const auto* sizes = list_view_array.data()->template GetValues(2); - // If a ListViewArray: - // - // 1) does not contain nulls - // 2) has sorted offsets - // 3) has disjoint views which completely cover the values array - // - // then simply slice its value array with the first offset and end of the last list - // view. - if (list_view_array.null_count() == 0) { - bool sorted_and_disjoint = true; - for (int64_t i = 1; sorted_and_disjoint && i < list_view_array_length; ++i) { - sorted_and_disjoint &= - sizes[i - 1] == 0 || offsets[i] - offsets[i - 1] == sizes[i - 1]; + auto is_null_or_empty = [&](int64_t i) { + if constexpr (HasNulls) { + if (!bit_util::GetBit(validity, list_view_array_offset + i)) { + return true; + } } + return sizes[i] == 0; + }; - if (sorted_and_disjoint) { - const auto begin_offset = list_view_array.value_offset(0); - const auto end_offset = list_view_array.value_offset(list_view_array_length - 1) + - list_view_array.value_length(list_view_array_length - 1); - return SliceArrayWithOffsets(*value_array, begin_offset, end_offset); + // Index of the first valid, non-empty list-view. + int64_t first_i = 0; + for (; first_i < list_view_array_length; first_i++) { + if (!is_null_or_empty(first_i)) { + break; } } + // If all list-views are empty, return an empty array. + if (first_i == list_view_array_length) { + return MakeEmptyArray(value_array->type(), memory_pool); + } - auto is_null_or_empty = [&](int64_t i) { - return (validity && !bit_util::GetBit(validity, list_view_array.offset() + i)) || - sizes[i] == 0; - }; - - std::vector> non_null_fragments; - // Index of first valid, non-empty list-view and last offset - // of the current contiguous fragment in values. - constexpr int64_t kUninitialized = -1; - int64_t first_i = kUninitialized; - offset_type end_offset; - int64_t i = 0; - for (; i < list_view_array_length; i++) { - if (is_null_or_empty(i)) continue; - - first_i = i; - end_offset = offsets[i] + sizes[i]; - break; - } - i += 1; - for (; i < list_view_array_length; i++) { - if (is_null_or_empty(i)) continue; - - if (offsets[i] == end_offset) { - end_offset += sizes[i]; - continue; + std::vector> slices; + { + int64_t i = first_i; + auto begin_offset = offsets[i]; + auto end_offset = offsets[i] + sizes[i]; + i += 1; + // Inductive invariant: slices and the always non-empty values slice + // [begin_offset, end_offset) contains all the maximally contiguous slices of the + // values array that are covered by all the list-views before list-view i. + for (; i < list_view_array_length; i++) { + if (is_null_or_empty(i)) { + // The invariant is preserved by simply preserving the current set of slices. + } else { + if (offsets[i] == end_offset) { + end_offset += sizes[i]; + // The invariant is preserved because since the non-empty list-view i + // starts at end_offset, the current range can be extended to end at + // offsets[i] + sizes[i] (the same as end_offset + sizes[i]). + } else { + // The current slice can't be extended because the list-view i either + // shares values with the current slice or starts after the position + // immediately after the end of the current slice. + slices.push_back(SliceArrayWithOffsets(*value_array, begin_offset, end_offset)); + begin_offset = offsets[i]; + end_offset = offsets[i] + sizes[i]; + // The invariant is preserved because a maximally contiguous slice of + // the values array (i.e. one that can't be extended) was added to slices + // and [begin_offset, end_offset) is non-empty and contains the + // current list-view i. + } + } } - non_null_fragments.push_back( - SliceArrayWithOffsets(*value_array, offsets[first_i], end_offset)); - first_i = i; - end_offset = offsets[i] + sizes[i]; - } - if (first_i != kUninitialized) { - non_null_fragments.push_back( - SliceArrayWithOffsets(*value_array, offsets[first_i], end_offset)); + slices.push_back(SliceArrayWithOffsets(*value_array, begin_offset, end_offset)); } // Final attempt to avoid invoking Concatenate(). - if (non_null_fragments.size() == 1) { - return non_null_fragments[0]; - } else if (non_null_fragments.size() == 0) { - return MakeEmptyArray(value_array->type(), memory_pool); + switch (slices.size()) { + case 0: + return MakeEmptyArray(value_array->type(), memory_pool); + case 1: + return slices[0]; } - return Concatenate(non_null_fragments, memory_pool); + return Concatenate(slices, memory_pool); } std::shared_ptr BoxOffsets(const std::shared_ptr& boxed_type, @@ -548,7 +548,10 @@ Result> ListViewArray::FromArrays( } Result> ListViewArray::Flatten(MemoryPool* memory_pool) const { - return FlattenListViewArray(*this, memory_pool); + if (null_count() > 0) { + return FlattenListViewArray(*this, memory_pool); + } + return FlattenListViewArray(*this, memory_pool); } std::shared_ptr ListViewArray::offsets() const { @@ -606,7 +609,10 @@ Result> LargeListViewArray::FromArrays( Result> LargeListViewArray::Flatten( MemoryPool* memory_pool) const { - return FlattenListViewArray(*this, memory_pool); + if (null_count() > 0) { + return FlattenListViewArray(*this, memory_pool); + } + return FlattenListViewArray(*this, memory_pool); } std::shared_ptr LargeListViewArray::offsets() const { From 8f8703838048705a0fb54bfdd69302401f532c3f Mon Sep 17 00:00:00 2001 From: Felipe Oliveira Carvalho Date: Tue, 17 Oct 2023 18:17:38 -0300 Subject: [PATCH 55/91] list_util: Use mutable_data_as --- cpp/src/arrow/util/list_util.cc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cpp/src/arrow/util/list_util.cc b/cpp/src/arrow/util/list_util.cc index b10cd2459a7..3392e7cb590 100644 --- a/cpp/src/arrow/util/list_util.cc +++ b/cpp/src/arrow/util/list_util.cc @@ -219,7 +219,7 @@ Result> ListViewFromListImpl( ARROW_ASSIGN_OR_RAISE(auto sizes_buffer, AllocateBuffer(buffer_length * sizeof(offset_type), pool)); const auto* offsets = list_data->template GetValues(1, 0); - auto* sizes = reinterpret_cast(sizes_buffer->mutable_data()); + auto* sizes = sizes_buffer->mutable_data_as(); // Zero the initial padding area to avoid leaking any data when buffers are // sent over IPC or throught the C Data interface. memset(sizes, 0, list_data->offset * sizeof(offset_type)); From 15b0904d24e9edd6ba00127c4fb45b79d9cf821c Mon Sep 17 00:00:00 2001 From: Felipe Oliveira Carvalho Date: Wed, 18 Oct 2023 16:18:12 -0300 Subject: [PATCH 56/91] More throroughly document builder member functions --- cpp/src/arrow/array/builder_nested.h | 39 +++++++++++++++++++++++++--- 1 file changed, 35 insertions(+), 4 deletions(-) diff --git a/cpp/src/arrow/array/builder_nested.h b/cpp/src/arrow/array/builder_nested.h index 36bfa0d7d72..bca0095d9d1 100644 --- a/cpp/src/arrow/array/builder_nested.h +++ b/cpp/src/arrow/array/builder_nested.h @@ -91,9 +91,22 @@ class ARROW_EXPORT VarLengthListLikeBuilder : public ArrayBuilder { /// \brief Start a new variable-length list slot /// - /// This function should be called before beginning to append elements to the - /// value builder. Elements appended to the value builder before this function is - /// called, will not be members of any list value. + /// This function should be called before appending elements to the + /// value builder. Elements appended to the value builder before this function + /// is called for the first time, will not be members of any list value. + /// + /// After this function is called, list_length elements SHOULD be appended to + /// the values builder. If this contract is violated, the behavior is defined by + /// the concrete builder implementation and SHOULD NOT be relied upon unless + /// the caller is specifically building a [Large]List or [Large]ListView array. + /// + /// For [Large]List arrays, the list slot length will be the number of elements + /// appended to the values builder before the next call to Append* or Finish. For + /// [Large]ListView arrays, the list slot length will be exactly list_length, but if + /// Append* is called before at least list_length elements are appended to the values + /// builder, the current list slot will share elements with the next list + /// slots or an invalid [Large]ListView array will be generated because there + /// aren't enough elements in the values builder to fill the list slots. /// /// \pre if is_valid is false, list_length MUST be 0 /// \param is_valid Whether the new list slot is valid @@ -106,7 +119,17 @@ class ARROW_EXPORT VarLengthListLikeBuilder : public ArrayBuilder { return Status::OK(); } - Status AppendNull() final { return Append(false, 0); } + Status AppendNull() final { + // Append() a null list slot with list_length=0. + // + // When building [Large]List arrays, elements being appended to the values builder + // before the next call to Append* or Finish will extend the list slot length, but + // that is totally fine because list arrays admit non-empty null list slots. + // + // In the case of [Large]ListViews that's not a problem either because the + // list slot length remains zero. + return Append(false, 0); + } Status AppendNulls(int64_t length) final { ARROW_RETURN_NOT_OK(Reserve(length)); @@ -115,8 +138,16 @@ class ARROW_EXPORT VarLengthListLikeBuilder : public ArrayBuilder { return Status::OK(); } + /// \brief Append an empty list slot + /// + /// \post Another call to Append* or Finish should be made before appending to + /// the values builder to ensure list slot remains empty Status AppendEmptyValue() final { return Append(true, 0); } + /// \brief Append an empty list slot + /// + /// \post Another call to Append* or Finish should be made before appending to + /// the values builder to ensure the last list slot remains empty Status AppendEmptyValues(int64_t length) final { ARROW_RETURN_NOT_OK(Reserve(length)); UnsafeAppendToBitmap(length, true); From dde0f5a720f7a7dfbdfdce8cc6cce2c2cd874e43 Mon Sep 17 00:00:00 2001 From: Felipe Oliveira Carvalho Date: Thu, 26 Oct 2023 20:54:26 -0300 Subject: [PATCH 57/91] fixup! concatenate.cc: Respect the new invariants imposed on the spec --- cpp/src/arrow/array/concatenate_test.cc | 19 ++++++++++--------- 1 file changed, 10 insertions(+), 9 deletions(-) diff --git a/cpp/src/arrow/array/concatenate_test.cc b/cpp/src/arrow/array/concatenate_test.cc index f166370ca0c..0374516a9ed 100644 --- a/cpp/src/arrow/array/concatenate_test.cc +++ b/cpp/src/arrow/array/concatenate_test.cc @@ -231,27 +231,28 @@ struct ListViewConcatenationChecker { using ListViewArrayType = typename TypeTraits::ArrayType; template - static void Check(Self& self, int32_t size, double null_probability, + static void Check(Self& self, int32_t num_list_views, double null_probability, std::shared_ptr* out) { - auto values_size = 4 * size; + auto values_size = 4 * num_list_views; auto values = self.template GeneratePrimitive(values_size, null_probability); std::shared_ptr offsets; - auto offsets_vector = self.template Offsets(values_size, size); + auto offsets_vector = self.template Offsets(values_size, num_list_views); offsets_vector.front() = 0; ArrayFromVector(offsets_vector, &offsets); std::shared_ptr sizes; std::vector sizes_vector; - sizes_vector.reserve(size); - for (int32_t i = 0; i < size; ++i) { - // Make list-views share values with the next list-view by extending the size to a - // point after the next offset. + sizes_vector.reserve(num_list_views); + for (int32_t i = 0; i < num_list_views; ++i) { + ASSERT_LE(offsets_vector[i], values_size); offset_type size = offsets_vector[i + 1] - offsets_vector[i]; - size = std::min(2 * size / 3, values_size - offsets_vector[i]); + // Make list-views share values with the next list-view by + // extending the list-view size to a point after the next offset. + size = std::min(3 * size / 2, values_size - offsets_vector[i]); sizes_vector.push_back(size); - ASSERT_LE(offsets_vector[i] + sizes_vector.back(), values_size); + ASSERT_LE(offsets_vector[i] + size, values_size); } ASSERT_EQ(offsets_vector.size(), sizes_vector.size() + 1); ArrayFromVector(sizes_vector, &sizes); From 8d04a3885da2b6264c679e1db6de89aceddffe70 Mon Sep 17 00:00:00 2001 From: Felipe Oliveira Carvalho Date: Mon, 6 Nov 2023 16:58:59 -0500 Subject: [PATCH 58/91] fixup! Clarify the validations in List[View]Array::FromArrays() docstrings --- cpp/src/arrow/array/array_nested.h | 20 +++++++++----------- 1 file changed, 9 insertions(+), 11 deletions(-) diff --git a/cpp/src/arrow/array/array_nested.h b/cpp/src/arrow/array/array_nested.h index 6a46b1719d5..77d9c974968 100644 --- a/cpp/src/arrow/array/array_nested.h +++ b/cpp/src/arrow/array/array_nested.h @@ -137,12 +137,11 @@ class ARROW_EXPORT ListArray : public BaseListArray { /// the offsets contain any nulls). If the offsets do not have nulls, they /// are assumed to be well-formed. /// - /// If a null_bitmap is not provided, the nulls will be inferred from the offsets' or - /// sizes' null bitmap. Only one of these two is allowed to have a null bitmap. But if a - /// null_bitmap is provided, the offsets array and the sizes array can't have nulls. + /// If a null_bitmap is not provided, the nulls will be inferred from the offsets' + /// null bitmap. But if a null_bitmap is provided, the offsets array can't have nulls. /// - /// And when a null_bitmap is provided, neither the offsets or sizes array can be a - /// slice (i.e. an array with offset() > 0). + /// And when a null_bitmap is provided, the offsets array cannot be a slice (i.e. an + /// array with offset() > 0). /// /// \param[in] offsets Array containing n + 1 offsets encoding length and /// size. Must be of int32 type @@ -202,11 +201,10 @@ class ARROW_EXPORT LargeListArray : public BaseListArray { /// the offsets contain any nulls). If the offsets do not have nulls, they /// are assumed to be well-formed. /// - /// If a null_bitmap is not provided, the nulls will be inferred from the - /// offsets's null bitmap. But if a null_bitmap is provided, the offsets array - /// can't have nulls. + /// If a null_bitmap is not provided, the nulls will be inferred from the offsets' + /// null bitmap. But if a null_bitmap is provided, the offsets array can't have nulls. /// - /// If a null_bitmap is provided, the offsets array can't be a slice (i.e. an + /// And when a null_bitmap is provided, the offsets array cannot be a slice (i.e. an /// array with offset() > 0). /// /// \param[in] offsets Array containing n + 1 offsets encoding length and @@ -295,8 +293,8 @@ class ARROW_EXPORT ListViewArray : public BaseListViewArray { /// offsets's null bitmap. But if a null_bitmap is provided, the offsets array /// can't have nulls. /// - /// If a null_bitmap is provided, the offsets array can't be a slice (i.e. an - /// array with offset() > 0). + /// And when a null_bitmap is provided, neither the offsets or sizes array can be a + /// slice (i.e. an array with offset() > 0). /// /// \param[in] offsets An array of int32 offsets into the values array. NULL values are /// supported if the corresponding values in sizes is NULL or 0. From c490d84f539ea0c588fb7a29c80f8413ae6d1aff Mon Sep 17 00:00:00 2001 From: Felipe Oliveira Carvalho Date: Thu, 9 Nov 2023 13:54:11 -0500 Subject: [PATCH 59/91] Make [Large]ListViewArray docstrings consistent with each other --- cpp/src/arrow/array/array_nested.h | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) diff --git a/cpp/src/arrow/array/array_nested.h b/cpp/src/arrow/array/array_nested.h index 77d9c974968..9f69d0d0c25 100644 --- a/cpp/src/arrow/array/array_nested.h +++ b/cpp/src/arrow/array/array_nested.h @@ -334,8 +334,8 @@ class ARROW_EXPORT ListViewArray : public BaseListViewArray { /// \brief Return list-view sizes as an Int32Array /// /// The returned array will not have a validity bitmap, so you cannot expect - /// to pass it to ListArray::FromArrays() and get back the same list array - /// if the original one has nulls. + /// to pass it to ListViewArray::FromArrays() and get back the same list + /// array if the original one has nulls. std::shared_ptr sizes() const; protected: @@ -405,9 +405,17 @@ class ARROW_EXPORT LargeListViewArray : public BaseListViewArray offsets() const; /// \brief Return list-view sizes as an Int64Array + /// + /// The returned array will not have a validity bitmap, so you cannot expect + /// to pass it to LargeListViewArray::FromArrays() and get back the same list + /// array if the original one has nulls. std::shared_ptr sizes() const; protected: From 36bb781f18daa7cea145d9774486c954b39cbea9 Mon Sep 17 00:00:00 2001 From: Felipe Oliveira Carvalho Date: Thu, 9 Nov 2023 15:07:16 -0500 Subject: [PATCH 60/91] fixup! [Large]ListViewScalar: Implement all operations --- cpp/src/arrow/array/util.cc | 21 +++++++++++++-------- 1 file changed, 13 insertions(+), 8 deletions(-) diff --git a/cpp/src/arrow/array/util.cc b/cpp/src/arrow/array/util.cc index a7002e04e51..38ee30508ea 100644 --- a/cpp/src/arrow/array/util.cc +++ b/cpp/src/arrow/array/util.cc @@ -389,13 +389,16 @@ class NullArrayFactory { enable_if_var_size_list Visit(const T& type) { // values array may be empty, but there must be at least one offset of 0 RETURN_NOT_OK(MaxOf(sizeof(typename T::offset_type) * (length_ + 1))); + // XXX(felipec): reviewers, is this correct? RETURN_NOT_OK(MaxOf(GetBufferLength(type.value_type(), length_))); return Status::OK(); } template - enable_if_list_view Visit(const T&) { - buffer_length_ = length_ * sizeof(typename T::offset_type); + enable_if_list_view Visit(const T& type) { + RETURN_NOT_OK(MaxOf(sizeof(typename T::offset_type) * length_)); + // XXX(felipec): reviewers, is this correct? + RETURN_NOT_OK(MaxOf(GetBufferLength(type.value_type(), length_))); return Status::OK(); } @@ -727,10 +730,10 @@ class RepeatedArrayFactory { auto value = checked_cast(scalar_).value; auto size = static_cast(value->length()); - std::shared_ptr offsets_buffer; - std::shared_ptr sizes_buffer; - RETURN_NOT_OK(CreateIntBuffer(0, &offsets_buffer)); - RETURN_NOT_OK(CreateIntBuffer(size, &sizes_buffer)); + ARROW_ASSIGN_OR_RAISE(auto offsets_buffer, + CreateIntBuffer(0)); + ARROW_ASSIGN_OR_RAISE(auto sizes_buffer, + CreateIntBuffer(size)); out_ = std::make_shared(scalar_.type, length_, std::move(offsets_buffer), std::move(sizes_buffer), value); return Status::OK(); @@ -886,10 +889,12 @@ class RepeatedArrayFactory { } template - Status CreateIntBuffer(IntType value, std::shared_ptr* out) { + Result> CreateIntBuffer(IntType value) { + std::shared_ptr buffer; TypedBufferBuilder builder(pool_); RETURN_NOT_OK(builder.Append(/*num_copies=*/length_, value)); - return builder.Finish(out); + RETURN_NOT_OK(builder.Finish(&buffer)); + return buffer; } Status CreateBufferOf(const void* data, size_t data_length, From 4b7716340dcee8b0465e32e4e2b252e84d7e60f4 Mon Sep 17 00:00:00 2001 From: Felipe Oliveira Carvalho Date: Thu, 9 Nov 2023 15:07:39 -0500 Subject: [PATCH 61/91] fixup! [Large]ListViewArray: Implement Validate --- cpp/src/arrow/array/validate.cc | 1 - 1 file changed, 1 deletion(-) diff --git a/cpp/src/arrow/array/validate.cc b/cpp/src/arrow/array/validate.cc index 84b7b28d180..8c00f70a2a2 100644 --- a/cpp/src/arrow/array/validate.cc +++ b/cpp/src/arrow/array/validate.cc @@ -23,7 +23,6 @@ #include "arrow/extension_type.h" #include "arrow/type.h" #include "arrow/type_traits.h" -#include "arrow/util/bit_run_reader.h" #include "arrow/util/bit_util.h" #include "arrow/util/bitmap_ops.h" #include "arrow/util/checked_cast.h" From f02d5ce1c7574975a9b1f13defb4d577be134376 Mon Sep 17 00:00:00 2001 From: Felipe Oliveira Carvalho Date: Thu, 9 Nov 2023 15:09:16 -0500 Subject: [PATCH 62/91] fixup! validate.cc: Be strict about nullability of offsets and sizes buffers on list-views --- cpp/src/arrow/array/validate.cc | 1 - 1 file changed, 1 deletion(-) diff --git a/cpp/src/arrow/array/validate.cc b/cpp/src/arrow/array/validate.cc index 8c00f70a2a2..717331a4dcb 100644 --- a/cpp/src/arrow/array/validate.cc +++ b/cpp/src/arrow/array/validate.cc @@ -892,7 +892,6 @@ struct ValidateArrayImpl { const bool non_empty = data.length > 0; if constexpr (is_list_view) { if (!IsBufferValid(1)) { - // For length 0, an empty offsets buffer is accepted (ARROW-544). return Status::Invalid("offsets buffer is null"); } if (!IsBufferValid(2)) { From 311603c35cf0ba9606a6acb62e105ba78bdea4d6 Mon Sep 17 00:00:00 2001 From: Felipe Oliveira Carvalho Date: Fri, 10 Nov 2023 12:04:53 -0500 Subject: [PATCH 63/91] Make the C Bridge test set for list-views complete --- cpp/src/arrow/c/bridge.cc | 2 +- cpp/src/arrow/c/bridge_test.cc | 99 ++++++++++++++++++++++++++++++++++ 2 files changed, 100 insertions(+), 1 deletion(-) diff --git a/cpp/src/arrow/c/bridge.cc b/cpp/src/arrow/c/bridge.cc index 085c5df076d..eeec75f2f47 100644 --- a/cpp/src/arrow/c/bridge.cc +++ b/cpp/src/arrow/c/bridge.cc @@ -1701,7 +1701,7 @@ struct ArrayImporter { RETURN_NOT_OK(CheckNumBuffers(3)); RETURN_NOT_OK(AllocateArrayData()); RETURN_NOT_OK(ImportNullBitmap()); - RETURN_NOT_OK((ImportOffsetsBuffer(1))); + RETURN_NOT_OK((ImportOffsetsBuffer(1))); RETURN_NOT_OK(ImportSizesBuffer(2)); return Status::OK(); } diff --git a/cpp/src/arrow/c/bridge_test.cc b/cpp/src/arrow/c/bridge_test.cc index 122d4a0e7b8..608e59e4227 100644 --- a/cpp/src/arrow/c/bridge_test.cc +++ b/cpp/src/arrow/c/bridge_test.cc @@ -2426,6 +2426,18 @@ static const int64_t large_list_offsets_buffer1[] = {0, 2, 2, 5, 6, 8}; static const void* large_list_buffers_no_nulls1[2] = {nullptr, large_list_offsets_buffer1}; +static const int32_t list_view_offsets_buffer1[] = {0, 2, 2, 5, 6}; +static const int32_t list_view_sizes_buffer1[] = {2, 0, 3, 1, 2}; +static const void* list_view_buffers_no_nulls1[3] = {nullptr, list_view_offsets_buffer1, + list_view_sizes_buffer1}; +static const void* list_view_buffers_nulls1[3] = {bits_buffer1, list_view_offsets_buffer1, + list_view_sizes_buffer1}; + +static const int64_t large_list_view_offsets_buffer1[] = {0, 2, 2, 5, 6}; +static const int64_t large_list_view_sizes_buffer1[] = {2, 0, 3, 1, 2}; +static const void* large_list_view_buffers_no_nulls1[3] = { + nullptr, large_list_view_offsets_buffer1, large_list_view_sizes_buffer1}; + static const int8_t type_codes_buffer1[] = {42, 42, 43, 43, 42}; static const int32_t union_offsets_buffer1[] = {0, 1, 0, 1, 2}; static const void* sparse_union_buffers1_legacy[2] = {nullptr, type_codes_buffer1}; @@ -2508,6 +2520,17 @@ class TestArrayImport : public ::testing::Test { c->children = NLastChildren(1, c); } + void FillListView(struct ArrowArray* c, int64_t length, int64_t null_count, + int64_t offset, const void** buffers) { + c->length = length; + c->null_count = null_count; + c->offset = offset; + c->n_buffers = 3; + c->buffers = buffers; + c->n_children = 1; + c->children = NLastChildren(1, c); + } + void FillFixedSizeListLike(struct ArrowArray* c, int64_t length, int64_t null_count, int64_t offset, const void** buffers) { c->length = length; @@ -2564,6 +2587,11 @@ class TestArrayImport : public ::testing::Test { FillListLike(&c_struct_, length, null_count, offset, buffers); } + void FillListView(int64_t length, int64_t null_count, int64_t offset, + const void** buffers) { + FillListView(&c_struct_, length, null_count, offset, buffers); + } + void FillFixedSizeListLike(int64_t length, int64_t null_count, int64_t offset, const void** buffers) { FillFixedSizeListLike(&c_struct_, length, null_count, offset, buffers); @@ -2921,6 +2949,53 @@ TEST_F(TestArrayImport, ListWithOffset) { "[[6, 7, 8], [9, 10, 11], [12, 13, 14]]")); } +TEST_F(TestArrayImport, ListView) { + FillPrimitive(AddChild(), 8, 0, 0, primitive_buffers_no_nulls1_8); + FillListView(5, 0, 0, list_view_buffers_no_nulls1); + CheckImport(ArrayFromJSON(list_view(int8()), "[[1, 2], [], [3, 4, 5], [6], [7, 8]]")); + FillPrimitive(AddChild(), 5, 0, 0, primitive_buffers_no_nulls1_16); + FillListView(3, 1, 0, list_view_buffers_nulls1); + CheckImport( + ArrayFromJSON(list_view(int16()), "[[513, 1027], null, [1541, 2055, 2569]]")); + + // Large list-view + FillPrimitive(AddChild(), 5, 0, 0, primitive_buffers_no_nulls1_16); + FillListView(3, 0, 0, large_list_view_buffers_no_nulls1); + CheckImport( + ArrayFromJSON(large_list_view(int16()), "[[513, 1027], [], [1541, 2055, 2569]]")); +} + +TEST_F(TestArrayImport, NestedListView) { + FillPrimitive(AddChild(), 8, 0, 0, primitive_buffers_no_nulls1_8); + FillListView(AddChild(), 5, 0, 0, list_view_buffers_no_nulls1); + FillListView(3, 0, 0, large_list_view_buffers_no_nulls1); + CheckImport(ArrayFromJSON(large_list_view(list_view(int8())), + "[[[1, 2], []], [], [[3, 4, 5], [6], [7, 8]]]")); + + FillPrimitive(AddChild(), 6, 0, 0, primitive_buffers_no_nulls1_8); + FillFixedSizeListLike(AddChild(), 2, 0, 0, buffers_no_nulls_no_data); + FillListView(2, 0, 0, list_view_buffers_no_nulls1); + CheckImport(ArrayFromJSON(list_view(fixed_size_list(int8(), 3)), + "[[[1, 2, 3], [4, 5, 6]], []]")); +} + +TEST_F(TestArrayImport, ListViewWithOffset) { + // Offset in child + FillPrimitive(AddChild(), 8, 0, 1, primitive_buffers_no_nulls1_8); + FillListView(5, 0, 0, list_view_buffers_no_nulls1); + CheckImport(ArrayFromJSON(list_view(int8()), "[[2, 3], [], [4, 5, 6], [7], [8, 9]]")); + + // Offset in parent + FillPrimitive(AddChild(), 8, 0, 0, primitive_buffers_no_nulls1_8); + FillListView(4, 0, 1, list_view_buffers_no_nulls1); + CheckImport(ArrayFromJSON(list_view(int8()), "[[], [3, 4, 5], [6], [7, 8]]")); + + // Both + FillPrimitive(AddChild(), 8, 0, 2, primitive_buffers_no_nulls1_8); + FillListView(4, 0, 1, list_view_buffers_no_nulls1); + CheckImport(ArrayFromJSON(list_view(int8()), "[[], [5, 6, 7], [8], [9, 10]]")); +} + TEST_F(TestArrayImport, Struct) { FillStringLike(AddChild(), 3, 0, 0, string_buffers_no_nulls1); FillPrimitive(AddChild(), 3, -1, 0, primitive_buffers_nulls1_16); @@ -3218,6 +3293,17 @@ TEST_F(TestArrayImport, ListError) { CheckImportError(list(int8())); } +TEST_F(TestArrayImport, ListViewNoError) { + // Unlike with lists, importing a length-0 list-view with all buffers ommitted is + // not an error. List-views don't need an extra offset value, so an empty offsets + // buffer is valid in this case. + + // Null offsets pointer + FillPrimitive(AddChild(), 0, 0, 0, primitive_buffers_no_nulls1_8); + FillListView(0, 0, 0, all_buffers_omitted); + CheckImport(ArrayFromJSON(list_view(int8()), "[]")); +} + TEST_F(TestArrayImport, MapError) { // Bad number of (struct) children in map child FillStringLike(AddChild(), 5, 0, 0, string_buffers_no_nulls1); @@ -3471,6 +3557,12 @@ TEST_F(TestSchemaRoundtrip, List) { TestWithTypeFactory([]() { return list(fixed_size_list(utf8(), 5)); }); } +TEST_F(TestSchemaRoundtrip, ListView) { + TestWithTypeFactory([]() { return list_view(utf8()); }); + TestWithTypeFactory([]() { return large_list_view(list_view(utf8())); }); + TestWithTypeFactory([]() { return list_view(fixed_size_list(utf8(), 5)); }); +} + TEST_F(TestSchemaRoundtrip, Struct) { auto f1 = field("f1", utf8(), /*nullable=*/false); auto f2 = field("f2", list(decimal(19, 4))); @@ -3732,6 +3824,13 @@ TEST_F(TestArrayRoundtrip, List) { TestWithJSONSliced(fixed_size_list(int32(), 3), "[[4, 5, 6], null, [7, 8, null]]"); } +TEST_F(TestArrayRoundtrip, ListView) { + TestWithJSON(list_view(int32()), "[]"); + TestWithJSON(list_view(int32()), "[[4, 5], [6, null], null]"); + + TestWithJSONSliced(list_view(int32()), "[[4, 5], [6, null], null]"); +} + TEST_F(TestArrayRoundtrip, Struct) { auto type = struct_({field("ints", int16()), field("bools", boolean())}); TestWithJSON(type, "[]"); From 26e5288969ac5a3ecfaa0b4ff751e066fac3f4a3 Mon Sep 17 00:00:00 2001 From: Felipe Oliveira Carvalho Date: Fri, 10 Nov 2023 15:41:24 -0500 Subject: [PATCH 64/91] fixup! IPC and JSON Integration tests for list-view types --- cpp/src/arrow/ipc/test_common.cc | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/cpp/src/arrow/ipc/test_common.cc b/cpp/src/arrow/ipc/test_common.cc index 1d1f74379fc..87c02e2d87a 100644 --- a/cpp/src/arrow/ipc/test_common.cc +++ b/cpp/src/arrow/ipc/test_common.cc @@ -196,7 +196,8 @@ Status MakeRandomListViewArray(const std::shared_ptr& child_array, int nu random::RandomArrayGenerator rand(seed); const double null_probability = include_nulls ? 0.5 : 0.0; - *out = rand.ListView(*child_array, num_lists, null_probability, false, 0.9, + *out = rand.ListView(*child_array, /*length=*/num_lists, null_probability, + /*force_empty_nulls=*/false, /*coverage=*/0.9, kDefaultBufferAlignment, pool); return Status::OK(); } @@ -208,8 +209,9 @@ Status MakeRandomLargeListViewArray(const std::shared_ptr& child_array, random::RandomArrayGenerator rand(seed); const double null_probability = include_nulls ? 0.5 : 0.0; - *out = rand.LargeListView(*child_array, num_lists, null_probability, false, 0.9, - kDefaultBufferAlignment, pool); + *out = rand.LargeListView(*child_array, /*length=*/num_lists, null_probability, + /*force_empty_nulls=*/false, + /*force_empty_nulls=*/0.9, kDefaultBufferAlignment, pool); return Status::OK(); } From 0da6695c3c8159c364f216946425b099130907cd Mon Sep 17 00:00:00 2001 From: Felipe Oliveira Carvalho Date: Fri, 10 Nov 2023 17:05:37 -0500 Subject: [PATCH 65/91] fixup! random.cc: Simplify and split the random generator into two algorithms --- cpp/src/arrow/testing/random.cc | 28 ++++++++++++++-------------- 1 file changed, 14 insertions(+), 14 deletions(-) diff --git a/cpp/src/arrow/testing/random.cc b/cpp/src/arrow/testing/random.cc index 33ddd476efc..87d606dd9fb 100644 --- a/cpp/src/arrow/testing/random.cc +++ b/cpp/src/arrow/testing/random.cc @@ -635,27 +635,27 @@ enable_if_parameter_free GetMetadata(const KeyValueMetadata* metad /// \param[in] seed The seed for the random number generator /// \param[in,out] data The array to shuffle template -void ShuffleListViewDataInPlace(SeedType seed, ArrayData& data) { - DCHECK_EQ(data.type->id(), ListViewType::type_id); +void ShuffleListViewDataInPlace(SeedType seed, ArrayData* data) { + DCHECK_EQ(data->type->id(), ListViewType::type_id); using offset_type = typename ListViewType::offset_type; - auto* validity = data.GetMutableValues(0, 0); - auto* offsets = data.GetMutableValues(1); - auto* sizes = data.GetMutableValues(2); + auto* validity = data->GetMutableValues(0, 0); + auto* offsets = data->GetMutableValues(1); + auto* sizes = data->GetMutableValues(2); pcg32_fast rng(seed); using UniformDist = std::uniform_int_distribution; UniformDist dist; - for (int64_t i = data.length - 1; i > 0; --i) { + for (int64_t i = data->length - 1; i > 0; --i) { const auto j = dist(rng, UniformDist::param_type(0, i)); if (ARROW_PREDICT_TRUE(i != j)) { // Swap validity bits if (validity) { - const bool valid_i = bit_util::GetBit(validity, data.offset + i); - const bool valid_j = bit_util::GetBit(validity, data.offset + i); + const bool valid_i = bit_util::GetBit(validity, data->offset + i); + const bool valid_j = bit_util::GetBit(validity, data->offset + i); if (valid_i != valid_j) { - bit_util::SetBitTo(validity, data.offset + i, valid_j); - bit_util::SetBitTo(validity, data.offset + j, valid_i); + bit_util::SetBitTo(validity, data->offset + i, valid_j); + bit_util::SetBitTo(validity, data->offset + j, valid_i); } } // Swap offsets and sizes @@ -764,8 +764,8 @@ Result> ArrayOfListView(RAG& self, const Field& field, ARROW_ASSIGN_OR_RAISE(auto list_view_array, ArrayType::FromArrays(field.type(), *offsets, *lengths, *values)); - ShuffleListViewDataInPlace(self.seed(), - const_cast(*list_view_array->data())); + ShuffleListViewDataInPlace( + self.seed(), const_cast(list_view_array->data().get())); return list_view_array; } @@ -816,8 +816,8 @@ Result> RandomListView(RAG& self, const Array& values, ARROW_ASSIGN_OR_RAISE( auto list_view_array, ArrayType::FromArrays(*offsets_array, *sizes_array, values, memory_pool)); - ShuffleListViewDataInPlace(self.seed(), - const_cast(*list_view_array->data())); + ShuffleListViewDataInPlace( + self.seed(), const_cast(list_view_array->data().get())); return list_view_array; } From 551466a35f0c9e43f8838b362d8c56cb605466dd Mon Sep 17 00:00:00 2001 From: Felipe Oliveira Carvalho Date: Fri, 10 Nov 2023 17:17:18 -0500 Subject: [PATCH 66/91] fixup! [Large]ListViewArray: Implement Compare + most of the unit tests --- cpp/src/arrow/testing/random.cc | 1 + 1 file changed, 1 insertion(+) diff --git a/cpp/src/arrow/testing/random.cc b/cpp/src/arrow/testing/random.cc index 87d606dd9fb..d9409e80a94 100644 --- a/cpp/src/arrow/testing/random.cc +++ b/cpp/src/arrow/testing/random.cc @@ -1295,6 +1295,7 @@ std::shared_ptr RandomArrayGenerator::ArrayOf(const Field& field, int64_t #undef GENERATE_INTEGRAL_CASE #undef GENERATE_FLOATING_CASE #undef GENERATE_LIST_CASE +#undef GENERATE_LIST_VIEW_CASE #undef VALIDATE_RANGE #undef VALIDATE_MIN_MAX From 6922d270e80e0b682b8660586b12b65d73b6d951 Mon Sep 17 00:00:00 2001 From: Felipe Oliveira Carvalho Date: Fri, 10 Nov 2023 17:39:03 -0500 Subject: [PATCH 67/91] fixup! list_util.h: Add RangeOfValuesUsed() function --- cpp/src/arrow/util/list_util.h | 4 ---- 1 file changed, 4 deletions(-) diff --git a/cpp/src/arrow/util/list_util.h b/cpp/src/arrow/util/list_util.h index b893f3d5d6c..cbfd71e0055 100644 --- a/cpp/src/arrow/util/list_util.h +++ b/cpp/src/arrow/util/list_util.h @@ -25,10 +25,6 @@ namespace arrow { namespace list_util { - -/// \brief Get the child array holding the values from a List or ListView array -inline const ArraySpan& ValuesArray(const ArraySpan& span) { return span.child_data[0]; } - namespace internal { /// \brief Calculate the smallest continuous range of values used by the From 8e6af3e621340c95617d50d5e1d4829a8527b05f Mon Sep 17 00:00:00 2001 From: Felipe Oliveira Carvalho Date: Fri, 10 Nov 2023 17:49:09 -0500 Subject: [PATCH 68/91] fixup! list_util.cc: Rewrite MinViewOffset and MaxViewEnd --- cpp/src/arrow/util/list_util.cc | 12 ------------ 1 file changed, 12 deletions(-) diff --git a/cpp/src/arrow/util/list_util.cc b/cpp/src/arrow/util/list_util.cc index 3392e7cb590..5ef7fecb9ab 100644 --- a/cpp/src/arrow/util/list_util.cc +++ b/cpp/src/arrow/util/list_util.cc @@ -92,27 +92,16 @@ std::optional MinViewOffset(const ArraySpan& input) { /// \param input A LIST_VIEW or LARGE_LIST_VIEW array template int64_t MaxViewEnd(const ArraySpan& input) { - constexpr auto kInt64Max = std::numeric_limits::max(); const auto values_length = input.child_data[0].length; const uint8_t* validity = input.buffers[0].data; const auto* offsets = input.GetValues(1); const auto* sizes = input.GetValues(2); - // Early-exit: 64-bit overflow detected. This is not possible on a valid list-view, - // but we return the maximum possible value to avoid undefined behavior. -#define MAX_VIEW_END_OVERFLOW_CHECK(offset, size) \ - if constexpr (sizeof(offset_type) == sizeof(int64_t)) { \ - if (ARROW_PREDICT_FALSE((offset) > kInt64Max - (size))) { \ - return kInt64Max; \ - } \ - } - #define MAXIMIZE_MAX_VIEW_END(i) \ const auto offset = static_cast(offsets[i]); \ const offset_type size = sizes[i]; \ if (size > 0) { \ - MAX_VIEW_END_OVERFLOW_CHECK(offset, size); \ const int64_t end = offset + size; \ if (end > max_end) { \ if (end == values_length) { \ @@ -141,7 +130,6 @@ int64_t MaxViewEnd(const ArraySpan& input) { } return max_end; -#undef MAX_VIEW_END_OVERFLOW_CHECK #undef MAXIMIZE_MAX_VIEW_END } From 698b1df017d5f2af9d66a67eeca04f3e22bcc40f Mon Sep 17 00:00:00 2001 From: Felipe Oliveira Carvalho Date: Wed, 15 Nov 2023 21:59:37 -0300 Subject: [PATCH 69/91] fixup! More throroughly document builder member functions --- cpp/src/arrow/array/builder_nested.h | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/cpp/src/arrow/array/builder_nested.h b/cpp/src/arrow/array/builder_nested.h index bca0095d9d1..2f73ebc8df5 100644 --- a/cpp/src/arrow/array/builder_nested.h +++ b/cpp/src/arrow/array/builder_nested.h @@ -108,6 +108,10 @@ class ARROW_EXPORT VarLengthListLikeBuilder : public ArrayBuilder { /// slots or an invalid [Large]ListView array will be generated because there /// aren't enough elements in the values builder to fill the list slots. /// + /// If you're building a [Large]List and don't need to be compatible + /// with [Large]ListView, then `BaseListBuilder::Append(bool is_valid)` + /// is a simpler API. + /// /// \pre if is_valid is false, list_length MUST be 0 /// \param is_valid Whether the new list slot is valid /// \param list_length The number of elements in the list From e9fc0054dac507dd6f5ad5c727db4247bb2b19ff Mon Sep 17 00:00:00 2001 From: Felipe Oliveira Carvalho Date: Wed, 15 Nov 2023 22:00:26 -0300 Subject: [PATCH 70/91] fixup! [Large]ListViewArrayBuilder: Add list-view builder classes --- cpp/src/arrow/array/builder_nested.h | 3 +++ 1 file changed, 3 insertions(+) diff --git a/cpp/src/arrow/array/builder_nested.h b/cpp/src/arrow/array/builder_nested.h index 2f73ebc8df5..c0f5d5be2f7 100644 --- a/cpp/src/arrow/array/builder_nested.h +++ b/cpp/src/arrow/array/builder_nested.h @@ -305,6 +305,9 @@ class ARROW_EXPORT BaseListBuilder : public VarLengthListLikeBuilder { // consistent with the offsets to rule out the possibility that the caller // is passing sizes that could work if building a list-view, but don't work // on building a list that requires offsets to be non-decreasing. + // CAUTION: the last size element (`sizes[length - 1]`) is not + // validated and could be inconsistent with the offsets given in a + // subsequent call to AppendValues. if (sizes) { for (int64_t i = 0; i < length - 1; ++i) { if (ARROW_PREDICT_FALSE(offsets[i] != offsets[i + 1] - sizes[i])) { From 308781b51691960a72cfcfadff35c1df65378fc5 Mon Sep 17 00:00:00 2001 From: Felipe Oliveira Carvalho Date: Wed, 15 Nov 2023 18:47:21 -0300 Subject: [PATCH 71/91] fixup! [Large]ListViewArray: Implement Compare + most of the unit tests --- cpp/src/arrow/array/array_list_test.cc | 1 - 1 file changed, 1 deletion(-) diff --git a/cpp/src/arrow/array/array_list_test.cc b/cpp/src/arrow/array/array_list_test.cc index 271fde671f5..f8150a25226 100644 --- a/cpp/src/arrow/array/array_list_test.cc +++ b/cpp/src/arrow/array/array_list_test.cc @@ -623,7 +623,6 @@ class TestListArray : public ::testing::Test { void TestBulkAppendInvalid() { std::vector values = {0, 1, 2, 3, 4, 5, 6}; std::vector is_valid = {1, 0, 1}; - // Should be {0, 3, 3} given the is_valid array std::vector offsets = {0, 2, 4}; std::vector sizes = {2, 2, 4}; From ec0e10e131643de6ed53d2435468ecc854627fe9 Mon Sep 17 00:00:00 2001 From: Felipe Oliveira Carvalho Date: Wed, 15 Nov 2023 19:08:36 -0300 Subject: [PATCH 72/91] fixup! Rewrite ListView Flatten --- cpp/src/arrow/array/array_nested.h | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/cpp/src/arrow/array/array_nested.h b/cpp/src/arrow/array/array_nested.h index 9f69d0d0c25..58ebf47dce9 100644 --- a/cpp/src/arrow/array/array_nested.h +++ b/cpp/src/arrow/array/array_nested.h @@ -321,6 +321,11 @@ class ARROW_EXPORT ListViewArray : public BaseListViewArray { /// Note that it's different from `values()` in that it takes into /// consideration this array's offsets (which can be in any order) /// and sizes. Nulls are skipped. + /// + /// This function invokes Concatenate() if list-views are non-contiguous. It + /// will try to minimize the number of array slices passed to Concatenate() by + /// maximizing the size of each slice (containing as many contiguous + /// list-views as possible). Result> Flatten( MemoryPool* memory_pool = default_memory_pool()) const; From 8b8c6f0ae29fc205c807b6ef13ad4d526f979f8e Mon Sep 17 00:00:00 2001 From: Felipe Oliveira Carvalho Date: Wed, 15 Nov 2023 19:32:08 -0300 Subject: [PATCH 73/91] fixup! [Large]ListViewArrayBuilder: Add list-view builder classes --- cpp/src/arrow/array/builder_nested.h | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/cpp/src/arrow/array/builder_nested.h b/cpp/src/arrow/array/builder_nested.h index c0f5d5be2f7..21c2d4b270e 100644 --- a/cpp/src/arrow/array/builder_nested.h +++ b/cpp/src/arrow/array/builder_nested.h @@ -301,13 +301,15 @@ class ARROW_EXPORT BaseListBuilder : public VarLengthListLikeBuilder { Status AppendValues(const offset_type* offsets, const offset_type* sizes, int64_t length, const uint8_t* valid_bytes) final { - // offsets are assumed to be valid, but the first length-1 sizes have to be - // consistent with the offsets to rule out the possibility that the caller - // is passing sizes that could work if building a list-view, but don't work - // on building a list that requires offsets to be non-decreasing. + // Offsets are assumed to be valid, but the first length-1 sizes have to be + // consistent with the offsets to partially rule out the possibility that the + // caller is passing sizes that could work if building a list-view, but don't + // work on building a list that requires offsets to be non-decreasing. + // // CAUTION: the last size element (`sizes[length - 1]`) is not // validated and could be inconsistent with the offsets given in a // subsequent call to AppendValues. +#ifndef NDEBUG if (sizes) { for (int64_t i = 0; i < length - 1; ++i) { if (ARROW_PREDICT_FALSE(offsets[i] != offsets[i + 1] - sizes[i])) { @@ -318,6 +320,7 @@ class ARROW_EXPORT BaseListBuilder : public VarLengthListLikeBuilder { } } } +#endif return AppendValues(offsets, length, valid_bytes); } From 2d8e55f510f4cf99eca7416b95440702f18766ce Mon Sep 17 00:00:00 2001 From: Felipe Oliveira Carvalho Date: Wed, 15 Nov 2023 19:35:37 -0300 Subject: [PATCH 74/91] fixup! fixup! [Large]ListViewScalar: Implement all operations --- cpp/src/arrow/array/util.cc | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/cpp/src/arrow/array/util.cc b/cpp/src/arrow/array/util.cc index 38ee30508ea..099d5d113ea 100644 --- a/cpp/src/arrow/array/util.cc +++ b/cpp/src/arrow/array/util.cc @@ -390,7 +390,7 @@ class NullArrayFactory { // values array may be empty, but there must be at least one offset of 0 RETURN_NOT_OK(MaxOf(sizeof(typename T::offset_type) * (length_ + 1))); // XXX(felipec): reviewers, is this correct? - RETURN_NOT_OK(MaxOf(GetBufferLength(type.value_type(), length_))); + RETURN_NOT_OK(MaxOf(GetBufferLength(type.value_type(), /*length=*/0))); return Status::OK(); } @@ -398,7 +398,7 @@ class NullArrayFactory { enable_if_list_view Visit(const T& type) { RETURN_NOT_OK(MaxOf(sizeof(typename T::offset_type) * length_)); // XXX(felipec): reviewers, is this correct? - RETURN_NOT_OK(MaxOf(GetBufferLength(type.value_type(), length_))); + RETURN_NOT_OK(MaxOf(GetBufferLength(type.value_type(), /*length=*/0))); return Status::OK(); } From 0972d5cff78f647674ad98a436752ff44416c102 Mon Sep 17 00:00:00 2001 From: Felipe Oliveira Carvalho Date: Wed, 15 Nov 2023 22:13:03 -0300 Subject: [PATCH 75/91] fixup! [Large]ListViewArray: Implement Validate --- cpp/src/arrow/array/validate.cc | 22 ++++------------------ 1 file changed, 4 insertions(+), 18 deletions(-) diff --git a/cpp/src/arrow/array/validate.cc b/cpp/src/arrow/array/validate.cc index 717331a4dcb..89562161d2f 100644 --- a/cpp/src/arrow/array/validate.cc +++ b/cpp/src/arrow/array/validate.cc @@ -584,7 +584,7 @@ struct ValidateArrayImpl { const Buffer& values = *data.buffers[2]; // First validate offsets, to make sure the accesses below are valid - RETURN_NOT_OK(ValidateOffsets(type, values.size())); + RETURN_NOT_OK(ValidateOffsetsAndSizes(type, values.size())); if (data.length > 0 && data.buffers[1]->is_cpu()) { using offset_type = typename BinaryType::offset_type; @@ -704,7 +704,7 @@ struct ValidateArrayImpl { } // First validate offsets, to make sure the accesses below are valid - RETURN_NOT_OK(ValidateOffsets(type, values.offset + values.length)); + RETURN_NOT_OK(ValidateOffsetsAndSizes(type, values.offset + values.length)); // An empty list array can have 0 offsets if (data.length > 0 && data.buffers[1]->is_cpu()) { @@ -884,8 +884,9 @@ struct ValidateArrayImpl { return Status::OK(); } + public: template - Status ValidateOffsetsAndMaybeSizes(const TypeClass&, int64_t offset_limit) { + Status ValidateOffsetsAndSizes(const TypeClass&, int64_t offset_limit) { using offset_type = typename TypeClass::offset_type; constexpr bool is_list_view = is_list_view_type::value; @@ -935,21 +936,6 @@ struct ValidateArrayImpl { return Status::OK(); } - public: - template - enable_if_list_view ValidateOffsetsAndSizes(const TypeClass& type, - int64_t offset_limit) { - return ValidateOffsetsAndMaybeSizes(type, offset_limit); - } - - template - std::enable_if_t::value || - is_base_binary_like(TypeClass::type_id), - Status> - ValidateOffsets(const TypeClass& type, int64_t offset_limit) { - return ValidateOffsetsAndMaybeSizes(type, offset_limit); - } - template Status ValidateDecimals(const DecimalType& type) { using CType = typename TypeTraits::CType; From 5f57008b60891fb45cb20b8744510741a6d64cb4 Mon Sep 17 00:00:00 2001 From: Felipe Oliveira Carvalho Date: Wed, 15 Nov 2023 22:13:03 -0300 Subject: [PATCH 76/91] Remove sparsity parameter from random list-view generator --- cpp/src/arrow/testing/random.cc | 25 ++++++------------------- 1 file changed, 6 insertions(+), 19 deletions(-) diff --git a/cpp/src/arrow/testing/random.cc b/cpp/src/arrow/testing/random.cc index d9409e80a94..c317fe7aef4 100644 --- a/cpp/src/arrow/testing/random.cc +++ b/cpp/src/arrow/testing/random.cc @@ -670,28 +670,17 @@ void ShuffleListViewDataInPlace(SeedType seed, ArrayData* data) { /// The sizes buffer is an input of this function, but when force_empty_nulls is true, /// some values on the sizes buffer can be set to 0. /// -/// When sparsity is 0.0, the list-view spans are perfectly packed one after the -/// other. If sparsity is greater than 0.0, the list-view spans are set apart -/// from each other in proportion to the sparsity value and size of each -/// list-view. A negative sparsity means each list-view shares a fraction of the -/// values used by the previous list-view. -/// -/// For instance, a sparsity of -1.0 means the values array will only need enough values -/// for the largest list-view with all the other list-views spanning some of these same -/// values. -/// /// \param[in] seed The seed for the random number generator /// \param[in,out] mutable_sizes_array The array of sizes to use /// \param[in] force_empty_nulls Whether to force null list-view sizes to be 0 /// \param[in] zero_undefined_offsets Whether to zero the offsets of list-views that have /// 0 set as the size -/// \param[in] sparsity The sparsity of the generated list-view offsets /// \param[out] out_max_view_end The maximum value of the end of a list-view template std::shared_ptr ViewOffsetsFromLengthsArray( SeedType seed, OffsetArrayType& mutable_sizes_array, bool force_empty_nulls, - bool zero_undefined_offsets, double sparsity, int64_t* out_max_view_end, - int64_t alignment, MemoryPool* memory_pool) { + bool zero_undefined_offsets, int64_t* out_max_view_end, int64_t alignment, + MemoryPool* memory_pool) { using TypeClass = typename OffsetArrayType::TypeClass; auto* sizes = mutable_sizes_array.data()->template GetMutableValues(1); @@ -702,10 +691,9 @@ std::shared_ptr ViewOffsetsFromLengthsArray( alignment, memory_pool); auto offsets = buffers[1]->mutable_data_as(); - double offset_base = 0.0; + offset_type offset = 0; offset_type max_view_end = 0; for (int64_t i = 0; i < mutable_sizes_array.length(); ++i) { - const auto offset = static_cast(std::llround(offset_base)); if (mutable_sizes_array.IsNull(i)) { if (force_empty_nulls) { sizes[i] = 0; @@ -717,7 +705,7 @@ std::shared_ptr ViewOffsetsFromLengthsArray( } else { offsets[i] = offset; DCHECK_LT(offset, std::numeric_limits::max() - sizes[i]); - offset_base = std::max(0.0, offset_base + (sparsity * sizes[i])); + offset += sizes[i]; } } max_view_end = std::max(max_view_end, offsets[i] + sizes[i]); @@ -748,15 +736,14 @@ Result> ArrayOfListView(RAG& self, const Field& field, GetMetadata(field.metadata().get(), "force_empty_nulls", false); const auto zero_undefined_offsets = GetMetadata(field.metadata().get(), "zero_undefined_offsets", false); - const auto sparsity = GetMetadata(field.metadata().get(), "sparsity", 0.0); const auto lengths = internal::checked_pointer_cast( self.RAG::template Numeric( length, min_length, max_length, null_probability)); int64_t max_view_end = 0; const auto offsets = ViewOffsetsFromLengthsArray( - self.seed(), *lengths, force_empty_nulls, zero_undefined_offsets, sparsity, - &max_view_end, alignment, memory_pool); + self.seed(), *lengths, force_empty_nulls, zero_undefined_offsets, &max_view_end, + alignment, memory_pool); const auto values = self.RAG::ArrayOf( *internal::checked_pointer_cast(field.type())->value_field(), From d554a131befa11b573c3d588ae3c9ce845d88e95 Mon Sep 17 00:00:00 2001 From: Felipe Oliveira Carvalho Date: Thu, 16 Nov 2023 12:41:54 -0300 Subject: [PATCH 77/91] Move list-x-list-view converters from list_util.h/cc to array_nested.h/cc --- cpp/src/arrow/array/array_nested.cc | 106 +++++++++++++++++++++++++++ cpp/src/arrow/array/array_nested.h | 16 ++++ cpp/src/arrow/util/list_util.cc | 104 -------------------------- cpp/src/arrow/util/list_util.h | 16 ---- cpp/src/arrow/util/list_util_test.cc | 10 ++- 5 files changed, 128 insertions(+), 124 deletions(-) diff --git a/cpp/src/arrow/array/array_nested.cc b/cpp/src/arrow/array/array_nested.cc index d1044b95100..03f3e5af299 100644 --- a/cpp/src/arrow/array/array_nested.cc +++ b/cpp/src/arrow/array/array_nested.cc @@ -27,6 +27,8 @@ #include "arrow/array/array_base.h" #include "arrow/array/array_primitive.h" +#include "arrow/array/builder_base.h" +#include "arrow/array/builder_nested.h" #include "arrow/array/concatenate.h" #include "arrow/array/util.h" #include "arrow/buffer.h" @@ -38,6 +40,7 @@ #include "arrow/util/bitmap_generate.h" #include "arrow/util/bitmap_ops.h" #include "arrow/util/checked_cast.h" +#include "arrow/util/list_util.h" #include "arrow/util/logging.h" namespace arrow { @@ -371,6 +374,79 @@ std::shared_ptr BoxSizes(const std::shared_ptr& boxed_type, return MakeArray(sizes_data); } +template +Result> ListViewFromListImpl( + const std::shared_ptr& list_data, MemoryPool* pool) { + static_assert( + std::is_same::value, + "Offset types between list type and list-view type are expected to match"); + using offset_type = typename SrcListType::offset_type; + const auto& list_type = checked_cast(*list_data->type); + + // To re-use the validity and offsets buffers, a sizes buffer with enough + // padding on the beginning is allocated and filled with the sizes after + // list_data->offset. + const int64_t buffer_length = list_data->offset + list_data->length; + ARROW_ASSIGN_OR_RAISE(auto sizes_buffer, + AllocateBuffer(buffer_length * sizeof(offset_type), pool)); + const auto* offsets = list_data->template GetValues(1, 0); + auto* sizes = sizes_buffer->mutable_data_as(); + // Zero the initial padding area to avoid leaking any data when buffers are + // sent over IPC or throught the C Data interface. + memset(sizes, 0, list_data->offset * sizeof(offset_type)); + for (int64_t i = list_data->offset; i < buffer_length; i++) { + sizes[i] = offsets[i + 1] - offsets[i]; + } + BufferVector buffers = {list_data->buffers[0], list_data->buffers[1], + std::move(sizes_buffer)}; + + return ArrayData::Make(std::make_shared(list_type.value_type()), + list_data->length, std::move(buffers), + {list_data->child_data[0]}, list_data->null_count, + list_data->offset); +} + +template +Result> ListFromListViewImpl( + const std::shared_ptr& list_view_data, MemoryPool* pool) { + static_assert( + std::is_same::value, + "Offset types between list type and list-view type are expected to match"); + using offset_type = typename DestListType::offset_type; + using ListBuilderType = typename TypeTraits::BuilderType; + + const auto& list_view_type = + checked_cast(*list_view_data->type); + const auto& value_type = list_view_type.value_type(); + const auto list_type = std::make_shared(value_type); + + ARROW_ASSIGN_OR_RAISE(auto sum_of_list_view_sizes, + list_util::internal::SumOfLogicalListSizes(*list_view_data)); + ARROW_ASSIGN_OR_RAISE(std::shared_ptr value_builder, + MakeBuilder(value_type, pool)); + RETURN_NOT_OK(value_builder->Reserve(sum_of_list_view_sizes)); + auto list_builder = std::make_shared(pool, value_builder, list_type); + RETURN_NOT_OK(list_builder->Reserve(list_view_data->length)); + + ArraySpan values{*list_view_data->child_data[0]}; + const auto* in_validity_bitmap = list_view_data->GetValues(0); + const auto* in_offsets = list_view_data->GetValues(1); + const auto* in_sizes = list_view_data->GetValues(2); + for (int64_t i = 0; i < list_view_data->length; ++i) { + const bool is_valid = + !in_validity_bitmap || + bit_util::GetBit(in_validity_bitmap, list_view_data->offset + i); + const int64_t size = is_valid ? in_sizes[i] : 0; + RETURN_NOT_OK(list_builder->Append(is_valid, size)); + RETURN_NOT_OK(value_builder->AppendArraySlice(values, in_offsets[i], size)); + } + std::shared_ptr list_array_data; + RETURN_NOT_OK(list_builder->FinishInternal(&list_array_data)); + return list_array_data; +} + } // namespace namespace internal { @@ -427,6 +503,13 @@ Result> ListArray::FromArrays( values, pool, null_bitmap, null_count); } +Result> ListArray::FromListView(const ListViewArray& source, + MemoryPool* pool) { + ARROW_ASSIGN_OR_RAISE( + auto data, (ListFromListViewImpl(source.data(), pool))); + return std::make_shared(std::move(data)); +} + Result> ListArray::FromArrays( std::shared_ptr type, const Array& offsets, const Array& values, MemoryPool* pool, std::shared_ptr null_bitmap, int64_t null_count) { @@ -478,6 +561,14 @@ Result> LargeListArray::FromArrays( null_count); } +Result> LargeListArray::FromListView( + const LargeListViewArray& source, MemoryPool* pool) { + ARROW_ASSIGN_OR_RAISE( + auto data, + (ListFromListViewImpl(source.data(), pool))); + return std::make_shared(std::move(data)); +} + Result> LargeListArray::FromArrays( std::shared_ptr type, const Array& offsets, const Array& values, MemoryPool* pool, std::shared_ptr null_bitmap, int64_t null_count) { @@ -547,6 +638,21 @@ Result> ListViewArray::FromArrays( pool, null_bitmap, null_count); } +Result> ListViewArray::FromList(const ListArray& source, + MemoryPool* pool) { + ARROW_ASSIGN_OR_RAISE( + auto data, (ListViewFromListImpl(source.data(), pool))); + return std::make_shared(std::move(data)); +} + +Result> LargeListViewArray::FromList( + const LargeListArray& source, MemoryPool* pool) { + ARROW_ASSIGN_OR_RAISE( + auto data, + (ListViewFromListImpl(source.data(), pool))); + return std::make_shared(std::move(data)); +} + Result> ListViewArray::Flatten(MemoryPool* memory_pool) const { if (null_count() > 0) { return FlattenListViewArray(*this, memory_pool); diff --git a/cpp/src/arrow/array/array_nested.h b/cpp/src/arrow/array/array_nested.h index 58ebf47dce9..8a19dee095f 100644 --- a/cpp/src/arrow/array/array_nested.h +++ b/cpp/src/arrow/array/array_nested.h @@ -161,6 +161,10 @@ class ARROW_EXPORT ListArray : public BaseListArray { std::shared_ptr null_bitmap = NULLPTR, int64_t null_count = kUnknownNullCount); + /// \brief Build a ListArray from a ListViewArray + static Result> FromListView(const ListViewArray& source, + MemoryPool* pool); + /// \brief Return an Array that is a concatenation of the lists in this array. /// /// Note that it's different from `values()` in that it takes into @@ -225,6 +229,10 @@ class ARROW_EXPORT LargeListArray : public BaseListArray { std::shared_ptr null_bitmap = NULLPTR, int64_t null_count = kUnknownNullCount); + /// \brief Build a LargeListArray from a LargeListViewArray + static Result> FromListView( + const LargeListViewArray& source, MemoryPool* pool); + /// \brief Return an Array that is a concatenation of the lists in this array. /// /// Note that it's different from `values()` in that it takes into @@ -316,6 +324,10 @@ class ARROW_EXPORT ListViewArray : public BaseListViewArray { std::shared_ptr null_bitmap = NULLPTR, int64_t null_count = kUnknownNullCount); + /// \brief Build a ListViewArray from a ListArray + static Result> FromList(const ListArray& list_array, + MemoryPool* pool); + /// \brief Return an Array that is a concatenation of the list-views in this array. /// /// Note that it's different from `values()` in that it takes into @@ -400,6 +412,10 @@ class ARROW_EXPORT LargeListViewArray : public BaseListViewArray null_bitmap = NULLPTR, int64_t null_count = kUnknownNullCount); + /// \brief Build a LargeListViewArray from a LargeListArray + static Result> FromList( + const LargeListArray& list_array, MemoryPool* pool); + /// \brief Return an Array that is a concatenation of the large list-views in this /// array. /// diff --git a/cpp/src/arrow/util/list_util.cc b/cpp/src/arrow/util/list_util.cc index 5ef7fecb9ab..6d7508f9664 100644 --- a/cpp/src/arrow/util/list_util.cc +++ b/cpp/src/arrow/util/list_util.cc @@ -190,78 +190,6 @@ int64_t SumOfListViewSizes(const ArraySpan& input) { return sum; } -template -Result> ListViewFromListImpl( - const std::shared_ptr& list_data, MemoryPool* pool) { - static_assert( - std::is_same::value, - "Offset types between list type and list-view type are expected to match"); - using offset_type = typename SrcListType::offset_type; - const auto& list_type = checked_cast(*list_data->type); - - // To re-use the validity and offsets buffers, a sizes buffer with enough - // padding on the beginning is allocated and filled with the sizes after - // list_data->offset. - const int64_t buffer_length = list_data->offset + list_data->length; - ARROW_ASSIGN_OR_RAISE(auto sizes_buffer, - AllocateBuffer(buffer_length * sizeof(offset_type), pool)); - const auto* offsets = list_data->template GetValues(1, 0); - auto* sizes = sizes_buffer->mutable_data_as(); - // Zero the initial padding area to avoid leaking any data when buffers are - // sent over IPC or throught the C Data interface. - memset(sizes, 0, list_data->offset * sizeof(offset_type)); - for (int64_t i = list_data->offset; i < buffer_length; i++) { - sizes[i] = offsets[i + 1] - offsets[i]; - } - BufferVector buffers = {list_data->buffers[0], list_data->buffers[1], - std::move(sizes_buffer)}; - - return ArrayData::Make(std::make_shared(list_type.value_type()), - list_data->length, std::move(buffers), - {list_data->child_data[0]}, list_data->null_count, - list_data->offset); -} - -template -Result> ListFromListViewImpl( - const std::shared_ptr& list_view_data, MemoryPool* pool) { - static_assert( - std::is_same::value, - "Offset types between list type and list-view type are expected to match"); - using offset_type = typename DestListType::offset_type; - using ListBuilderType = typename TypeTraits::BuilderType; - - const auto& list_view_type = - checked_cast(*list_view_data->type); - const auto& value_type = list_view_type.value_type(); - const auto list_type = std::make_shared(value_type); - - auto sum_of_list_view_sizes = SumOfListViewSizes(*list_view_data); - ARROW_ASSIGN_OR_RAISE(std::shared_ptr value_builder, - MakeBuilder(value_type, pool)); - RETURN_NOT_OK(value_builder->Reserve(sum_of_list_view_sizes)); - auto list_builder = std::make_shared(pool, value_builder, list_type); - RETURN_NOT_OK(list_builder->Reserve(list_view_data->length)); - - ArraySpan values{*list_view_data->child_data[0]}; - const auto* in_validity_bitmap = list_view_data->GetValues(0); - const auto* in_offsets = list_view_data->GetValues(1); - const auto* in_sizes = list_view_data->GetValues(2); - for (int64_t i = 0; i < list_view_data->length; ++i) { - const bool is_valid = - !in_validity_bitmap || - bit_util::GetBit(in_validity_bitmap, list_view_data->offset + i); - const int64_t size = is_valid ? in_sizes[i] : 0; - RETURN_NOT_OK(list_builder->Append(is_valid, size)); - RETURN_NOT_OK(value_builder->AppendArraySlice(values, in_offsets[i], size)); - } - std::shared_ptr list_array_data; - RETURN_NOT_OK(list_builder->FinishInternal(&list_array_data)); - return list_array_data; -} - } // namespace Result> RangeOfValuesUsed(const ArraySpan& input) { @@ -304,38 +232,6 @@ Result SumOfLogicalListSizes(const ArraySpan& input) { "SumOfLogicalListSizes: input is not a var-length list-like array"); } -Result> ListViewFromList(const ListArray& source, - MemoryPool* pool) { - ARROW_ASSIGN_OR_RAISE( - auto data, - (internal::ListViewFromListImpl(source.data(), pool))); - return std::make_shared(std::move(data)); -} - -Result> ListViewFromList(const LargeListArray& source, - MemoryPool* pool) { - ARROW_ASSIGN_OR_RAISE(auto data, - (internal::ListViewFromListImpl( - source.data(), pool))); - return std::make_shared(std::move(data)); -} - -Result> ListFromListView(const ListViewArray& source, - MemoryPool* pool) { - ARROW_ASSIGN_OR_RAISE( - auto data, - (internal::ListFromListViewImpl(source.data(), pool))); - return std::make_shared(std::move(data)); -} - -Result> ListFromListView(const LargeListViewArray& source, - MemoryPool* pool) { - ARROW_ASSIGN_OR_RAISE(auto data, - (internal::ListFromListViewImpl( - source.data(), pool))); - return std::make_shared(std::move(data)); -} - } // namespace internal } // namespace arrow::list_util diff --git a/cpp/src/arrow/util/list_util.h b/cpp/src/arrow/util/list_util.h index cbfd71e0055..82c732bd153 100644 --- a/cpp/src/arrow/util/list_util.h +++ b/cpp/src/arrow/util/list_util.h @@ -49,22 +49,6 @@ ARROW_EXPORT Result> RangeOfValuesUsed( /// \return The sum of all list or list-view sizes ARROW_EXPORT Result SumOfLogicalListSizes(const ArraySpan& input); -/// \brief Build a ListViewArray from a ListArray -ARROW_EXPORT Result> ListViewFromList( - const ListArray& source, MemoryPool* pool); - -/// \brief Build a LargeListViewArray from a LargeListArray -ARROW_EXPORT Result> ListViewFromList( - const LargeListArray& source, MemoryPool* pool); - -/// \brief Build a ListArray from a ListViewArray -ARROW_EXPORT Result> ListFromListView( - const ListViewArray& source, MemoryPool* pool); - -/// \brief Build a LargeListArray from a LargeListViewArray -ARROW_EXPORT Result> ListFromListView( - const LargeListViewArray& source, MemoryPool* pool); - } // namespace internal } // namespace list_util diff --git a/cpp/src/arrow/util/list_util_test.cc b/cpp/src/arrow/util/list_util_test.cc index 425580a7084..954646e0b3b 100644 --- a/cpp/src/arrow/util/list_util_test.cc +++ b/cpp/src/arrow/util/list_util_test.cc @@ -169,6 +169,7 @@ class TestListConversions : public ::testing::Test { template void DoTestListViewFromList() { + using DestListViewArrayClass = typename TypeTraits::ArrayType; using SrcListArrayClass = typename TypeTraits::ArrayType; auto list_type = std::make_shared(int32()); auto list_view_type = std::make_shared(int32()); @@ -183,13 +184,13 @@ class TestListConversions : public ::testing::Test { auto list_wo_nulls = ArrayFromJSON(list_type, "[[1, 2], [], [100000]]"); ASSERT_OK_AND_ASSIGN( - auto result, list_util::internal::ListViewFromList( + auto result, DestListViewArrayClass::FromList( *checked_pointer_cast(list_w_nulls), pool_)); ASSERT_OK(result->ValidateFull()); AssertArraysEqual(*expected_list_view_w_nulls, *result, /*verbose=*/true); ASSERT_OK_AND_ASSIGN( - result, list_util::internal::ListViewFromList( + result, DestListViewArrayClass::FromList( *checked_pointer_cast(list_wo_nulls), pool_)); ASSERT_OK(result->ValidateFull()); AssertArraysEqual(*expected_list_view_wo_nulls, *result, /*verbose=*/true); @@ -198,6 +199,7 @@ class TestListConversions : public ::testing::Test { template void DoTestListFromListView() { using SrcListViewArrayClass = typename TypeTraits::ArrayType; + using DestListArrayClass = typename TypeTraits::ArrayType; auto list_view_type = std::make_shared(int32()); auto list_type = std::make_shared(int32()); @@ -210,14 +212,14 @@ class TestListConversions : public ::testing::Test { ASSERT_OK_AND_ASSIGN( auto result, - list_util::internal::ListFromListView( + DestListArrayClass::FromListView( *checked_pointer_cast(list_view_w_nulls), pool_)); ASSERT_OK(result->ValidateFull()); AssertArraysEqual(*expected_list_w_nulls, *result, /*verbose=*/true); ASSERT_OK_AND_ASSIGN( result, - list_util::internal::ListFromListView( + DestListArrayClass::FromListView( *checked_pointer_cast(list_view_wo_nulls), pool_)); ASSERT_OK(result->ValidateFull()); AssertArraysEqual(*expected_list_wo_nulls, *result, /*verbose=*/true); From aef0cb6e10372db63520a52f86d6715f3f18575e Mon Sep 17 00:00:00 2001 From: Felipe Oliveira Carvalho Date: Thu, 16 Nov 2023 12:44:14 -0300 Subject: [PATCH 78/91] Move list-x-list-view converter tests to list_array_test.cc --- cpp/src/arrow/array/array_list_test.cc | 76 ++++++++++++++++++++++++++ cpp/src/arrow/util/list_util_test.cc | 76 -------------------------- 2 files changed, 76 insertions(+), 76 deletions(-) diff --git a/cpp/src/arrow/array/array_list_test.cc b/cpp/src/arrow/array/array_list_test.cc index f8150a25226..0b591d40180 100644 --- a/cpp/src/arrow/array/array_list_test.cc +++ b/cpp/src/arrow/array/array_list_test.cc @@ -940,6 +940,82 @@ TYPED_TEST(TestListArray, DISABLED_TestOverflowCheck) { this->TestOverflowCheck( TYPED_TEST(TestListArray, TestOverflowCheck) { this->TestOverflowCheck(); } #endif +class TestListConversions : public ::testing::Test { + private: + MemoryPool* pool_; + + public: + TestListConversions() : pool_(default_memory_pool()) {} + + template + void DoTestListViewFromList() { + using DestListViewArrayClass = typename TypeTraits::ArrayType; + using SrcListArrayClass = typename TypeTraits::ArrayType; + auto list_type = std::make_shared(int32()); + auto list_view_type = std::make_shared(int32()); + + auto expected_list_view_w_nulls = + ArrayFromJSON(list_view_type, "[[1, 2], [3], [], [4], null]"); + auto expected_list_view_wo_nulls = + ArrayFromJSON(list_view_type, "[[1, 2], [], [100000]]"); + + std::shared_ptr list_w_nulls = + ArrayFromJSON(list_type, "[[1, 2], [3], [], [4], null]"); + auto list_wo_nulls = ArrayFromJSON(list_type, "[[1, 2], [], [100000]]"); + + ASSERT_OK_AND_ASSIGN( + auto result, DestListViewArrayClass::FromList( + *checked_pointer_cast(list_w_nulls), pool_)); + ASSERT_OK(result->ValidateFull()); + AssertArraysEqual(*expected_list_view_w_nulls, *result, /*verbose=*/true); + + ASSERT_OK_AND_ASSIGN( + result, DestListViewArrayClass::FromList( + *checked_pointer_cast(list_wo_nulls), pool_)); + ASSERT_OK(result->ValidateFull()); + AssertArraysEqual(*expected_list_view_wo_nulls, *result, /*verbose=*/true); + } + + template + void DoTestListFromListView() { + using SrcListViewArrayClass = typename TypeTraits::ArrayType; + using DestListArrayClass = typename TypeTraits::ArrayType; + auto list_view_type = std::make_shared(int32()); + auto list_type = std::make_shared(int32()); + + auto list_view_w_nulls = + ArrayFromJSON(list_view_type, "[[1, 2], [3], [], [4], null]"); + auto list_view_wo_nulls = ArrayFromJSON(list_view_type, "[[1, 2], [], [100000]]"); + + auto expected_list_w_nulls = ArrayFromJSON(list_type, "[[1, 2], [3], [], [4], null]"); + auto expected_list_wo_nulls = ArrayFromJSON(list_type, "[[1, 2], [], [100000]]"); + + ASSERT_OK_AND_ASSIGN( + auto result, + DestListArrayClass::FromListView( + *checked_pointer_cast(list_view_w_nulls), pool_)); + ASSERT_OK(result->ValidateFull()); + AssertArraysEqual(*expected_list_w_nulls, *result, /*verbose=*/true); + + ASSERT_OK_AND_ASSIGN( + result, + DestListArrayClass::FromListView( + *checked_pointer_cast(list_view_wo_nulls), pool_)); + ASSERT_OK(result->ValidateFull()); + AssertArraysEqual(*expected_list_wo_nulls, *result, /*verbose=*/true); + } +}; + +TEST_F(TestListConversions, ListViewFromList) { + this->DoTestListViewFromList(); + this->DoTestListViewFromList(); +} + +TEST_F(TestListConversions, ListFromListView) { + this->DoTestListFromListView(); + this->DoTestListFromListView(); +} + // ---------------------------------------------------------------------- // Map tests diff --git a/cpp/src/arrow/util/list_util_test.cc b/cpp/src/arrow/util/list_util_test.cc index 954646e0b3b..4021180b2be 100644 --- a/cpp/src/arrow/util/list_util_test.cc +++ b/cpp/src/arrow/util/list_util_test.cc @@ -160,80 +160,4 @@ TYPED_TEST_SUITE(TestListUtils, ListAndListViewTypes); TYPED_TEST(TestListUtils, RangeOfValuesUsed) { this->TestRangeOfValuesUsed(); } -class TestListConversions : public ::testing::Test { - private: - MemoryPool* pool_; - - public: - TestListConversions() : pool_(default_memory_pool()) {} - - template - void DoTestListViewFromList() { - using DestListViewArrayClass = typename TypeTraits::ArrayType; - using SrcListArrayClass = typename TypeTraits::ArrayType; - auto list_type = std::make_shared(int32()); - auto list_view_type = std::make_shared(int32()); - - auto expected_list_view_w_nulls = - ArrayFromJSON(list_view_type, "[[1, 2], [3], [], [4], null]"); - auto expected_list_view_wo_nulls = - ArrayFromJSON(list_view_type, "[[1, 2], [], [100000]]"); - - std::shared_ptr list_w_nulls = - ArrayFromJSON(list_type, "[[1, 2], [3], [], [4], null]"); - auto list_wo_nulls = ArrayFromJSON(list_type, "[[1, 2], [], [100000]]"); - - ASSERT_OK_AND_ASSIGN( - auto result, DestListViewArrayClass::FromList( - *checked_pointer_cast(list_w_nulls), pool_)); - ASSERT_OK(result->ValidateFull()); - AssertArraysEqual(*expected_list_view_w_nulls, *result, /*verbose=*/true); - - ASSERT_OK_AND_ASSIGN( - result, DestListViewArrayClass::FromList( - *checked_pointer_cast(list_wo_nulls), pool_)); - ASSERT_OK(result->ValidateFull()); - AssertArraysEqual(*expected_list_view_wo_nulls, *result, /*verbose=*/true); - } - - template - void DoTestListFromListView() { - using SrcListViewArrayClass = typename TypeTraits::ArrayType; - using DestListArrayClass = typename TypeTraits::ArrayType; - auto list_view_type = std::make_shared(int32()); - auto list_type = std::make_shared(int32()); - - auto list_view_w_nulls = - ArrayFromJSON(list_view_type, "[[1, 2], [3], [], [4], null]"); - auto list_view_wo_nulls = ArrayFromJSON(list_view_type, "[[1, 2], [], [100000]]"); - - auto expected_list_w_nulls = ArrayFromJSON(list_type, "[[1, 2], [3], [], [4], null]"); - auto expected_list_wo_nulls = ArrayFromJSON(list_type, "[[1, 2], [], [100000]]"); - - ASSERT_OK_AND_ASSIGN( - auto result, - DestListArrayClass::FromListView( - *checked_pointer_cast(list_view_w_nulls), pool_)); - ASSERT_OK(result->ValidateFull()); - AssertArraysEqual(*expected_list_w_nulls, *result, /*verbose=*/true); - - ASSERT_OK_AND_ASSIGN( - result, - DestListArrayClass::FromListView( - *checked_pointer_cast(list_view_wo_nulls), pool_)); - ASSERT_OK(result->ValidateFull()); - AssertArraysEqual(*expected_list_wo_nulls, *result, /*verbose=*/true); - } -}; - -TEST_F(TestListConversions, ListViewFromList) { - this->DoTestListViewFromList(); - this->DoTestListViewFromList(); -} - -TEST_F(TestListConversions, ListFromListView) { - this->DoTestListFromListView(); - this->DoTestListFromListView(); -} - } // namespace arrow From 7d44e76f028ea7e85a07412f55979f9b96fdbcb9 Mon Sep 17 00:00:00 2001 From: Felipe Oliveira Carvalho Date: Thu, 16 Nov 2023 18:11:20 -0300 Subject: [PATCH 79/91] array_nested.h: Document the IsValid(i) pre-cond on value_length/value_slice --- cpp/src/arrow/array/array_nested.h | 35 ++++++++++++++++++++++++++++-- 1 file changed, 33 insertions(+), 2 deletions(-) diff --git a/cpp/src/arrow/array/array_nested.h b/cpp/src/arrow/array/array_nested.h index 8a19dee095f..61606e1592d 100644 --- a/cpp/src/arrow/array/array_nested.h +++ b/cpp/src/arrow/array/array_nested.h @@ -85,10 +85,20 @@ class VarLengthListLikeArray : public Array { } // The following functions will not perform boundschecking + offset_type value_offset(int64_t i) const { return raw_value_offsets_[i + data_->offset]; } + + /// \brief Return the size of the value at a particular index + /// + /// Since non-empty null lists and list-views are possible, avoid calling this + /// function when the list at slot i is null. + /// + /// \pre IsValid(i) virtual offset_type value_length(int64_t i) const = 0; + + /// \pre IsValid(i) std::shared_ptr value_slice(int64_t i) const { return values_->Slice(value_offset(i), value_length(i)); } @@ -114,6 +124,12 @@ class BaseListArray : public VarLengthListLikeArray { const TypeClass* list_type() const { return this->var_length_list_like_type(); } + /// \brief Return the size of the value at a particular index + /// + /// Since non-empty null lists are possible, avoid calling this + /// function when the list at slot i is null. + /// + /// \pre IsValid(i) offset_type value_length(int64_t i) const final { i += this->data_->offset; return this->raw_value_offsets_[i + 1] - this->raw_value_offsets_[i]; @@ -259,14 +275,21 @@ class BaseListViewArray : public VarLengthListLikeArray { const TypeClass* list_view_type() const { return this->var_length_list_like_type(); } - /// Note that this buffer does not account for any slice offset or length. + /// \brief Note that this buffer does not account for any slice offset or length. const std::shared_ptr& value_sizes() const { return this->data_->buffers[2]; } - /// Return pointer to raw value offsets accounting for any slice offset + /// \brief Return pointer to raw value offsets accounting for any slice offset const offset_type* raw_value_sizes() const { return raw_value_sizes_ + this->data_->offset; } + /// \brief Return the size of the value at a particular index + /// + /// This should not be called if the list-view at slot i is null. + /// The returned size in those cases could be any value from 0 to the + /// length of the child values array. + /// + /// \pre IsValid(i) offset_type value_length(int64_t i) const final { return this->raw_value_sizes_[i + this->data_->offset]; } @@ -549,10 +572,18 @@ class ARROW_EXPORT FixedSizeListArray : public Array { i += data_->offset; return list_size_ * i; } + /// \brief Return the fixed-size of the values + /// + /// No matter the value of the index parameter, the result is the same. + /// So even when the value at slot i is null, this function will return a + /// non-zero size. + /// + /// \pre IsValid(i) int32_t value_length(int64_t i = 0) const { ARROW_UNUSED(i); return list_size_; } + /// \pre IsValid(i) std::shared_ptr value_slice(int64_t i) const { return values_->Slice(value_offset(i), value_length(i)); } From ab5711fe168112244451dc58236f59db0e8e4d1c Mon Sep 17 00:00:00 2001 From: Felipe Oliveira Carvalho Date: Thu, 16 Nov 2023 20:39:54 -0300 Subject: [PATCH 80/91] fixup! list_util.h: Add RangeOfValuesUsed() function --- cpp/src/arrow/util/list_util.cc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cpp/src/arrow/util/list_util.cc b/cpp/src/arrow/util/list_util.cc index 6d7508f9664..15196ff8c12 100644 --- a/cpp/src/arrow/util/list_util.cc +++ b/cpp/src/arrow/util/list_util.cc @@ -136,7 +136,7 @@ int64_t MaxViewEnd(const ArraySpan& input) { template std::pair RangeOfValuesUsedByListView(const ArraySpan& input) { DCHECK(is_list_view(*input.type)); - if (input.length == 0 || input.GetNullCount() == input.length) { + if (input.length == 0 || input.null_count == input.length) { return {0, 0}; } const auto min_offset = MinViewOffset(input); From 4fa8e74c79693824f88ed8622d74ad038b2e2017 Mon Sep 17 00:00:00 2001 From: Felipe Oliveira Carvalho Date: Thu, 16 Nov 2023 20:55:54 -0300 Subject: [PATCH 81/91] fixup! [Large]ListViewArray: Implement Validate --- cpp/src/arrow/array/validate.cc | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/cpp/src/arrow/array/validate.cc b/cpp/src/arrow/array/validate.cc index 89562161d2f..8dd3eb3f90c 100644 --- a/cpp/src/arrow/array/validate.cc +++ b/cpp/src/arrow/array/validate.cc @@ -742,10 +742,11 @@ struct ValidateArrayImpl { const ArrayData& values = *data.child_data[0]; const Status child_valid = RecurseInto(values); if (!child_valid.ok()) { - return Status::Invalid("List-view child array invalid: ", child_valid.ToString()); + return Status::Invalid("List-view child array is invalid: ", + child_valid.ToString()); } // For list-views, sizes are validated together with offsets. - return ValidateOffsetsAndSizes(type, values.offset + values.length); + return ValidateOffsetsAndSizes(type, /*offset_limit=*/values.length); } template From d42aae8934c74dc9664b691abf316422564f25bb Mon Sep 17 00:00:00 2001 From: Felipe Oliveira Carvalho Date: Thu, 16 Nov 2023 17:51:02 -0300 Subject: [PATCH 82/91] concatenate_test.cc: Isolate all the random array generation code --- cpp/src/arrow/array/concatenate_test.cc | 283 ++++++++++++++---------- 1 file changed, 167 insertions(+), 116 deletions(-) diff --git a/cpp/src/arrow/array/concatenate_test.cc b/cpp/src/arrow/array/concatenate_test.cc index 0374516a9ed..7fee0635225 100644 --- a/cpp/src/arrow/array/concatenate_test.cc +++ b/cpp/src/arrow/array/concatenate_test.cc @@ -17,6 +17,7 @@ #include #include +#include #include #include #include @@ -40,27 +41,55 @@ #include "arrow/testing/random.h" #include "arrow/testing/util.h" #include "arrow/type.h" +#include "arrow/util/list_util.h" namespace arrow { -class ConcatenateTest : public ::testing::Test { - protected: - ConcatenateTest() - : rng_(seed_), - sizes_({0, 1, 2, 4, 16, 31, 1234}), - null_probabilities_({0.0, 0.1, 0.5, 0.9, 1.0}) {} +class SimpleRandomArrayGenerator { + private: + random::SeedType seed_ = 0xdeadbeef; + std::default_random_engine random_engine_; + random::RandomArrayGenerator rag_; public: - template - std::vector Offsets(int32_t length, int32_t slice_count) { - std::vector offsets(static_cast(slice_count + 1)); - std::default_random_engine gen(seed_); - std::uniform_int_distribution dist(0, length); - std::generate(offsets.begin(), offsets.end(), [&] { return dist(gen); }); + SimpleRandomArrayGenerator() : random_engine_(seed_), rag_(seed_) {} + + template + std::vector RandomOffsetsInRange(offset_type min_offset, + offset_type max_offset, + int64_t num_offsets) { + std::vector offsets(static_cast(num_offsets)); + std::uniform_int_distribution dist(min_offset, max_offset); + std::generate(offsets.begin(), offsets.end(), [&] { return dist(random_engine_); }); + return offsets; + } + + template + std::vector Offsets(int32_t values_length, int32_t slice_count) { + auto offsets = RandomOffsetsInRange(0, values_length, slice_count + 1); std::sort(offsets.begin(), offsets.end()); return offsets; } + /// \param[in] random_offsets Random offsets in [0, values_size] and no particular order + template + std::vector ListViewSizes(const std::vector& random_offsets, + int64_t values_size, double avg_size, + int64_t num_sizes) { + std::normal_distribution normal(/*mean=*/avg_size, /*stddev=*/3.0); + std::vector sizes; + sizes.reserve(num_sizes); + for (int64_t i = 0; i < num_sizes; ++i) { + const auto sampled_size = std::llround(normal(random_engine_)); + auto size = std::max(0, static_cast(sampled_size)); + if (random_offsets[i] > values_size - size) { + size = static_cast(values_size - random_offsets[i]); + } + sizes.push_back(size); + } + return sizes; + } + ArrayVector Slices(const std::shared_ptr& array, const std::vector& offsets) { ArrayVector slices(offsets.size() - 1); @@ -70,34 +99,117 @@ class ConcatenateTest : public ::testing::Test { return slices; } + std::shared_ptr ValidityBitmap(int64_t size, double null_probability) { + return rag_.NullBitmap(size, null_probability, kDefaultBufferAlignment, + default_memory_pool()); + } + template - std::shared_ptr GeneratePrimitive(int64_t size, double null_probability) { + std::shared_ptr PrimitiveArray(int64_t size, double null_probability) { if (std::is_same::value) { - return rng_.Boolean(size, 0.5, null_probability); + return rag_.Boolean(size, 0.5, null_probability); } - return rng_.Numeric(size, 0, 127, null_probability); + return rag_.Numeric(size, 0, 127, null_probability); + } + + std::shared_ptr StringArray(int64_t size, double null_probability) { + return rag_.String(size, /*min_length =*/0, /*max_length =*/15, null_probability); + } + + std::shared_ptr LargeStringArray(int64_t size, double null_probability) { + return rag_.LargeString(size, /*min_length =*/0, /*max_length =*/15, + null_probability); + } + + std::shared_ptr StringViewArray(int64_t size, double null_probability) { + return rag_.StringView(size, /*min_length =*/0, /*max_length =*/40, null_probability, + /*max_buffer_length=*/200); + } + + std::shared_ptr ArrayOf(std::shared_ptr type, int64_t size, + double null_probability) { + return rag_.ArrayOf(std::move(type), size, null_probability); + } + + // TODO(GH-38656): Use the random array generators from testing/random.h here + + template ::ArrayType> + Result> ListArray(int32_t length, + double null_probability) { + using offset_type = typename ListType::offset_type; + using OffsetArrowType = typename CTypeTraits::ArrowType; + + auto values_size = length * 4; + auto values = PrimitiveArray(values_size, null_probability); + auto offsets_vector = Offsets(values_size, length); + // Ensure first and last offsets encompass the whole values array + offsets_vector.front() = 0; + offsets_vector.back() = static_cast(values_size); + std::shared_ptr offsets; + ArrayFromVector(offsets_vector, &offsets); + return ListArrayType::FromArrays(*offsets, *values); + } + + template ::ArrayType> + Result> ListViewArray(int32_t length, + double null_probability) { + using offset_type = typename ListViewType::offset_type; + using OffsetArrowType = typename CTypeTraits::ArrowType; + + constexpr int kAvgListViewSize = 4; + auto values_size = kAvgListViewSize * length; + + auto values = PrimitiveArray(values_size, null_probability); + + std::shared_ptr offsets; + auto offsets_vector = RandomOffsetsInRange(0, values_size, length); + ArrayFromVector(offsets_vector, &offsets); + + std::shared_ptr sizes; + auto sizes_vector = + ListViewSizes(offsets_vector, values_size, kAvgListViewSize, length); + ArrayFromVector(sizes_vector, &sizes); + + auto validity_bitmap = ValidityBitmap(length, null_probability); + + return ListViewArrayType::FromArrays(*offsets, *sizes, *values, default_memory_pool(), + std::move(validity_bitmap)); } +}; + +class ConcatenateTest : public ::testing::Test { + private: + std::vector sizes_; + std::vector null_probabilities_; protected: + SimpleRandomArrayGenerator rag; + + ConcatenateTest() + : sizes_({0, 1, 2, 4, 16, 31, 1234}), + null_probabilities_({0.0, 0.1, 0.5, 0.9, 1.0}) {} + void CheckTrailingBitsAreZeroed(const std::shared_ptr& bitmap, int64_t length) { if (auto preceding_bits = bit_util::kPrecedingBitmask[length % 8]) { auto last_byte = bitmap->data()[length / 8]; ASSERT_EQ(static_cast(last_byte & preceding_bits), last_byte) - << length << " " << int(preceding_bits); + << length << " " << static_cast(preceding_bits); } } template void Check(ArrayFactory&& factory) { for (auto size : this->sizes_) { - auto offsets = this->Offsets(size, 3); + auto offsets = rag.Offsets(size, 3); for (auto null_probability : this->null_probabilities_) { std::shared_ptr array; factory(size, null_probability, &array); ASSERT_OK(array->ValidateFull()); auto expected = array->Slice(offsets.front(), offsets.back() - offsets.front()); ASSERT_OK(expected->ValidateFull()); - auto slices = this->Slices(array, offsets); + auto slices = rag.Slices(array, offsets); for (auto slice : slices) { ASSERT_OK(slice->ValidateFull()); } @@ -113,11 +225,6 @@ class ConcatenateTest : public ::testing::Test { } } } - - random::SeedType seed_ = 0xdeadbeef; - random::RandomArrayGenerator rng_; - std::vector sizes_; - std::vector null_probabilities_; }; TEST(ConcatenateEmptyArraysTest, TestValueBuffersNullPtr) { @@ -145,9 +252,10 @@ class PrimitiveConcatenateTest : public ConcatenateTest { TYPED_TEST_SUITE(PrimitiveConcatenateTest, PrimitiveArrowTypes); TYPED_TEST(PrimitiveConcatenateTest, Primitives) { - this->Check([this](int64_t size, double null_probability, std::shared_ptr* out) { - *out = this->template GeneratePrimitive(size, null_probability); - }); + this->template Check( + [this](int64_t size, double null_probability, std::shared_ptr* out) { + *out = this->rag.template PrimitiveArray(size, null_probability); + }); } TEST_F(ConcatenateTest, NullType) { @@ -158,23 +266,21 @@ TEST_F(ConcatenateTest, NullType) { TEST_F(ConcatenateTest, StringType) { Check([this](int32_t size, double null_probability, std::shared_ptr* out) { - *out = rng_.String(size, /*min_length =*/0, /*max_length =*/15, null_probability); + *out = rag.StringArray(size, null_probability); ASSERT_OK((**out).ValidateFull()); }); } TEST_F(ConcatenateTest, StringViewType) { Check([this](int32_t size, double null_probability, std::shared_ptr* out) { - *out = rng_.StringView(size, /*min_length =*/0, /*max_length =*/40, null_probability, - /*max_buffer_length=*/200); + *out = rag.StringViewArray(size, null_probability); ASSERT_OK((**out).ValidateFull()); }); } TEST_F(ConcatenateTest, LargeStringType) { Check([this](int32_t size, double null_probability, std::shared_ptr* out) { - *out = - rng_.LargeString(size, /*min_length =*/0, /*max_length =*/15, null_probability); + *out = rag.LargeStringArray(size, null_probability); ASSERT_OK((**out).ValidateFull()); }); } @@ -183,103 +289,48 @@ TEST_F(ConcatenateTest, FixedSizeListType) { Check([this](int32_t size, double null_probability, std::shared_ptr* out) { auto list_size = 3; auto values_size = size * list_size; - auto values = this->GeneratePrimitive(values_size, null_probability); + auto values = this->rag.PrimitiveArray(values_size, null_probability); ASSERT_OK_AND_ASSIGN(*out, FixedSizeListArray::FromArrays(values, list_size)); ASSERT_OK((**out).ValidateFull()); }); } -template -struct ListConcatenationChecker { - using offset_type = typename ListType::offset_type; - using OffsetArrowType = typename CTypeTraits::ArrowType; - using ListArrayType = typename TypeTraits::ArrayType; - - template - static void Check(Self& self, int32_t size, double null_probability, - std::shared_ptr* out) { - auto values_size = size * 4; - auto values = - self.template GeneratePrimitive(values_size, null_probability); - auto offsets_vector = self.template Offsets(values_size, size); - // Ensure first and last offsets encompass the whole values array - offsets_vector.front() = 0; - offsets_vector.back() = static_cast(values_size); - std::shared_ptr offsets; - ArrayFromVector(offsets_vector, &offsets); - ASSERT_OK_AND_ASSIGN(*out, ListArrayType::FromArrays(*offsets, *values)); - ASSERT_OK((**out).ValidateFull()); - } -}; - TEST_F(ConcatenateTest, ListType) { Check([this](int32_t size, double null_probability, std::shared_ptr* out) { - ListConcatenationChecker::Check(*this, size, null_probability, out); + ASSERT_OK_AND_ASSIGN(*out, this->rag.ListArray(size, null_probability)); + ASSERT_OK((**out).ValidateFull()); }); } TEST_F(ConcatenateTest, LargeListType) { Check([this](int32_t size, double null_probability, std::shared_ptr* out) { - ListConcatenationChecker::Check(*this, size, null_probability, out); + ASSERT_OK_AND_ASSIGN(*out, + this->rag.ListArray(size, null_probability)); + ASSERT_OK((**out).ValidateFull()); }); } -template -struct ListViewConcatenationChecker { - using offset_type = typename ListViewType::offset_type; - using OffsetArrowType = typename CTypeTraits::ArrowType; - using ListViewArrayType = typename TypeTraits::ArrayType; - - template - static void Check(Self& self, int32_t num_list_views, double null_probability, - std::shared_ptr* out) { - auto values_size = 4 * num_list_views; - auto values = - self.template GeneratePrimitive(values_size, null_probability); - - std::shared_ptr offsets; - auto offsets_vector = self.template Offsets(values_size, num_list_views); - offsets_vector.front() = 0; - ArrayFromVector(offsets_vector, &offsets); - - std::shared_ptr sizes; - std::vector sizes_vector; - sizes_vector.reserve(num_list_views); - for (int32_t i = 0; i < num_list_views; ++i) { - ASSERT_LE(offsets_vector[i], values_size); - offset_type size = offsets_vector[i + 1] - offsets_vector[i]; - // Make list-views share values with the next list-view by - // extending the list-view size to a point after the next offset. - size = std::min(3 * size / 2, values_size - offsets_vector[i]); - sizes_vector.push_back(size); - ASSERT_LE(offsets_vector[i] + size, values_size); - } - ASSERT_EQ(offsets_vector.size(), sizes_vector.size() + 1); - ArrayFromVector(sizes_vector, &sizes); - - ASSERT_OK_AND_ASSIGN(*out, ListViewArrayType::FromArrays(*offsets, *sizes, *values)); - ASSERT_OK((**out).ValidateFull()); - } -}; - TEST_F(ConcatenateTest, ListViewType) { Check([this](int32_t size, double null_probability, std::shared_ptr* out) { - ListViewConcatenationChecker::Check(*this, size, null_probability, out); + ASSERT_OK_AND_ASSIGN(*out, + this->rag.ListViewArray(size, null_probability)); + ASSERT_OK((**out).ValidateFull()); }); } TEST_F(ConcatenateTest, LargeListViewType) { Check([this](int32_t size, double null_probability, std::shared_ptr* out) { - ListViewConcatenationChecker::Check(*this, size, null_probability, - out); + ASSERT_OK_AND_ASSIGN( + *out, this->rag.ListViewArray(size, null_probability)); + ASSERT_OK((**out).ValidateFull()); }); } TEST_F(ConcatenateTest, StructType) { Check([this](int32_t size, double null_probability, std::shared_ptr* out) { - auto foo = this->GeneratePrimitive(size, null_probability); - auto bar = this->GeneratePrimitive(size, null_probability); - auto baz = this->GeneratePrimitive(size, null_probability); + auto foo = this->rag.PrimitiveArray(size, null_probability); + auto bar = this->rag.PrimitiveArray(size, null_probability); + auto baz = this->rag.PrimitiveArray(size, null_probability); *out = std::make_shared( struct_({field("foo", int8()), field("bar", float64()), field("baz", boolean())}), size, ArrayVector{foo, bar, baz}); @@ -288,8 +339,8 @@ TEST_F(ConcatenateTest, StructType) { TEST_F(ConcatenateTest, DictionaryType) { Check([this](int32_t size, double null_probability, std::shared_ptr* out) { - auto indices = this->GeneratePrimitive(size, null_probability); - auto dict = this->GeneratePrimitive(128, 0); + auto indices = rag.PrimitiveArray(size, null_probability); + auto dict = rag.PrimitiveArray(128, 0); auto type = dictionary(int32(), dict->type()); *out = std::make_shared(type, indices, dict); }); @@ -440,20 +491,20 @@ TEST_F(ConcatenateTest, DictionaryTypeNullSlots) { TEST_F(ConcatenateTest, UnionType) { // sparse mode Check([this](int32_t size, double null_probability, std::shared_ptr* out) { - *out = rng_.ArrayOf(sparse_union({ - field("a", float64()), - field("b", boolean()), - }), - size, null_probability); + *out = rag.ArrayOf(sparse_union({ + field("a", float64()), + field("b", boolean()), + }), + size, null_probability); }); // dense mode Check([this](int32_t size, double null_probability, std::shared_ptr* out) { - *out = rng_.ArrayOf(dense_union({ - field("a", uint32()), - field("b", boolean()), - field("c", int8()), - }), - size, null_probability); + *out = rag.ArrayOf(dense_union({ + field("a", uint32()), + field("b", boolean()), + field("c", int8()), + }), + size, null_probability); }); } @@ -471,7 +522,7 @@ TEST_F(ConcatenateTest, DenseUnionTypeOverflow) { auto type_ids_ok = ArrayFromJSON(int8(), "[0]"); auto offsets_ok = ArrayFromJSON(int32(), "[0]"); auto child_array_overflow = - this->rng_.ArrayOf(null(), std::numeric_limits::max() - 1, 0.0); + rag.ArrayOf(null(), std::numeric_limits::max() - 1, 0.0); ASSERT_OK_AND_ASSIGN( auto array_overflow, DenseUnionArray::Make(*type_ids_ok, *offsets_ok, {child_array_overflow})); @@ -604,7 +655,7 @@ TEST_F(ConcatenateTest, DenseUnionType) { TEST_F(ConcatenateTest, ExtensionType) { Check([this](int32_t size, double null_probability, std::shared_ptr* out) { - auto storage = this->GeneratePrimitive(size, null_probability); + auto storage = this->rag.PrimitiveArray(size, null_probability); *out = ExtensionType::WrapArray(smallint(), storage); }); } From 252ff8b912228a1569441579d49574f456607697 Mon Sep 17 00:00:00 2001 From: Felipe Oliveira Carvalho Date: Thu, 16 Nov 2023 23:11:36 -0300 Subject: [PATCH 83/91] concatenate.cc: Preserve the properties of offsets and sizes required by the final list-view spec --- cpp/src/arrow/array/concatenate.cc | 128 +++++++++++++++++++++++------ 1 file changed, 104 insertions(+), 24 deletions(-) diff --git a/cpp/src/arrow/array/concatenate.cc b/cpp/src/arrow/array/concatenate.cc index 0f5fc8dbedd..e77821eeb1f 100644 --- a/cpp/src/arrow/array/concatenate.cc +++ b/cpp/src/arrow/array/concatenate.cc @@ -35,6 +35,7 @@ #include "arrow/status.h" #include "arrow/type.h" #include "arrow/type_fwd.h" +#include "arrow/util/bit_block_counter.h" #include "arrow/util/bit_run_reader.h" #include "arrow/util/bit_util.h" #include "arrow/util/bitmap_ops.h" @@ -44,6 +45,7 @@ #include "arrow/util/list_util.h" #include "arrow/util/logging.h" #include "arrow/util/ree_util.h" +#include "arrow/util/slice_util_internal.h" #include "arrow/visit_data_inline.h" #include "arrow/visit_type_inline.h" @@ -177,7 +179,8 @@ Status PutOffsets(const Buffer& src, Offset first_offset, Offset* dst, } template -void PutListViewOffsets(const Buffer& src, offset_type displacement, offset_type* dst); +Status PutListViewOffsets(const ArrayData& input, offset_type* sizes, const Buffer& src, + offset_type displacement, offset_type* dst); // Concatenate buffers holding list-view offsets into a single buffer of offsets // @@ -186,22 +189,37 @@ void PutListViewOffsets(const Buffer& src, offset_type displacement, offset_type // but when that is not the case, we need to adjust the displacement of offsets. // The concatenated child array does not contain values from the beginning // if they are not referenced to by any view. +// +// The child arrays and the sizes buffer are used to ensure we can trust the offsets in +// offset_buffers to be within the valid range. +// +// This function also mutates sizes so that null list-view entries have size 0. +// +// \param[in] in The child arrays +// \param[in,out] sizes The concatenated sizes buffer template -Status ConcatenateListViewOffsets(const BufferVector& buffers, +Status ConcatenateListViewOffsets(const ArrayDataVector& in, offset_type* sizes, + const BufferVector& offset_buffers, const std::vector& value_ranges, MemoryPool* pool, std::shared_ptr* out) { - const int64_t out_size_in_bytes = SumBufferSizesInBytes(buffers); + DCHECK_EQ(offset_buffers.size(), value_ranges.size()); + + // Allocate resulting offsets buffer and initialize it with zeros + const int64_t out_size_in_bytes = SumBufferSizesInBytes(offset_buffers); ARROW_ASSIGN_OR_RAISE(*out, AllocateBuffer(out_size_in_bytes, pool)); - auto* out_data = (*out)->mutable_data_as(); + memset((*out)->mutable_data(), 0, static_cast((*out)->size())); + + auto* out_offsets = (*out)->mutable_data_as(); int64_t num_child_values = 0; int64_t elements_length = 0; - for (size_t i = 0; i < buffers.size(); ++i) { + for (size_t i = 0; i < offset_buffers.size(); ++i) { const auto displacement = static_cast(num_child_values - value_ranges[i].offset); - PutListViewOffsets(/*src=*/*buffers[i], static_cast(displacement), - /*dst=*/out_data + elements_length); - elements_length += buffers[i]->size() / sizeof(offset_type); + RETURN_NOT_OK(PutListViewOffsets(*in[i], /*sizes=*/sizes + elements_length, + /*src=*/*offset_buffers[i], displacement, + /*dst=*/out_offsets + elements_length)); + elements_length += offset_buffers[i]->size() / sizeof(offset_type); num_child_values += value_ranges[i].length; if (num_child_values > std::numeric_limits::max()) { return Status::Invalid("offset overflow while concatenating arrays"); @@ -214,19 +232,78 @@ Status ConcatenateListViewOffsets(const BufferVector& buffers, } template -void PutListViewOffsets(const Buffer& src, offset_type displacement, offset_type* dst) { +Status PutListViewOffsets(const ArrayData& input, offset_type* sizes, const Buffer& src, + offset_type displacement, offset_type* dst) { if (src.size() == 0) { - return; + return Status::OK(); } - auto src_begin = src.data_as(); - auto src_end = reinterpret_cast(src.data() + src.size()); - // NOTE: Concatenate can be called during IPC reads to append delta dictionaries. - // Avoid UB on non-validated input by doing the addition in the unsigned domain. - // (the result can later be validated using Array::ValidateFull) - std::transform(src_begin, src_end, dst, [displacement](offset_type offset) { - constexpr offset_type kZero = 0; - return std::max(kZero, SafeSignedAdd(offset, displacement)); - }); + const auto& validity_buffer = input.buffers[0]; + if (validity_buffer) { + // Ensure that it is safe to access all the bits in the validity bitmap of input. + RETURN_NOT_OK(internal::CheckSliceParams(/*size=*/8 * validity_buffer->size(), + input.offset, input.length, "buffer")); + } + + const auto offsets = src.data_as(); + DCHECK_EQ(src.size() / sizeof(offset_type), input.length); + + auto visit_not_null = [&](int64_t position) { + if (sizes[position] > 0) { + // NOTE: Concatenate can be called during IPC reads to append delta + // dictionaries. Avoid UB on non-validated input by doing the addition in the + // unsigned domain. (the result can later be validated using + // Array::ValidateFull) + const auto displaced_offset = SafeSignedAdd(offsets[position], displacement); + // displaced_offset>=0 is guaranteed by RangeOfValuesUsed returning the + // smallest offset of valid and non-empty list-views. + DCHECK_GE(displaced_offset, 0); + dst[position] = displaced_offset; + } else { + // Do nothing to leave the dst[position] as 0. + } + }; + + const auto* validity = validity_buffer->data_as(); + internal::OptionalBitBlockCounter bit_counter(validity, input.offset, input.length); + int64_t position = 0; + while (position < input.length) { + internal::BitBlockCount block = bit_counter.NextBlock(); + if (block.AllSet()) { + for (int64_t i = 0; i < block.length; ++i, ++position) { + if (sizes[position] > 0) { + // NOTE: Concatenate can be called during IPC reads to append delta + // dictionaries. Avoid UB on non-validated input by doing the addition in the + // unsigned domain. (the result can later be validated using + // Array::ValidateFull) + const auto displaced_offset = SafeSignedAdd(offsets[position], displacement); + // displaced_offset>=0 is guaranteed by RangeOfValuesUsed returning the + // smallest offset of valid and non-empty list-views. + DCHECK_GE(displaced_offset, 0); + dst[position] = displaced_offset; + } else { + // Do nothing to leave dst[position] as 0. + } + } + } else if (block.NoneSet()) { + // NOTE: we don't have to do anything for the null entries regarding the + // offsets as the buffer is initialized to 0 when it is allocated. + + // Zero-out the sizes of the null entries to ensure these sizes are not + // greater than the new values length of the concatenated array. + memset(sizes + position, 0, block.length * sizeof(offset_type)); + position += block.length; + } else { + for (int64_t i = 0; i < block.length; ++i, ++position) { + if (bit_util::GetBit(validity, input.offset + position)) { + visit_not_null(position); + } else { + // Zero-out the size at position. + sizes[position] = 0; + } + } + } + } + return Status::OK(); } class ConcatenateImpl { @@ -369,14 +446,17 @@ class ConcatenateImpl { RETURN_NOT_OK(ConcatenateImpl(value_data, pool_).Concatenate(&out_->child_data[0])); out_->child_data[0]->type = type.value_type(); + // Concatenate the sizes first + ARROW_ASSIGN_OR_RAISE(auto size_buffers, Buffers(2, sizeof(offset_type))); + RETURN_NOT_OK(ConcatenateBuffers(size_buffers, pool_).Value(&out_->buffers[2])); + // Concatenate the offsets ARROW_ASSIGN_OR_RAISE(auto offset_buffers, Buffers(1, sizeof(offset_type))); - RETURN_NOT_OK(ConcatenateListViewOffsets(offset_buffers, value_ranges, - pool_, &out_->buffers[1])); + RETURN_NOT_OK(ConcatenateListViewOffsets( + in_, /*sizes=*/out_->buffers[2]->mutable_data_as(), offset_buffers, + value_ranges, pool_, &out_->buffers[1])); - // Concatenate the sizes - ARROW_ASSIGN_OR_RAISE(auto size_buffers, Buffers(2, sizeof(offset_type))); - return ConcatenateBuffers(size_buffers, pool_).Value(&out_->buffers[2]); + return Status::OK(); } Status Visit(const FixedSizeListType& fixed_size_list) { From b30d2e576b7a04feca6ad6fd1282cf3562e81ede Mon Sep 17 00:00:00 2001 From: Felipe Oliveira Carvalho Date: Thu, 16 Nov 2023 23:30:32 -0300 Subject: [PATCH 84/91] fixup! concatenate.cc: Preserve the properties of offsets and sizes required by the final list-view spec --- cpp/src/arrow/array/concatenate.cc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cpp/src/arrow/array/concatenate.cc b/cpp/src/arrow/array/concatenate.cc index e77821eeb1f..a982cfdd2a8 100644 --- a/cpp/src/arrow/array/concatenate.cc +++ b/cpp/src/arrow/array/concatenate.cc @@ -245,7 +245,7 @@ Status PutListViewOffsets(const ArrayData& input, offset_type* sizes, const Buff } const auto offsets = src.data_as(); - DCHECK_EQ(src.size() / sizeof(offset_type), input.length); + DCHECK_EQ(static_cast(src.size() / sizeof(offset_type)), input.length); auto visit_not_null = [&](int64_t position) { if (sizes[position] > 0) { From c2068bb922444ce31075071219e4a28b0d2a1ff7 Mon Sep 17 00:00:00 2001 From: Felipe Oliveira Carvalho Date: Thu, 16 Nov 2023 23:32:06 -0300 Subject: [PATCH 85/91] fixup! fixup! [Large]ListViewScalar: Implement all operations --- cpp/src/arrow/array/util.cc | 2 -- 1 file changed, 2 deletions(-) diff --git a/cpp/src/arrow/array/util.cc b/cpp/src/arrow/array/util.cc index 099d5d113ea..86e2ffcae4d 100644 --- a/cpp/src/arrow/array/util.cc +++ b/cpp/src/arrow/array/util.cc @@ -389,7 +389,6 @@ class NullArrayFactory { enable_if_var_size_list Visit(const T& type) { // values array may be empty, but there must be at least one offset of 0 RETURN_NOT_OK(MaxOf(sizeof(typename T::offset_type) * (length_ + 1))); - // XXX(felipec): reviewers, is this correct? RETURN_NOT_OK(MaxOf(GetBufferLength(type.value_type(), /*length=*/0))); return Status::OK(); } @@ -397,7 +396,6 @@ class NullArrayFactory { template enable_if_list_view Visit(const T& type) { RETURN_NOT_OK(MaxOf(sizeof(typename T::offset_type) * length_)); - // XXX(felipec): reviewers, is this correct? RETURN_NOT_OK(MaxOf(GetBufferLength(type.value_type(), /*length=*/0))); return Status::OK(); } From a41490e11074ecdcec7cea70455f77bd35f1c91d Mon Sep 17 00:00:00 2001 From: Felipe Oliveira Carvalho Date: Thu, 16 Nov 2023 23:52:30 -0300 Subject: [PATCH 86/91] fixup! concatenate_test.cc: Isolate all the random array generation code --- cpp/src/arrow/array/concatenate.cc | 2 +- cpp/src/arrow/array/concatenate_test.cc | 6 ++++-- 2 files changed, 5 insertions(+), 3 deletions(-) diff --git a/cpp/src/arrow/array/concatenate.cc b/cpp/src/arrow/array/concatenate.cc index a982cfdd2a8..72ffadafdd8 100644 --- a/cpp/src/arrow/array/concatenate.cc +++ b/cpp/src/arrow/array/concatenate.cc @@ -263,7 +263,7 @@ Status PutListViewOffsets(const ArrayData& input, offset_type* sizes, const Buff } }; - const auto* validity = validity_buffer->data_as(); + const auto* validity = validity_buffer ? validity_buffer->data_as() : nullptr; internal::OptionalBitBlockCounter bit_counter(validity, input.offset, input.length); int64_t position = 0; while (position < input.length) { diff --git a/cpp/src/arrow/array/concatenate_test.cc b/cpp/src/arrow/array/concatenate_test.cc index 7fee0635225..0b05af15501 100644 --- a/cpp/src/arrow/array/concatenate_test.cc +++ b/cpp/src/arrow/array/concatenate_test.cc @@ -173,9 +173,11 @@ class SimpleRandomArrayGenerator { ArrayFromVector(sizes_vector, &sizes); auto validity_bitmap = ValidityBitmap(length, null_probability); + auto valid_count = internal::CountSetBits(validity_bitmap->data(), 0, length); - return ListViewArrayType::FromArrays(*offsets, *sizes, *values, default_memory_pool(), - std::move(validity_bitmap)); + return ListViewArrayType::FromArrays( + *offsets, *sizes, *values, default_memory_pool(), + valid_count == length ? nullptr : std::move(validity_bitmap)); } }; From 79c6acd8f5f5c553feba9f107c4583a3bb457bbf Mon Sep 17 00:00:00 2001 From: Felipe Oliveira Carvalho Date: Fri, 17 Nov 2023 00:24:59 -0300 Subject: [PATCH 87/91] fixup! Make the C Bridge test set for list-views complete --- cpp/src/arrow/c/bridge_test.cc | 19 +++++++++++++++++++ 1 file changed, 19 insertions(+) diff --git a/cpp/src/arrow/c/bridge_test.cc b/cpp/src/arrow/c/bridge_test.cc index 608e59e4227..362df833781 100644 --- a/cpp/src/arrow/c/bridge_test.cc +++ b/cpp/src/arrow/c/bridge_test.cc @@ -33,6 +33,7 @@ #include "arrow/c/util_internal.h" #include "arrow/ipc/json_simple.h" #include "arrow/memory_pool.h" +#include "arrow/testing/builder.h" #include "arrow/testing/extension_type.h" #include "arrow/testing/gtest_util.h" #include "arrow/testing/matchers.h" @@ -3829,6 +3830,24 @@ TEST_F(TestArrayRoundtrip, ListView) { TestWithJSON(list_view(int32()), "[[4, 5], [6, null], null]"); TestWithJSONSliced(list_view(int32()), "[[4, 5], [6, null], null]"); + + // Out-of-order offsets + TestWithArrayFactory([this]() -> Result> { + std::shared_ptr offsets; + ArrayFromVector(int32(), + std::vector{false, true, true, true, false, true}, + std::vector{4, 2, 1, 3, 3, 2}, &offsets); + + std::shared_ptr sizes; + ArrayFromVector(std::vector{2, 2, 3, 1, 2, 0}, &sizes); + + auto values = ArrayFromJSON(int8(), "[4, 5, 6, null, 8, null]"); + auto result = ListViewArray::FromArrays(*offsets, *sizes, *values, pool_); + if (result.ok()) { + RETURN_NOT_OK((*result)->ValidateFull()); + } + return result; + }); } TEST_F(TestArrayRoundtrip, Struct) { From 05f6f38ba67117266b1288c26f0125821b019734 Mon Sep 17 00:00:00 2001 From: Felipe Oliveira Carvalho Date: Fri, 17 Nov 2023 00:26:17 -0300 Subject: [PATCH 88/91] fixup! list_util.h: Add RangeOfValuesUsed() function --- cpp/src/arrow/compute/kernels/scalar_cast_nested.cc | 1 - 1 file changed, 1 deletion(-) diff --git a/cpp/src/arrow/compute/kernels/scalar_cast_nested.cc b/cpp/src/arrow/compute/kernels/scalar_cast_nested.cc index db5fda17d22..6fd449a9313 100644 --- a/cpp/src/arrow/compute/kernels/scalar_cast_nested.cc +++ b/cpp/src/arrow/compute/kernels/scalar_cast_nested.cc @@ -29,7 +29,6 @@ #include "arrow/compute/kernels/scalar_cast_internal.h" #include "arrow/util/bitmap_ops.h" #include "arrow/util/int_util.h" -#include "arrow/util/list_util.h" namespace arrow { From 773c800924dce03df23edac7fa04c4b45340a1a4 Mon Sep 17 00:00:00 2001 From: Felipe Oliveira Carvalho Date: Fri, 17 Nov 2023 00:46:25 -0300 Subject: [PATCH 89/91] fixup! concatenate_test.cc: Isolate all the random array generation code --- cpp/src/arrow/array/concatenate_test.cc | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/cpp/src/arrow/array/concatenate_test.cc b/cpp/src/arrow/array/concatenate_test.cc index 0b05af15501..af595e897f9 100644 --- a/cpp/src/arrow/array/concatenate_test.cc +++ b/cpp/src/arrow/array/concatenate_test.cc @@ -254,10 +254,9 @@ class PrimitiveConcatenateTest : public ConcatenateTest { TYPED_TEST_SUITE(PrimitiveConcatenateTest, PrimitiveArrowTypes); TYPED_TEST(PrimitiveConcatenateTest, Primitives) { - this->template Check( - [this](int64_t size, double null_probability, std::shared_ptr* out) { - *out = this->rag.template PrimitiveArray(size, null_probability); - }); + this->Check([this](int64_t size, double null_probability, std::shared_ptr* out) { + *out = this->rag.template PrimitiveArray(size, null_probability); + }); } TEST_F(ConcatenateTest, NullType) { From ab0dcaf6fdf9d64931f5068d612ace2cfdc0132b Mon Sep 17 00:00:00 2001 From: Felipe Oliveira Carvalho Date: Wed, 22 Nov 2023 11:52:32 -0300 Subject: [PATCH 90/91] fixup! list_util.h: Add SumOfLogicalListSizes() utility --- cpp/src/arrow/util/list_util.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cpp/src/arrow/util/list_util.h b/cpp/src/arrow/util/list_util.h index 82c732bd153..467f4eb15ed 100644 --- a/cpp/src/arrow/util/list_util.h +++ b/cpp/src/arrow/util/list_util.h @@ -38,7 +38,7 @@ ARROW_EXPORT Result> RangeOfValuesUsed( /// \brief Calculate the sum of the sizes of all valid lists or list-views /// -/// This is usally the same as the length of the RangeOfValuesUsed() range, but +/// This is usually the same as the length of the RangeOfValuesUsed() range, but /// it can be: /// - Smaller: when the child array constains many values that are not /// referenced by the lists or list-views in the parent array From 62ede21adfc09a0290b53a0525815f58196402c3 Mon Sep 17 00:00:00 2001 From: Felipe Oliveira Carvalho Date: Wed, 22 Nov 2023 11:55:48 -0300 Subject: [PATCH 91/91] fixup! concatenate.cc: Preserve the properties of offsets and sizes required by the final list-view spec --- cpp/src/arrow/array/concatenate.cc | 14 +------------- 1 file changed, 1 insertion(+), 13 deletions(-) diff --git a/cpp/src/arrow/array/concatenate.cc b/cpp/src/arrow/array/concatenate.cc index 72ffadafdd8..ff9ed66d114 100644 --- a/cpp/src/arrow/array/concatenate.cc +++ b/cpp/src/arrow/array/concatenate.cc @@ -270,19 +270,7 @@ Status PutListViewOffsets(const ArrayData& input, offset_type* sizes, const Buff internal::BitBlockCount block = bit_counter.NextBlock(); if (block.AllSet()) { for (int64_t i = 0; i < block.length; ++i, ++position) { - if (sizes[position] > 0) { - // NOTE: Concatenate can be called during IPC reads to append delta - // dictionaries. Avoid UB on non-validated input by doing the addition in the - // unsigned domain. (the result can later be validated using - // Array::ValidateFull) - const auto displaced_offset = SafeSignedAdd(offsets[position], displacement); - // displaced_offset>=0 is guaranteed by RangeOfValuesUsed returning the - // smallest offset of valid and non-empty list-views. - DCHECK_GE(displaced_offset, 0); - dst[position] = displaced_offset; - } else { - // Do nothing to leave dst[position] as 0. - } + visit_not_null(position); } } else if (block.NoneSet()) { // NOTE: we don't have to do anything for the null entries regarding the