From efc34f331c3ae02024e73dd9a506000c16aa6e4e Mon Sep 17 00:00:00 2001 From: Benjamin Kietzman Date: Mon, 18 Feb 2019 16:01:44 -0500 Subject: [PATCH 01/21] fist pass at concatenate function --- cpp/src/arrow/array-test.cc | 49 +++++++ cpp/src/arrow/array.cc | 280 ++++++++++++++++++++++++++++++++++++ cpp/src/arrow/array.h | 10 ++ cpp/src/arrow/buffer.cc | 15 ++ cpp/src/arrow/buffer.h | 11 ++ 5 files changed, 365 insertions(+) diff --git a/cpp/src/arrow/array-test.cc b/cpp/src/arrow/array-test.cc index b9786828065..9ab1211e182 100644 --- a/cpp/src/arrow/array-test.cc +++ b/cpp/src/arrow/array-test.cc @@ -1785,4 +1785,53 @@ TEST(TestRechunkArraysConsistently, Plain) { } } +// TODO this could be made much better. +// First, if the concatenation of addends equals the expected then +// the concatenation of their JSON representations must equal that of the +// expected array, so we only need to provide the addends as strings. +// More generally, Concatenate is an excellent candidate for property testing: +// generate random addends then concatenate them using their json repr and +// Concatenate, then compare. +struct ConcatenateParam { + std::shared_ptr expected; + std::vector> addends; + + ConcatenateParam(std::shared_ptr expected_type, + const std::string& expected_json, + const std::vector& addends_json) + : expected(ArrayFromJSON(expected_type, expected_json)) { + for (const auto& addend : addends_json) { + addends.push_back(ArrayFromJSON(expected_type, addend)); + } + } +}; + +class ConcatenateTest : public ::testing::TestWithParam { + public: + ConcatenateTest() {} +}; + +TEST_P(ConcatenateTest, Basics) { + auto param = GetParam(); + std::shared_ptr actual; + ASSERT_OK(Concatenate(param.addends, default_memory_pool(), &actual)); + AssertArraysEqual(*param.expected, *actual); +} + +INSTANTIATE_TEST_CASE_P( + ConcatenateTest, ConcatenateTest, + ::testing::Values( + ConcatenateParam(int32(), "[0, 1, 2, 3]", {"[0, 1]", "[2, 3]"}), + ConcatenateParam(float64(), "[0, 1, 2, 3]", {"[0, 1]", "[2, 3]"}), + ConcatenateParam(uint8(), "[0, 1, 2, 3]", {"[0, 1]", "[2, 3]"}), + ConcatenateParam(list(uint8()), "[[], [0, 1], [2], [3]]", + {"[[], [0, 1]]", "[[2], [3]]"}), + ConcatenateParam(utf8(), R"(["a", "b", "c", "d"])", + {R"(["a", "b"])", R"(["c", "d"])"}), + ConcatenateParam( + struct_({field("strings", utf8()), field("ints", int64())}), + R"([{"strings":"a", "ints":0}, {"strings":"b", "ints":1}, {"strings":"c", "ints":2}, {"strings":"d", "ints":3}])", + {R"([{"strings":"a", "ints":0}, {"strings":"b", "ints":1}])", + R"([{"strings":"c", "ints":2}, {"strings":"d", "ints":3}])"}))); + } // namespace arrow diff --git a/cpp/src/arrow/array.cc b/cpp/src/arrow/array.cc index bb3b47d32a1..90e988d010e 100644 --- a/cpp/src/arrow/array.cc +++ b/cpp/src/arrow/array.cc @@ -995,4 +995,284 @@ std::vector RechunkArraysConsistently( } // namespace internal +struct ConcatenateImpl { + ConcatenateImpl(const std::vector>& in, MemoryPool* pool) + : in_(in), + in_size_(static_cast(in.size())), + lengths_(in.size()), + offsets_(in.size()), + pool_(pool), + out_(new ArrayData) { + out_->type = in[0]->type; + for (int i = 0; i != in_size_; ++i) { + lengths_[i] = in[i]->length; + offsets_[i] = in[i]->offset; + out_->length += lengths_[i]; + if (out_->null_count == kUnknownNullCount || + in[i]->null_count == kUnknownNullCount) { + out_->null_count = kUnknownNullCount; + continue; + } + out_->null_count += in[i]->null_count; + } + } + + struct range { + range() : offset(-1), length(0) {} + range(int32_t o, int32_t l) : offset(o), length(l) {} + void widen_to_include(int32_t index) { + if (length == 0) { + offset = index; + length = 1; + return; + } + auto end = std::max(offset + length, index + 1); + length = end - offset; + offset = std::min(index, offset); + } + int32_t offset, length; + }; + + Status Visit(const NullType&) { return Status::OK(); } + + Status Visit(const BooleanType&) { + std::shared_ptr values_buffer; + RETURN_NOT_OK(ConcatenateBitmaps(1, &values_buffer)); + out_->buffers.push_back(values_buffer); + return Status::OK(); + } + + // handle numbers, decimal128, fixed_size_binary + Status Visit(const FixedWidthType& fixed) { + DCHECK_EQ(fixed.bit_width() % 8, 0); + const int byte_width = fixed.bit_width() / 8; + std::shared_ptr values_buffer; + std::vector> values_slices(in_size_); + for (int i = 0; i != in_size_; ++i) { + auto byte_length = byte_width * lengths_[i]; + auto byte_offset = byte_width * offsets_[i]; + values_slices[i] = SliceBuffer(in_[i]->buffers[1], byte_offset, byte_length); + } + RETURN_NOT_OK(arrow::Concatenate(values_slices, pool_, &values_buffer)); + out_->buffers.push_back(values_buffer); + return Status::OK(); + } + + Status Visit(const BinaryType&) { + std::shared_ptr values_buffer, offset_buffer; + std::vector value_ranges; + RETURN_NOT_OK(ConcatenateOffsets(1, &offset_buffer, &value_ranges)); + out_->buffers.push_back(offset_buffer); + std::vector> values_slices(in_size_); + for (int i = 0; i != in_size_; ++i) { + values_slices[i] = + SliceBuffer(in_[i]->buffers[2], value_ranges[i].offset, value_ranges[i].length); + } + RETURN_NOT_OK(arrow::Concatenate(values_slices, pool_, &values_buffer)); + out_->buffers.push_back(values_buffer); + return Status::OK(); + } + + Status Visit(const ListType&) { + std::shared_ptr offset_buffer; + std::vector value_ranges; + RETURN_NOT_OK(ConcatenateOffsets(1, &offset_buffer, &value_ranges)); + out_->buffers.push_back(offset_buffer); + std::vector> values_slices(in_size_); + for (int i = 0; i != in_size_; ++i) { + values_slices[i] = SliceData(*in_[i]->child_data[0], value_ranges[i].offset, + value_ranges[i].length); + } + std::shared_ptr values; + RETURN_NOT_OK(ConcatenateImpl(values_slices, pool_).Concatenate(&values)); + out_->child_data = {values}; + return Status::OK(); + } + + Status Visit(const StructType& s) { + std::vector> values_slices(in_size_); + for (int field_index = 0; field_index != s.num_children(); ++field_index) { + for (int i = 0; i != in_size_; ++i) { + values_slices[i] = + SliceData(*in_[i]->child_data[field_index], offsets_[i], lengths_[i]); + } + std::shared_ptr values; + RETURN_NOT_OK(ConcatenateImpl(values_slices, pool_).Concatenate(&values)); + out_->child_data.push_back(values); + } + return Status::OK(); + } + + Status Visit(const DictionaryType& d) { + std::vector> indices_slices(in_size_); + for (int i = 0; i != in_size_; ++i) { + indices_slices[i] = SliceData(*in_[i], offsets_[i], lengths_[i]); + indices_slices[i]->type = d.index_type(); + } + std::shared_ptr indices; + RETURN_NOT_OK(ConcatenateImpl(indices_slices, pool_).Concatenate(&indices)); + auto type = out_->type; + auto null_bitmap = out_->buffers[0]; + out_ = indices; + out_->type = type; + out_->buffers[0] = null_bitmap; + return Status::OK(); + } + + Status Visit(const UnionType& u) { + // type_codes are an index into child_data + std::vector> codes_slices(in_size_); + for (int i = 0; i != in_size_; ++i) { + codes_slices[i] = SliceBuffer(in_[i]->buffers[1], offsets_[i], lengths_[i]); + } + std::shared_ptr codes_buffer; + RETURN_NOT_OK(arrow::Concatenate(codes_slices, pool_, &codes_buffer)); + out_->buffers.push_back(codes_buffer); + if (u.mode() == UnionMode::SPARSE) { + std::vector> values_slices(in_size_); + for (int field_index = 0; field_index != u.num_children(); ++field_index) { + for (int i = 0; i != in_size_; ++i) { + values_slices[i] = + SliceData(*in_[i]->child_data[field_index], offsets_[i], lengths_[i]); + } + std::shared_ptr values; + RETURN_NOT_OK(ConcatenateImpl(values_slices, pool_).Concatenate(&values)); + out_->child_data.push_back(values); + } + out_->buffers.push_back(nullptr); + return Status::OK(); + } + DCHECK_EQ(u.mode(), UnionMode::DENSE); + + // build mapping from (input, type_code)->the range of values actually referenced + auto max_code = *std::max_element(u.type_codes().begin(), u.type_codes().end()); + std::vector values_ranges((static_cast(max_code) + 1) * in_size_); + auto get_values_range = [&](int in_i, UnionArray::type_id_t code) -> range& { + return values_ranges[in_i * in_size_ + code]; + }; + for (int i = 0; i != in_size_; ++i) { + auto codes = in_[i]->buffers[1]->data(); + auto offsets = reinterpret_cast(in_[i]->buffers[2]->data()); + for (auto index = offsets_[i]; index != offsets_[i] + lengths_[i]; ++index) { + get_values_range(i, codes[index]).widen_to_include(offsets[index]); + } + } + + // for each type_code, use the min/max offset as a slice range + // and concatenate sliced data for that type_code + out_->child_data.resize(static_cast(max_code) + 1); + for (auto code : u.type_codes()) { + std::vector> values_slices(in_size_); + for (int i = 0; i != in_size_; ++i) { + auto values_range = get_values_range(i, code); + values_slices[i] = SliceData(*in_[i]->child_data[code], values_range.offset, + values_range.length); + } + std::shared_ptr values; + RETURN_NOT_OK(ConcatenateImpl(values_slices, pool_).Concatenate(&values)); + out_->child_data[code] = values; + } + + // for each input array, adjust the offsets by the length of the slice + // range for a given type code and the minimum offset in that input array, + // so that offsets point into the concatenated values + std::vector total_lengths(static_cast(max_code) + 1, 0); + std::shared_ptr offsets_buffer; + RETURN_NOT_OK(AllocateBuffer(pool_, out_->length * sizeof(int32_t), &offsets_buffer)); + auto raw_offsets = reinterpret_cast(offsets_buffer->mutable_data()); + for (int i = 0; i != in_size_; ++i) { + auto codes = in_[i]->buffers[1]->data(); + auto offsets = reinterpret_cast(in_[i]->buffers[2]->data()); + for (auto index = offsets_[i]; index != offsets_[i] + lengths_[i]; ++index) { + auto min_offset = get_values_range(i, codes[index]).offset; + *raw_offsets++ = offsets[index] - min_offset + total_lengths[codes[index]]; + } + for (auto code : u.type_codes()) { + total_lengths[code] += get_values_range(i, code).length; + } + } + out_->buffers.push_back(offsets_buffer); + return Status::OK(); + } + + Status Concatenate(std::shared_ptr* out) && { + std::shared_ptr null_bitmap; + if (out_->null_count != 0) { + RETURN_NOT_OK(ConcatenateBitmaps(0, &null_bitmap)); + } + out_->buffers = {null_bitmap}; + RETURN_NOT_OK(VisitTypeInline(*out_->type, this)); + *out = std::move(out_); + return Status::OK(); + } + + Status ConcatenateBitmaps(int index, std::shared_ptr* bitmap_buffer) { + RETURN_NOT_OK(AllocateBitmap(pool_, out_->length, bitmap_buffer)); + uint8_t* bitmap_data = (*bitmap_buffer)->mutable_data(); + int64_t bitmap_offset = 0; + for (int i = 0; i != in_size_; ++i) { + if (auto bitmap = in_[i]->buffers[0]) { + internal::CopyBitmap(bitmap->data(), offsets_[i], lengths_[i], bitmap_data, + bitmap_offset); + } else { + BitUtil::SetBitsTo(bitmap_data, bitmap_offset, lengths_[i], true); + } + bitmap_offset += lengths_[i]; + } + if (auto preceding_bits = BitUtil::kPrecedingBitmask[out_->length % 8]) { + bitmap_data[out_->length / 8] &= preceding_bits; + } + return Status::OK(); + } + + // FIXME the below assumes that the first offset in the inputs will always be 0, which + // isn't necessarily correct. Accumulating first and last offsets will be necessary + // because we need to slice the referenced data (child_data for lists, values_buffer for + // strings) + Status ConcatenateOffsets(int index, std::shared_ptr* offset_buffer, + std::vector* ranges) { + RETURN_NOT_OK( + AllocateBuffer(pool_, (out_->length + 1) * sizeof(int32_t), offset_buffer)); + auto dst_offsets_begin = reinterpret_cast((*offset_buffer)->mutable_data()); + int32_t values_length = 0; + ranges->resize(in_size_); + for (int i = 0; i != in_size_; ++i) { + auto src_offsets_begin = in_[i]->GetValues(index) + offsets_[i]; + auto src_offsets_end = src_offsets_begin + lengths_[i]; + auto first_offset = src_offsets_begin[0]; + ranges->at(i).offset = first_offset; + std::transform(src_offsets_begin, src_offsets_end, dst_offsets_begin, + [values_length, first_offset](int32_t offset) { + return offset - first_offset + values_length; + }); + auto last_offset = *src_offsets_end; + ranges->at(i).length = last_offset - first_offset; + values_length += last_offset - first_offset; + dst_offsets_begin += lengths_[i]; + } + *dst_offsets_begin = values_length; + return Status::OK(); + } + + const std::vector>& in_; + int in_size_; + std::vector lengths_, offsets_; + MemoryPool* pool_; + std::shared_ptr out_; +}; + +Status Concatenate(const std::vector>& arrays, MemoryPool* pool, + std::shared_ptr* out) { + DCHECK_GT(arrays.size(), 0); + std::vector> data(arrays.size()); + for (std::size_t i = 0; i != arrays.size(); ++i) { + DCHECK(arrays[i]->type()->Equals(*arrays[0]->type())); + data[i] = arrays[i]->data(); + } + std::shared_ptr out_data; + RETURN_NOT_OK(ConcatenateImpl(data, pool).Concatenate(&out_data)); + *out = MakeArray(std::move(out_data)); + return Status::OK(); +} + } // namespace arrow diff --git a/cpp/src/arrow/array.h b/cpp/src/arrow/array.h index bee133c017e..4d0dd028da8 100644 --- a/cpp/src/arrow/array.h +++ b/cpp/src/arrow/array.h @@ -859,6 +859,16 @@ class ARROW_EXPORT DictionaryArray : public Array { ARROW_EXPORT Status ValidateArray(const Array& array); +/// \brief Concatenate arrays +/// +/// \param[in] arrays a vector of arrays to be concatenated +/// \param[in] pool memory to store the result will be allocated from this memory pool +/// \param[out] out the resulting concatenated array +/// \return Status +ARROW_EXPORT +Status Concatenate(const std::vector>& arrays, MemoryPool* pool, + std::shared_ptr* out); + } // namespace arrow #endif // ARROW_ARRAY_H diff --git a/cpp/src/arrow/buffer.cc b/cpp/src/arrow/buffer.cc index 8f05912b804..e12c4d1822f 100644 --- a/cpp/src/arrow/buffer.cc +++ b/cpp/src/arrow/buffer.cc @@ -227,4 +227,19 @@ Status AllocateEmptyBitmap(int64_t length, std::shared_ptr* out) { return AllocateEmptyBitmap(default_memory_pool(), length, out); } +Status Concatenate(const std::vector>& buffers, MemoryPool* pool, + std::shared_ptr* out) { + int64_t out_length = 0; + for (const auto& buffer : buffers) { + out_length += buffer->size(); + } + RETURN_NOT_OK(AllocateBuffer(pool, out_length, out)); + auto out_data = (*out)->mutable_data(); + for (const auto& buffer : buffers) { + std::copy(buffer->data(), buffer->data() + buffer->size(), out_data); + out_data += buffer->size(); + } + return Status::OK(); +} + } // namespace arrow diff --git a/cpp/src/arrow/buffer.h b/cpp/src/arrow/buffer.h index 306e677619f..5c6ae5e13f7 100644 --- a/cpp/src/arrow/buffer.h +++ b/cpp/src/arrow/buffer.h @@ -402,6 +402,17 @@ Status AllocateEmptyBitmap(MemoryPool* pool, int64_t length, ARROW_EXPORT Status AllocateEmptyBitmap(int64_t length, std::shared_ptr* out); +/// \brief Concatenate multiple buffers into a single buffer +/// +/// \param[in] buffers to be concatenated +/// \param[in] pool memory pool to allocate the new buffer from +/// \param[out] out the concatenated buffer +/// +/// \return Status +ARROW_EXPORT +Status Concatenate(const std::vector>& buffers, MemoryPool* pool, + std::shared_ptr* out); + /// @} } // namespace arrow From 2573acaebf561d0c984c2d326beee5b3bda288eb Mon Sep 17 00:00:00 2001 From: Benjamin Kietzman Date: Tue, 26 Feb 2019 12:39:06 -0500 Subject: [PATCH 02/21] move Concatenate to util/concatenate --- cpp/src/arrow/CMakeLists.txt | 1 + cpp/src/arrow/array-test.cc | 1 + cpp/src/arrow/array.cc | 281 ------------------------- cpp/src/arrow/array.h | 10 - cpp/src/arrow/util/concatenate.cc | 326 ++++++++++++++++++++++++++++++ cpp/src/arrow/util/concatenate.h | 39 ++++ 6 files changed, 367 insertions(+), 291 deletions(-) create mode 100644 cpp/src/arrow/util/concatenate.cc create mode 100644 cpp/src/arrow/util/concatenate.h diff --git a/cpp/src/arrow/CMakeLists.txt b/cpp/src/arrow/CMakeLists.txt index 4ef60c9f168..f1cabfb569d 100644 --- a/cpp/src/arrow/CMakeLists.txt +++ b/cpp/src/arrow/CMakeLists.txt @@ -118,6 +118,7 @@ set(ARROW_SRCS testing/util.cc util/basic_decimal.cc util/bit-util.cc + util/concatenate.cc util/compression.cc util/cpu-info.cc util/decimal.cc diff --git a/cpp/src/arrow/array-test.cc b/cpp/src/arrow/array-test.cc index 9ab1211e182..a7325e2f772 100644 --- a/cpp/src/arrow/array-test.cc +++ b/cpp/src/arrow/array-test.cc @@ -42,6 +42,7 @@ #include "arrow/type.h" #include "arrow/util/bit-util.h" #include "arrow/util/checked_cast.h" +#include "arrow/util/concatenate.h" #include "arrow/util/decimal.h" #include "arrow/util/lazy.h" diff --git a/cpp/src/arrow/array.cc b/cpp/src/arrow/array.cc index 90e988d010e..b5af31901f4 100644 --- a/cpp/src/arrow/array.cc +++ b/cpp/src/arrow/array.cc @@ -994,285 +994,4 @@ std::vector RechunkArraysConsistently( } } // namespace internal - -struct ConcatenateImpl { - ConcatenateImpl(const std::vector>& in, MemoryPool* pool) - : in_(in), - in_size_(static_cast(in.size())), - lengths_(in.size()), - offsets_(in.size()), - pool_(pool), - out_(new ArrayData) { - out_->type = in[0]->type; - for (int i = 0; i != in_size_; ++i) { - lengths_[i] = in[i]->length; - offsets_[i] = in[i]->offset; - out_->length += lengths_[i]; - if (out_->null_count == kUnknownNullCount || - in[i]->null_count == kUnknownNullCount) { - out_->null_count = kUnknownNullCount; - continue; - } - out_->null_count += in[i]->null_count; - } - } - - struct range { - range() : offset(-1), length(0) {} - range(int32_t o, int32_t l) : offset(o), length(l) {} - void widen_to_include(int32_t index) { - if (length == 0) { - offset = index; - length = 1; - return; - } - auto end = std::max(offset + length, index + 1); - length = end - offset; - offset = std::min(index, offset); - } - int32_t offset, length; - }; - - Status Visit(const NullType&) { return Status::OK(); } - - Status Visit(const BooleanType&) { - std::shared_ptr values_buffer; - RETURN_NOT_OK(ConcatenateBitmaps(1, &values_buffer)); - out_->buffers.push_back(values_buffer); - return Status::OK(); - } - - // handle numbers, decimal128, fixed_size_binary - Status Visit(const FixedWidthType& fixed) { - DCHECK_EQ(fixed.bit_width() % 8, 0); - const int byte_width = fixed.bit_width() / 8; - std::shared_ptr values_buffer; - std::vector> values_slices(in_size_); - for (int i = 0; i != in_size_; ++i) { - auto byte_length = byte_width * lengths_[i]; - auto byte_offset = byte_width * offsets_[i]; - values_slices[i] = SliceBuffer(in_[i]->buffers[1], byte_offset, byte_length); - } - RETURN_NOT_OK(arrow::Concatenate(values_slices, pool_, &values_buffer)); - out_->buffers.push_back(values_buffer); - return Status::OK(); - } - - Status Visit(const BinaryType&) { - std::shared_ptr values_buffer, offset_buffer; - std::vector value_ranges; - RETURN_NOT_OK(ConcatenateOffsets(1, &offset_buffer, &value_ranges)); - out_->buffers.push_back(offset_buffer); - std::vector> values_slices(in_size_); - for (int i = 0; i != in_size_; ++i) { - values_slices[i] = - SliceBuffer(in_[i]->buffers[2], value_ranges[i].offset, value_ranges[i].length); - } - RETURN_NOT_OK(arrow::Concatenate(values_slices, pool_, &values_buffer)); - out_->buffers.push_back(values_buffer); - return Status::OK(); - } - - Status Visit(const ListType&) { - std::shared_ptr offset_buffer; - std::vector value_ranges; - RETURN_NOT_OK(ConcatenateOffsets(1, &offset_buffer, &value_ranges)); - out_->buffers.push_back(offset_buffer); - std::vector> values_slices(in_size_); - for (int i = 0; i != in_size_; ++i) { - values_slices[i] = SliceData(*in_[i]->child_data[0], value_ranges[i].offset, - value_ranges[i].length); - } - std::shared_ptr values; - RETURN_NOT_OK(ConcatenateImpl(values_slices, pool_).Concatenate(&values)); - out_->child_data = {values}; - return Status::OK(); - } - - Status Visit(const StructType& s) { - std::vector> values_slices(in_size_); - for (int field_index = 0; field_index != s.num_children(); ++field_index) { - for (int i = 0; i != in_size_; ++i) { - values_slices[i] = - SliceData(*in_[i]->child_data[field_index], offsets_[i], lengths_[i]); - } - std::shared_ptr values; - RETURN_NOT_OK(ConcatenateImpl(values_slices, pool_).Concatenate(&values)); - out_->child_data.push_back(values); - } - return Status::OK(); - } - - Status Visit(const DictionaryType& d) { - std::vector> indices_slices(in_size_); - for (int i = 0; i != in_size_; ++i) { - indices_slices[i] = SliceData(*in_[i], offsets_[i], lengths_[i]); - indices_slices[i]->type = d.index_type(); - } - std::shared_ptr indices; - RETURN_NOT_OK(ConcatenateImpl(indices_slices, pool_).Concatenate(&indices)); - auto type = out_->type; - auto null_bitmap = out_->buffers[0]; - out_ = indices; - out_->type = type; - out_->buffers[0] = null_bitmap; - return Status::OK(); - } - - Status Visit(const UnionType& u) { - // type_codes are an index into child_data - std::vector> codes_slices(in_size_); - for (int i = 0; i != in_size_; ++i) { - codes_slices[i] = SliceBuffer(in_[i]->buffers[1], offsets_[i], lengths_[i]); - } - std::shared_ptr codes_buffer; - RETURN_NOT_OK(arrow::Concatenate(codes_slices, pool_, &codes_buffer)); - out_->buffers.push_back(codes_buffer); - if (u.mode() == UnionMode::SPARSE) { - std::vector> values_slices(in_size_); - for (int field_index = 0; field_index != u.num_children(); ++field_index) { - for (int i = 0; i != in_size_; ++i) { - values_slices[i] = - SliceData(*in_[i]->child_data[field_index], offsets_[i], lengths_[i]); - } - std::shared_ptr values; - RETURN_NOT_OK(ConcatenateImpl(values_slices, pool_).Concatenate(&values)); - out_->child_data.push_back(values); - } - out_->buffers.push_back(nullptr); - return Status::OK(); - } - DCHECK_EQ(u.mode(), UnionMode::DENSE); - - // build mapping from (input, type_code)->the range of values actually referenced - auto max_code = *std::max_element(u.type_codes().begin(), u.type_codes().end()); - std::vector values_ranges((static_cast(max_code) + 1) * in_size_); - auto get_values_range = [&](int in_i, UnionArray::type_id_t code) -> range& { - return values_ranges[in_i * in_size_ + code]; - }; - for (int i = 0; i != in_size_; ++i) { - auto codes = in_[i]->buffers[1]->data(); - auto offsets = reinterpret_cast(in_[i]->buffers[2]->data()); - for (auto index = offsets_[i]; index != offsets_[i] + lengths_[i]; ++index) { - get_values_range(i, codes[index]).widen_to_include(offsets[index]); - } - } - - // for each type_code, use the min/max offset as a slice range - // and concatenate sliced data for that type_code - out_->child_data.resize(static_cast(max_code) + 1); - for (auto code : u.type_codes()) { - std::vector> values_slices(in_size_); - for (int i = 0; i != in_size_; ++i) { - auto values_range = get_values_range(i, code); - values_slices[i] = SliceData(*in_[i]->child_data[code], values_range.offset, - values_range.length); - } - std::shared_ptr values; - RETURN_NOT_OK(ConcatenateImpl(values_slices, pool_).Concatenate(&values)); - out_->child_data[code] = values; - } - - // for each input array, adjust the offsets by the length of the slice - // range for a given type code and the minimum offset in that input array, - // so that offsets point into the concatenated values - std::vector total_lengths(static_cast(max_code) + 1, 0); - std::shared_ptr offsets_buffer; - RETURN_NOT_OK(AllocateBuffer(pool_, out_->length * sizeof(int32_t), &offsets_buffer)); - auto raw_offsets = reinterpret_cast(offsets_buffer->mutable_data()); - for (int i = 0; i != in_size_; ++i) { - auto codes = in_[i]->buffers[1]->data(); - auto offsets = reinterpret_cast(in_[i]->buffers[2]->data()); - for (auto index = offsets_[i]; index != offsets_[i] + lengths_[i]; ++index) { - auto min_offset = get_values_range(i, codes[index]).offset; - *raw_offsets++ = offsets[index] - min_offset + total_lengths[codes[index]]; - } - for (auto code : u.type_codes()) { - total_lengths[code] += get_values_range(i, code).length; - } - } - out_->buffers.push_back(offsets_buffer); - return Status::OK(); - } - - Status Concatenate(std::shared_ptr* out) && { - std::shared_ptr null_bitmap; - if (out_->null_count != 0) { - RETURN_NOT_OK(ConcatenateBitmaps(0, &null_bitmap)); - } - out_->buffers = {null_bitmap}; - RETURN_NOT_OK(VisitTypeInline(*out_->type, this)); - *out = std::move(out_); - return Status::OK(); - } - - Status ConcatenateBitmaps(int index, std::shared_ptr* bitmap_buffer) { - RETURN_NOT_OK(AllocateBitmap(pool_, out_->length, bitmap_buffer)); - uint8_t* bitmap_data = (*bitmap_buffer)->mutable_data(); - int64_t bitmap_offset = 0; - for (int i = 0; i != in_size_; ++i) { - if (auto bitmap = in_[i]->buffers[0]) { - internal::CopyBitmap(bitmap->data(), offsets_[i], lengths_[i], bitmap_data, - bitmap_offset); - } else { - BitUtil::SetBitsTo(bitmap_data, bitmap_offset, lengths_[i], true); - } - bitmap_offset += lengths_[i]; - } - if (auto preceding_bits = BitUtil::kPrecedingBitmask[out_->length % 8]) { - bitmap_data[out_->length / 8] &= preceding_bits; - } - return Status::OK(); - } - - // FIXME the below assumes that the first offset in the inputs will always be 0, which - // isn't necessarily correct. Accumulating first and last offsets will be necessary - // because we need to slice the referenced data (child_data for lists, values_buffer for - // strings) - Status ConcatenateOffsets(int index, std::shared_ptr* offset_buffer, - std::vector* ranges) { - RETURN_NOT_OK( - AllocateBuffer(pool_, (out_->length + 1) * sizeof(int32_t), offset_buffer)); - auto dst_offsets_begin = reinterpret_cast((*offset_buffer)->mutable_data()); - int32_t values_length = 0; - ranges->resize(in_size_); - for (int i = 0; i != in_size_; ++i) { - auto src_offsets_begin = in_[i]->GetValues(index) + offsets_[i]; - auto src_offsets_end = src_offsets_begin + lengths_[i]; - auto first_offset = src_offsets_begin[0]; - ranges->at(i).offset = first_offset; - std::transform(src_offsets_begin, src_offsets_end, dst_offsets_begin, - [values_length, first_offset](int32_t offset) { - return offset - first_offset + values_length; - }); - auto last_offset = *src_offsets_end; - ranges->at(i).length = last_offset - first_offset; - values_length += last_offset - first_offset; - dst_offsets_begin += lengths_[i]; - } - *dst_offsets_begin = values_length; - return Status::OK(); - } - - const std::vector>& in_; - int in_size_; - std::vector lengths_, offsets_; - MemoryPool* pool_; - std::shared_ptr out_; -}; - -Status Concatenate(const std::vector>& arrays, MemoryPool* pool, - std::shared_ptr* out) { - DCHECK_GT(arrays.size(), 0); - std::vector> data(arrays.size()); - for (std::size_t i = 0; i != arrays.size(); ++i) { - DCHECK(arrays[i]->type()->Equals(*arrays[0]->type())); - data[i] = arrays[i]->data(); - } - std::shared_ptr out_data; - RETURN_NOT_OK(ConcatenateImpl(data, pool).Concatenate(&out_data)); - *out = MakeArray(std::move(out_data)); - return Status::OK(); -} - } // namespace arrow diff --git a/cpp/src/arrow/array.h b/cpp/src/arrow/array.h index 4d0dd028da8..bee133c017e 100644 --- a/cpp/src/arrow/array.h +++ b/cpp/src/arrow/array.h @@ -859,16 +859,6 @@ class ARROW_EXPORT DictionaryArray : public Array { ARROW_EXPORT Status ValidateArray(const Array& array); -/// \brief Concatenate arrays -/// -/// \param[in] arrays a vector of arrays to be concatenated -/// \param[in] pool memory to store the result will be allocated from this memory pool -/// \param[out] out the resulting concatenated array -/// \return Status -ARROW_EXPORT -Status Concatenate(const std::vector>& arrays, MemoryPool* pool, - std::shared_ptr* out); - } // namespace arrow #endif // ARROW_ARRAY_H diff --git a/cpp/src/arrow/util/concatenate.cc b/cpp/src/arrow/util/concatenate.cc new file mode 100644 index 00000000000..8a3d9b274c9 --- /dev/null +++ b/cpp/src/arrow/util/concatenate.cc @@ -0,0 +1,326 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include "arrow/util/concatenate.h" + +#include +#include + +#include "arrow/array.h" +#include "arrow/memory_pool.h" +#include "arrow/util/logging.h" +#include "arrow/util/visibility.h" +#include "arrow/visitor_inline.h" + +namespace arrow { + +static inline std::shared_ptr SliceData(const ArrayData& data, int64_t offset, + int64_t length) { + DCHECK_LE(offset, data.length); + length = std::min(data.length - offset, length); + offset += data.offset; + + auto new_data = data.Copy(); + new_data->length = length; + new_data->offset = offset; + new_data->null_count = data.null_count != 0 ? kUnknownNullCount : 0; + return new_data; +} + +struct ConcatenateImpl { + ConcatenateImpl(const std::vector>& in, MemoryPool* pool) + : in_(in), + in_size_(static_cast(in.size())), + lengths_(in.size()), + offsets_(in.size()), + pool_(pool), + out_(new ArrayData) { + out_->type = in[0]->type; + for (int i = 0; i != in_size_; ++i) { + lengths_[i] = in[i]->length; + offsets_[i] = in[i]->offset; + out_->length += lengths_[i]; + if (out_->null_count == kUnknownNullCount || + in[i]->null_count == kUnknownNullCount) { + out_->null_count = kUnknownNullCount; + continue; + } + out_->null_count += in[i]->null_count; + } + } + + /// offset, length pair for representing a range of a buffer or array + struct range { + int32_t offset, length; + + range() : offset(-1), length(0) {} + range(int32_t o, int32_t l) : offset(o), length(l) {} + void widen_to_include(int32_t index) { + if (length == 0) { + offset = index; + length = 1; + return; + } + auto end = std::max(offset + length, index + 1); + length = end - offset; + offset = std::min(index, offset); + } + }; + + Status Visit(const NullType&) { return Status::OK(); } + + Status Visit(const BooleanType&) { + std::shared_ptr values_buffer; + RETURN_NOT_OK(ConcatenateBitmaps(1, &values_buffer)); + out_->buffers.push_back(values_buffer); + return Status::OK(); + } + + // handle numbers, decimal128, fixed_size_binary + Status Visit(const FixedWidthType& fixed) { + DCHECK_EQ(fixed.bit_width() % 8, 0); + const int byte_width = fixed.bit_width() / 8; + std::shared_ptr values_buffer; + std::vector> values_slices(in_size_); + for (int i = 0; i != in_size_; ++i) { + auto byte_length = byte_width * lengths_[i]; + auto byte_offset = byte_width * offsets_[i]; + values_slices[i] = SliceBuffer(in_[i]->buffers[1], byte_offset, byte_length); + } + RETURN_NOT_OK(arrow::Concatenate(values_slices, pool_, &values_buffer)); + out_->buffers.push_back(values_buffer); + return Status::OK(); + } + + Status Visit(const BinaryType&) { + std::shared_ptr values_buffer, offset_buffer; + std::vector value_ranges; + RETURN_NOT_OK(ConcatenateOffsets(1, &offset_buffer, &value_ranges)); + out_->buffers.push_back(offset_buffer); + std::vector> values_slices(in_size_); + for (int i = 0; i != in_size_; ++i) { + values_slices[i] = + SliceBuffer(in_[i]->buffers[2], value_ranges[i].offset, value_ranges[i].length); + } + RETURN_NOT_OK(arrow::Concatenate(values_slices, pool_, &values_buffer)); + out_->buffers.push_back(values_buffer); + return Status::OK(); + } + + Status Visit(const ListType&) { + std::shared_ptr offset_buffer; + std::vector value_ranges; + RETURN_NOT_OK(ConcatenateOffsets(1, &offset_buffer, &value_ranges)); + out_->buffers.push_back(offset_buffer); + std::vector> values_slices(in_size_); + for (int i = 0; i != in_size_; ++i) { + values_slices[i] = SliceData(*in_[i]->child_data[0], value_ranges[i].offset, + value_ranges[i].length); + } + std::shared_ptr values; + RETURN_NOT_OK(ConcatenateImpl(values_slices, pool_).Concatenate(&values)); + out_->child_data = {values}; + return Status::OK(); + } + + Status Visit(const StructType& s) { + std::vector> values_slices(in_size_); + for (int field_index = 0; field_index != s.num_children(); ++field_index) { + for (int i = 0; i != in_size_; ++i) { + values_slices[i] = + SliceData(*in_[i]->child_data[field_index], offsets_[i], lengths_[i]); + } + std::shared_ptr values; + RETURN_NOT_OK(ConcatenateImpl(values_slices, pool_).Concatenate(&values)); + out_->child_data.push_back(values); + } + return Status::OK(); + } + + Status Visit(const DictionaryType& d) { + std::vector> indices_slices(in_size_); + for (int i = 0; i != in_size_; ++i) { + indices_slices[i] = SliceData(*in_[i], offsets_[i], lengths_[i]); + indices_slices[i]->type = d.index_type(); + } + std::shared_ptr indices; + RETURN_NOT_OK(ConcatenateImpl(indices_slices, pool_).Concatenate(&indices)); + auto type = out_->type; + auto null_bitmap = out_->buffers[0]; + out_ = indices; + out_->type = type; + out_->buffers[0] = null_bitmap; + return Status::OK(); + } + + Status Visit(const UnionType& u) { + // type_codes are an index into child_data + std::vector> codes_slices(in_size_); + for (int i = 0; i != in_size_; ++i) { + codes_slices[i] = SliceBuffer(in_[i]->buffers[1], offsets_[i], lengths_[i]); + } + std::shared_ptr codes_buffer; + RETURN_NOT_OK(arrow::Concatenate(codes_slices, pool_, &codes_buffer)); + out_->buffers.push_back(codes_buffer); + if (u.mode() == UnionMode::SPARSE) { + std::vector> values_slices(in_size_); + for (int field_index = 0; field_index != u.num_children(); ++field_index) { + for (int i = 0; i != in_size_; ++i) { + values_slices[i] = + SliceData(*in_[i]->child_data[field_index], offsets_[i], lengths_[i]); + } + std::shared_ptr values; + RETURN_NOT_OK(ConcatenateImpl(values_slices, pool_).Concatenate(&values)); + out_->child_data.push_back(values); + } + out_->buffers.push_back(nullptr); + return Status::OK(); + } + DCHECK_EQ(u.mode(), UnionMode::DENSE); + + // build mapping from (input, type_code)->the range of values actually referenced + auto max_code = *std::max_element(u.type_codes().begin(), u.type_codes().end()); + std::vector values_ranges((static_cast(max_code) + 1) * in_size_); + auto get_values_range = [&](int in_i, UnionArray::type_id_t code) -> range& { + return values_ranges[in_i * in_size_ + code]; + }; + for (int i = 0; i != in_size_; ++i) { + auto codes = in_[i]->buffers[1]->data(); + auto offsets = reinterpret_cast(in_[i]->buffers[2]->data()); + for (auto index = offsets_[i]; index != offsets_[i] + lengths_[i]; ++index) { + get_values_range(i, codes[index]).widen_to_include(offsets[index]); + } + } + + // for each type_code, use the min/max offset as a slice range + // and concatenate sliced data for that type_code + out_->child_data.resize(static_cast(max_code) + 1); + for (auto code : u.type_codes()) { + std::vector> values_slices(in_size_); + for (int i = 0; i != in_size_; ++i) { + auto values_range = get_values_range(i, code); + values_slices[i] = SliceData(*in_[i]->child_data[code], values_range.offset, + values_range.length); + } + std::shared_ptr values; + RETURN_NOT_OK(ConcatenateImpl(values_slices, pool_).Concatenate(&values)); + out_->child_data[code] = values; + } + + // for each input array, adjust the offsets by the length of the slice + // range for a given type code and the minimum offset in that input array, + // so that offsets point into the concatenated values + std::vector total_lengths(static_cast(max_code) + 1, 0); + std::shared_ptr offsets_buffer; + RETURN_NOT_OK(AllocateBuffer(pool_, out_->length * sizeof(int32_t), &offsets_buffer)); + auto raw_offsets = reinterpret_cast(offsets_buffer->mutable_data()); + for (int i = 0; i != in_size_; ++i) { + auto codes = in_[i]->buffers[1]->data(); + auto offsets = reinterpret_cast(in_[i]->buffers[2]->data()); + for (auto index = offsets_[i]; index != offsets_[i] + lengths_[i]; ++index) { + auto min_offset = get_values_range(i, codes[index]).offset; + *raw_offsets++ = offsets[index] - min_offset + total_lengths[codes[index]]; + } + for (auto code : u.type_codes()) { + total_lengths[code] += get_values_range(i, code).length; + } + } + out_->buffers.push_back(offsets_buffer); + return Status::OK(); + } + + Status Concatenate(std::shared_ptr* out) && { + std::shared_ptr null_bitmap; + if (out_->null_count != 0) { + RETURN_NOT_OK(ConcatenateBitmaps(0, &null_bitmap)); + } + out_->buffers = {null_bitmap}; + RETURN_NOT_OK(VisitTypeInline(*out_->type, this)); + *out = std::move(out_); + return Status::OK(); + } + + Status ConcatenateBitmaps(int index, std::shared_ptr* bitmap_buffer) { + RETURN_NOT_OK(AllocateBitmap(pool_, out_->length, bitmap_buffer)); + uint8_t* bitmap_data = (*bitmap_buffer)->mutable_data(); + int64_t bitmap_offset = 0; + for (int i = 0; i != in_size_; ++i) { + if (auto bitmap = in_[i]->buffers[0]) { + internal::CopyBitmap(bitmap->data(), offsets_[i], lengths_[i], bitmap_data, + bitmap_offset); + } else { + BitUtil::SetBitsTo(bitmap_data, bitmap_offset, lengths_[i], true); + } + bitmap_offset += lengths_[i]; + } + if (auto preceding_bits = BitUtil::kPrecedingBitmask[out_->length % 8]) { + bitmap_data[out_->length / 8] &= preceding_bits; + } + return Status::OK(); + } + + // FIXME the below assumes that the first offset in the inputs will always be 0, which + // isn't necessarily correct. Accumulating first and last offsets will be necessary + // because we need to slice the referenced data (child_data for lists, values_buffer for + // strings) + Status ConcatenateOffsets(int index, std::shared_ptr* offset_buffer, + std::vector* ranges) { + RETURN_NOT_OK( + AllocateBuffer(pool_, (out_->length + 1) * sizeof(int32_t), offset_buffer)); + auto dst_offsets_begin = reinterpret_cast((*offset_buffer)->mutable_data()); + int32_t values_length = 0; + ranges->resize(in_size_); + for (int i = 0; i != in_size_; ++i) { + auto src_offsets_begin = in_[i]->GetValues(index) + offsets_[i]; + auto src_offsets_end = src_offsets_begin + lengths_[i]; + auto first_offset = src_offsets_begin[0]; + ranges->at(i).offset = first_offset; + std::transform(src_offsets_begin, src_offsets_end, dst_offsets_begin, + [values_length, first_offset](int32_t offset) { + return offset - first_offset + values_length; + }); + auto last_offset = *src_offsets_end; + ranges->at(i).length = last_offset - first_offset; + values_length += last_offset - first_offset; + dst_offsets_begin += lengths_[i]; + } + *dst_offsets_begin = values_length; + return Status::OK(); + } + + const std::vector>& in_; + int in_size_; + std::vector lengths_, offsets_; + MemoryPool* pool_; + std::shared_ptr out_; +}; + +Status Concatenate(const std::vector>& arrays, MemoryPool* pool, + std::shared_ptr* out) { + DCHECK_GT(arrays.size(), 0); + std::vector> data(arrays.size()); + for (std::size_t i = 0; i != arrays.size(); ++i) { + DCHECK(arrays[i]->type()->Equals(*arrays[0]->type())); + data[i] = arrays[i]->data(); + } + std::shared_ptr out_data; + RETURN_NOT_OK(ConcatenateImpl(data, pool).Concatenate(&out_data)); + *out = MakeArray(std::move(out_data)); + return Status::OK(); +} + +} // namespace arrow diff --git a/cpp/src/arrow/util/concatenate.h b/cpp/src/arrow/util/concatenate.h new file mode 100644 index 00000000000..0169892bff1 --- /dev/null +++ b/cpp/src/arrow/util/concatenate.h @@ -0,0 +1,39 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#pragma once + +#include +#include + +#include "arrow/array.h" +#include "arrow/memory_pool.h" +#include "arrow/util/visibility.h" + +namespace arrow { + +/// \brief Concatenate arrays +/// +/// \param[in] arrays a vector of arrays to be concatenated +/// \param[in] pool memory to store the result will be allocated from this memory pool +/// \param[out] out the resulting concatenated array +/// \return Status +ARROW_EXPORT +Status Concatenate(const std::vector>& arrays, MemoryPool* pool, + std::shared_ptr* out); + +} // namespace arrow From 0c524f30a48417f3c5f58fc5658fdb5bb8b84262 Mon Sep 17 00:00:00 2001 From: Benjamin Kietzman Date: Tue, 26 Feb 2019 13:05:47 -0500 Subject: [PATCH 03/21] don't concatenate null bitmaps twice --- cpp/src/arrow/util/concatenate.cc | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/cpp/src/arrow/util/concatenate.cc b/cpp/src/arrow/util/concatenate.cc index 8a3d9b274c9..e70e45f726b 100644 --- a/cpp/src/arrow/util/concatenate.cc +++ b/cpp/src/arrow/util/concatenate.cc @@ -156,6 +156,9 @@ struct ConcatenateImpl { for (int i = 0; i != in_size_; ++i) { indices_slices[i] = SliceData(*in_[i], offsets_[i], lengths_[i]); indices_slices[i]->type = d.index_type(); + // don't bother concatenating null bitmaps again + indices_slices[i]->null_count = 0; + indices_slices[i]->buffers[0] = nullptr; } std::shared_ptr indices; RETURN_NOT_OK(ConcatenateImpl(indices_slices, pool_).Concatenate(&indices)); @@ -243,6 +246,11 @@ struct ConcatenateImpl { return Status::OK(); } + Status Visit(const ExtensionType&) { + // XXX can we just concatenate their storage in general? + return Status::NotImplemented("concatenation of extension arrays"); + } + Status Concatenate(std::shared_ptr* out) && { std::shared_ptr null_bitmap; if (out_->null_count != 0) { From cbb89c5245d0635a00580683e18977cde2fb3b5d Mon Sep 17 00:00:00 2001 From: Benjamin Kietzman Date: Tue, 26 Feb 2019 13:08:30 -0500 Subject: [PATCH 04/21] return Invalid on mismatched types --- cpp/src/arrow/util/concatenate.cc | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/cpp/src/arrow/util/concatenate.cc b/cpp/src/arrow/util/concatenate.cc index e70e45f726b..5ccd7302483 100644 --- a/cpp/src/arrow/util/concatenate.cc +++ b/cpp/src/arrow/util/concatenate.cc @@ -322,7 +322,11 @@ Status Concatenate(const std::vector>& arrays, MemoryPool DCHECK_GT(arrays.size(), 0); std::vector> data(arrays.size()); for (std::size_t i = 0; i != arrays.size(); ++i) { - DCHECK(arrays[i]->type()->Equals(*arrays[0]->type())); + if (!arrays[i]->type()->Equals(*arrays[0]->type())) { + return Status::Invalid("arrays to be concatentated must be identically typed, but ", + *arrays[0]->type(), " and ", *arrays[i]->type(), + " were encountered."); + } data[i] = arrays[i]->data(); } std::shared_ptr out_data; From a9538264d53d2e25dd8399b7f341b6393c5e7354 Mon Sep 17 00:00:00 2001 From: Benjamin Kietzman Date: Tue, 26 Feb 2019 13:27:36 -0500 Subject: [PATCH 05/21] bail on offset overflow --- cpp/src/arrow/array-test.cc | 11 +++++++++++ cpp/src/arrow/util/concatenate.cc | 23 +++++++++++++---------- 2 files changed, 24 insertions(+), 10 deletions(-) diff --git a/cpp/src/arrow/array-test.cc b/cpp/src/arrow/array-test.cc index a7325e2f772..5358f2dab45 100644 --- a/cpp/src/arrow/array-test.cc +++ b/cpp/src/arrow/array-test.cc @@ -1835,4 +1835,15 @@ INSTANTIATE_TEST_CASE_P( {R"([{"strings":"a", "ints":0}, {"strings":"b", "ints":1}])", R"([{"strings":"c", "ints":2}, {"strings":"d", "ints":3}])"}))); +TEST(Concatenate, OffsetOverflow) { + auto fake_long = ArrayFromJSON(utf8(), "[\"\"]"); + fake_long->data()->GetMutableValues(1)[1] = + std::numeric_limits::max(); + std::shared_ptr concatenated; + // XX since the data fake_long claims to own isn't there, this will segfault if + // Concatenate doesn't detect overflow and raise an error. + ASSERT_RAISES( + Invalid, Concatenate({fake_long, fake_long}, default_memory_pool(), &concatenated)); +} + } // namespace arrow diff --git a/cpp/src/arrow/util/concatenate.cc b/cpp/src/arrow/util/concatenate.cc index 5ccd7302483..4f1e1eea5cd 100644 --- a/cpp/src/arrow/util/concatenate.cc +++ b/cpp/src/arrow/util/concatenate.cc @@ -289,24 +289,27 @@ struct ConcatenateImpl { std::vector* ranges) { RETURN_NOT_OK( AllocateBuffer(pool_, (out_->length + 1) * sizeof(int32_t), offset_buffer)); - auto dst_offsets_begin = reinterpret_cast((*offset_buffer)->mutable_data()); - int32_t values_length = 0; + auto dst_offsets = reinterpret_cast((*offset_buffer)->mutable_data()); + int32_t total_length = 0; ranges->resize(in_size_); for (int i = 0; i != in_size_; ++i) { auto src_offsets_begin = in_[i]->GetValues(index) + offsets_[i]; auto src_offsets_end = src_offsets_begin + lengths_[i]; auto first_offset = src_offsets_begin[0]; + auto length = *src_offsets_end - first_offset; ranges->at(i).offset = first_offset; - std::transform(src_offsets_begin, src_offsets_end, dst_offsets_begin, - [values_length, first_offset](int32_t offset) { - return offset - first_offset + values_length; + ranges->at(i).length = length; + if (total_length > std::numeric_limits::max() - length) { + return Status::Invalid("offset overflow while concatenating arrays"); + } + std::transform(src_offsets_begin, src_offsets_end, dst_offsets, + [total_length, first_offset](int32_t offset) { + return offset - first_offset + total_length; }); - auto last_offset = *src_offsets_end; - ranges->at(i).length = last_offset - first_offset; - values_length += last_offset - first_offset; - dst_offsets_begin += lengths_[i]; + total_length += length; + dst_offsets += lengths_[i]; } - *dst_offsets_begin = values_length; + *dst_offsets = total_length; return Status::OK(); } From 711de4a4fb8e34e2d380fe447faf2d9c5a7d0810 Mon Sep 17 00:00:00 2001 From: Benjamin Kietzman Date: Tue, 26 Feb 2019 14:02:54 -0500 Subject: [PATCH 06/21] remove unnecessary shared_ptr indirection --- cpp/src/arrow/util/concatenate.cc | 160 ++++++++++++++---------------- 1 file changed, 76 insertions(+), 84 deletions(-) diff --git a/cpp/src/arrow/util/concatenate.cc b/cpp/src/arrow/util/concatenate.cc index 4f1e1eea5cd..f77484ee658 100644 --- a/cpp/src/arrow/util/concatenate.cc +++ b/cpp/src/arrow/util/concatenate.cc @@ -28,38 +28,35 @@ namespace arrow { -static inline std::shared_ptr SliceData(const ArrayData& data, int64_t offset, - int64_t length) { +static inline ArrayData SliceData(const ArrayData& data, int64_t offset, int64_t length) { DCHECK_LE(offset, data.length); length = std::min(data.length - offset, length); offset += data.offset; - auto new_data = data.Copy(); - new_data->length = length; - new_data->offset = offset; - new_data->null_count = data.null_count != 0 ? kUnknownNullCount : 0; - return new_data; + auto copy = data; + copy.length = length; + copy.offset = offset; + copy.null_count = data.null_count != 0 ? kUnknownNullCount : 0; + return copy; } struct ConcatenateImpl { - ConcatenateImpl(const std::vector>& in, MemoryPool* pool) + ConcatenateImpl(const std::vector& in, MemoryPool* pool) : in_(in), in_size_(static_cast(in.size())), lengths_(in.size()), offsets_(in.size()), - pool_(pool), - out_(new ArrayData) { - out_->type = in[0]->type; + pool_(pool) { + out_.type = in[0].type; for (int i = 0; i != in_size_; ++i) { - lengths_[i] = in[i]->length; - offsets_[i] = in[i]->offset; - out_->length += lengths_[i]; - if (out_->null_count == kUnknownNullCount || - in[i]->null_count == kUnknownNullCount) { - out_->null_count = kUnknownNullCount; + lengths_[i] = in[i].length; + offsets_[i] = in[i].offset; + out_.length += lengths_[i]; + if (out_.null_count == kUnknownNullCount || in[i].null_count == kUnknownNullCount) { + out_.null_count = kUnknownNullCount; continue; } - out_->null_count += in[i]->null_count; + out_.null_count += in[i].null_count; } } @@ -86,7 +83,7 @@ struct ConcatenateImpl { Status Visit(const BooleanType&) { std::shared_ptr values_buffer; RETURN_NOT_OK(ConcatenateBitmaps(1, &values_buffer)); - out_->buffers.push_back(values_buffer); + out_.buffers.push_back(values_buffer); return Status::OK(); } @@ -99,10 +96,10 @@ struct ConcatenateImpl { for (int i = 0; i != in_size_; ++i) { auto byte_length = byte_width * lengths_[i]; auto byte_offset = byte_width * offsets_[i]; - values_slices[i] = SliceBuffer(in_[i]->buffers[1], byte_offset, byte_length); + values_slices[i] = SliceBuffer(in_[i].buffers[1], byte_offset, byte_length); } RETURN_NOT_OK(arrow::Concatenate(values_slices, pool_, &values_buffer)); - out_->buffers.push_back(values_buffer); + out_.buffers.push_back(values_buffer); return Status::OK(); } @@ -110,14 +107,14 @@ struct ConcatenateImpl { std::shared_ptr values_buffer, offset_buffer; std::vector value_ranges; RETURN_NOT_OK(ConcatenateOffsets(1, &offset_buffer, &value_ranges)); - out_->buffers.push_back(offset_buffer); + out_.buffers.push_back(offset_buffer); std::vector> values_slices(in_size_); for (int i = 0; i != in_size_; ++i) { values_slices[i] = - SliceBuffer(in_[i]->buffers[2], value_ranges[i].offset, value_ranges[i].length); + SliceBuffer(in_[i].buffers[2], value_ranges[i].offset, value_ranges[i].length); } RETURN_NOT_OK(arrow::Concatenate(values_slices, pool_, &values_buffer)); - out_->buffers.push_back(values_buffer); + out_.buffers.push_back(values_buffer); return Status::OK(); } @@ -125,48 +122,44 @@ struct ConcatenateImpl { std::shared_ptr offset_buffer; std::vector value_ranges; RETURN_NOT_OK(ConcatenateOffsets(1, &offset_buffer, &value_ranges)); - out_->buffers.push_back(offset_buffer); - std::vector> values_slices(in_size_); + out_.buffers.push_back(offset_buffer); + std::vector values_slices(in_size_); for (int i = 0; i != in_size_; ++i) { - values_slices[i] = SliceData(*in_[i]->child_data[0], value_ranges[i].offset, + values_slices[i] = SliceData(*in_[i].child_data[0], value_ranges[i].offset, value_ranges[i].length); } - std::shared_ptr values; - RETURN_NOT_OK(ConcatenateImpl(values_slices, pool_).Concatenate(&values)); - out_->child_data = {values}; + auto values = std::make_shared(); + RETURN_NOT_OK(ConcatenateImpl(values_slices, pool_).Concatenate(values.get())); + out_.child_data = {values}; return Status::OK(); } Status Visit(const StructType& s) { - std::vector> values_slices(in_size_); + std::vector values_slices(in_size_); for (int field_index = 0; field_index != s.num_children(); ++field_index) { for (int i = 0; i != in_size_; ++i) { values_slices[i] = - SliceData(*in_[i]->child_data[field_index], offsets_[i], lengths_[i]); + SliceData(*in_[i].child_data[field_index], offsets_[i], lengths_[i]); } - std::shared_ptr values; - RETURN_NOT_OK(ConcatenateImpl(values_slices, pool_).Concatenate(&values)); - out_->child_data.push_back(values); + auto values = std::make_shared(); + RETURN_NOT_OK(ConcatenateImpl(values_slices, pool_).Concatenate(values.get())); + out_.child_data.push_back(values); } return Status::OK(); } Status Visit(const DictionaryType& d) { - std::vector> indices_slices(in_size_); + std::vector indices_slices(in_size_); for (int i = 0; i != in_size_; ++i) { - indices_slices[i] = SliceData(*in_[i], offsets_[i], lengths_[i]); - indices_slices[i]->type = d.index_type(); + indices_slices[i] = SliceData(in_[i], offsets_[i], lengths_[i]); + indices_slices[i].type = d.index_type(); // don't bother concatenating null bitmaps again - indices_slices[i]->null_count = 0; - indices_slices[i]->buffers[0] = nullptr; + indices_slices[i].null_count = 0; + indices_slices[i].buffers[0] = nullptr; } - std::shared_ptr indices; + ArrayData indices; RETURN_NOT_OK(ConcatenateImpl(indices_slices, pool_).Concatenate(&indices)); - auto type = out_->type; - auto null_bitmap = out_->buffers[0]; - out_ = indices; - out_->type = type; - out_->buffers[0] = null_bitmap; + out_.buffers.push_back(indices.buffers[1]); return Status::OK(); } @@ -174,23 +167,22 @@ struct ConcatenateImpl { // type_codes are an index into child_data std::vector> codes_slices(in_size_); for (int i = 0; i != in_size_; ++i) { - codes_slices[i] = SliceBuffer(in_[i]->buffers[1], offsets_[i], lengths_[i]); + codes_slices[i] = SliceBuffer(in_[i].buffers[1], offsets_[i], lengths_[i]); } std::shared_ptr codes_buffer; RETURN_NOT_OK(arrow::Concatenate(codes_slices, pool_, &codes_buffer)); - out_->buffers.push_back(codes_buffer); if (u.mode() == UnionMode::SPARSE) { - std::vector> values_slices(in_size_); + std::vector values_slices(in_size_); for (int field_index = 0; field_index != u.num_children(); ++field_index) { for (int i = 0; i != in_size_; ++i) { values_slices[i] = - SliceData(*in_[i]->child_data[field_index], offsets_[i], lengths_[i]); + SliceData(*in_[i].child_data[field_index], offsets_[i], lengths_[i]); } - std::shared_ptr values; - RETURN_NOT_OK(ConcatenateImpl(values_slices, pool_).Concatenate(&values)); - out_->child_data.push_back(values); + auto values = std::make_shared(); + RETURN_NOT_OK(ConcatenateImpl(values_slices, pool_).Concatenate(values.get())); + out_.child_data.push_back(values); } - out_->buffers.push_back(nullptr); + out_.buffers.push_back(nullptr); return Status::OK(); } DCHECK_EQ(u.mode(), UnionMode::DENSE); @@ -202,8 +194,8 @@ struct ConcatenateImpl { return values_ranges[in_i * in_size_ + code]; }; for (int i = 0; i != in_size_; ++i) { - auto codes = in_[i]->buffers[1]->data(); - auto offsets = reinterpret_cast(in_[i]->buffers[2]->data()); + auto codes = in_[i].buffers[1]->data(); + auto offsets = reinterpret_cast(in_[i].buffers[2]->data()); for (auto index = offsets_[i]; index != offsets_[i] + lengths_[i]; ++index) { get_values_range(i, codes[index]).widen_to_include(offsets[index]); } @@ -211,17 +203,17 @@ struct ConcatenateImpl { // for each type_code, use the min/max offset as a slice range // and concatenate sliced data for that type_code - out_->child_data.resize(static_cast(max_code) + 1); + out_.child_data.resize(static_cast(max_code) + 1); for (auto code : u.type_codes()) { - std::vector> values_slices(in_size_); + std::vector values_slices(in_size_); for (int i = 0; i != in_size_; ++i) { auto values_range = get_values_range(i, code); - values_slices[i] = SliceData(*in_[i]->child_data[code], values_range.offset, - values_range.length); + values_slices[i] = + SliceData(*in_[i].child_data[code], values_range.offset, values_range.length); } - std::shared_ptr values; - RETURN_NOT_OK(ConcatenateImpl(values_slices, pool_).Concatenate(&values)); - out_->child_data[code] = values; + auto values = std::make_shared(); + RETURN_NOT_OK(ConcatenateImpl(values_slices, pool_).Concatenate(values.get())); + out_.child_data[code] = values; } // for each input array, adjust the offsets by the length of the slice @@ -229,11 +221,11 @@ struct ConcatenateImpl { // so that offsets point into the concatenated values std::vector total_lengths(static_cast(max_code) + 1, 0); std::shared_ptr offsets_buffer; - RETURN_NOT_OK(AllocateBuffer(pool_, out_->length * sizeof(int32_t), &offsets_buffer)); + RETURN_NOT_OK(AllocateBuffer(pool_, out_.length * sizeof(int32_t), &offsets_buffer)); auto raw_offsets = reinterpret_cast(offsets_buffer->mutable_data()); for (int i = 0; i != in_size_; ++i) { - auto codes = in_[i]->buffers[1]->data(); - auto offsets = reinterpret_cast(in_[i]->buffers[2]->data()); + auto codes = in_[i].buffers[1]->data(); + auto offsets = reinterpret_cast(in_[i].buffers[2]->data()); for (auto index = offsets_[i]; index != offsets_[i] + lengths_[i]; ++index) { auto min_offset = get_values_range(i, codes[index]).offset; *raw_offsets++ = offsets[index] - min_offset + total_lengths[codes[index]]; @@ -242,32 +234,32 @@ struct ConcatenateImpl { total_lengths[code] += get_values_range(i, code).length; } } - out_->buffers.push_back(offsets_buffer); + out_.buffers.push_back(offsets_buffer); return Status::OK(); } Status Visit(const ExtensionType&) { - // XXX can we just concatenate their storage in general? + // XXX can we just concatenate their storage? return Status::NotImplemented("concatenation of extension arrays"); } - Status Concatenate(std::shared_ptr* out) && { + Status Concatenate(ArrayData* out) && { std::shared_ptr null_bitmap; - if (out_->null_count != 0) { + if (out_.null_count != 0) { RETURN_NOT_OK(ConcatenateBitmaps(0, &null_bitmap)); } - out_->buffers = {null_bitmap}; - RETURN_NOT_OK(VisitTypeInline(*out_->type, this)); + out_.buffers = {null_bitmap}; + RETURN_NOT_OK(VisitTypeInline(*out_.type, this)); *out = std::move(out_); return Status::OK(); } Status ConcatenateBitmaps(int index, std::shared_ptr* bitmap_buffer) { - RETURN_NOT_OK(AllocateBitmap(pool_, out_->length, bitmap_buffer)); + RETURN_NOT_OK(AllocateBitmap(pool_, out_.length, bitmap_buffer)); uint8_t* bitmap_data = (*bitmap_buffer)->mutable_data(); int64_t bitmap_offset = 0; for (int i = 0; i != in_size_; ++i) { - if (auto bitmap = in_[i]->buffers[0]) { + if (auto bitmap = in_[i].buffers[0]) { internal::CopyBitmap(bitmap->data(), offsets_[i], lengths_[i], bitmap_data, bitmap_offset); } else { @@ -275,8 +267,8 @@ struct ConcatenateImpl { } bitmap_offset += lengths_[i]; } - if (auto preceding_bits = BitUtil::kPrecedingBitmask[out_->length % 8]) { - bitmap_data[out_->length / 8] &= preceding_bits; + if (auto preceding_bits = BitUtil::kPrecedingBitmask[out_.length % 8]) { + bitmap_data[out_.length / 8] &= preceding_bits; } return Status::OK(); } @@ -288,12 +280,12 @@ struct ConcatenateImpl { Status ConcatenateOffsets(int index, std::shared_ptr* offset_buffer, std::vector* ranges) { RETURN_NOT_OK( - AllocateBuffer(pool_, (out_->length + 1) * sizeof(int32_t), offset_buffer)); + AllocateBuffer(pool_, (out_.length + 1) * sizeof(int32_t), offset_buffer)); auto dst_offsets = reinterpret_cast((*offset_buffer)->mutable_data()); int32_t total_length = 0; ranges->resize(in_size_); for (int i = 0; i != in_size_; ++i) { - auto src_offsets_begin = in_[i]->GetValues(index) + offsets_[i]; + auto src_offsets_begin = in_[i].GetValues(index) + offsets_[i]; auto src_offsets_end = src_offsets_begin + lengths_[i]; auto first_offset = src_offsets_begin[0]; auto length = *src_offsets_end - first_offset; @@ -313,27 +305,27 @@ struct ConcatenateImpl { return Status::OK(); } - const std::vector>& in_; + const std::vector& in_; int in_size_; std::vector lengths_, offsets_; MemoryPool* pool_; - std::shared_ptr out_; + ArrayData out_; }; Status Concatenate(const std::vector>& arrays, MemoryPool* pool, std::shared_ptr* out) { DCHECK_GT(arrays.size(), 0); - std::vector> data(arrays.size()); + std::vector data(arrays.size()); for (std::size_t i = 0; i != arrays.size(); ++i) { if (!arrays[i]->type()->Equals(*arrays[0]->type())) { return Status::Invalid("arrays to be concatentated must be identically typed, but ", *arrays[0]->type(), " and ", *arrays[i]->type(), " were encountered."); } - data[i] = arrays[i]->data(); + data[i] = ArrayData(*arrays[i]->data()); } - std::shared_ptr out_data; - RETURN_NOT_OK(ConcatenateImpl(data, pool).Concatenate(&out_data)); + auto out_data = std::make_shared(); + RETURN_NOT_OK(ConcatenateImpl(data, pool).Concatenate(out_data.get())); *out = MakeArray(std::move(out_data)); return Status::OK(); } From acc6c08d9d90c91803fd2c1a656e376590205b57 Mon Sep 17 00:00:00 2001 From: Benjamin Kietzman Date: Wed, 27 Feb 2019 12:50:20 -0500 Subject: [PATCH 07/21] Improve Concatenate testing - test concatenation of randomly generated array slices - ensure some of those array slices will have offset != 0 - test all primitive types, StringType, ListType, StructType, and DictionaryType - add disabled test for UnionType --- cpp/src/arrow/array-test.cc | 164 +++++++++++++++++++++++++----- cpp/src/arrow/util/concatenate.cc | 11 +- 2 files changed, 143 insertions(+), 32 deletions(-) diff --git a/cpp/src/arrow/array-test.cc b/cpp/src/arrow/array-test.cc index 5358f2dab45..8a8d32e50d2 100644 --- a/cpp/src/arrow/array-test.cc +++ b/cpp/src/arrow/array-test.cc @@ -1807,35 +1807,147 @@ struct ConcatenateParam { } }; -class ConcatenateTest : public ::testing::TestWithParam { +class ConcatenateTest : public ::testing::Test { + protected: + ConcatenateTest() + : rng_(seed_), + sizes_({0, 1, 2, 4, 16, 31, 1234}), + null_probabilities_({0.0, 0.1, 0.5, 0.9, 1.0}) {} + + std::vector Offsets(int32_t length, int32_t slice_count) { + std::vector offsets(static_cast(slice_count + 1)); + std::mt19937_64 gen(seed_); + std::uniform_int_distribution dist(0, length); + std::generate(offsets.begin(), offsets.end(), [&] { return dist(gen); }); + std::sort(offsets.begin(), offsets.end()); + return offsets; + } + + std::vector> Slices(const std::shared_ptr& array, + const std::vector& offsets) { + std::vector> slices(offsets.size() - 1); + for (size_t i = 0; i != slices.size(); ++i) { + slices[i] = array->Slice(offsets[i], offsets[i + 1] - offsets[i]); + } + return slices; + } + + template + std::shared_ptr GeneratePrimitive(int64_t size, double null_probability) { + if (std::is_same::value) { + return rng_.Boolean(size, 0.5, null_probability); + } + return rng_.Numeric(size, 0, 127, null_probability); + } + + template + void Check(ArrayFactory&& factory) { + for (auto size : this->sizes_) { + auto offsets = this->Offsets(size, 3); + for (auto null_probability : this->null_probabilities_) { + std::shared_ptr array; + factory(size, null_probability, &array); + auto expected = array->Slice(offsets.front(), offsets.back() - offsets.front()); + auto slices = this->Slices(array, offsets); + std::shared_ptr actual; + ASSERT_OK(Concatenate(slices, default_memory_pool(), &actual)); + // force computation of null_count for both arrays + ARROW_IGNORE_EXPR(expected->null_count()); + ARROW_IGNORE_EXPR(actual->null_count()); + AssertArraysEqual(*expected, *actual); + } + } + } + + random::SeedType seed_ = 0xdeadbeef; + random::RandomArrayGenerator rng_; + std::vector sizes_; + std::vector null_probabilities_; +}; + +template +class PrimitiveConcatenateTest : public ConcatenateTest { public: - ConcatenateTest() {} + void operator()(int64_t size, double null_probability, std::shared_ptr* out) { + *out = this->GeneratePrimitive(size, null_probability); + } }; -TEST_P(ConcatenateTest, Basics) { - auto param = GetParam(); - std::shared_ptr actual; - ASSERT_OK(Concatenate(param.addends, default_memory_pool(), &actual)); - AssertArraysEqual(*param.expected, *actual); -} - -INSTANTIATE_TEST_CASE_P( - ConcatenateTest, ConcatenateTest, - ::testing::Values( - ConcatenateParam(int32(), "[0, 1, 2, 3]", {"[0, 1]", "[2, 3]"}), - ConcatenateParam(float64(), "[0, 1, 2, 3]", {"[0, 1]", "[2, 3]"}), - ConcatenateParam(uint8(), "[0, 1, 2, 3]", {"[0, 1]", "[2, 3]"}), - ConcatenateParam(list(uint8()), "[[], [0, 1], [2], [3]]", - {"[[], [0, 1]]", "[[2], [3]]"}), - ConcatenateParam(utf8(), R"(["a", "b", "c", "d"])", - {R"(["a", "b"])", R"(["c", "d"])"}), - ConcatenateParam( - struct_({field("strings", utf8()), field("ints", int64())}), - R"([{"strings":"a", "ints":0}, {"strings":"b", "ints":1}, {"strings":"c", "ints":2}, {"strings":"d", "ints":3}])", - {R"([{"strings":"a", "ints":0}, {"strings":"b", "ints":1}])", - R"([{"strings":"c", "ints":2}, {"strings":"d", "ints":3}])"}))); - -TEST(Concatenate, OffsetOverflow) { +using PrimitiveTypes = + ::testing::Types; +TYPED_TEST_CASE(PrimitiveConcatenateTest, PrimitiveTypes); + +TYPED_TEST(PrimitiveConcatenateTest, Primitives) { this->Check(*this); } + +TEST_F(ConcatenateTest, StringType) { + Check([this](int32_t size, double null_probability, std::shared_ptr* out) { + auto values_size = size * 4; + auto char_array = this->GeneratePrimitive(values_size, null_probability); + std::shared_ptr offsets; + auto offsets_vector = this->Offsets(values_size, size); + // ensure the first offset is 0, which is expected for StringType + offsets_vector[0] = 0; + ASSERT_OK(CopyBufferFromVector(offsets_vector, default_memory_pool(), &offsets)); + *out = MakeArray(ArrayData::Make( + utf8(), size, + {char_array->data()->buffers[0], offsets, char_array->data()->buffers[1]})); + }); +} + +TEST_F(ConcatenateTest, ListType) { + Check([this](int32_t size, double null_probability, std::shared_ptr* out) { + auto values_size = size * 4; + auto values = this->GeneratePrimitive(values_size, null_probability); + auto offsets_vector = this->Offsets(values_size, size); + // ensure the first offset is 0, which is expected for ListType + offsets_vector[0] = 0; + std::shared_ptr offsets; + ArrayFromVector(offsets_vector, &offsets); + ASSERT_OK(ListArray::FromArrays(*offsets, *values, default_memory_pool(), out)); + }); +} + +TEST_F(ConcatenateTest, StructType) { + Check([this](int32_t size, double null_probability, std::shared_ptr* out) { + auto foo = this->GeneratePrimitive(size, null_probability); + auto bar = this->GeneratePrimitive(size, null_probability); + auto baz = this->GeneratePrimitive(size, null_probability); + *out = std::make_shared( + struct_({field("foo", int8()), field("bar", float64()), field("baz", boolean())}), + size, ArrayVector{foo, bar, baz}); + }); +} + +TEST_F(ConcatenateTest, DictionaryType) { + Check([this](int32_t size, double null_probability, std::shared_ptr* out) { + auto indices = this->GeneratePrimitive(size, null_probability); + auto type = dictionary(int32(), this->GeneratePrimitive(128, 0)); + *out = std::make_shared(type, indices); + }); +} + +TEST_F(ConcatenateTest, DISABLED_UnionType) { + // sparse mode + Check([this](int32_t size, double null_probability, std::shared_ptr* out) { + auto foo = this->GeneratePrimitive(size, null_probability); + auto bar = this->GeneratePrimitive(size, null_probability); + auto baz = this->GeneratePrimitive(size, null_probability); + auto type_ids = rng_.Numeric(size, 0, 2, null_probability); + ASSERT_OK(UnionArray::MakeSparse(*type_ids, {foo, bar, baz}, out)); + }); + // dense mode + Check([this](int32_t size, double null_probability, std::shared_ptr* out) { + auto foo = this->GeneratePrimitive(size, null_probability); + auto bar = this->GeneratePrimitive(size, null_probability); + auto baz = this->GeneratePrimitive(size, null_probability); + auto type_ids = rng_.Numeric(size, 0, 2, null_probability); + auto value_offsets = rng_.Numeric(size, 0, size, 0); + ASSERT_OK(UnionArray::MakeDense(*type_ids, *value_offsets, {foo, bar, baz}, out)); + }); +} + +TEST_F(ConcatenateTest, OffsetOverflow) { auto fake_long = ArrayFromJSON(utf8(), "[\"\"]"); fake_long->data()->GetMutableValues(1)[1] = std::numeric_limits::max(); diff --git a/cpp/src/arrow/util/concatenate.cc b/cpp/src/arrow/util/concatenate.cc index f77484ee658..d430ad9332a 100644 --- a/cpp/src/arrow/util/concatenate.cc +++ b/cpp/src/arrow/util/concatenate.cc @@ -165,6 +165,8 @@ struct ConcatenateImpl { Status Visit(const UnionType& u) { // type_codes are an index into child_data + return Status::NotImplemented("concatenation of ", u); + /* std::vector> codes_slices(in_size_); for (int i = 0; i != in_size_; ++i) { codes_slices[i] = SliceBuffer(in_[i].buffers[1], offsets_[i], lengths_[i]); @@ -236,6 +238,7 @@ struct ConcatenateImpl { } out_.buffers.push_back(offsets_buffer); return Status::OK(); + */ } Status Visit(const ExtensionType&) { @@ -259,7 +262,7 @@ struct ConcatenateImpl { uint8_t* bitmap_data = (*bitmap_buffer)->mutable_data(); int64_t bitmap_offset = 0; for (int i = 0; i != in_size_; ++i) { - if (auto bitmap = in_[i].buffers[0]) { + if (auto bitmap = in_[i].buffers[index]) { internal::CopyBitmap(bitmap->data(), offsets_[i], lengths_[i], bitmap_data, bitmap_offset); } else { @@ -273,10 +276,6 @@ struct ConcatenateImpl { return Status::OK(); } - // FIXME the below assumes that the first offset in the inputs will always be 0, which - // isn't necessarily correct. Accumulating first and last offsets will be necessary - // because we need to slice the referenced data (child_data for lists, values_buffer for - // strings) Status ConcatenateOffsets(int index, std::shared_ptr* offset_buffer, std::vector* ranges) { RETURN_NOT_OK( @@ -285,7 +284,7 @@ struct ConcatenateImpl { int32_t total_length = 0; ranges->resize(in_size_); for (int i = 0; i != in_size_; ++i) { - auto src_offsets_begin = in_[i].GetValues(index) + offsets_[i]; + auto src_offsets_begin = in_[i].GetValues(index); auto src_offsets_end = src_offsets_begin + lengths_[i]; auto first_offset = src_offsets_begin[0]; auto length = *src_offsets_end - first_offset; From 5803f1d4569bd191a579775142c852117cb7058c Mon Sep 17 00:00:00 2001 From: Benjamin Kietzman Date: Wed, 27 Feb 2019 12:59:14 -0500 Subject: [PATCH 08/21] remove ConcatenateParam, fix concat(Dictionary) --- cpp/src/arrow/array-test.cc | 21 --------------------- cpp/src/arrow/util/concatenate.cc | 2 +- 2 files changed, 1 insertion(+), 22 deletions(-) diff --git a/cpp/src/arrow/array-test.cc b/cpp/src/arrow/array-test.cc index 8a8d32e50d2..8e14a7a7d64 100644 --- a/cpp/src/arrow/array-test.cc +++ b/cpp/src/arrow/array-test.cc @@ -1786,27 +1786,6 @@ TEST(TestRechunkArraysConsistently, Plain) { } } -// TODO this could be made much better. -// First, if the concatenation of addends equals the expected then -// the concatenation of their JSON representations must equal that of the -// expected array, so we only need to provide the addends as strings. -// More generally, Concatenate is an excellent candidate for property testing: -// generate random addends then concatenate them using their json repr and -// Concatenate, then compare. -struct ConcatenateParam { - std::shared_ptr expected; - std::vector> addends; - - ConcatenateParam(std::shared_ptr expected_type, - const std::string& expected_json, - const std::vector& addends_json) - : expected(ArrayFromJSON(expected_type, expected_json)) { - for (const auto& addend : addends_json) { - addends.push_back(ArrayFromJSON(expected_type, addend)); - } - } -}; - class ConcatenateTest : public ::testing::Test { protected: ConcatenateTest() diff --git a/cpp/src/arrow/util/concatenate.cc b/cpp/src/arrow/util/concatenate.cc index d430ad9332a..23eef16d5f6 100644 --- a/cpp/src/arrow/util/concatenate.cc +++ b/cpp/src/arrow/util/concatenate.cc @@ -151,7 +151,7 @@ struct ConcatenateImpl { Status Visit(const DictionaryType& d) { std::vector indices_slices(in_size_); for (int i = 0; i != in_size_; ++i) { - indices_slices[i] = SliceData(in_[i], offsets_[i], lengths_[i]); + indices_slices[i] = ArrayData(in_[i]); indices_slices[i].type = d.index_type(); // don't bother concatenating null bitmaps again indices_slices[i].null_count = 0; From d8ba14d4d20c9ccf8d856eb37268d364511a37f7 Mon Sep 17 00:00:00 2001 From: Benjamin Kietzman Date: Wed, 27 Feb 2019 13:03:28 -0500 Subject: [PATCH 09/21] add lint #includes --- cpp/src/arrow/util/concatenate.cc | 3 +++ 1 file changed, 3 insertions(+) diff --git a/cpp/src/arrow/util/concatenate.cc b/cpp/src/arrow/util/concatenate.cc index 23eef16d5f6..b633da82ec4 100644 --- a/cpp/src/arrow/util/concatenate.cc +++ b/cpp/src/arrow/util/concatenate.cc @@ -17,7 +17,10 @@ #include "arrow/util/concatenate.h" +#include +#include #include +#include #include #include "arrow/array.h" From 76ac1017d8157e71ade7fd5fa2fb5a70424dad7d Mon Sep 17 00:00:00 2001 From: Benjamin Kietzman Date: Thu, 28 Feb 2019 11:07:33 -0500 Subject: [PATCH 10/21] assert trailing bits are zeroed in bitmap buffers vagrind was complaining about uninitialized values due to the way CopyBitmap restores trailing bits; extended CopyBitmap to (optionally) elide that step --- cpp/src/arrow/array-test.cc | 14 ++++++++++++++ cpp/src/arrow/util/bit-util.cc | 8 ++++++-- cpp/src/arrow/util/bit-util.h | 3 ++- cpp/src/arrow/util/concatenate.cc | 2 +- 4 files changed, 23 insertions(+), 4 deletions(-) diff --git a/cpp/src/arrow/array-test.cc b/cpp/src/arrow/array-test.cc index 8e14a7a7d64..ada9df822b4 100644 --- a/cpp/src/arrow/array-test.cc +++ b/cpp/src/arrow/array-test.cc @@ -1819,6 +1819,14 @@ class ConcatenateTest : public ::testing::Test { return rng_.Numeric(size, 0, 127, null_probability); } + void CheckTrailingBitsAreZeroed(const std::shared_ptr& bitmap, int64_t length) { + if (auto preceding_bits = BitUtil::kPrecedingBitmask[length % 8]) { + auto last_byte = bitmap->data()[length / 8]; + ASSERT_EQ(static_cast(last_byte & preceding_bits), last_byte) + << length << " " << int(preceding_bits); + } + } + template void Check(ArrayFactory&& factory) { for (auto size : this->sizes_) { @@ -1834,6 +1842,12 @@ class ConcatenateTest : public ::testing::Test { ARROW_IGNORE_EXPR(expected->null_count()); ARROW_IGNORE_EXPR(actual->null_count()); AssertArraysEqual(*expected, *actual); + if (actual->data()->buffers[0]) { + CheckTrailingBitsAreZeroed(actual->data()->buffers[0], actual->length()); + } + if (actual->type_id() == Type::BOOL) { + CheckTrailingBitsAreZeroed(actual->data()->buffers[1], actual->length()); + } } } } diff --git a/cpp/src/arrow/util/bit-util.cc b/cpp/src/arrow/util/bit-util.cc index 2e7bf2f03d5..033267beb45 100644 --- a/cpp/src/arrow/util/bit-util.cc +++ b/cpp/src/arrow/util/bit-util.cc @@ -210,8 +210,12 @@ Status TransferBitmap(MemoryPool* pool, const uint8_t* data, int64_t offset, } void CopyBitmap(const uint8_t* data, int64_t offset, int64_t length, uint8_t* dest, - int64_t dest_offset) { - TransferBitmap(data, offset, length, dest_offset, dest); + int64_t dest_offset, bool restore_trailing_bits) { + if (restore_trailing_bits) { + TransferBitmap(data, offset, length, dest_offset, dest); + } else { + TransferBitmap(data, offset, length, dest_offset, dest); + } } void InvertBitmap(const uint8_t* data, int64_t offset, int64_t length, uint8_t* dest, diff --git a/cpp/src/arrow/util/bit-util.h b/cpp/src/arrow/util/bit-util.h index 22bf8fc858b..b7de112b85c 100644 --- a/cpp/src/arrow/util/bit-util.h +++ b/cpp/src/arrow/util/bit-util.h @@ -708,11 +708,12 @@ Status CopyBitmap(MemoryPool* pool, const uint8_t* bitmap, int64_t offset, int64 /// \param[in] offset bit offset into the source data /// \param[in] length number of bits to copy /// \param[in] dest_offset bit offset into the destination +/// \param[in] restore_trailing_bits don't clobber bits outside the destination range /// \param[out] dest the destination buffer, must have at least space for /// (offset + length) bits ARROW_EXPORT void CopyBitmap(const uint8_t* bitmap, int64_t offset, int64_t length, uint8_t* dest, - int64_t dest_offset); + int64_t dest_offset, bool restore_trailing_bits = true); /// Invert a bit range of an existing bitmap into an existing bitmap /// diff --git a/cpp/src/arrow/util/concatenate.cc b/cpp/src/arrow/util/concatenate.cc index b633da82ec4..e3216387983 100644 --- a/cpp/src/arrow/util/concatenate.cc +++ b/cpp/src/arrow/util/concatenate.cc @@ -267,7 +267,7 @@ struct ConcatenateImpl { for (int i = 0; i != in_size_; ++i) { if (auto bitmap = in_[i].buffers[index]) { internal::CopyBitmap(bitmap->data(), offsets_[i], lengths_[i], bitmap_data, - bitmap_offset); + bitmap_offset, false); } else { BitUtil::SetBitsTo(bitmap_data, bitmap_offset, lengths_[i], true); } From 94460a8ffa7a9c737e0689ffbeea3c1671c514e6 Mon Sep 17 00:00:00 2001 From: Benjamin Kietzman Date: Thu, 28 Feb 2019 15:42:31 -0500 Subject: [PATCH 11/21] remove offsets_, lengths_, range->Range --- cpp/src/arrow/util/concatenate.cc | 161 +++++++----------------------- 1 file changed, 37 insertions(+), 124 deletions(-) diff --git a/cpp/src/arrow/util/concatenate.cc b/cpp/src/arrow/util/concatenate.cc index e3216387983..00ce7b3e460 100644 --- a/cpp/src/arrow/util/concatenate.cc +++ b/cpp/src/arrow/util/concatenate.cc @@ -45,16 +45,10 @@ static inline ArrayData SliceData(const ArrayData& data, int64_t offset, int64_t struct ConcatenateImpl { ConcatenateImpl(const std::vector& in, MemoryPool* pool) - : in_(in), - in_size_(static_cast(in.size())), - lengths_(in.size()), - offsets_(in.size()), - pool_(pool) { + : in_(in), pool_(pool) { out_.type = in[0].type; - for (int i = 0; i != in_size_; ++i) { - lengths_[i] = in[i].length; - offsets_[i] = in[i].offset; - out_.length += lengths_[i]; + for (int i = 0; i != in_size(); ++i) { + out_.length += in[i].length; if (out_.null_count == kUnknownNullCount || in[i].null_count == kUnknownNullCount) { out_.null_count = kUnknownNullCount; continue; @@ -63,22 +57,12 @@ struct ConcatenateImpl { } } - /// offset, length pair for representing a range of a buffer or array - struct range { + /// offset, length pair for representing a Range of a buffer or array + struct Range { int32_t offset, length; - range() : offset(-1), length(0) {} - range(int32_t o, int32_t l) : offset(o), length(l) {} - void widen_to_include(int32_t index) { - if (length == 0) { - offset = index; - length = 1; - return; - } - auto end = std::max(offset + length, index + 1); - length = end - offset; - offset = std::min(index, offset); - } + Range() : offset(-1), length(0) {} + Range(int32_t o, int32_t l) : offset(o), length(l) {} }; Status Visit(const NullType&) { return Status::OK(); } @@ -95,10 +79,10 @@ struct ConcatenateImpl { DCHECK_EQ(fixed.bit_width() % 8, 0); const int byte_width = fixed.bit_width() / 8; std::shared_ptr values_buffer; - std::vector> values_slices(in_size_); - for (int i = 0; i != in_size_; ++i) { - auto byte_length = byte_width * lengths_[i]; - auto byte_offset = byte_width * offsets_[i]; + std::vector> values_slices(in_size()); + for (int i = 0; i != in_size(); ++i) { + auto byte_length = byte_width * in_length(i); + auto byte_offset = byte_width * in_offset(i); values_slices[i] = SliceBuffer(in_[i].buffers[1], byte_offset, byte_length); } RETURN_NOT_OK(arrow::Concatenate(values_slices, pool_, &values_buffer)); @@ -108,11 +92,11 @@ struct ConcatenateImpl { Status Visit(const BinaryType&) { std::shared_ptr values_buffer, offset_buffer; - std::vector value_ranges; + std::vector value_ranges; RETURN_NOT_OK(ConcatenateOffsets(1, &offset_buffer, &value_ranges)); out_.buffers.push_back(offset_buffer); - std::vector> values_slices(in_size_); - for (int i = 0; i != in_size_; ++i) { + std::vector> values_slices(in_size()); + for (int i = 0; i != in_size(); ++i) { values_slices[i] = SliceBuffer(in_[i].buffers[2], value_ranges[i].offset, value_ranges[i].length); } @@ -123,11 +107,11 @@ struct ConcatenateImpl { Status Visit(const ListType&) { std::shared_ptr offset_buffer; - std::vector value_ranges; + std::vector value_ranges; RETURN_NOT_OK(ConcatenateOffsets(1, &offset_buffer, &value_ranges)); out_.buffers.push_back(offset_buffer); - std::vector values_slices(in_size_); - for (int i = 0; i != in_size_; ++i) { + std::vector values_slices(in_size()); + for (int i = 0; i != in_size(); ++i) { values_slices[i] = SliceData(*in_[i].child_data[0], value_ranges[i].offset, value_ranges[i].length); } @@ -138,11 +122,11 @@ struct ConcatenateImpl { } Status Visit(const StructType& s) { - std::vector values_slices(in_size_); + std::vector values_slices(in_size()); for (int field_index = 0; field_index != s.num_children(); ++field_index) { - for (int i = 0; i != in_size_; ++i) { + for (int i = 0; i != in_size(); ++i) { values_slices[i] = - SliceData(*in_[i].child_data[field_index], offsets_[i], lengths_[i]); + SliceData(*in_[i].child_data[field_index], in_offset(i), in_length(i)); } auto values = std::make_shared(); RETURN_NOT_OK(ConcatenateImpl(values_slices, pool_).Concatenate(values.get())); @@ -152,8 +136,8 @@ struct ConcatenateImpl { } Status Visit(const DictionaryType& d) { - std::vector indices_slices(in_size_); - for (int i = 0; i != in_size_; ++i) { + std::vector indices_slices(in_size()); + for (int i = 0; i != in_size(); ++i) { indices_slices[i] = ArrayData(in_[i]); indices_slices[i].type = d.index_type(); // don't bother concatenating null bitmaps again @@ -169,84 +153,11 @@ struct ConcatenateImpl { Status Visit(const UnionType& u) { // type_codes are an index into child_data return Status::NotImplemented("concatenation of ", u); - /* - std::vector> codes_slices(in_size_); - for (int i = 0; i != in_size_; ++i) { - codes_slices[i] = SliceBuffer(in_[i].buffers[1], offsets_[i], lengths_[i]); - } - std::shared_ptr codes_buffer; - RETURN_NOT_OK(arrow::Concatenate(codes_slices, pool_, &codes_buffer)); - if (u.mode() == UnionMode::SPARSE) { - std::vector values_slices(in_size_); - for (int field_index = 0; field_index != u.num_children(); ++field_index) { - for (int i = 0; i != in_size_; ++i) { - values_slices[i] = - SliceData(*in_[i].child_data[field_index], offsets_[i], lengths_[i]); - } - auto values = std::make_shared(); - RETURN_NOT_OK(ConcatenateImpl(values_slices, pool_).Concatenate(values.get())); - out_.child_data.push_back(values); - } - out_.buffers.push_back(nullptr); - return Status::OK(); - } - DCHECK_EQ(u.mode(), UnionMode::DENSE); - - // build mapping from (input, type_code)->the range of values actually referenced - auto max_code = *std::max_element(u.type_codes().begin(), u.type_codes().end()); - std::vector values_ranges((static_cast(max_code) + 1) * in_size_); - auto get_values_range = [&](int in_i, UnionArray::type_id_t code) -> range& { - return values_ranges[in_i * in_size_ + code]; - }; - for (int i = 0; i != in_size_; ++i) { - auto codes = in_[i].buffers[1]->data(); - auto offsets = reinterpret_cast(in_[i].buffers[2]->data()); - for (auto index = offsets_[i]; index != offsets_[i] + lengths_[i]; ++index) { - get_values_range(i, codes[index]).widen_to_include(offsets[index]); - } - } - - // for each type_code, use the min/max offset as a slice range - // and concatenate sliced data for that type_code - out_.child_data.resize(static_cast(max_code) + 1); - for (auto code : u.type_codes()) { - std::vector values_slices(in_size_); - for (int i = 0; i != in_size_; ++i) { - auto values_range = get_values_range(i, code); - values_slices[i] = - SliceData(*in_[i].child_data[code], values_range.offset, values_range.length); - } - auto values = std::make_shared(); - RETURN_NOT_OK(ConcatenateImpl(values_slices, pool_).Concatenate(values.get())); - out_.child_data[code] = values; - } - - // for each input array, adjust the offsets by the length of the slice - // range for a given type code and the minimum offset in that input array, - // so that offsets point into the concatenated values - std::vector total_lengths(static_cast(max_code) + 1, 0); - std::shared_ptr offsets_buffer; - RETURN_NOT_OK(AllocateBuffer(pool_, out_.length * sizeof(int32_t), &offsets_buffer)); - auto raw_offsets = reinterpret_cast(offsets_buffer->mutable_data()); - for (int i = 0; i != in_size_; ++i) { - auto codes = in_[i].buffers[1]->data(); - auto offsets = reinterpret_cast(in_[i].buffers[2]->data()); - for (auto index = offsets_[i]; index != offsets_[i] + lengths_[i]; ++index) { - auto min_offset = get_values_range(i, codes[index]).offset; - *raw_offsets++ = offsets[index] - min_offset + total_lengths[codes[index]]; - } - for (auto code : u.type_codes()) { - total_lengths[code] += get_values_range(i, code).length; - } - } - out_.buffers.push_back(offsets_buffer); - return Status::OK(); - */ } - Status Visit(const ExtensionType&) { + Status Visit(const ExtensionType& e) { // XXX can we just concatenate their storage? - return Status::NotImplemented("concatenation of extension arrays"); + return Status::NotImplemented("concatenation of ", e); } Status Concatenate(ArrayData* out) && { @@ -264,14 +175,14 @@ struct ConcatenateImpl { RETURN_NOT_OK(AllocateBitmap(pool_, out_.length, bitmap_buffer)); uint8_t* bitmap_data = (*bitmap_buffer)->mutable_data(); int64_t bitmap_offset = 0; - for (int i = 0; i != in_size_; ++i) { + for (int i = 0; i != in_size(); ++i) { if (auto bitmap = in_[i].buffers[index]) { - internal::CopyBitmap(bitmap->data(), offsets_[i], lengths_[i], bitmap_data, + internal::CopyBitmap(bitmap->data(), in_offset(i), in_length(i), bitmap_data, bitmap_offset, false); } else { - BitUtil::SetBitsTo(bitmap_data, bitmap_offset, lengths_[i], true); + BitUtil::SetBitsTo(bitmap_data, bitmap_offset, in_length(i), true); } - bitmap_offset += lengths_[i]; + bitmap_offset += in_length(i); } if (auto preceding_bits = BitUtil::kPrecedingBitmask[out_.length % 8]) { bitmap_data[out_.length / 8] &= preceding_bits; @@ -280,15 +191,15 @@ struct ConcatenateImpl { } Status ConcatenateOffsets(int index, std::shared_ptr* offset_buffer, - std::vector* ranges) { + std::vector* ranges) { RETURN_NOT_OK( AllocateBuffer(pool_, (out_.length + 1) * sizeof(int32_t), offset_buffer)); auto dst_offsets = reinterpret_cast((*offset_buffer)->mutable_data()); int32_t total_length = 0; - ranges->resize(in_size_); - for (int i = 0; i != in_size_; ++i) { + ranges->resize(in_size()); + for (int i = 0; i != in_size(); ++i) { auto src_offsets_begin = in_[i].GetValues(index); - auto src_offsets_end = src_offsets_begin + lengths_[i]; + auto src_offsets_end = src_offsets_begin + in_length(i); auto first_offset = src_offsets_begin[0]; auto length = *src_offsets_end - first_offset; ranges->at(i).offset = first_offset; @@ -301,15 +212,17 @@ struct ConcatenateImpl { return offset - first_offset + total_length; }); total_length += length; - dst_offsets += lengths_[i]; + dst_offsets += in_length(i); } *dst_offsets = total_length; return Status::OK(); } + int in_size() const { return static_cast(in_.size()); } + int64_t in_offset(int i) const { return in_[i].offset; } + int64_t in_length(int i) const { return in_[i].length; } + const std::vector& in_; - int in_size_; - std::vector lengths_, offsets_; MemoryPool* pool_; ArrayData out_; }; From aa3d3996d63edf93db05af1fdb7fa444b0a818b8 Mon Sep 17 00:00:00 2001 From: Benjamin Kietzman Date: Thu, 28 Feb 2019 16:05:24 -0500 Subject: [PATCH 12/21] get out_ correctly shaped in constructor --- cpp/src/arrow/util/concatenate.cc | 36 +++++++++++++------------------ 1 file changed, 15 insertions(+), 21 deletions(-) diff --git a/cpp/src/arrow/util/concatenate.cc b/cpp/src/arrow/util/concatenate.cc index 00ce7b3e460..ee7cd0c1333 100644 --- a/cpp/src/arrow/util/concatenate.cc +++ b/cpp/src/arrow/util/concatenate.cc @@ -55,6 +55,8 @@ struct ConcatenateImpl { } out_.null_count += in[i].null_count; } + out_.buffers.resize(in[0].buffers.size()); + out_.child_data.resize(in[0].child_data.size()); } /// offset, length pair for representing a Range of a buffer or array @@ -69,8 +71,7 @@ struct ConcatenateImpl { Status Visit(const BooleanType&) { std::shared_ptr values_buffer; - RETURN_NOT_OK(ConcatenateBitmaps(1, &values_buffer)); - out_.buffers.push_back(values_buffer); + RETURN_NOT_OK(ConcatenateBitmaps(1, &out_.buffers[1])); return Status::OK(); } @@ -78,46 +79,39 @@ struct ConcatenateImpl { Status Visit(const FixedWidthType& fixed) { DCHECK_EQ(fixed.bit_width() % 8, 0); const int byte_width = fixed.bit_width() / 8; - std::shared_ptr values_buffer; std::vector> values_slices(in_size()); for (int i = 0; i != in_size(); ++i) { auto byte_length = byte_width * in_length(i); auto byte_offset = byte_width * in_offset(i); values_slices[i] = SliceBuffer(in_[i].buffers[1], byte_offset, byte_length); } - RETURN_NOT_OK(arrow::Concatenate(values_slices, pool_, &values_buffer)); - out_.buffers.push_back(values_buffer); + RETURN_NOT_OK(arrow::Concatenate(values_slices, pool_, &out_.buffers[1])); return Status::OK(); } Status Visit(const BinaryType&) { - std::shared_ptr values_buffer, offset_buffer; std::vector value_ranges; - RETURN_NOT_OK(ConcatenateOffsets(1, &offset_buffer, &value_ranges)); - out_.buffers.push_back(offset_buffer); + RETURN_NOT_OK(ConcatenateOffsets(1, &out_.buffers[1], &value_ranges)); std::vector> values_slices(in_size()); for (int i = 0; i != in_size(); ++i) { values_slices[i] = SliceBuffer(in_[i].buffers[2], value_ranges[i].offset, value_ranges[i].length); } - RETURN_NOT_OK(arrow::Concatenate(values_slices, pool_, &values_buffer)); - out_.buffers.push_back(values_buffer); + RETURN_NOT_OK(arrow::Concatenate(values_slices, pool_, &out_.buffers[2])); return Status::OK(); } Status Visit(const ListType&) { - std::shared_ptr offset_buffer; std::vector value_ranges; - RETURN_NOT_OK(ConcatenateOffsets(1, &offset_buffer, &value_ranges)); - out_.buffers.push_back(offset_buffer); + RETURN_NOT_OK(ConcatenateOffsets(1, &out_.buffers[1], &value_ranges)); std::vector values_slices(in_size()); for (int i = 0; i != in_size(); ++i) { values_slices[i] = SliceData(*in_[i].child_data[0], value_ranges[i].offset, value_ranges[i].length); } - auto values = std::make_shared(); - RETURN_NOT_OK(ConcatenateImpl(values_slices, pool_).Concatenate(values.get())); - out_.child_data = {values}; + out_.child_data[0] = std::make_shared(); + RETURN_NOT_OK( + ConcatenateImpl(values_slices, pool_).Concatenate(out_.child_data[0].get())); return Status::OK(); } @@ -128,9 +122,9 @@ struct ConcatenateImpl { values_slices[i] = SliceData(*in_[i].child_data[field_index], in_offset(i), in_length(i)); } - auto values = std::make_shared(); - RETURN_NOT_OK(ConcatenateImpl(values_slices, pool_).Concatenate(values.get())); - out_.child_data.push_back(values); + out_.child_data[field_index] = std::make_shared(); + RETURN_NOT_OK(ConcatenateImpl(values_slices, pool_) + .Concatenate(out_.child_data[field_index].get())); } return Status::OK(); } @@ -146,7 +140,7 @@ struct ConcatenateImpl { } ArrayData indices; RETURN_NOT_OK(ConcatenateImpl(indices_slices, pool_).Concatenate(&indices)); - out_.buffers.push_back(indices.buffers[1]); + out_.buffers[1] = indices.buffers[1]; return Status::OK(); } @@ -165,7 +159,7 @@ struct ConcatenateImpl { if (out_.null_count != 0) { RETURN_NOT_OK(ConcatenateBitmaps(0, &null_bitmap)); } - out_.buffers = {null_bitmap}; + out_.buffers[0] = null_bitmap; RETURN_NOT_OK(VisitTypeInline(*out_.type, this)); *out = std::move(out_); return Status::OK(); From d58d063848650713709b86f24abafb1b9af1e9ae Mon Sep 17 00:00:00 2001 From: Benjamin Kietzman Date: Thu, 28 Feb 2019 16:06:06 -0500 Subject: [PATCH 13/21] use less_than rather than not_equal for loop conditions --- cpp/src/arrow/util/concatenate.cc | 20 ++++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) diff --git a/cpp/src/arrow/util/concatenate.cc b/cpp/src/arrow/util/concatenate.cc index ee7cd0c1333..b9f3fcc511a 100644 --- a/cpp/src/arrow/util/concatenate.cc +++ b/cpp/src/arrow/util/concatenate.cc @@ -47,7 +47,7 @@ struct ConcatenateImpl { ConcatenateImpl(const std::vector& in, MemoryPool* pool) : in_(in), pool_(pool) { out_.type = in[0].type; - for (int i = 0; i != in_size(); ++i) { + for (int i = 0; i < in_size(); ++i) { out_.length += in[i].length; if (out_.null_count == kUnknownNullCount || in[i].null_count == kUnknownNullCount) { out_.null_count = kUnknownNullCount; @@ -80,7 +80,7 @@ struct ConcatenateImpl { DCHECK_EQ(fixed.bit_width() % 8, 0); const int byte_width = fixed.bit_width() / 8; std::vector> values_slices(in_size()); - for (int i = 0; i != in_size(); ++i) { + for (int i = 0; i < in_size(); ++i) { auto byte_length = byte_width * in_length(i); auto byte_offset = byte_width * in_offset(i); values_slices[i] = SliceBuffer(in_[i].buffers[1], byte_offset, byte_length); @@ -93,7 +93,7 @@ struct ConcatenateImpl { std::vector value_ranges; RETURN_NOT_OK(ConcatenateOffsets(1, &out_.buffers[1], &value_ranges)); std::vector> values_slices(in_size()); - for (int i = 0; i != in_size(); ++i) { + for (int i = 0; i < in_size(); ++i) { values_slices[i] = SliceBuffer(in_[i].buffers[2], value_ranges[i].offset, value_ranges[i].length); } @@ -105,7 +105,7 @@ struct ConcatenateImpl { std::vector value_ranges; RETURN_NOT_OK(ConcatenateOffsets(1, &out_.buffers[1], &value_ranges)); std::vector values_slices(in_size()); - for (int i = 0; i != in_size(); ++i) { + for (int i = 0; i < in_size(); ++i) { values_slices[i] = SliceData(*in_[i].child_data[0], value_ranges[i].offset, value_ranges[i].length); } @@ -117,8 +117,8 @@ struct ConcatenateImpl { Status Visit(const StructType& s) { std::vector values_slices(in_size()); - for (int field_index = 0; field_index != s.num_children(); ++field_index) { - for (int i = 0; i != in_size(); ++i) { + for (int field_index = 0; field_index < s.num_children(); ++field_index) { + for (int i = 0; i < in_size(); ++i) { values_slices[i] = SliceData(*in_[i].child_data[field_index], in_offset(i), in_length(i)); } @@ -131,7 +131,7 @@ struct ConcatenateImpl { Status Visit(const DictionaryType& d) { std::vector indices_slices(in_size()); - for (int i = 0; i != in_size(); ++i) { + for (int i = 0; i < in_size(); ++i) { indices_slices[i] = ArrayData(in_[i]); indices_slices[i].type = d.index_type(); // don't bother concatenating null bitmaps again @@ -169,7 +169,7 @@ struct ConcatenateImpl { RETURN_NOT_OK(AllocateBitmap(pool_, out_.length, bitmap_buffer)); uint8_t* bitmap_data = (*bitmap_buffer)->mutable_data(); int64_t bitmap_offset = 0; - for (int i = 0; i != in_size(); ++i) { + for (int i = 0; i < in_size(); ++i) { if (auto bitmap = in_[i].buffers[index]) { internal::CopyBitmap(bitmap->data(), in_offset(i), in_length(i), bitmap_data, bitmap_offset, false); @@ -191,7 +191,7 @@ struct ConcatenateImpl { auto dst_offsets = reinterpret_cast((*offset_buffer)->mutable_data()); int32_t total_length = 0; ranges->resize(in_size()); - for (int i = 0; i != in_size(); ++i) { + for (int i = 0; i < in_size(); ++i) { auto src_offsets_begin = in_[i].GetValues(index); auto src_offsets_end = src_offsets_begin + in_length(i); auto first_offset = src_offsets_begin[0]; @@ -225,7 +225,7 @@ Status Concatenate(const std::vector>& arrays, MemoryPool std::shared_ptr* out) { DCHECK_GT(arrays.size(), 0); std::vector data(arrays.size()); - for (std::size_t i = 0; i != arrays.size(); ++i) { + for (std::size_t i = 0; i < arrays.size(); ++i) { if (!arrays[i]->type()->Equals(*arrays[0]->type())) { return Status::Invalid("arrays to be concatentated must be identically typed, but ", *arrays[0]->type(), " and ", *arrays[i]->type(), From 6d8b76ffc6a8f47addb469ab97ba50b3a1b231d7 Mon Sep 17 00:00:00 2001 From: Benjamin Kietzman Date: Fri, 1 Mar 2019 14:00:10 -0500 Subject: [PATCH 14/21] Refactor SliceData to ArrayData::Slice --- cpp/src/arrow/array.cc | 36 +++++++++++++++---------------- cpp/src/arrow/array.h | 3 +++ cpp/src/arrow/util/concatenate.cc | 18 +++------------- 3 files changed, 24 insertions(+), 33 deletions(-) diff --git a/cpp/src/arrow/array.cc b/cpp/src/arrow/array.cc index b5af31901f4..5cb7bf46300 100644 --- a/cpp/src/arrow/array.cc +++ b/cpp/src/arrow/array.cc @@ -77,6 +77,18 @@ std::shared_ptr ArrayData::Make(const std::shared_ptr& type return std::make_shared(type, length, null_count, offset); } +ArrayData ArrayData::Slice(int64_t off, int64_t len) const { + DCHECK_LE(off, length); + len = std::min(length - off, len); + off += offset; + + auto copy = *this; + copy.length = len; + copy.offset = off; + copy.null_count = null_count != 0 ? kUnknownNullCount : 0; + return copy; +} + int64_t ArrayData::GetNullCount() const { if (ARROW_PREDICT_FALSE(this->null_count == kUnknownNullCount)) { if (this->buffers[0]) { @@ -125,21 +137,8 @@ bool Array::RangeEquals(const Array& other, int64_t start_idx, int64_t end_idx, return ArrayRangeEquals(*this, other, start_idx, end_idx, other_start_idx); } -static inline std::shared_ptr SliceData(const ArrayData& data, int64_t offset, - int64_t length) { - DCHECK_LE(offset, data.length); - length = std::min(data.length - offset, length); - offset += data.offset; - - auto new_data = data.Copy(); - new_data->length = length; - new_data->offset = offset; - new_data->null_count = data.null_count != 0 ? kUnknownNullCount : 0; - return new_data; -} - std::shared_ptr Array::Slice(int64_t offset, int64_t length) const { - return MakeArray(SliceData(*data_, offset, length)); + return MakeArray(std::make_shared(data_->Slice(offset, length))); } std::shared_ptr Array::Slice(int64_t offset) const { @@ -385,7 +384,8 @@ std::shared_ptr StructArray::field(int i) const { if (!boxed_fields_[i]) { std::shared_ptr field_data; if (data_->offset != 0 || data_->child_data[i]->length != data_->length) { - field_data = SliceData(*data_->child_data[i].get(), data_->offset, data_->length); + field_data = std::make_shared( + data_->child_data[i]->Slice(data_->offset, data_->length)); } else { field_data = data_->child_data[i]; } @@ -410,7 +410,7 @@ Status StructArray::Flatten(MemoryPool* pool, ArrayVector* out) const { // Need to adjust for parent offset if (data_->offset != 0 || data_->length != child_data->length) { - child_data = SliceData(*child_data, data_->offset, data_->length); + *child_data = child_data->Slice(data_->offset, data_->length); } std::shared_ptr child_null_bitmap = child_data->buffers[0]; const int64_t child_offset = child_data->offset; @@ -540,13 +540,13 @@ Status UnionArray::MakeSparse(const Array& type_ids, std::shared_ptr UnionArray::child(int i) const { if (!boxed_fields_[i]) { - std::shared_ptr child_data = data_->child_data[i]; + std::shared_ptr child_data = data_->child_data[i]->Copy(); if (mode() == UnionMode::SPARSE) { // Sparse union: need to adjust child if union is sliced // (for dense unions, the need to lookup through the offsets // makes this unnecessary) if (data_->offset != 0 || child_data->length > data_->length) { - child_data = SliceData(*child_data.get(), data_->offset, data_->length); + *child_data = child_data->Slice(data_->offset, data_->length); } } boxed_fields_[i] = MakeArray(child_data); diff --git a/cpp/src/arrow/array.h b/cpp/src/arrow/array.h index bee133c017e..cce63260191 100644 --- a/cpp/src/arrow/array.h +++ b/cpp/src/arrow/array.h @@ -197,6 +197,9 @@ struct ARROW_EXPORT ArrayData { return GetMutableValues(i, offset); } + // Construct a zero-copy slice of the data with the indicated offset and length + ArrayData Slice(int64_t offset, int64_t length) const; + /// \brief Return null count, or compute and set it if it's not known int64_t GetNullCount() const; diff --git a/cpp/src/arrow/util/concatenate.cc b/cpp/src/arrow/util/concatenate.cc index b9f3fcc511a..209d80336eb 100644 --- a/cpp/src/arrow/util/concatenate.cc +++ b/cpp/src/arrow/util/concatenate.cc @@ -31,18 +31,6 @@ namespace arrow { -static inline ArrayData SliceData(const ArrayData& data, int64_t offset, int64_t length) { - DCHECK_LE(offset, data.length); - length = std::min(data.length - offset, length); - offset += data.offset; - - auto copy = data; - copy.length = length; - copy.offset = offset; - copy.null_count = data.null_count != 0 ? kUnknownNullCount : 0; - return copy; -} - struct ConcatenateImpl { ConcatenateImpl(const std::vector& in, MemoryPool* pool) : in_(in), pool_(pool) { @@ -106,8 +94,8 @@ struct ConcatenateImpl { RETURN_NOT_OK(ConcatenateOffsets(1, &out_.buffers[1], &value_ranges)); std::vector values_slices(in_size()); for (int i = 0; i < in_size(); ++i) { - values_slices[i] = SliceData(*in_[i].child_data[0], value_ranges[i].offset, - value_ranges[i].length); + values_slices[i] = + in_[i].child_data[0]->Slice(value_ranges[i].offset, value_ranges[i].length); } out_.child_data[0] = std::make_shared(); RETURN_NOT_OK( @@ -120,7 +108,7 @@ struct ConcatenateImpl { for (int field_index = 0; field_index < s.num_children(); ++field_index) { for (int i = 0; i < in_size(); ++i) { values_slices[i] = - SliceData(*in_[i].child_data[field_index], in_offset(i), in_length(i)); + in_[i].child_data[field_index]->Slice(in_offset(i), in_length(i)); } out_.child_data[field_index] = std::make_shared(); RETURN_NOT_OK(ConcatenateImpl(values_slices, pool_) From c6686a4ab4e2d01a4d24904055eb56ac2be13dfb Mon Sep 17 00:00:00 2001 From: Benjamin Kietzman Date: Fri, 1 Mar 2019 15:53:12 -0500 Subject: [PATCH 15/21] refactor ConcatenateImpl::Visit to be thinner with helpers --- cpp/src/arrow/array.h | 18 ++--- cpp/src/arrow/util/concatenate.cc | 124 +++++++++++++++++------------- 2 files changed, 76 insertions(+), 66 deletions(-) diff --git a/cpp/src/arrow/array.h b/cpp/src/arrow/array.h index cce63260191..af872dd5374 100644 --- a/cpp/src/arrow/array.h +++ b/cpp/src/arrow/array.h @@ -67,7 +67,7 @@ class Status; /// could cast from int64 to float64 like so: /// /// Int64Array arr = GetMyData(); -/// auto new_data = arr.data()->ShallowCopy(); +/// auto new_data = arr.data()->Copy(); /// new_data->type = arrow::float64(); /// DoubleArray double_arr(new_data); /// @@ -75,7 +75,7 @@ class Status; /// reused. For example, if we had a group of operations all returning doubles, /// say: /// -/// Log(Sqrt(Expr(arr)) +/// Log(Sqrt(Expr(arr))) /// /// Then the low-level implementations of each of these functions could have /// the signatures @@ -146,6 +146,7 @@ struct ARROW_EXPORT ArrayData { buffers(std::move(other.buffers)), child_data(std::move(other.child_data)) {} + // Copy constructor ArrayData(const ArrayData& other) noexcept : type(other.type), length(other.length), @@ -155,15 +156,10 @@ struct ARROW_EXPORT ArrayData { child_data(other.child_data) {} // Move assignment - ArrayData& operator=(ArrayData&& other) { - type = std::move(other.type); - length = other.length; - null_count = other.null_count; - offset = other.offset; - buffers = std::move(other.buffers); - child_data = std::move(other.child_data); - return *this; - } + ArrayData& operator=(ArrayData&& other) = default; + + // Copy assignment + ArrayData& operator=(const ArrayData& other) = default; std::shared_ptr Copy() const { return std::make_shared(*this); } diff --git a/cpp/src/arrow/util/concatenate.cc b/cpp/src/arrow/util/concatenate.cc index 209d80336eb..4f80220c55e 100644 --- a/cpp/src/arrow/util/concatenate.cc +++ b/cpp/src/arrow/util/concatenate.cc @@ -45,95 +45,58 @@ struct ConcatenateImpl { } out_.buffers.resize(in[0].buffers.size()); out_.child_data.resize(in[0].child_data.size()); + for (auto& data : out_.child_data) { + data = std::make_shared(); + } } /// offset, length pair for representing a Range of a buffer or array struct Range { - int32_t offset, length; + int64_t offset, length; Range() : offset(-1), length(0) {} - Range(int32_t o, int32_t l) : offset(o), length(l) {} + Range(int64_t o, int64_t l) : offset(o), length(l) {} }; Status Visit(const NullType&) { return Status::OK(); } - Status Visit(const BooleanType&) { - std::shared_ptr values_buffer; - RETURN_NOT_OK(ConcatenateBitmaps(1, &out_.buffers[1])); - return Status::OK(); - } + Status Visit(const BooleanType&) { return ConcatenateBitmaps(1, &out_.buffers[1]); } // handle numbers, decimal128, fixed_size_binary Status Visit(const FixedWidthType& fixed) { - DCHECK_EQ(fixed.bit_width() % 8, 0); - const int byte_width = fixed.bit_width() / 8; - std::vector> values_slices(in_size()); - for (int i = 0; i < in_size(); ++i) { - auto byte_length = byte_width * in_length(i); - auto byte_offset = byte_width * in_offset(i); - values_slices[i] = SliceBuffer(in_[i].buffers[1], byte_offset, byte_length); - } - RETURN_NOT_OK(arrow::Concatenate(values_slices, pool_, &out_.buffers[1])); - return Status::OK(); + return arrow::Concatenate(Slices(Buffers(1), FixedWidthRanges(fixed)), pool_, + &out_.buffers[1]); } Status Visit(const BinaryType&) { std::vector value_ranges; RETURN_NOT_OK(ConcatenateOffsets(1, &out_.buffers[1], &value_ranges)); - std::vector> values_slices(in_size()); - for (int i = 0; i < in_size(); ++i) { - values_slices[i] = - SliceBuffer(in_[i].buffers[2], value_ranges[i].offset, value_ranges[i].length); - } - RETURN_NOT_OK(arrow::Concatenate(values_slices, pool_, &out_.buffers[2])); - return Status::OK(); + return arrow::Concatenate(Slices(Buffers(2), value_ranges), pool_, &out_.buffers[2]); } Status Visit(const ListType&) { std::vector value_ranges; RETURN_NOT_OK(ConcatenateOffsets(1, &out_.buffers[1], &value_ranges)); - std::vector values_slices(in_size()); - for (int i = 0; i < in_size(); ++i) { - values_slices[i] = - in_[i].child_data[0]->Slice(value_ranges[i].offset, value_ranges[i].length); - } - out_.child_data[0] = std::make_shared(); - RETURN_NOT_OK( - ConcatenateImpl(values_slices, pool_).Concatenate(out_.child_data[0].get())); - return Status::OK(); + return ConcatenateImpl(Slices(ChildData(0), value_ranges), pool_) + .Concatenate(out_.child_data[0].get()); } Status Visit(const StructType& s) { - std::vector values_slices(in_size()); - for (int field_index = 0; field_index < s.num_children(); ++field_index) { - for (int i = 0; i < in_size(); ++i) { - values_slices[i] = - in_[i].child_data[field_index]->Slice(in_offset(i), in_length(i)); - } - out_.child_data[field_index] = std::make_shared(); - RETURN_NOT_OK(ConcatenateImpl(values_slices, pool_) - .Concatenate(out_.child_data[field_index].get())); + auto ranges = Ranges(); + for (int i = 0; i < s.num_children(); ++i) { + RETURN_NOT_OK(ConcatenateImpl(Slices(ChildData(i), ranges), pool_) + .Concatenate(out_.child_data[i].get())); } return Status::OK(); } Status Visit(const DictionaryType& d) { - std::vector indices_slices(in_size()); - for (int i = 0; i < in_size(); ++i) { - indices_slices[i] = ArrayData(in_[i]); - indices_slices[i].type = d.index_type(); - // don't bother concatenating null bitmaps again - indices_slices[i].null_count = 0; - indices_slices[i].buffers[0] = nullptr; - } - ArrayData indices; - RETURN_NOT_OK(ConcatenateImpl(indices_slices, pool_).Concatenate(&indices)); - out_.buffers[1] = indices.buffers[1]; - return Status::OK(); + auto fixed = internal::checked_cast(d.index_type().get()); + return arrow::Concatenate(Slices(Buffers(1), FixedWidthRanges(*fixed)), pool_, + &out_.buffers[1]); } Status Visit(const UnionType& u) { - // type_codes are an index into child_data return Status::NotImplemented("concatenation of ", u); } @@ -153,6 +116,57 @@ struct ConcatenateImpl { return Status::OK(); } + std::vector> Buffers(int index) { + std::vector> buffers(in_size()); + for (int i = 0; i != in_size(); ++i) { + buffers[i] = in_[i].buffers[index]; + } + return buffers; + } + + std::vector ChildData(int index) { + std::vector child_data(in_size()); + for (int i = 0; i != in_size(); ++i) { + child_data[i] = *in_[i].child_data[index]; + } + return child_data; + } + + ArrayData Slice(const ArrayData& d, Range r) { return d.Slice(r.offset, r.length); } + + std::shared_ptr Slice(const std::shared_ptr& b, Range r) { + return SliceBuffer(b, r.offset, r.length); + } + + template + std::vector Slices(const std::vector& slicable, + const std::vector& ranges) { + std::vector values_slices(in_size()); + for (int i = 0; i < in_size(); ++i) { + values_slices[i] = Slice(slicable[i], ranges[i]); + } + return values_slices; + } + + std::vector Ranges() { + std::vector ranges(in_size()); + for (int i = 0; i < in_size(); ++i) { + ranges[i] = Range(in_offset(i), in_length(i)); + } + return ranges; + } + + std::vector FixedWidthRanges(const FixedWidthType& fixed) { + DCHECK_EQ(fixed.bit_width() % 8, 0); + auto byte_width = fixed.bit_width() / 8; + auto ranges = Ranges(); + for (Range& range : ranges) { + range.offset *= byte_width; + range.length *= byte_width; + } + return ranges; + } + Status ConcatenateBitmaps(int index, std::shared_ptr* bitmap_buffer) { RETURN_NOT_OK(AllocateBitmap(pool_, out_.length, bitmap_buffer)); uint8_t* bitmap_data = (*bitmap_buffer)->mutable_data(); From 25a040e6abe303cb8bd49dfc02151c0b6c3f1e72 Mon Sep 17 00:00:00 2001 From: Benjamin Kietzman Date: Sun, 3 Mar 2019 13:11:55 -0500 Subject: [PATCH 16/21] refactor bitmap concatenation --- cpp/src/arrow/util/concatenate.cc | 117 +++++++++++++++++++----------- 1 file changed, 74 insertions(+), 43 deletions(-) diff --git a/cpp/src/arrow/util/concatenate.cc b/cpp/src/arrow/util/concatenate.cc index 4f80220c55e..7d000401b9f 100644 --- a/cpp/src/arrow/util/concatenate.cc +++ b/cpp/src/arrow/util/concatenate.cc @@ -58,9 +58,23 @@ struct ConcatenateImpl { Range(int64_t o, int64_t l) : offset(o), length(l) {} }; + struct Bitmap { + Bitmap() = default; + Bitmap(const uint8_t* d, Range r) : data(d), range(r) {} + explicit Bitmap(const std::shared_ptr& buffer, Range r = Range()) + : Bitmap(buffer ? buffer->data() : nullptr, r) {} + + const uint8_t* data; + Range range; + + bool AllSet() const { return data == nullptr; } + }; + Status Visit(const NullType&) { return Status::OK(); } - Status Visit(const BooleanType&) { return ConcatenateBitmaps(1, &out_.buffers[1]); } + Status Visit(const BooleanType&) { + return ConcatenateBitmaps(Slices(Bitmaps(1), Ranges()), &out_.buffers[1]); + } // handle numbers, decimal128, fixed_size_binary Status Visit(const FixedWidthType& fixed) { @@ -70,13 +84,13 @@ struct ConcatenateImpl { Status Visit(const BinaryType&) { std::vector value_ranges; - RETURN_NOT_OK(ConcatenateOffsets(1, &out_.buffers[1], &value_ranges)); + RETURN_NOT_OK(ConcatenateOffsets(Buffers(1), &out_.buffers[1], &value_ranges)); return arrow::Concatenate(Slices(Buffers(2), value_ranges), pool_, &out_.buffers[2]); } Status Visit(const ListType&) { std::vector value_ranges; - RETURN_NOT_OK(ConcatenateOffsets(1, &out_.buffers[1], &value_ranges)); + RETURN_NOT_OK(ConcatenateOffsets(Buffers(1), &out_.buffers[1], &value_ranges)); return ConcatenateImpl(Slices(ChildData(0), value_ranges), pool_) .Concatenate(out_.child_data[0].get()); } @@ -106,11 +120,9 @@ struct ConcatenateImpl { } Status Concatenate(ArrayData* out) && { - std::shared_ptr null_bitmap; if (out_.null_count != 0) { - RETURN_NOT_OK(ConcatenateBitmaps(0, &null_bitmap)); + RETURN_NOT_OK(ConcatenateBitmaps(Slices(Bitmaps(0), Ranges()), &out_.buffers[0])); } - out_.buffers[0] = null_bitmap; RETURN_NOT_OK(VisitTypeInline(*out_.type, this)); *out = std::move(out_); return Status::OK(); @@ -124,6 +136,14 @@ struct ConcatenateImpl { return buffers; } + std::vector Bitmaps(int index) { + std::vector bitmaps(in_size()); + for (int i = 0; i != in_size(); ++i) { + bitmaps[i] = Bitmap(in_[i].buffers[index]); + } + return bitmaps; + } + std::vector ChildData(int index) { std::vector child_data(in_size()); for (int i = 0; i != in_size(); ++i) { @@ -138,20 +158,22 @@ struct ConcatenateImpl { return SliceBuffer(b, r.offset, r.length); } + Bitmap Slice(Bitmap b, Range r) { return Bitmap(b.data, r); } + template std::vector Slices(const std::vector& slicable, const std::vector& ranges) { - std::vector values_slices(in_size()); + std::vector slices(in_size()); for (int i = 0; i < in_size(); ++i) { - values_slices[i] = Slice(slicable[i], ranges[i]); + slices[i] = Slice(slicable[i], ranges[i]); } - return values_slices; + return slices; } std::vector Ranges() { std::vector ranges(in_size()); for (int i = 0; i < in_size(); ++i) { - ranges[i] = Range(in_offset(i), in_length(i)); + ranges[i] = in_range(i); } return ranges; } @@ -167,18 +189,20 @@ struct ConcatenateImpl { return ranges; } - Status ConcatenateBitmaps(int index, std::shared_ptr* bitmap_buffer) { - RETURN_NOT_OK(AllocateBitmap(pool_, out_.length, bitmap_buffer)); - uint8_t* bitmap_data = (*bitmap_buffer)->mutable_data(); + Status ConcatenateBitmaps(const std::vector& bitmaps, + std::shared_ptr* out) { + RETURN_NOT_OK(AllocateBitmap(pool_, out_.length, out)); + uint8_t* bitmap_data = (*out)->mutable_data(); int64_t bitmap_offset = 0; for (int i = 0; i < in_size(); ++i) { - if (auto bitmap = in_[i].buffers[index]) { - internal::CopyBitmap(bitmap->data(), in_offset(i), in_length(i), bitmap_data, - bitmap_offset, false); + auto bitmap = bitmaps[i]; + if (bitmap.AllSet()) { + BitUtil::SetBitsTo(bitmap_data, bitmap_offset, bitmap.range.length, true); } else { - BitUtil::SetBitsTo(bitmap_data, bitmap_offset, in_length(i), true); + internal::CopyBitmap(bitmap.data, bitmap.range.offset, bitmap.range.length, + bitmap_data, bitmap_offset, false); } - bitmap_offset += in_length(i); + bitmap_offset += bitmap.range.length; } if (auto preceding_bits = BitUtil::kPrecedingBitmask[out_.length % 8]) { bitmap_data[out_.length / 8] &= preceding_bits; @@ -186,37 +210,44 @@ struct ConcatenateImpl { return Status::OK(); } - Status ConcatenateOffsets(int index, std::shared_ptr* offset_buffer, - std::vector* ranges) { - RETURN_NOT_OK( - AllocateBuffer(pool_, (out_.length + 1) * sizeof(int32_t), offset_buffer)); - auto dst_offsets = reinterpret_cast((*offset_buffer)->mutable_data()); - int32_t total_length = 0; + Status PutOffsets(const std::shared_ptr& src, Range range, + int32_t values_length, int32_t* dst_begin, Range* values_range) { + auto src_begin = reinterpret_cast(src->data()) + range.offset; + auto src_end = src_begin + range.length; + auto first_offset = src_begin[0]; + values_range->offset = first_offset; + values_range->length = *src_end - values_range->offset; + if (values_length > std::numeric_limits::max() - values_range->length) { + return Status::Invalid("offset overflow while concatenating arrays"); + } + std::transform(src_begin, src_end, dst_begin, + [values_length, first_offset](int32_t offset) { + return offset - first_offset + values_length; + }); + return Status::OK(); + } + + Status ConcatenateOffsets(const std::vector>& buffers, + std::shared_ptr* out, std::vector* ranges) { + RETURN_NOT_OK(AllocateBuffer(pool_, (out_.length + 1) * sizeof(int32_t), out)); + auto dst = reinterpret_cast((*out)->mutable_data()); + int64_t elements_length = 0; + int32_t values_length = 0; ranges->resize(in_size()); for (int i = 0; i < in_size(); ++i) { - auto src_offsets_begin = in_[i].GetValues(index); - auto src_offsets_end = src_offsets_begin + in_length(i); - auto first_offset = src_offsets_begin[0]; - auto length = *src_offsets_end - first_offset; - ranges->at(i).offset = first_offset; - ranges->at(i).length = length; - if (total_length > std::numeric_limits::max() - length) { - return Status::Invalid("offset overflow while concatenating arrays"); - } - std::transform(src_offsets_begin, src_offsets_end, dst_offsets, - [total_length, first_offset](int32_t offset) { - return offset - first_offset + total_length; - }); - total_length += length; - dst_offsets += in_length(i); + RETURN_NOT_OK(PutOffsets(buffers[i], in_range(i), values_length, + &dst[elements_length], &ranges->at(i))); + elements_length += in_length(i); + values_length += ranges->at(i).length; } - *dst_offsets = total_length; + dst[elements_length] = values_length; return Status::OK(); } int in_size() const { return static_cast(in_.size()); } int64_t in_offset(int i) const { return in_[i].offset; } int64_t in_length(int i) const { return in_[i].length; } + Range in_range(int i) const { return Range(in_offset(i), in_length(i)); } const std::vector& in_; MemoryPool* pool_; @@ -235,9 +266,9 @@ Status Concatenate(const std::vector>& arrays, MemoryPool } data[i] = ArrayData(*arrays[i]->data()); } - auto out_data = std::make_shared(); - RETURN_NOT_OK(ConcatenateImpl(data, pool).Concatenate(out_data.get())); - *out = MakeArray(std::move(out_data)); + ArrayData out_data; + RETURN_NOT_OK(ConcatenateImpl(data, pool).Concatenate(&out_data)); + *out = MakeArray(std::make_shared(std::move(out_data))); return Status::OK(); } From 7fcd9c75ce51381b3a358ea409419d17bb112566 Mon Sep 17 00:00:00 2001 From: Benjamin Kietzman Date: Mon, 4 Mar 2019 13:25:53 -0500 Subject: [PATCH 17/21] fix implicit conversion warning --- cpp/src/arrow/util/concatenate.cc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cpp/src/arrow/util/concatenate.cc b/cpp/src/arrow/util/concatenate.cc index 7d000401b9f..855823b24f3 100644 --- a/cpp/src/arrow/util/concatenate.cc +++ b/cpp/src/arrow/util/concatenate.cc @@ -238,7 +238,7 @@ struct ConcatenateImpl { RETURN_NOT_OK(PutOffsets(buffers[i], in_range(i), values_length, &dst[elements_length], &ranges->at(i))); elements_length += in_length(i); - values_length += ranges->at(i).length; + values_length += static_cast(ranges->at(i).length); } dst[elements_length] = values_length; return Status::OK(); From beb3ad29d33c4085a98e86636367ed788057eef2 Mon Sep 17 00:00:00 2001 From: Benjamin Kietzman Date: Fri, 8 Mar 2019 11:13:30 -0500 Subject: [PATCH 18/21] use ArrayVector, default_random_engine --- cpp/src/arrow/array-test.cc | 8 ++++---- cpp/src/arrow/util/concatenate.cc | 10 +++++----- cpp/src/arrow/util/concatenate.h | 2 +- 3 files changed, 10 insertions(+), 10 deletions(-) diff --git a/cpp/src/arrow/array-test.cc b/cpp/src/arrow/array-test.cc index ada9df822b4..705989430fd 100644 --- a/cpp/src/arrow/array-test.cc +++ b/cpp/src/arrow/array-test.cc @@ -1795,16 +1795,16 @@ class ConcatenateTest : public ::testing::Test { std::vector Offsets(int32_t length, int32_t slice_count) { std::vector offsets(static_cast(slice_count + 1)); - std::mt19937_64 gen(seed_); + std::default_random_engine gen(seed_); std::uniform_int_distribution dist(0, length); std::generate(offsets.begin(), offsets.end(), [&] { return dist(gen); }); std::sort(offsets.begin(), offsets.end()); return offsets; } - std::vector> Slices(const std::shared_ptr& array, - const std::vector& offsets) { - std::vector> slices(offsets.size() - 1); + ArrayVector Slices(const std::shared_ptr& array, + const std::vector& offsets) { + ArrayVector slices(offsets.size() - 1); for (size_t i = 0; i != slices.size(); ++i) { slices[i] = array->Slice(offsets[i], offsets[i + 1] - offsets[i]); } diff --git a/cpp/src/arrow/util/concatenate.cc b/cpp/src/arrow/util/concatenate.cc index 855823b24f3..a4ebf210004 100644 --- a/cpp/src/arrow/util/concatenate.cc +++ b/cpp/src/arrow/util/concatenate.cc @@ -128,8 +128,8 @@ struct ConcatenateImpl { return Status::OK(); } - std::vector> Buffers(int index) { - std::vector> buffers(in_size()); + BufferVector Buffers(int index) { + BufferVector buffers(in_size()); for (int i = 0; i != in_size(); ++i) { buffers[i] = in_[i].buffers[index]; } @@ -227,8 +227,8 @@ struct ConcatenateImpl { return Status::OK(); } - Status ConcatenateOffsets(const std::vector>& buffers, - std::shared_ptr* out, std::vector* ranges) { + Status ConcatenateOffsets(const BufferVector& buffers, std::shared_ptr* out, + std::vector* ranges) { RETURN_NOT_OK(AllocateBuffer(pool_, (out_.length + 1) * sizeof(int32_t), out)); auto dst = reinterpret_cast((*out)->mutable_data()); int64_t elements_length = 0; @@ -254,7 +254,7 @@ struct ConcatenateImpl { ArrayData out_; }; -Status Concatenate(const std::vector>& arrays, MemoryPool* pool, +Status Concatenate(const ArrayVector& arrays, MemoryPool* pool, std::shared_ptr* out) { DCHECK_GT(arrays.size(), 0); std::vector data(arrays.size()); diff --git a/cpp/src/arrow/util/concatenate.h b/cpp/src/arrow/util/concatenate.h index 0169892bff1..67738d547f4 100644 --- a/cpp/src/arrow/util/concatenate.h +++ b/cpp/src/arrow/util/concatenate.h @@ -33,7 +33,7 @@ namespace arrow { /// \param[out] out the resulting concatenated array /// \return Status ARROW_EXPORT -Status Concatenate(const std::vector>& arrays, MemoryPool* pool, +Status Concatenate(const ArrayVector& arrays, MemoryPool* pool, std::shared_ptr* out); } // namespace arrow From aeb2626a18e7ffa3d2fadbdd16fd9dae064cbf42 Mon Sep 17 00:00:00 2001 From: Benjamin Kietzman Date: Fri, 8 Mar 2019 11:25:07 -0500 Subject: [PATCH 19/21] move BufferVector alias to buffer.h and use for Concatenate --- cpp/src/arrow/array.h | 2 -- cpp/src/arrow/buffer.h | 4 +++- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/cpp/src/arrow/array.h b/cpp/src/arrow/array.h index af872dd5374..f6815426950 100644 --- a/cpp/src/arrow/array.h +++ b/cpp/src/arrow/array.h @@ -40,8 +40,6 @@ namespace arrow { class Array; class ArrayVisitor; -using BufferVector = std::vector>; - // When slicing, we do not know the null count of the sliced range without // doing some computation. To avoid doing this eagerly, we set the null count // to -1 (any negative number will do). When Array::null_count is called the diff --git a/cpp/src/arrow/buffer.h b/cpp/src/arrow/buffer.h index 5c6ae5e13f7..e870ef9f77f 100644 --- a/cpp/src/arrow/buffer.h +++ b/cpp/src/arrow/buffer.h @@ -198,6 +198,8 @@ class ARROW_EXPORT Buffer { ARROW_DISALLOW_COPY_AND_ASSIGN(Buffer); }; +using BufferVector = std::vector>; + /// \defgroup buffer-slicing-functions Functions for slicing buffers /// /// @{ @@ -410,7 +412,7 @@ Status AllocateEmptyBitmap(int64_t length, std::shared_ptr* out); /// /// \return Status ARROW_EXPORT -Status Concatenate(const std::vector>& buffers, MemoryPool* pool, +Status Concatenate(const BufferVector& buffers, MemoryPool* pool, std::shared_ptr* out); /// @} From ffa58ec4af218843742c11943a99797f66998959 Mon Sep 17 00:00:00 2001 From: Benjamin Kietzman Date: Wed, 20 Mar 2019 13:26:58 -0400 Subject: [PATCH 20/21] Refactor with Antoine's recommendations - renaming - use memcpy not std::copy in ConcatenateBuffers - concatenating zero buffers does not terminate - refactor for clarity - add comments --- cpp/src/arrow/array-test.cc | 12 +- cpp/src/arrow/buffer.cc | 6 +- cpp/src/arrow/buffer.h | 4 +- cpp/src/arrow/util/concatenate.cc | 338 +++++++++++++++++------------- 4 files changed, 202 insertions(+), 158 deletions(-) diff --git a/cpp/src/arrow/array-test.cc b/cpp/src/arrow/array-test.cc index 705989430fd..be7ba37d374 100644 --- a/cpp/src/arrow/array-test.cc +++ b/cpp/src/arrow/array-test.cc @@ -1838,9 +1838,6 @@ class ConcatenateTest : public ::testing::Test { auto slices = this->Slices(array, offsets); std::shared_ptr actual; ASSERT_OK(Concatenate(slices, default_memory_pool(), &actual)); - // force computation of null_count for both arrays - ARROW_IGNORE_EXPR(expected->null_count()); - ARROW_IGNORE_EXPR(actual->null_count()); AssertArraysEqual(*expected, *actual); if (actual->data()->buffers[0]) { CheckTrailingBitsAreZeroed(actual->data()->buffers[0], actual->length()); @@ -1861,9 +1858,6 @@ class ConcatenateTest : public ::testing::Test { template class PrimitiveConcatenateTest : public ConcatenateTest { public: - void operator()(int64_t size, double null_probability, std::shared_ptr* out) { - *out = this->GeneratePrimitive(size, null_probability); - } }; using PrimitiveTypes = @@ -1871,7 +1865,11 @@ using PrimitiveTypes = UInt32Type, Int64Type, UInt64Type, FloatType, DoubleType>; TYPED_TEST_CASE(PrimitiveConcatenateTest, PrimitiveTypes); -TYPED_TEST(PrimitiveConcatenateTest, Primitives) { this->Check(*this); } +TYPED_TEST(PrimitiveConcatenateTest, Primitives) { + this->Check([this](int64_t size, double null_probability, std::shared_ptr* out) { + *out = this->template GeneratePrimitive(size, null_probability); + }); +} TEST_F(ConcatenateTest, StringType) { Check([this](int32_t size, double null_probability, std::shared_ptr* out) { diff --git a/cpp/src/arrow/buffer.cc b/cpp/src/arrow/buffer.cc index e12c4d1822f..9e9bd2e33bc 100644 --- a/cpp/src/arrow/buffer.cc +++ b/cpp/src/arrow/buffer.cc @@ -227,8 +227,8 @@ Status AllocateEmptyBitmap(int64_t length, std::shared_ptr* out) { return AllocateEmptyBitmap(default_memory_pool(), length, out); } -Status Concatenate(const std::vector>& buffers, MemoryPool* pool, - std::shared_ptr* out) { +Status ConcatenateBuffers(const std::vector>& buffers, + MemoryPool* pool, std::shared_ptr* out) { int64_t out_length = 0; for (const auto& buffer : buffers) { out_length += buffer->size(); @@ -236,7 +236,7 @@ Status Concatenate(const std::vector>& buffers, MemoryPo RETURN_NOT_OK(AllocateBuffer(pool, out_length, out)); auto out_data = (*out)->mutable_data(); for (const auto& buffer : buffers) { - std::copy(buffer->data(), buffer->data() + buffer->size(), out_data); + std::memcpy(out_data, buffer->data(), buffer->size()); out_data += buffer->size(); } return Status::OK(); diff --git a/cpp/src/arrow/buffer.h b/cpp/src/arrow/buffer.h index e870ef9f77f..1f1cd4bdbb1 100644 --- a/cpp/src/arrow/buffer.h +++ b/cpp/src/arrow/buffer.h @@ -412,8 +412,8 @@ Status AllocateEmptyBitmap(int64_t length, std::shared_ptr* out); /// /// \return Status ARROW_EXPORT -Status Concatenate(const BufferVector& buffers, MemoryPool* pool, - std::shared_ptr* out); +Status ConcatenateBuffers(const BufferVector& buffers, MemoryPool* pool, + std::shared_ptr* out); /// @} diff --git a/cpp/src/arrow/util/concatenate.cc b/cpp/src/arrow/util/concatenate.cc index a4ebf210004..b982d27153a 100644 --- a/cpp/src/arrow/util/concatenate.cc +++ b/cpp/src/arrow/util/concatenate.cc @@ -31,11 +31,122 @@ namespace arrow { -struct ConcatenateImpl { +/// offset, length pair for representing a Range of a buffer or array +struct Range { + int64_t offset, length; + + Range() : offset(-1), length(0) {} + Range(int64_t o, int64_t l) : offset(o), length(l) {} +}; + +/// non-owning view into a range of bits +struct Bitmap { + Bitmap() = default; + Bitmap(const uint8_t* d, Range r) : data(d), range(r) {} + explicit Bitmap(const std::shared_ptr& buffer, Range r) + : Bitmap(buffer ? buffer->data() : nullptr, r) {} + + const uint8_t* data; + Range range; + + bool AllSet() const { return data == nullptr; } +}; + +// Allocate a buffer and concatenate bitmaps into it. +static Status ConcatenateBitmaps(const std::vector& bitmaps, MemoryPool* pool, + std::shared_ptr* out) { + int64_t out_length = 0; + for (size_t i = 0; i < bitmaps.size(); ++i) { + out_length += bitmaps[i].range.length; + } + RETURN_NOT_OK(AllocateBitmap(pool, out_length, out)); + uint8_t* dst = (*out)->mutable_data(); + + int64_t bitmap_offset = 0; + for (size_t i = 0; i < bitmaps.size(); ++i) { + auto bitmap = bitmaps[i]; + if (bitmap.AllSet()) { + BitUtil::SetBitsTo(dst, bitmap_offset, bitmap.range.length, true); + } else { + internal::CopyBitmap(bitmap.data, bitmap.range.offset, bitmap.range.length, dst, + bitmap_offset, false); + } + bitmap_offset += bitmap.range.length; + } + + // finally (if applicable) zero out any trailing bits + if (auto preceding_bits = BitUtil::kPrecedingBitmask[out_length % 8]) { + dst[out_length / 8] &= preceding_bits; + } + return Status::OK(); +} + +// Write offsets in src into dst, adjusting them such that first_offset +// will be the first offset written. +template +static Status PutOffsets(const std::shared_ptr& src, Offset first_offset, + Offset* dst, Range* values_range); + +// Concatenate buffers holding offsets into a single buffer of offsets, +// also computing the ranges of values spanned by each buffer of offsets. +template +static Status ConcatenateOffsets(const BufferVector& buffers, MemoryPool* pool, + std::shared_ptr* out, + std::vector* values_ranges) { + values_ranges->resize(buffers.size()); + + // allocate output buffer + int64_t out_length = 0; + for (size_t i = 0; i < buffers.size(); ++i) { + out_length += buffers[i]->size() / sizeof(Offset); + } + RETURN_NOT_OK(AllocateBuffer(pool, (out_length + 1) * sizeof(Offset), out)); + auto dst = reinterpret_cast((*out)->mutable_data()); + + int64_t elements_length = 0; + Offset values_length = 0; + for (size_t i = 0; i < buffers.size(); ++i) { + // the first offset from buffers[i] will be adjusted to values_length + // (the cumulative length of values spanned by offsets in previous buffers) + RETURN_NOT_OK(PutOffsets(buffers[i], values_length, &dst[elements_length], + &values_ranges->at(i))); + elements_length += buffers[i]->size() / sizeof(Offset); + values_length += static_cast(values_ranges->at(i).length); + } + + // the final element in dst is the length of all values spanned by the offsets + dst[out_length] = values_length; + return Status::OK(); +} + +template +static Status PutOffsets(const std::shared_ptr& src, Offset first_offset, + Offset* dst, Range* values_range) { + // Get the range of offsets to transfer from src + auto src_begin = reinterpret_cast(src->data()); + auto src_end = reinterpret_cast(src->data() + src->size()); + + // Compute the range of values which is spanned by this range of offsets + values_range->offset = src_begin[0]; + values_range->length = *src_end - values_range->offset; + if (first_offset > std::numeric_limits::max() - values_range->length) { + return Status::Invalid("offset overflow while concatenating arrays"); + } + + // Write offsets into dst, ensuring that the first offset written is + // first_offset + auto adjustment = first_offset - src_begin[0]; + std::transform(src_begin, src_end, dst, + [adjustment](Offset offset) { return offset + adjustment; }); + return Status::OK(); +} + +class ConcatenateImpl { + public: ConcatenateImpl(const std::vector& in, MemoryPool* pool) : in_(in), pool_(pool) { out_.type = in[0].type; - for (int i = 0; i < in_size(); ++i) { + for (size_t i = 0; i < in_.size(); ++i) { out_.length += in[i].length; if (out_.null_count == kUnknownNullCount || in[i].null_count == kUnknownNullCount) { out_.null_count = kUnknownNullCount; @@ -50,64 +161,52 @@ struct ConcatenateImpl { } } - /// offset, length pair for representing a Range of a buffer or array - struct Range { - int64_t offset, length; - - Range() : offset(-1), length(0) {} - Range(int64_t o, int64_t l) : offset(o), length(l) {} - }; - - struct Bitmap { - Bitmap() = default; - Bitmap(const uint8_t* d, Range r) : data(d), range(r) {} - explicit Bitmap(const std::shared_ptr& buffer, Range r = Range()) - : Bitmap(buffer ? buffer->data() : nullptr, r) {} - - const uint8_t* data; - Range range; - - bool AllSet() const { return data == nullptr; } - }; + Status Concatenate(ArrayData* out) && { + if (out_.null_count != 0) { + RETURN_NOT_OK(ConcatenateBitmaps(Bitmaps(0), pool_, &out_.buffers[0])); + } + RETURN_NOT_OK(VisitTypeInline(*out_.type, this)); + *out = std::move(out_); + return Status::OK(); + } Status Visit(const NullType&) { return Status::OK(); } Status Visit(const BooleanType&) { - return ConcatenateBitmaps(Slices(Bitmaps(1), Ranges()), &out_.buffers[1]); + return ConcatenateBitmaps(Bitmaps(1), pool_, &out_.buffers[1]); } - // handle numbers, decimal128, fixed_size_binary Status Visit(const FixedWidthType& fixed) { - return arrow::Concatenate(Slices(Buffers(1), FixedWidthRanges(fixed)), pool_, - &out_.buffers[1]); + // handles numbers, decimal128, fixed_size_binary + return ConcatenateBuffers(Buffers(1, fixed), pool_, &out_.buffers[1]); } Status Visit(const BinaryType&) { std::vector value_ranges; - RETURN_NOT_OK(ConcatenateOffsets(Buffers(1), &out_.buffers[1], &value_ranges)); - return arrow::Concatenate(Slices(Buffers(2), value_ranges), pool_, &out_.buffers[2]); + RETURN_NOT_OK(ConcatenateOffsets(Buffers(1, *offset_type), pool_, + &out_.buffers[1], &value_ranges)); + return ConcatenateBuffers(Buffers(2, value_ranges), pool_, &out_.buffers[2]); } Status Visit(const ListType&) { std::vector value_ranges; - RETURN_NOT_OK(ConcatenateOffsets(Buffers(1), &out_.buffers[1], &value_ranges)); - return ConcatenateImpl(Slices(ChildData(0), value_ranges), pool_) + RETURN_NOT_OK(ConcatenateOffsets(Buffers(1, *offset_type), pool_, + &out_.buffers[1], &value_ranges)); + return ConcatenateImpl(ChildData(0, value_ranges), pool_) .Concatenate(out_.child_data[0].get()); } Status Visit(const StructType& s) { - auto ranges = Ranges(); for (int i = 0; i < s.num_children(); ++i) { - RETURN_NOT_OK(ConcatenateImpl(Slices(ChildData(i), ranges), pool_) - .Concatenate(out_.child_data[i].get())); + RETURN_NOT_OK( + ConcatenateImpl(ChildData(i), pool_).Concatenate(out_.child_data[i].get())); } return Status::OK(); } Status Visit(const DictionaryType& d) { auto fixed = internal::checked_cast(d.index_type().get()); - return arrow::Concatenate(Slices(Buffers(1), FixedWidthRanges(*fixed)), pool_, - &out_.buffers[1]); + return ConcatenateBuffers(Buffers(1, *fixed), pool_, &out_.buffers[1]); } Status Visit(const UnionType& u) { @@ -119,153 +218,100 @@ struct ConcatenateImpl { return Status::NotImplemented("concatenation of ", e); } - Status Concatenate(ArrayData* out) && { - if (out_.null_count != 0) { - RETURN_NOT_OK(ConcatenateBitmaps(Slices(Bitmaps(0), Ranges()), &out_.buffers[0])); - } - RETURN_NOT_OK(VisitTypeInline(*out_.type, this)); - *out = std::move(out_); - return Status::OK(); - } - - BufferVector Buffers(int index) { - BufferVector buffers(in_size()); - for (int i = 0; i != in_size(); ++i) { - buffers[i] = in_[i].buffers[index]; + private: + // Gather the index-th buffer of each input into a vector. + // Bytes are sliced with that input's offset and length. + BufferVector Buffers(size_t index) { + BufferVector buffers(in_.size()); + for (size_t i = 0; i < in_.size(); ++i) { + buffers[i] = SliceBuffer(in_[i].buffers[index], in_[i].offset, in_[i].length); } return buffers; } - std::vector Bitmaps(int index) { - std::vector bitmaps(in_size()); - for (int i = 0; i != in_size(); ++i) { - bitmaps[i] = Bitmap(in_[i].buffers[index]); - } - return bitmaps; - } - - std::vector ChildData(int index) { - std::vector child_data(in_size()); - for (int i = 0; i != in_size(); ++i) { - child_data[i] = *in_[i].child_data[index]; - } - return child_data; - } - - ArrayData Slice(const ArrayData& d, Range r) { return d.Slice(r.offset, r.length); } - - std::shared_ptr Slice(const std::shared_ptr& b, Range r) { - return SliceBuffer(b, r.offset, r.length); - } - - Bitmap Slice(Bitmap b, Range r) { return Bitmap(b.data, r); } - - template - std::vector Slices(const std::vector& slicable, - const std::vector& ranges) { - std::vector slices(in_size()); - for (int i = 0; i < in_size(); ++i) { - slices[i] = Slice(slicable[i], ranges[i]); + // Gather the index-th buffer of each input into a vector. + // Bytes are sliced with the explicitly passed ranges. + BufferVector Buffers(size_t index, const std::vector& ranges) { + DCHECK_EQ(in_.size(), ranges.size()); + BufferVector buffers(in_.size()); + for (size_t i = 0; i < in_.size(); ++i) { + buffers[i] = SliceBuffer(in_[i].buffers[index], ranges[i].offset, ranges[i].length); } - return slices; - } - - std::vector Ranges() { - std::vector ranges(in_size()); - for (int i = 0; i < in_size(); ++i) { - ranges[i] = in_range(i); - } - return ranges; + return buffers; } - std::vector FixedWidthRanges(const FixedWidthType& fixed) { + // Gather the index-th buffer of each input into a vector. + // Buffers are assumed to contain elements of fixed.bit_width(), + // those elements are sliced with that input's offset and length. + BufferVector Buffers(size_t index, const FixedWidthType& fixed) { DCHECK_EQ(fixed.bit_width() % 8, 0); auto byte_width = fixed.bit_width() / 8; - auto ranges = Ranges(); - for (Range& range : ranges) { - range.offset *= byte_width; - range.length *= byte_width; + BufferVector buffers(in_.size()); + for (size_t i = 0; i < in_.size(); ++i) { + buffers[i] = SliceBuffer(in_[i].buffers[index], in_[i].offset * byte_width, + in_[i].length * byte_width); } - return ranges; + return buffers; } - Status ConcatenateBitmaps(const std::vector& bitmaps, - std::shared_ptr* out) { - RETURN_NOT_OK(AllocateBitmap(pool_, out_.length, out)); - uint8_t* bitmap_data = (*out)->mutable_data(); - int64_t bitmap_offset = 0; - for (int i = 0; i < in_size(); ++i) { - auto bitmap = bitmaps[i]; - if (bitmap.AllSet()) { - BitUtil::SetBitsTo(bitmap_data, bitmap_offset, bitmap.range.length, true); - } else { - internal::CopyBitmap(bitmap.data, bitmap.range.offset, bitmap.range.length, - bitmap_data, bitmap_offset, false); - } - bitmap_offset += bitmap.range.length; - } - if (auto preceding_bits = BitUtil::kPrecedingBitmask[out_.length % 8]) { - bitmap_data[out_.length / 8] &= preceding_bits; + // Gather the index-th buffer of each input as a Bitmap + // into a vector of Bitmaps. + std::vector Bitmaps(size_t index) { + std::vector bitmaps(in_.size()); + for (size_t i = 0; i < in_.size(); ++i) { + Range range(in_[i].offset, in_[i].length); + bitmaps[i] = Bitmap(in_[i].buffers[index], range); } - return Status::OK(); + return bitmaps; } - Status PutOffsets(const std::shared_ptr& src, Range range, - int32_t values_length, int32_t* dst_begin, Range* values_range) { - auto src_begin = reinterpret_cast(src->data()) + range.offset; - auto src_end = src_begin + range.length; - auto first_offset = src_begin[0]; - values_range->offset = first_offset; - values_range->length = *src_end - values_range->offset; - if (values_length > std::numeric_limits::max() - values_range->length) { - return Status::Invalid("offset overflow while concatenating arrays"); + // Gather the index-th child_data of each input into a vector. + // Elements are sliced with that input's offset and length. + std::vector ChildData(size_t index) { + std::vector child_data(in_.size()); + for (size_t i = 0; i < in_.size(); ++i) { + child_data[i] = in_[i].child_data[index]->Slice(in_[i].offset, in_[i].length); } - std::transform(src_begin, src_end, dst_begin, - [values_length, first_offset](int32_t offset) { - return offset - first_offset + values_length; - }); - return Status::OK(); + return child_data; } - Status ConcatenateOffsets(const BufferVector& buffers, std::shared_ptr* out, - std::vector* ranges) { - RETURN_NOT_OK(AllocateBuffer(pool_, (out_.length + 1) * sizeof(int32_t), out)); - auto dst = reinterpret_cast((*out)->mutable_data()); - int64_t elements_length = 0; - int32_t values_length = 0; - ranges->resize(in_size()); - for (int i = 0; i < in_size(); ++i) { - RETURN_NOT_OK(PutOffsets(buffers[i], in_range(i), values_length, - &dst[elements_length], &ranges->at(i))); - elements_length += in_length(i); - values_length += static_cast(ranges->at(i).length); + // Gather the index-th child_data of each input into a vector. + // Elements are sliced with the explicitly passed ranges. + std::vector ChildData(size_t index, const std::vector& ranges) { + DCHECK_EQ(in_.size(), ranges.size()); + std::vector child_data(in_.size()); + for (size_t i = 0; i < in_.size(); ++i) { + child_data[i] = in_[i].child_data[index]->Slice(ranges[i].offset, ranges[i].length); } - dst[elements_length] = values_length; - return Status::OK(); + return child_data; } - int in_size() const { return static_cast(in_.size()); } - int64_t in_offset(int i) const { return in_[i].offset; } - int64_t in_length(int i) const { return in_[i].length; } - Range in_range(int i) const { return Range(in_offset(i), in_length(i)); } - + static const std::shared_ptr offset_type; const std::vector& in_; MemoryPool* pool_; ArrayData out_; }; +const std::shared_ptr ConcatenateImpl::offset_type = + std::static_pointer_cast(int32()); + Status Concatenate(const ArrayVector& arrays, MemoryPool* pool, std::shared_ptr* out) { - DCHECK_GT(arrays.size(), 0); + if (arrays.size() == 0) { + return Status::Invalid("Must pass at least one array"); + } + + // gather ArrayData of input arrays std::vector data(arrays.size()); - for (std::size_t i = 0; i < arrays.size(); ++i) { + for (size_t i = 0; i < arrays.size(); ++i) { if (!arrays[i]->type()->Equals(*arrays[0]->type())) { - return Status::Invalid("arrays to be concatentated must be identically typed, but ", + return Status::Invalid("arrays to be concatenated must be identically typed, but ", *arrays[0]->type(), " and ", *arrays[i]->type(), " were encountered."); } data[i] = ArrayData(*arrays[i]->data()); } + ArrayData out_data; RETURN_NOT_OK(ConcatenateImpl(data, pool).Concatenate(&out_data)); *out = MakeArray(std::make_shared(std::move(out_data))); From c1600bc2ed7645f69303672f2b65565915fdb95d Mon Sep 17 00:00:00 2001 From: Benjamin Kietzman Date: Wed, 20 Mar 2019 13:41:29 -0400 Subject: [PATCH 21/21] move concatenate test to separate file --- cpp/src/arrow/array-test.cc | 164 -------------------- cpp/src/arrow/util/CMakeLists.txt | 1 + cpp/src/arrow/util/concatenate-test.cc | 206 +++++++++++++++++++++++++ 3 files changed, 207 insertions(+), 164 deletions(-) create mode 100644 cpp/src/arrow/util/concatenate-test.cc diff --git a/cpp/src/arrow/array-test.cc b/cpp/src/arrow/array-test.cc index be7ba37d374..b9786828065 100644 --- a/cpp/src/arrow/array-test.cc +++ b/cpp/src/arrow/array-test.cc @@ -42,7 +42,6 @@ #include "arrow/type.h" #include "arrow/util/bit-util.h" #include "arrow/util/checked_cast.h" -#include "arrow/util/concatenate.h" #include "arrow/util/decimal.h" #include "arrow/util/lazy.h" @@ -1786,167 +1785,4 @@ TEST(TestRechunkArraysConsistently, Plain) { } } -class ConcatenateTest : public ::testing::Test { - protected: - ConcatenateTest() - : rng_(seed_), - sizes_({0, 1, 2, 4, 16, 31, 1234}), - null_probabilities_({0.0, 0.1, 0.5, 0.9, 1.0}) {} - - std::vector Offsets(int32_t length, int32_t slice_count) { - std::vector offsets(static_cast(slice_count + 1)); - std::default_random_engine gen(seed_); - std::uniform_int_distribution dist(0, length); - std::generate(offsets.begin(), offsets.end(), [&] { return dist(gen); }); - std::sort(offsets.begin(), offsets.end()); - return offsets; - } - - ArrayVector Slices(const std::shared_ptr& array, - const std::vector& offsets) { - ArrayVector slices(offsets.size() - 1); - for (size_t i = 0; i != slices.size(); ++i) { - slices[i] = array->Slice(offsets[i], offsets[i + 1] - offsets[i]); - } - return slices; - } - - template - std::shared_ptr GeneratePrimitive(int64_t size, double null_probability) { - if (std::is_same::value) { - return rng_.Boolean(size, 0.5, null_probability); - } - return rng_.Numeric(size, 0, 127, null_probability); - } - - void CheckTrailingBitsAreZeroed(const std::shared_ptr& bitmap, int64_t length) { - if (auto preceding_bits = BitUtil::kPrecedingBitmask[length % 8]) { - auto last_byte = bitmap->data()[length / 8]; - ASSERT_EQ(static_cast(last_byte & preceding_bits), last_byte) - << length << " " << int(preceding_bits); - } - } - - template - void Check(ArrayFactory&& factory) { - for (auto size : this->sizes_) { - auto offsets = this->Offsets(size, 3); - for (auto null_probability : this->null_probabilities_) { - std::shared_ptr array; - factory(size, null_probability, &array); - auto expected = array->Slice(offsets.front(), offsets.back() - offsets.front()); - auto slices = this->Slices(array, offsets); - std::shared_ptr actual; - ASSERT_OK(Concatenate(slices, default_memory_pool(), &actual)); - AssertArraysEqual(*expected, *actual); - if (actual->data()->buffers[0]) { - CheckTrailingBitsAreZeroed(actual->data()->buffers[0], actual->length()); - } - if (actual->type_id() == Type::BOOL) { - CheckTrailingBitsAreZeroed(actual->data()->buffers[1], actual->length()); - } - } - } - } - - random::SeedType seed_ = 0xdeadbeef; - random::RandomArrayGenerator rng_; - std::vector sizes_; - std::vector null_probabilities_; -}; - -template -class PrimitiveConcatenateTest : public ConcatenateTest { - public: -}; - -using PrimitiveTypes = - ::testing::Types; -TYPED_TEST_CASE(PrimitiveConcatenateTest, PrimitiveTypes); - -TYPED_TEST(PrimitiveConcatenateTest, Primitives) { - this->Check([this](int64_t size, double null_probability, std::shared_ptr* out) { - *out = this->template GeneratePrimitive(size, null_probability); - }); -} - -TEST_F(ConcatenateTest, StringType) { - Check([this](int32_t size, double null_probability, std::shared_ptr* out) { - auto values_size = size * 4; - auto char_array = this->GeneratePrimitive(values_size, null_probability); - std::shared_ptr offsets; - auto offsets_vector = this->Offsets(values_size, size); - // ensure the first offset is 0, which is expected for StringType - offsets_vector[0] = 0; - ASSERT_OK(CopyBufferFromVector(offsets_vector, default_memory_pool(), &offsets)); - *out = MakeArray(ArrayData::Make( - utf8(), size, - {char_array->data()->buffers[0], offsets, char_array->data()->buffers[1]})); - }); -} - -TEST_F(ConcatenateTest, ListType) { - Check([this](int32_t size, double null_probability, std::shared_ptr* out) { - auto values_size = size * 4; - auto values = this->GeneratePrimitive(values_size, null_probability); - auto offsets_vector = this->Offsets(values_size, size); - // ensure the first offset is 0, which is expected for ListType - offsets_vector[0] = 0; - std::shared_ptr offsets; - ArrayFromVector(offsets_vector, &offsets); - ASSERT_OK(ListArray::FromArrays(*offsets, *values, default_memory_pool(), out)); - }); -} - -TEST_F(ConcatenateTest, StructType) { - Check([this](int32_t size, double null_probability, std::shared_ptr* out) { - auto foo = this->GeneratePrimitive(size, null_probability); - auto bar = this->GeneratePrimitive(size, null_probability); - auto baz = this->GeneratePrimitive(size, null_probability); - *out = std::make_shared( - struct_({field("foo", int8()), field("bar", float64()), field("baz", boolean())}), - size, ArrayVector{foo, bar, baz}); - }); -} - -TEST_F(ConcatenateTest, DictionaryType) { - Check([this](int32_t size, double null_probability, std::shared_ptr* out) { - auto indices = this->GeneratePrimitive(size, null_probability); - auto type = dictionary(int32(), this->GeneratePrimitive(128, 0)); - *out = std::make_shared(type, indices); - }); -} - -TEST_F(ConcatenateTest, DISABLED_UnionType) { - // sparse mode - Check([this](int32_t size, double null_probability, std::shared_ptr* out) { - auto foo = this->GeneratePrimitive(size, null_probability); - auto bar = this->GeneratePrimitive(size, null_probability); - auto baz = this->GeneratePrimitive(size, null_probability); - auto type_ids = rng_.Numeric(size, 0, 2, null_probability); - ASSERT_OK(UnionArray::MakeSparse(*type_ids, {foo, bar, baz}, out)); - }); - // dense mode - Check([this](int32_t size, double null_probability, std::shared_ptr* out) { - auto foo = this->GeneratePrimitive(size, null_probability); - auto bar = this->GeneratePrimitive(size, null_probability); - auto baz = this->GeneratePrimitive(size, null_probability); - auto type_ids = rng_.Numeric(size, 0, 2, null_probability); - auto value_offsets = rng_.Numeric(size, 0, size, 0); - ASSERT_OK(UnionArray::MakeDense(*type_ids, *value_offsets, {foo, bar, baz}, out)); - }); -} - -TEST_F(ConcatenateTest, OffsetOverflow) { - auto fake_long = ArrayFromJSON(utf8(), "[\"\"]"); - fake_long->data()->GetMutableValues(1)[1] = - std::numeric_limits::max(); - std::shared_ptr concatenated; - // XX since the data fake_long claims to own isn't there, this will segfault if - // Concatenate doesn't detect overflow and raise an error. - ASSERT_RAISES( - Invalid, Concatenate({fake_long, fake_long}, default_memory_pool(), &concatenated)); -} - } // namespace arrow diff --git a/cpp/src/arrow/util/CMakeLists.txt b/cpp/src/arrow/util/CMakeLists.txt index ba24f88e28b..feef3b555d9 100644 --- a/cpp/src/arrow/util/CMakeLists.txt +++ b/cpp/src/arrow/util/CMakeLists.txt @@ -29,6 +29,7 @@ arrow_install_all_headers("arrow/util") add_arrow_test(bit-util-test) add_arrow_test(checked-cast-test) add_arrow_test(compression-test) +add_arrow_test(concatenate-test) add_arrow_test(decimal-test) add_arrow_test(hashing-test) add_arrow_test(int-util-test) diff --git a/cpp/src/arrow/util/concatenate-test.cc b/cpp/src/arrow/util/concatenate-test.cc new file mode 100644 index 00000000000..8d9e9d6d62d --- /dev/null +++ b/cpp/src/arrow/util/concatenate-test.cc @@ -0,0 +1,206 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +#include "arrow/array.h" +#include "arrow/buffer.h" +#include "arrow/status.h" +#include "arrow/testing/gtest_common.h" +#include "arrow/testing/random.h" +#include "arrow/testing/util.h" +#include "arrow/type.h" +#include "arrow/util/concatenate.h" + +namespace arrow { + +class ConcatenateTest : public ::testing::Test { + protected: + ConcatenateTest() + : rng_(seed_), + sizes_({0, 1, 2, 4, 16, 31, 1234}), + null_probabilities_({0.0, 0.1, 0.5, 0.9, 1.0}) {} + + std::vector Offsets(int32_t length, int32_t slice_count) { + std::vector offsets(static_cast(slice_count + 1)); + std::default_random_engine gen(seed_); + std::uniform_int_distribution dist(0, length); + std::generate(offsets.begin(), offsets.end(), [&] { return dist(gen); }); + std::sort(offsets.begin(), offsets.end()); + return offsets; + } + + ArrayVector Slices(const std::shared_ptr& array, + const std::vector& offsets) { + ArrayVector slices(offsets.size() - 1); + for (size_t i = 0; i != slices.size(); ++i) { + slices[i] = array->Slice(offsets[i], offsets[i + 1] - offsets[i]); + } + return slices; + } + + template + std::shared_ptr GeneratePrimitive(int64_t size, double null_probability) { + if (std::is_same::value) { + return rng_.Boolean(size, 0.5, null_probability); + } + return rng_.Numeric(size, 0, 127, null_probability); + } + + void CheckTrailingBitsAreZeroed(const std::shared_ptr& bitmap, int64_t length) { + if (auto preceding_bits = BitUtil::kPrecedingBitmask[length % 8]) { + auto last_byte = bitmap->data()[length / 8]; + ASSERT_EQ(static_cast(last_byte & preceding_bits), last_byte) + << length << " " << int(preceding_bits); + } + } + + template + void Check(ArrayFactory&& factory) { + for (auto size : this->sizes_) { + auto offsets = this->Offsets(size, 3); + for (auto null_probability : this->null_probabilities_) { + std::shared_ptr array; + factory(size, null_probability, &array); + auto expected = array->Slice(offsets.front(), offsets.back() - offsets.front()); + auto slices = this->Slices(array, offsets); + std::shared_ptr actual; + ASSERT_OK(Concatenate(slices, default_memory_pool(), &actual)); + AssertArraysEqual(*expected, *actual); + if (actual->data()->buffers[0]) { + CheckTrailingBitsAreZeroed(actual->data()->buffers[0], actual->length()); + } + if (actual->type_id() == Type::BOOL) { + CheckTrailingBitsAreZeroed(actual->data()->buffers[1], actual->length()); + } + } + } + } + + random::SeedType seed_ = 0xdeadbeef; + random::RandomArrayGenerator rng_; + std::vector sizes_; + std::vector null_probabilities_; +}; + +template +class PrimitiveConcatenateTest : public ConcatenateTest { + public: +}; + +using PrimitiveTypes = + ::testing::Types; +TYPED_TEST_CASE(PrimitiveConcatenateTest, PrimitiveTypes); + +TYPED_TEST(PrimitiveConcatenateTest, Primitives) { + this->Check([this](int64_t size, double null_probability, std::shared_ptr* out) { + *out = this->template GeneratePrimitive(size, null_probability); + }); +} + +TEST_F(ConcatenateTest, StringType) { + Check([this](int32_t size, double null_probability, std::shared_ptr* out) { + auto values_size = size * 4; + auto char_array = this->GeneratePrimitive(values_size, null_probability); + std::shared_ptr offsets; + auto offsets_vector = this->Offsets(values_size, size); + // ensure the first offset is 0, which is expected for StringType + offsets_vector[0] = 0; + ASSERT_OK(CopyBufferFromVector(offsets_vector, default_memory_pool(), &offsets)); + *out = MakeArray(ArrayData::Make( + utf8(), size, + {char_array->data()->buffers[0], offsets, char_array->data()->buffers[1]})); + }); +} + +TEST_F(ConcatenateTest, ListType) { + Check([this](int32_t size, double null_probability, std::shared_ptr* out) { + auto values_size = size * 4; + auto values = this->GeneratePrimitive(values_size, null_probability); + auto offsets_vector = this->Offsets(values_size, size); + // ensure the first offset is 0, which is expected for ListType + offsets_vector[0] = 0; + std::shared_ptr offsets; + ArrayFromVector(offsets_vector, &offsets); + ASSERT_OK(ListArray::FromArrays(*offsets, *values, default_memory_pool(), out)); + }); +} + +TEST_F(ConcatenateTest, StructType) { + Check([this](int32_t size, double null_probability, std::shared_ptr* out) { + auto foo = this->GeneratePrimitive(size, null_probability); + auto bar = this->GeneratePrimitive(size, null_probability); + auto baz = this->GeneratePrimitive(size, null_probability); + *out = std::make_shared( + struct_({field("foo", int8()), field("bar", float64()), field("baz", boolean())}), + size, ArrayVector{foo, bar, baz}); + }); +} + +TEST_F(ConcatenateTest, DictionaryType) { + Check([this](int32_t size, double null_probability, std::shared_ptr* out) { + auto indices = this->GeneratePrimitive(size, null_probability); + auto type = dictionary(int32(), this->GeneratePrimitive(128, 0)); + *out = std::make_shared(type, indices); + }); +} + +TEST_F(ConcatenateTest, DISABLED_UnionType) { + // sparse mode + Check([this](int32_t size, double null_probability, std::shared_ptr* out) { + auto foo = this->GeneratePrimitive(size, null_probability); + auto bar = this->GeneratePrimitive(size, null_probability); + auto baz = this->GeneratePrimitive(size, null_probability); + auto type_ids = rng_.Numeric(size, 0, 2, null_probability); + ASSERT_OK(UnionArray::MakeSparse(*type_ids, {foo, bar, baz}, out)); + }); + // dense mode + Check([this](int32_t size, double null_probability, std::shared_ptr* out) { + auto foo = this->GeneratePrimitive(size, null_probability); + auto bar = this->GeneratePrimitive(size, null_probability); + auto baz = this->GeneratePrimitive(size, null_probability); + auto type_ids = rng_.Numeric(size, 0, 2, null_probability); + auto value_offsets = rng_.Numeric(size, 0, size, 0); + ASSERT_OK(UnionArray::MakeDense(*type_ids, *value_offsets, {foo, bar, baz}, out)); + }); +} + +TEST_F(ConcatenateTest, OffsetOverflow) { + auto fake_long = ArrayFromJSON(utf8(), "[\"\"]"); + fake_long->data()->GetMutableValues(1)[1] = + std::numeric_limits::max(); + std::shared_ptr concatenated; + // XX since the data fake_long claims to own isn't there, this will segfault if + // Concatenate doesn't detect overflow and raise an error. + ASSERT_RAISES( + Invalid, Concatenate({fake_long, fake_long}, default_memory_pool(), &concatenated)); +} + +} // namespace arrow