From 716322377d5da924a506cbe0b6c641242e1b7ae2 Mon Sep 17 00:00:00 2001 From: Wes McKinney Date: Tue, 11 Dec 2018 17:45:03 -0600 Subject: [PATCH 01/12] Split up builder.h, builder.cc into smaller headers, compilation units. add failing test case for ARROW-3762. Add ChunkedBinaryBuilder, make BinaryBuilder Append methods inline --- cpp/src/arrow/CMakeLists.txt | 12 +- cpp/src/arrow/allocator-test.cc | 1 + cpp/src/arrow/array-binary-test.cc | 106 +- cpp/src/arrow/array-dict-test.cc | 8 +- cpp/src/arrow/array-list-test.cc | 4 +- cpp/src/arrow/array-struct-test.cc | 4 +- cpp/src/arrow/array-test.cc | 2 - cpp/src/arrow/array.cc | 1 + cpp/src/arrow/array.h | 1 - cpp/src/arrow/builder-benchmark.cc | 22 + cpp/src/arrow/builder.cc | 503 +------ cpp/src/arrow/builder.h | 1177 +---------------- .../builder_adaptive.cc} | 4 +- cpp/src/arrow/columnar/builder_adaptive.h | 174 +++ cpp/src/arrow/columnar/builder_base.cc | 176 +++ cpp/src/arrow/columnar/builder_base.h | 228 ++++ .../builder_binary.cc} | 68 +- cpp/src/arrow/columnar/builder_binary.h | 298 +++++ cpp/src/arrow/columnar/builder_decimal.cc | 64 + cpp/src/arrow/columnar/builder_decimal.h | 45 + .../builder_dict.cc} | 4 +- cpp/src/arrow/columnar/builder_dict.h | 168 +++ cpp/src/arrow/columnar/builder_nested.cc | 156 +++ cpp/src/arrow/columnar/builder_nested.h | 125 ++ cpp/src/arrow/columnar/builder_primitive.cc | 272 ++++ cpp/src/arrow/columnar/builder_primitive.h | 403 ++++++ cpp/src/arrow/csv/column-builder.h | 21 +- cpp/src/arrow/csv/converter.cc | 1 + cpp/src/arrow/csv/parser.h | 1 + cpp/src/arrow/csv/reader.cc | 2 + cpp/src/arrow/io/buffered.cc | 2 +- cpp/src/arrow/io/buffered.h | 1 + cpp/src/arrow/ipc/feather-test.cc | 1 + cpp/src/arrow/ipc/json-simple-test.cc | 1 + cpp/src/arrow/memory_pool-test.h | 1 + cpp/src/arrow/memory_pool.cc | 12 +- cpp/src/arrow/pretty_print-test.cc | 2 - cpp/src/arrow/pretty_print.cc | 1 - cpp/src/arrow/pretty_print.h | 5 +- cpp/src/arrow/python/numpy_to_arrow.cc | 30 +- cpp/src/arrow/python/python-test.cc | 1 + cpp/src/arrow/record_batch.h | 1 + cpp/src/arrow/tensor.cc | 1 + cpp/src/arrow/test-util.cc | 13 +- cpp/src/arrow/test-util.h | 18 +- cpp/src/arrow/util/compression_lz4.cc | 1 + cpp/src/arrow/util/int-util-test.cc | 2 - cpp/src/arrow/util/string_view.h | 2 +- cpp/src/parquet/arrow/reader.cc | 1 + cpp/src/parquet/arrow/record_reader.cc | 85 +- python/pyarrow/tests/test_parquet.py | 25 + 51 files changed, 2456 insertions(+), 1801 deletions(-) rename cpp/src/arrow/{builder-adaptive.cc => columnar/builder_adaptive.cc} (99%) create mode 100644 cpp/src/arrow/columnar/builder_adaptive.h create mode 100644 cpp/src/arrow/columnar/builder_base.cc create mode 100644 cpp/src/arrow/columnar/builder_base.h rename cpp/src/arrow/{builder-binary.cc => columnar/builder_binary.cc} (89%) create mode 100644 cpp/src/arrow/columnar/builder_binary.h create mode 100644 cpp/src/arrow/columnar/builder_decimal.cc create mode 100644 cpp/src/arrow/columnar/builder_decimal.h rename cpp/src/arrow/{builder-dict.cc => columnar/builder_dict.cc} (99%) create mode 100644 cpp/src/arrow/columnar/builder_dict.h create mode 100644 cpp/src/arrow/columnar/builder_nested.cc create mode 100644 cpp/src/arrow/columnar/builder_nested.h create mode 100644 cpp/src/arrow/columnar/builder_primitive.cc create mode 100644 cpp/src/arrow/columnar/builder_primitive.h diff --git a/cpp/src/arrow/CMakeLists.txt b/cpp/src/arrow/CMakeLists.txt index 2d043a9a276..e381d706c68 100644 --- a/cpp/src/arrow/CMakeLists.txt +++ b/cpp/src/arrow/CMakeLists.txt @@ -55,9 +55,6 @@ set(ARROW_SRCS array.cc buffer.cc builder.cc - builder-adaptive.cc - builder-binary.cc - builder-dict.cc compare.cc memory_pool.cc pretty_print.cc @@ -69,6 +66,14 @@ set(ARROW_SRCS type.cc visitor.cc + columnar/builder_adaptive.cc + columnar/builder_base.cc + columnar/builder_binary.cc + columnar/builder_decimal.cc + columnar/builder_dict.cc + columnar/builder_nested.cc + columnar/builder_primitive.cc + csv/converter.cc csv/chunker.cc csv/column-builder.cc @@ -297,6 +302,7 @@ ADD_ARROW_TEST(tensor-test) ADD_ARROW_BENCHMARK(builder-benchmark) ADD_ARROW_BENCHMARK(column-benchmark) +add_subdirectory(columnar) add_subdirectory(csv) add_subdirectory(io) add_subdirectory(util) diff --git a/cpp/src/arrow/allocator-test.cc b/cpp/src/arrow/allocator-test.cc index cdffbd7e849..1a94467281d 100644 --- a/cpp/src/arrow/allocator-test.cc +++ b/cpp/src/arrow/allocator-test.cc @@ -17,6 +17,7 @@ #include #include +#include #include #include diff --git a/cpp/src/arrow/array-binary-test.cc b/cpp/src/arrow/array-binary-test.cc index 4376695c68c..5bfdc993161 100644 --- a/cpp/src/arrow/array-binary-test.cc +++ b/cpp/src/arrow/array-binary-test.cc @@ -15,10 +15,8 @@ // specific language governing permissions and limitations // under the License. -#include #include #include -#include #include #include #include @@ -28,10 +26,14 @@ #include "arrow/array.h" #include "arrow/buffer.h" #include "arrow/builder.h" +#include "arrow/memory_pool.h" #include "arrow/status.h" #include "arrow/test-common.h" #include "arrow/test-util.h" #include "arrow/type.h" +#include "arrow/type_traits.h" +#include "arrow/util/bit-util.h" +#include "arrow/util/checked_cast.h" namespace arrow { @@ -676,4 +678,104 @@ TEST_F(TestStringArray, TestSliceEquality) { CheckSliceEquality(); } TEST_F(TestBinaryArray, LengthZeroCtor) { BinaryArray array(0, nullptr, nullptr); } +// ---------------------------------------------------------------------- +// ChunkedBinaryBuilder tests + +class TestChunkedBinaryBuilder : public ::testing::Test { + public: + void SetUp() {} + + void Init(int32_t chunksize) { + builder_.reset(new internal::ChunkedBinaryBuilder(chunksize)); + } + + protected: + std::unique_ptr builder_; +}; + +TEST_F(TestChunkedBinaryBuilder, BasicOperation) { + const int32_t chunksize = 1000; + Init(chunksize); + + const int elem_size = 10; + uint8_t buf[elem_size]; + + BinaryBuilder unchunked_builder; + + const int iterations = 1000; + for (int i = 0; i < iterations; ++i) { + random_bytes(elem_size, i, buf); + + ASSERT_OK(unchunked_builder.Append(buf, elem_size)); + ASSERT_OK(builder_->Append(buf, elem_size)); + } + + std::shared_ptr unchunked; + ASSERT_OK(unchunked_builder.Finish(&unchunked)); + + ArrayVector chunks; + ASSERT_OK(builder_->Finish(&chunks)); + + // This assumes that everything is evenly divisible + ArrayVector expected_chunks; + const int elems_per_chunk = chunksize / elem_size; + for (int i = 0; i < iterations / elems_per_chunk; ++i) { + expected_chunks.emplace_back(unchunked->Slice(i * elems_per_chunk, elems_per_chunk)); + } + + ASSERT_EQ(expected_chunks.size(), chunks.size()); + for (size_t i = 0; i < chunks.size(); ++i) { + AssertArraysEqual(*expected_chunks[i], *chunks[i]); + } +} + +TEST_F(TestChunkedBinaryBuilder, NoData) { + Init(1000); + + ArrayVector chunks; + ASSERT_OK(builder_->Finish(&chunks)); + + ASSERT_EQ(1, chunks.size()); + ASSERT_EQ(0, chunks[0]->length()); +} + +TEST_F(TestChunkedBinaryBuilder, LargeElements) { + Init(100); + + const int bufsize = 101; + uint8_t buf[bufsize]; + + const int iterations = 100; + for (int i = 0; i < iterations; ++i) { + random_bytes(bufsize, i, buf); + ASSERT_OK(builder_->Append(buf, bufsize)); + } + + ArrayVector chunks; + ASSERT_OK(builder_->Finish(&chunks)); + ASSERT_EQ(iterations, static_cast(chunks.size())); +} + +TEST(TestChunkedStringBuilder, BasicOperation) { + const int chunksize = 100; + internal::ChunkedStringBuilder builder(chunksize); + + std::string value = "0123456789"; + + const int iterations = 100; + for (int i = 0; i < iterations; ++i) { + ASSERT_OK(builder.Append(value)); + } + + ArrayVector chunks; + ASSERT_OK(builder.Finish(&chunks)); + + ASSERT_EQ(10, chunks.size()); + + // Type is correct + for (auto chunk : chunks) { + ASSERT_TRUE(chunk->type()->Equals(*::arrow::utf8())); + } +} + } // namespace arrow diff --git a/cpp/src/arrow/array-dict-test.cc b/cpp/src/arrow/array-dict-test.cc index cc471a3e540..87cb2290a7b 100644 --- a/cpp/src/arrow/array-dict-test.cc +++ b/cpp/src/arrow/array-dict-test.cc @@ -15,23 +15,23 @@ // specific language governing permissions and limitations // under the License. -#include +#include #include -#include -#include #include +#include #include #include #include #include "arrow/array.h" -#include "arrow/buffer.h" #include "arrow/builder.h" +#include "arrow/memory_pool.h" #include "arrow/status.h" #include "arrow/test-common.h" #include "arrow/test-util.h" #include "arrow/type.h" +#include "arrow/util/decimal.h" namespace arrow { diff --git a/cpp/src/arrow/array-list-test.cc b/cpp/src/arrow/array-list-test.cc index 207acd4cf65..c49c5e30970 100644 --- a/cpp/src/arrow/array-list-test.cc +++ b/cpp/src/arrow/array-list-test.cc @@ -15,10 +15,8 @@ // specific language governing permissions and limitations // under the License. -#include #include #include -#include #include #include #include @@ -32,6 +30,8 @@ #include "arrow/test-common.h" #include "arrow/test-util.h" #include "arrow/type.h" +#include "arrow/util/bit-util.h" +#include "arrow/util/checked_cast.h" namespace arrow { diff --git a/cpp/src/arrow/array-struct-test.cc b/cpp/src/arrow/array-struct-test.cc index dc8bafd4c00..68c35f57116 100644 --- a/cpp/src/arrow/array-struct-test.cc +++ b/cpp/src/arrow/array-struct-test.cc @@ -15,10 +15,8 @@ // specific language governing permissions and limitations // under the License. -#include #include #include -#include #include #include #include @@ -26,12 +24,12 @@ #include #include "arrow/array.h" -#include "arrow/buffer.h" #include "arrow/builder.h" #include "arrow/status.h" #include "arrow/test-common.h" #include "arrow/test-util.h" #include "arrow/type.h" +#include "arrow/util/checked_cast.h" namespace arrow { diff --git a/cpp/src/arrow/array-test.cc b/cpp/src/arrow/array-test.cc index de0885e6f5f..bdb7eda118d 100644 --- a/cpp/src/arrow/array-test.cc +++ b/cpp/src/arrow/array-test.cc @@ -23,7 +23,6 @@ #include #include #include -#include #include #include #include @@ -40,7 +39,6 @@ #include "arrow/test-common.h" #include "arrow/test-util.h" #include "arrow/type.h" -#include "arrow/type_traits.h" #include "arrow/util/bit-util.h" #include "arrow/util/checked_cast.h" #include "arrow/util/decimal.h" diff --git a/cpp/src/arrow/array.cc b/cpp/src/arrow/array.cc index 05d66d5cffd..ff94aa2a1e6 100644 --- a/cpp/src/arrow/array.cc +++ b/cpp/src/arrow/array.cc @@ -18,6 +18,7 @@ #include "arrow/array.h" #include +#include #include #include #include diff --git a/cpp/src/arrow/array.h b/cpp/src/arrow/array.h index 37fa5aedfc2..52c5207d8dd 100644 --- a/cpp/src/arrow/array.h +++ b/cpp/src/arrow/array.h @@ -18,7 +18,6 @@ #ifndef ARROW_ARRAY_H #define ARROW_ARRAY_H -#include #include #include #include diff --git a/cpp/src/arrow/builder-benchmark.cc b/cpp/src/arrow/builder-benchmark.cc index f96728dcd4f..2dd56478c0a 100644 --- a/cpp/src/arrow/builder-benchmark.cc +++ b/cpp/src/arrow/builder-benchmark.cc @@ -177,6 +177,25 @@ static void BM_BuildBinaryArray(benchmark::State& state) { // NOLINT non-const state.SetBytesProcessed(state.iterations() * iterations * value.size()); } +static void BM_BuildChunkedBinaryArray( + benchmark::State& state) { // NOLINT non-const reference + const int64_t iterations = 1 << 20; + + std::string value = "1234567890"; + while (state.KeepRunning()) { + // 1MB chunks + const int32_t chunksize = 1 << 20; + internal::ChunkedBinaryBuilder builder(chunksize); + for (int64_t i = 0; i < iterations; i++) { + ABORT_NOT_OK(builder.Append(reinterpret_cast(value.data()), + static_cast(value.size()))); + } + ArrayVector out; + ABORT_NOT_OK(builder.Finish(&out)); + } + state.SetBytesProcessed(state.iterations() * iterations * value.size()); +} + static void BM_BuildFixedSizeBinaryArray( benchmark::State& state) { // NOLINT non-const reference const int64_t iterations = 1 << 20; @@ -372,6 +391,9 @@ BENCHMARK(BM_BuildAdaptiveUIntNoNullsScalarAppend) ->Unit(benchmark::kMicrosecond); BENCHMARK(BM_BuildBinaryArray)->Repetitions(kRepetitions)->Unit(benchmark::kMicrosecond); +BENCHMARK(BM_BuildChunkedBinaryArray) + ->Repetitions(kRepetitions) + ->Unit(benchmark::kMicrosecond); BENCHMARK(BM_BuildFixedSizeBinaryArray) ->Repetitions(kRepetitions) ->Unit(benchmark::kMicrosecond); diff --git a/cpp/src/arrow/builder.cc b/cpp/src/arrow/builder.cc index aef4df05108..ff2b453bb44 100644 --- a/cpp/src/arrow/builder.cc +++ b/cpp/src/arrow/builder.cc @@ -15,513 +15,20 @@ // specific language governing permissions and limitations // under the License. -#include -#include -#include -#include +#include "arrow/builder.h" + #include +#include #include #include -#include "arrow/array.h" -#include "arrow/buffer.h" -#include "arrow/builder.h" #include "arrow/status.h" #include "arrow/type.h" -#include "arrow/type_traits.h" -#include "arrow/util/bit-util.h" #include "arrow/util/checked_cast.h" -#include "arrow/util/int-util.h" -#include "arrow/util/logging.h" namespace arrow { -using internal::checked_cast; - -Status ArrayBuilder::TrimBuffer(const int64_t bytes_filled, ResizableBuffer* buffer) { - if (buffer) { - if (bytes_filled < buffer->size()) { - // Trim buffer - RETURN_NOT_OK(buffer->Resize(bytes_filled)); - } - // zero the padding - buffer->ZeroPadding(); - } else { - // Null buffers are allowed in place of 0-byte buffers - DCHECK_EQ(bytes_filled, 0); - } - return Status::OK(); -} - -Status ArrayBuilder::AppendToBitmap(bool is_valid) { - if (length_ == capacity_) { - // If the capacity was not already a multiple of 2, do so here - // TODO(emkornfield) doubling isn't great default allocation practice - // see https://github.com/facebook/folly/blob/master/folly/docs/FBVector.md - // fo discussion - RETURN_NOT_OK(Resize(BitUtil::NextPower2(capacity_ + 1))); - } - UnsafeAppendToBitmap(is_valid); - return Status::OK(); -} - -Status ArrayBuilder::AppendToBitmap(const uint8_t* valid_bytes, int64_t length) { - RETURN_NOT_OK(Reserve(length)); - - UnsafeAppendToBitmap(valid_bytes, length); - return Status::OK(); -} - -Status ArrayBuilder::Resize(int64_t capacity) { - // Target size of validity (null) bitmap data - const int64_t new_bitmap_size = BitUtil::BytesForBits(capacity); - RETURN_NOT_OK(CheckCapacity(capacity, capacity_)); - - if (capacity_ == 0) { - RETURN_NOT_OK(AllocateResizableBuffer(pool_, new_bitmap_size, &null_bitmap_)); - null_bitmap_data_ = null_bitmap_->mutable_data(); - - // Padding is zeroed by AllocateResizableBuffer - memset(null_bitmap_data_, 0, static_cast(new_bitmap_size)); - } else { - const int64_t old_bitmap_capacity = null_bitmap_->capacity(); - RETURN_NOT_OK(null_bitmap_->Resize(new_bitmap_size)); - - const int64_t new_bitmap_capacity = null_bitmap_->capacity(); - null_bitmap_data_ = null_bitmap_->mutable_data(); - - // Zero the region between the original capacity and the new capacity, - // including padding, which has not been zeroed, unlike - // AllocateResizableBuffer - if (old_bitmap_capacity < new_bitmap_capacity) { - memset(null_bitmap_data_ + old_bitmap_capacity, 0, - static_cast(new_bitmap_capacity - old_bitmap_capacity)); - } - } - capacity_ = capacity; - return Status::OK(); -} - -Status ArrayBuilder::Advance(int64_t elements) { - if (length_ + elements > capacity_) { - return Status::Invalid("Builder must be expanded"); - } - length_ += elements; - return Status::OK(); -} - -Status ArrayBuilder::Finish(std::shared_ptr* out) { - std::shared_ptr internal_data; - RETURN_NOT_OK(FinishInternal(&internal_data)); - *out = MakeArray(internal_data); - return Status::OK(); -} - -Status ArrayBuilder::Reserve(int64_t additional_elements) { - if (length_ + additional_elements > capacity_) { - // TODO(emkornfield) power of 2 growth is potentially suboptimal - int64_t new_size = BitUtil::NextPower2(length_ + additional_elements); - return Resize(new_size); - } - return Status::OK(); -} - -void ArrayBuilder::Reset() { - capacity_ = length_ = null_count_ = 0; - null_bitmap_ = nullptr; -} - -Status ArrayBuilder::SetNotNull(int64_t length) { - RETURN_NOT_OK(Reserve(length)); - UnsafeSetNotNull(length); - return Status::OK(); -} - -void ArrayBuilder::UnsafeAppendToBitmap(const uint8_t* valid_bytes, int64_t length) { - if (valid_bytes == nullptr) { - UnsafeSetNotNull(length); - return; - } - UnsafeAppendToBitmap(valid_bytes, valid_bytes + length); -} - -void ArrayBuilder::UnsafeAppendToBitmap(const std::vector& is_valid) { - UnsafeAppendToBitmap(is_valid.begin(), is_valid.end()); -} - -void ArrayBuilder::UnsafeSetNotNull(int64_t length) { - const int64_t new_length = length + length_; - - // Fill up the bytes until we have a byte alignment - int64_t pad_to_byte = std::min(8 - (length_ % 8), length); - - if (pad_to_byte == 8) { - pad_to_byte = 0; - } - for (int64_t i = length_; i < length_ + pad_to_byte; ++i) { - BitUtil::SetBit(null_bitmap_data_, i); - } - - // Fast bitsetting - int64_t fast_length = (length - pad_to_byte) / 8; - memset(null_bitmap_data_ + ((length_ + pad_to_byte) / 8), 0xFF, - static_cast(fast_length)); - - // Trailing bits - for (int64_t i = length_ + pad_to_byte + (fast_length * 8); i < new_length; ++i) { - BitUtil::SetBit(null_bitmap_data_, i); - } - - length_ = new_length; -} - -// ---------------------------------------------------------------------- -// Null builder - -Status NullBuilder::FinishInternal(std::shared_ptr* out) { - *out = ArrayData::Make(null(), length_, {nullptr}, length_); - length_ = null_count_ = 0; - return Status::OK(); -} - -// ---------------------------------------------------------------------- - -template -void PrimitiveBuilder::Reset() { - data_.reset(); - raw_data_ = nullptr; -} - -template -Status PrimitiveBuilder::Resize(int64_t capacity) { - RETURN_NOT_OK(CheckCapacity(capacity, capacity_)); - capacity = std::max(capacity, kMinBuilderCapacity); - - int64_t nbytes = TypeTraits::bytes_required(capacity); - if (capacity_ == 0) { - RETURN_NOT_OK(AllocateResizableBuffer(pool_, nbytes, &data_)); - } else { - RETURN_NOT_OK(data_->Resize(nbytes)); - } - - raw_data_ = reinterpret_cast(data_->mutable_data()); - return ArrayBuilder::Resize(capacity); -} - -template -Status PrimitiveBuilder::AppendValues(const value_type* values, int64_t length, - const uint8_t* valid_bytes) { - RETURN_NOT_OK(Reserve(length)); - - if (length > 0) { - std::memcpy(raw_data_ + length_, values, - static_cast(TypeTraits::bytes_required(length))); - } - - // length_ is update by these - ArrayBuilder::UnsafeAppendToBitmap(valid_bytes, length); - return Status::OK(); -} - -template -Status PrimitiveBuilder::AppendValues(const value_type* values, int64_t length, - const std::vector& is_valid) { - RETURN_NOT_OK(Reserve(length)); - DCHECK_EQ(length, static_cast(is_valid.size())); - - if (length > 0) { - std::memcpy(raw_data_ + length_, values, - static_cast(TypeTraits::bytes_required(length))); - } - - // length_ is update by these - ArrayBuilder::UnsafeAppendToBitmap(is_valid); - return Status::OK(); -} - -template -Status PrimitiveBuilder::AppendValues(const std::vector& values, - const std::vector& is_valid) { - return AppendValues(values.data(), static_cast(values.size()), is_valid); -} - -template -Status PrimitiveBuilder::AppendValues(const std::vector& values) { - return AppendValues(values.data(), static_cast(values.size())); -} - -template -Status PrimitiveBuilder::FinishInternal(std::shared_ptr* out) { - RETURN_NOT_OK(TrimBuffer(BitUtil::BytesForBits(length_), null_bitmap_.get())); - RETURN_NOT_OK(TrimBuffer(TypeTraits::bytes_required(length_), data_.get())); - - *out = ArrayData::Make(type_, length_, {null_bitmap_, data_}, null_count_); - - data_ = null_bitmap_ = nullptr; - capacity_ = length_ = null_count_ = 0; - - return Status::OK(); -} - -template class PrimitiveBuilder; -template class PrimitiveBuilder; -template class PrimitiveBuilder; -template class PrimitiveBuilder; -template class PrimitiveBuilder; -template class PrimitiveBuilder; -template class PrimitiveBuilder; -template class PrimitiveBuilder; -template class PrimitiveBuilder; -template class PrimitiveBuilder; -template class PrimitiveBuilder; -template class PrimitiveBuilder; -template class PrimitiveBuilder; -template class PrimitiveBuilder; -template class PrimitiveBuilder; -template class PrimitiveBuilder; - -BooleanBuilder::BooleanBuilder(MemoryPool* pool) - : ArrayBuilder(boolean(), pool), data_(nullptr), raw_data_(nullptr) {} - -BooleanBuilder::BooleanBuilder(const std::shared_ptr& type, MemoryPool* pool) - : BooleanBuilder(pool) { - DCHECK_EQ(Type::BOOL, type->id()); -} - -void BooleanBuilder::Reset() { - ArrayBuilder::Reset(); - data_.reset(); - raw_data_ = nullptr; -} - -Status BooleanBuilder::Resize(int64_t capacity) { - RETURN_NOT_OK(CheckCapacity(capacity, capacity_)); - capacity = std::max(capacity, kMinBuilderCapacity); - - const int64_t new_bitmap_size = BitUtil::BytesForBits(capacity); - if (capacity_ == 0) { - RETURN_NOT_OK(AllocateResizableBuffer(pool_, new_bitmap_size, &data_)); - raw_data_ = reinterpret_cast(data_->mutable_data()); - - // We zero the memory for booleans to keep things simple; for some reason if - // we do not, even though we may write every bit (through in-place | or &), - // valgrind will still show a warning. If we do not zero the bytes here, we - // will have to be careful to zero them in AppendNull and AppendNulls. Also, - // zeroing the bits results in deterministic bits when each byte may have a - // mix of nulls and not nulls. - // - // We only zero up to new_bitmap_size because the padding was zeroed by - // AllocateResizableBuffer - memset(raw_data_, 0, static_cast(new_bitmap_size)); - } else { - const int64_t old_bitmap_capacity = data_->capacity(); - RETURN_NOT_OK(data_->Resize(new_bitmap_size)); - const int64_t new_bitmap_capacity = data_->capacity(); - raw_data_ = reinterpret_cast(data_->mutable_data()); - - // See comment above about why we zero memory for booleans - memset(raw_data_ + old_bitmap_capacity, 0, - static_cast(new_bitmap_capacity - old_bitmap_capacity)); - } - - return ArrayBuilder::Resize(capacity); -} - -Status BooleanBuilder::FinishInternal(std::shared_ptr* out) { - int64_t bit_offset = length_ % 8; - if (bit_offset > 0) { - // Adjust last byte - data_->mutable_data()[length_ / 8] &= BitUtil::kPrecedingBitmask[bit_offset]; - } - - RETURN_NOT_OK(TrimBuffer(BitUtil::BytesForBits(length_), null_bitmap_.get())); - RETURN_NOT_OK(TrimBuffer(BitUtil::BytesForBits(length_), data_.get())); - - *out = ArrayData::Make(boolean(), length_, {null_bitmap_, data_}, null_count_); - - data_ = null_bitmap_ = nullptr; - capacity_ = length_ = null_count_ = 0; - return Status::OK(); -} - -Status BooleanBuilder::AppendValues(const uint8_t* values, int64_t length, - const uint8_t* valid_bytes) { - RETURN_NOT_OK(Reserve(length)); - - int64_t i = 0; - internal::GenerateBitsUnrolled(raw_data_, length_, length, - [values, &i]() -> bool { return values[i++] != 0; }); - - // this updates length_ - ArrayBuilder::UnsafeAppendToBitmap(valid_bytes, length); - return Status::OK(); -} - -Status BooleanBuilder::AppendValues(const uint8_t* values, int64_t length, - const std::vector& is_valid) { - RETURN_NOT_OK(Reserve(length)); - DCHECK_EQ(length, static_cast(is_valid.size())); - - int64_t i = 0; - internal::GenerateBitsUnrolled(raw_data_, length_, length, - [values, &i]() -> bool { return values[i++]; }); - - // this updates length_ - ArrayBuilder::UnsafeAppendToBitmap(is_valid); - return Status::OK(); -} - -Status BooleanBuilder::AppendValues(const std::vector& values, - const std::vector& is_valid) { - return AppendValues(values.data(), static_cast(values.size()), is_valid); -} - -Status BooleanBuilder::AppendValues(const std::vector& values) { - return AppendValues(values.data(), static_cast(values.size())); -} - -Status BooleanBuilder::AppendValues(const std::vector& values, - const std::vector& is_valid) { - const int64_t length = static_cast(values.size()); - RETURN_NOT_OK(Reserve(length)); - DCHECK_EQ(length, static_cast(is_valid.size())); - - int64_t i = 0; - internal::GenerateBitsUnrolled(raw_data_, length_, length, - [&values, &i]() -> bool { return values[i++]; }); - - // this updates length_ - ArrayBuilder::UnsafeAppendToBitmap(is_valid); - return Status::OK(); -} - -Status BooleanBuilder::AppendValues(const std::vector& values) { - const int64_t length = static_cast(values.size()); - RETURN_NOT_OK(Reserve(length)); - - int64_t i = 0; - internal::GenerateBitsUnrolled(raw_data_, length_, length, - [&values, &i]() -> bool { return values[i++]; }); - - // this updates length_ - ArrayBuilder::UnsafeSetNotNull(length); - return Status::OK(); -} - -// ---------------------------------------------------------------------- -// ListBuilder - -ListBuilder::ListBuilder(MemoryPool* pool, - std::shared_ptr const& value_builder, - const std::shared_ptr& type) - : ArrayBuilder(type ? type - : std::static_pointer_cast( - std::make_shared(value_builder->type())), - pool), - offsets_builder_(pool), - value_builder_(value_builder) {} - -Status ListBuilder::AppendValues(const int32_t* offsets, int64_t length, - const uint8_t* valid_bytes) { - RETURN_NOT_OK(Reserve(length)); - UnsafeAppendToBitmap(valid_bytes, length); - offsets_builder_.UnsafeAppend(offsets, length); - return Status::OK(); -} - -Status ListBuilder::AppendNextOffset() { - int64_t num_values = value_builder_->length(); - if (ARROW_PREDICT_FALSE(num_values > kListMaximumElements)) { - std::stringstream ss; - ss << "ListArray cannot contain more then INT32_MAX - 1 child elements," - << " have " << num_values; - return Status::CapacityError(ss.str()); - } - return offsets_builder_.Append(static_cast(num_values)); -} - -Status ListBuilder::Append(bool is_valid) { - RETURN_NOT_OK(Reserve(1)); - UnsafeAppendToBitmap(is_valid); - return AppendNextOffset(); -} - -Status ListBuilder::Resize(int64_t capacity) { - DCHECK_LE(capacity, kListMaximumElements); - RETURN_NOT_OK(CheckCapacity(capacity, capacity_)); - - // one more then requested for offsets - RETURN_NOT_OK(offsets_builder_.Resize((capacity + 1) * sizeof(int32_t))); - return ArrayBuilder::Resize(capacity); -} - -Status ListBuilder::FinishInternal(std::shared_ptr* out) { - RETURN_NOT_OK(AppendNextOffset()); - - // Offset padding zeroed by BufferBuilder - std::shared_ptr offsets; - RETURN_NOT_OK(offsets_builder_.Finish(&offsets)); - - std::shared_ptr items; - if (values_) { - items = values_->data(); - } else { - if (value_builder_->length() == 0) { - // Try to make sure we get a non-null values buffer (ARROW-2744) - RETURN_NOT_OK(value_builder_->Resize(0)); - } - RETURN_NOT_OK(value_builder_->FinishInternal(&items)); - } - - *out = ArrayData::Make(type_, length_, {null_bitmap_, offsets}, null_count_); - (*out)->child_data.emplace_back(std::move(items)); - Reset(); - return Status::OK(); -} - -void ListBuilder::Reset() { - ArrayBuilder::Reset(); - values_.reset(); - offsets_builder_.Reset(); - value_builder_->Reset(); -} - -ArrayBuilder* ListBuilder::value_builder() const { - DCHECK(!values_) << "Using value builder is pointless when values_ is set"; - return value_builder_.get(); -} - -// ---------------------------------------------------------------------- -// Struct - -StructBuilder::StructBuilder(const std::shared_ptr& type, MemoryPool* pool, - std::vector>&& field_builders) - : ArrayBuilder(type, pool), field_builders_(std::move(field_builders)) {} - -void StructBuilder::Reset() { - ArrayBuilder::Reset(); - for (const auto& field_builder : field_builders_) { - field_builder->Reset(); - } -} - -Status StructBuilder::FinishInternal(std::shared_ptr* out) { - RETURN_NOT_OK(TrimBuffer(BitUtil::BytesForBits(length_), null_bitmap_.get())); - *out = ArrayData::Make(type_, length_, {null_bitmap_}, null_count_); - - (*out)->child_data.resize(field_builders_.size()); - for (size_t i = 0; i < field_builders_.size(); ++i) { - if (length_ == 0) { - // Try to make sure the child buffers are initialized - RETURN_NOT_OK(field_builders_[i]->Resize(0)); - } - RETURN_NOT_OK(field_builders_[i]->FinishInternal(&(*out)->child_data[i])); - } - - null_bitmap_ = nullptr; - capacity_ = length_ = null_count_ = 0; - return Status::OK(); -} +class MemoryPool; // ---------------------------------------------------------------------- // Helper functions @@ -566,7 +73,7 @@ Status MakeBuilder(MemoryPool* pool, const std::shared_ptr& type, case Type::LIST: { std::unique_ptr value_builder; std::shared_ptr value_type = - checked_cast(*type).value_type(); + internal::checked_cast(*type).value_type(); RETURN_NOT_OK(MakeBuilder(pool, value_type, &value_builder)); out->reset(new ListBuilder(pool, std::move(value_builder))); return Status::OK(); diff --git a/cpp/src/arrow/builder.h b/cpp/src/arrow/builder.h index d0016674215..6d58b8f2a0d 100644 --- a/cpp/src/arrow/builder.h +++ b/cpp/src/arrow/builder.h @@ -15,1184 +15,27 @@ // specific language governing permissions and limitations // under the License. -#ifndef ARROW_BUILDER_H -#define ARROW_BUILDER_H +#pragma once -#include // IWYU pragma: keep -#include -#include -#include -#include -#include -#include #include -#include -#include -#include -#include "arrow/buffer.h" -#include "arrow/memory_pool.h" +#include "arrow/columnar/builder_adaptive.h" // IWYU pragma: export +#include "arrow/columnar/builder_base.h" // IWYU pragma: export +#include "arrow/columnar/builder_binary.h" // IWYU pragma: export +#include "arrow/columnar/builder_decimal.h" // IWYU pragma: export +#include "arrow/columnar/builder_dict.h" // IWYU pragma: export +#include "arrow/columnar/builder_nested.h" // IWYU pragma: export +#include "arrow/columnar/builder_primitive.h" // IWYU pragma: export #include "arrow/status.h" -#include "arrow/type.h" -#include "arrow/type_traits.h" -#include "arrow/util/bit-util.h" -#include "arrow/util/macros.h" -#include "arrow/util/string_view.h" -#include "arrow/util/type_traits.h" #include "arrow/util/visibility.h" namespace arrow { -class Array; -struct ArrayData; -class Decimal128; - -constexpr int64_t kBinaryMemoryLimit = std::numeric_limits::max() - 1; -constexpr int64_t kListMaximumElements = std::numeric_limits::max() - 1; - -constexpr int64_t kMinBuilderCapacity = 1 << 5; - -/// Base class for all data array builders. -/// -/// This class provides a facilities for incrementally building the null bitmap -/// (see Append methods) and as a side effect the current number of slots and -/// the null count. -/// -/// \note Users are expected to use builders as one of the concrete types below. -/// For example, ArrayBuilder* pointing to BinaryBuilder should be downcast before use. -class ARROW_EXPORT ArrayBuilder { - public: - explicit ArrayBuilder(const std::shared_ptr& type, MemoryPool* pool) - : type_(type), - pool_(pool), - null_bitmap_(NULLPTR), - null_count_(0), - null_bitmap_data_(NULLPTR), - length_(0), - capacity_(0) {} - - virtual ~ArrayBuilder() = default; - - /// For nested types. Since the objects are owned by this class instance, we - /// skip shared pointers and just return a raw pointer - ArrayBuilder* child(int i) { return children_[i].get(); } - - int num_children() const { return static_cast(children_.size()); } - - int64_t length() const { return length_; } - int64_t null_count() const { return null_count_; } - int64_t capacity() const { return capacity_; } - - /// \brief Ensure that enough memory has been allocated to fit the indicated - /// number of total elements in the builder, including any that have already - /// been appended. Does not account for reallocations that may be due to - /// variable size data, like binary values. To make space for incremental - /// appends, use Reserve instead. - /// - /// \param[in] capacity the minimum number of total array values to - /// accommodate. Must be greater than the current capacity. - /// \return Status - virtual Status Resize(int64_t capacity); - - /// \brief Ensure that there is enough space allocated to add the indicated - /// number of elements without any further calls to Resize. The memory - /// allocated is rounded up to the next highest power of 2 similar to memory - /// allocations in STL containers like std::vector - /// \param[in] additional_capacity the number of additional array values - /// \return Status - Status Reserve(int64_t additional_capacity); - - /// Reset the builder. - virtual void Reset(); - - /// For cases where raw data was memcpy'd into the internal buffers, allows us - /// to advance the length of the builder. It is your responsibility to use - /// this function responsibly. - Status Advance(int64_t elements); - - /// \brief Return result of builder as an internal generic ArrayData - /// object. Resets builder except for dictionary builder - /// - /// \param[out] out the finalized ArrayData object - /// \return Status - virtual Status FinishInternal(std::shared_ptr* out) = 0; - - /// \brief Return result of builder as an Array object. - /// - /// The builder is reset except for DictionaryBuilder. - /// - /// \param[out] out the finalized Array object - /// \return Status - Status Finish(std::shared_ptr* out); - - std::shared_ptr type() const { return type_; } - - protected: - ArrayBuilder() {} - - /// Append to null bitmap - Status AppendToBitmap(bool is_valid); - - /// Vector append. Treat each zero byte as a null. If valid_bytes is null - /// assume all of length bits are valid. - Status AppendToBitmap(const uint8_t* valid_bytes, int64_t length); - - /// Set the next length bits to not null (i.e. valid). - Status SetNotNull(int64_t length); - - // Unsafe operations (don't check capacity/don't resize) - - void UnsafeAppendNull() { UnsafeAppendToBitmap(false); } - - // Append to null bitmap, update the length - void UnsafeAppendToBitmap(bool is_valid) { - if (is_valid) { - BitUtil::SetBit(null_bitmap_data_, length_); - } else { - ++null_count_; - } - ++length_; - } - - template - void UnsafeAppendToBitmap(const IterType& begin, const IterType& end) { - int64_t byte_offset = length_ / 8; - int64_t bit_offset = length_ % 8; - uint8_t bitset = null_bitmap_data_[byte_offset]; - - for (auto iter = begin; iter != end; ++iter) { - if (bit_offset == 8) { - bit_offset = 0; - null_bitmap_data_[byte_offset] = bitset; - byte_offset++; - // TODO: Except for the last byte, this shouldn't be needed - bitset = null_bitmap_data_[byte_offset]; - } - - if (*iter) { - bitset |= BitUtil::kBitmask[bit_offset]; - } else { - bitset &= BitUtil::kFlippedBitmask[bit_offset]; - ++null_count_; - } - - bit_offset++; - } - - if (bit_offset != 0) { - null_bitmap_data_[byte_offset] = bitset; - } - - length_ += std::distance(begin, end); - } - - // Vector append. Treat each zero byte as a nullzero. If valid_bytes is null - // assume all of length bits are valid. - void UnsafeAppendToBitmap(const uint8_t* valid_bytes, int64_t length); - - void UnsafeAppendToBitmap(const std::vector& is_valid); - - // Set the next length bits to not null (i.e. valid). - void UnsafeSetNotNull(int64_t length); - - static Status TrimBuffer(const int64_t bytes_filled, ResizableBuffer* buffer); - - static Status CheckCapacity(int64_t new_capacity, int64_t old_capacity) { - if (new_capacity < 0) { - return Status::Invalid("Resize capacity must be positive"); - } - if (new_capacity < old_capacity) { - return Status::Invalid("Resize cannot downsize"); - } - return Status::OK(); - } - - std::shared_ptr type_; - MemoryPool* pool_; - - // When null_bitmap are first appended to the builder, the null bitmap is allocated - std::shared_ptr null_bitmap_; - int64_t null_count_; - uint8_t* null_bitmap_data_; - - // Array length, so far. Also, the index of the next element to be added - int64_t length_; - int64_t capacity_; - - // Child value array builders. These are owned by this class - std::vector> children_; - - private: - ARROW_DISALLOW_COPY_AND_ASSIGN(ArrayBuilder); -}; - -class ARROW_EXPORT NullBuilder : public ArrayBuilder { - public: - explicit NullBuilder(MemoryPool* pool ARROW_MEMORY_POOL_DEFAULT) - : ArrayBuilder(null(), pool) {} - - Status AppendNull() { - ++null_count_; - ++length_; - return Status::OK(); - } - - Status Append(std::nullptr_t value) { return AppendNull(); } - - Status FinishInternal(std::shared_ptr* out) override; -}; - -template -class ARROW_EXPORT PrimitiveBuilder : public ArrayBuilder { - public: - using value_type = typename Type::c_type; - - explicit PrimitiveBuilder(const std::shared_ptr& type, MemoryPool* pool) - : ArrayBuilder(type, pool), data_(NULLPTR), raw_data_(NULLPTR) {} - - using ArrayBuilder::Advance; - - /// Write nulls as uint8_t* (0 value indicates null) into pre-allocated memory - /// The memory at the corresponding data slot is set to 0 to prevent uninitialized - /// memory access - Status AppendNulls(const uint8_t* valid_bytes, int64_t length) { - ARROW_RETURN_NOT_OK(Reserve(length)); - memset(raw_data_ + length_, 0, - static_cast(TypeTraits::bytes_required(length))); - UnsafeAppendToBitmap(valid_bytes, length); - return Status::OK(); - } - - /// \brief Append a single null element - Status AppendNull() { - ARROW_RETURN_NOT_OK(Reserve(1)); - memset(raw_data_ + length_, 0, sizeof(value_type)); - UnsafeAppendToBitmap(false); - return Status::OK(); - } - - value_type GetValue(int64_t index) const { - return reinterpret_cast(data_->data())[index]; - } - - /// \brief Append a sequence of elements in one shot - /// \param[in] values a contiguous C array of values - /// \param[in] length the number of values to append - /// \param[in] valid_bytes an optional sequence of bytes where non-zero - /// indicates a valid (non-null) value - /// \return Status - Status AppendValues(const value_type* values, int64_t length, - const uint8_t* valid_bytes = NULLPTR); - - /// \brief Append a sequence of elements in one shot - /// \param[in] values a contiguous C array of values - /// \param[in] length the number of values to append - /// \param[in] is_valid an std::vector indicating valid (1) or null - /// (0). Equal in length to values - /// \return Status - Status AppendValues(const value_type* values, int64_t length, - const std::vector& is_valid); - - /// \brief Append a sequence of elements in one shot - /// \param[in] values a std::vector of values - /// \param[in] is_valid an std::vector indicating valid (1) or null - /// (0). Equal in length to values - /// \return Status - Status AppendValues(const std::vector& values, - const std::vector& is_valid); - - /// \brief Append a sequence of elements in one shot - /// \param[in] values a std::vector of values - /// \return Status - Status AppendValues(const std::vector& values); - - /// \brief Append a sequence of elements in one shot - /// \param[in] values_begin InputIterator to the beginning of the values - /// \param[in] values_end InputIterator pointing to the end of the values - /// \return Status - - template - Status AppendValues(ValuesIter values_begin, ValuesIter values_end) { - int64_t length = static_cast(std::distance(values_begin, values_end)); - ARROW_RETURN_NOT_OK(Reserve(length)); - - std::copy(values_begin, values_end, raw_data_ + length_); - - // this updates the length_ - UnsafeSetNotNull(length); - return Status::OK(); - } - - /// \brief Append a sequence of elements in one shot, with a specified nullmap - /// \param[in] values_begin InputIterator to the beginning of the values - /// \param[in] values_end InputIterator pointing to the end of the values - /// \param[in] valid_begin InputIterator with elements indication valid(1) - /// or null(0) values. - /// \return Status - template - typename std::enable_if::value, Status>::type AppendValues( - ValuesIter values_begin, ValuesIter values_end, ValidIter valid_begin) { - static_assert(!internal::is_null_pointer::value, - "Don't pass a NULLPTR directly as valid_begin, use the 2-argument " - "version instead"); - int64_t length = static_cast(std::distance(values_begin, values_end)); - ARROW_RETURN_NOT_OK(Reserve(length)); - - std::copy(values_begin, values_end, raw_data_ + length_); - - // this updates the length_ - UnsafeAppendToBitmap(valid_begin, std::next(valid_begin, length)); - return Status::OK(); - } - - // Same as above, with a pointer type ValidIter - template - typename std::enable_if::value, Status>::type AppendValues( - ValuesIter values_begin, ValuesIter values_end, ValidIter valid_begin) { - int64_t length = static_cast(std::distance(values_begin, values_end)); - ARROW_RETURN_NOT_OK(Reserve(length)); - - std::copy(values_begin, values_end, raw_data_ + length_); - - // this updates the length_ - if (valid_begin == NULLPTR) { - UnsafeSetNotNull(length); - } else { - UnsafeAppendToBitmap(valid_begin, std::next(valid_begin, length)); - } - - return Status::OK(); - } - - Status FinishInternal(std::shared_ptr* out) override; - void Reset() override; - - Status Resize(int64_t capacity) override; - - protected: - std::shared_ptr data_; - value_type* raw_data_; -}; - -/// Base class for all Builders that emit an Array of a scalar numerical type. -template -class ARROW_EXPORT NumericBuilder : public PrimitiveBuilder { - public: - using typename PrimitiveBuilder::value_type; - using PrimitiveBuilder::PrimitiveBuilder; - - template - explicit NumericBuilder( - typename std::enable_if::is_parameter_free, MemoryPool*>::type pool - ARROW_MEMORY_POOL_DEFAULT) - : PrimitiveBuilder(TypeTraits::type_singleton(), pool) {} - - using ArrayBuilder::UnsafeAppendNull; - using PrimitiveBuilder::AppendValues; - using PrimitiveBuilder::Resize; - using PrimitiveBuilder::Reserve; - - /// Append a single scalar and increase the size if necessary. - Status Append(const value_type val) { - ARROW_RETURN_NOT_OK(ArrayBuilder::Reserve(1)); - UnsafeAppend(val); - return Status::OK(); - } - - /// Append a single scalar under the assumption that the underlying Buffer is - /// large enough. - /// - /// This method does not capacity-check; make sure to call Reserve - /// beforehand. - void UnsafeAppend(const value_type val) { - BitUtil::SetBit(null_bitmap_data_, length_); - raw_data_[length_++] = val; - } - - protected: - using PrimitiveBuilder::length_; - using PrimitiveBuilder::null_bitmap_data_; - using PrimitiveBuilder::raw_data_; -}; - -// Builders - -using UInt8Builder = NumericBuilder; -using UInt16Builder = NumericBuilder; -using UInt32Builder = NumericBuilder; -using UInt64Builder = NumericBuilder; - -using Int8Builder = NumericBuilder; -using Int16Builder = NumericBuilder; -using Int32Builder = NumericBuilder; -using Int64Builder = NumericBuilder; -using TimestampBuilder = NumericBuilder; -using Time32Builder = NumericBuilder; -using Time64Builder = NumericBuilder; -using Date32Builder = NumericBuilder; -using Date64Builder = NumericBuilder; - -using HalfFloatBuilder = NumericBuilder; -using FloatBuilder = NumericBuilder; -using DoubleBuilder = NumericBuilder; - -namespace internal { - -class ARROW_EXPORT AdaptiveIntBuilderBase : public ArrayBuilder { - public: - explicit AdaptiveIntBuilderBase(MemoryPool* pool); - - /// Write nulls as uint8_t* (0 value indicates null) into pre-allocated memory - Status AppendNulls(const uint8_t* valid_bytes, int64_t length) { - ARROW_RETURN_NOT_OK(CommitPendingData()); - ARROW_RETURN_NOT_OK(Reserve(length)); - memset(data_->mutable_data() + length_ * int_size_, 0, int_size_ * length); - UnsafeAppendToBitmap(valid_bytes, length); - return Status::OK(); - } - - Status AppendNull() { - pending_data_[pending_pos_] = 0; - pending_valid_[pending_pos_] = 0; - pending_has_nulls_ = true; - ++pending_pos_; - - if (ARROW_PREDICT_FALSE(pending_pos_ >= pending_size_)) { - return CommitPendingData(); - } - return Status::OK(); - } - - void Reset() override; - Status Resize(int64_t capacity) override; - - protected: - virtual Status CommitPendingData() = 0; - - std::shared_ptr data_; - uint8_t* raw_data_; - uint8_t int_size_; - - static constexpr int32_t pending_size_ = 1024; - uint8_t pending_valid_[pending_size_]; - uint64_t pending_data_[pending_size_]; - int32_t pending_pos_; - bool pending_has_nulls_; -}; - -} // namespace internal - -class ARROW_EXPORT AdaptiveUIntBuilder : public internal::AdaptiveIntBuilderBase { - public: - explicit AdaptiveUIntBuilder(MemoryPool* pool ARROW_MEMORY_POOL_DEFAULT); - - using ArrayBuilder::Advance; - using internal::AdaptiveIntBuilderBase::Reset; - - /// Scalar append - Status Append(const uint64_t val) { - pending_data_[pending_pos_] = val; - pending_valid_[pending_pos_] = 1; - ++pending_pos_; - - if (ARROW_PREDICT_FALSE(pending_pos_ >= pending_size_)) { - return CommitPendingData(); - } - return Status::OK(); - } - - /// \brief Append a sequence of elements in one shot - /// \param[in] values a contiguous C array of values - /// \param[in] length the number of values to append - /// \param[in] valid_bytes an optional sequence of bytes where non-zero - /// indicates a valid (non-null) value - /// \return Status - Status AppendValues(const uint64_t* values, int64_t length, - const uint8_t* valid_bytes = NULLPTR); - - Status FinishInternal(std::shared_ptr* out) override; - - protected: - Status CommitPendingData() override; - Status ExpandIntSize(uint8_t new_int_size); - - Status AppendValuesInternal(const uint64_t* values, int64_t length, - const uint8_t* valid_bytes); - - template - typename std::enable_if= sizeof(new_type), Status>::type - ExpandIntSizeInternal(); -#define __LESS(a, b) (a) < (b) - template - typename std::enable_if<__LESS(sizeof(old_type), sizeof(new_type)), Status>::type - ExpandIntSizeInternal(); -#undef __LESS - - template - Status ExpandIntSizeN(); -}; - -class ARROW_EXPORT AdaptiveIntBuilder : public internal::AdaptiveIntBuilderBase { - public: - explicit AdaptiveIntBuilder(MemoryPool* pool ARROW_MEMORY_POOL_DEFAULT); - - using ArrayBuilder::Advance; - using internal::AdaptiveIntBuilderBase::Reset; - - /// Scalar append - Status Append(const int64_t val) { - auto v = static_cast(val); - - pending_data_[pending_pos_] = v; - pending_valid_[pending_pos_] = 1; - ++pending_pos_; - - if (ARROW_PREDICT_FALSE(pending_pos_ >= pending_size_)) { - return CommitPendingData(); - } - return Status::OK(); - } - - /// \brief Append a sequence of elements in one shot - /// \param[in] values a contiguous C array of values - /// \param[in] length the number of values to append - /// \param[in] valid_bytes an optional sequence of bytes where non-zero - /// indicates a valid (non-null) value - /// \return Status - Status AppendValues(const int64_t* values, int64_t length, - const uint8_t* valid_bytes = NULLPTR); - - Status FinishInternal(std::shared_ptr* out) override; - - protected: - Status CommitPendingData() override; - Status ExpandIntSize(uint8_t new_int_size); - - Status AppendValuesInternal(const int64_t* values, int64_t length, - const uint8_t* valid_bytes); - - template - typename std::enable_if= sizeof(new_type), Status>::type - ExpandIntSizeInternal(); -#define __LESS(a, b) (a) < (b) - template - typename std::enable_if<__LESS(sizeof(old_type), sizeof(new_type)), Status>::type - ExpandIntSizeInternal(); -#undef __LESS - - template - Status ExpandIntSizeN(); -}; - -class ARROW_EXPORT BooleanBuilder : public ArrayBuilder { - public: - using value_type = bool; - explicit BooleanBuilder(MemoryPool* pool ARROW_MEMORY_POOL_DEFAULT); - - explicit BooleanBuilder(const std::shared_ptr& type, MemoryPool* pool); - - using ArrayBuilder::Advance; - using ArrayBuilder::UnsafeAppendNull; - - /// Write nulls as uint8_t* (0 value indicates null) into pre-allocated memory - Status AppendNulls(const uint8_t* valid_bytes, int64_t length) { - ARROW_RETURN_NOT_OK(Reserve(length)); - UnsafeAppendToBitmap(valid_bytes, length); - - return Status::OK(); - } - - Status AppendNull() { - ARROW_RETURN_NOT_OK(Reserve(1)); - UnsafeAppendToBitmap(false); - - return Status::OK(); - } - - /// Scalar append - Status Append(const bool val) { - ARROW_RETURN_NOT_OK(Reserve(1)); - UnsafeAppend(val); - return Status::OK(); - } - - Status Append(const uint8_t val) { return Append(val != 0); } - - /// Scalar append, without checking for capacity - void UnsafeAppend(const bool val) { - BitUtil::SetBit(null_bitmap_data_, length_); - if (val) { - BitUtil::SetBit(raw_data_, length_); - } else { - BitUtil::ClearBit(raw_data_, length_); - } - ++length_; - } - - void UnsafeAppend(const uint8_t val) { UnsafeAppend(val != 0); } - - /// \brief Append a sequence of elements in one shot - /// \param[in] values a contiguous array of bytes (non-zero is 1) - /// \param[in] length the number of values to append - /// \param[in] valid_bytes an optional sequence of bytes where non-zero - /// indicates a valid (non-null) value - /// \return Status - Status AppendValues(const uint8_t* values, int64_t length, - const uint8_t* valid_bytes = NULLPTR); - - /// \brief Append a sequence of elements in one shot - /// \param[in] values a contiguous C array of values - /// \param[in] length the number of values to append - /// \param[in] is_valid an std::vector indicating valid (1) or null - /// (0). Equal in length to values - /// \return Status - Status AppendValues(const uint8_t* values, int64_t length, - const std::vector& is_valid); - - /// \brief Append a sequence of elements in one shot - /// \param[in] values a std::vector of bytes - /// \param[in] is_valid an std::vector indicating valid (1) or null - /// (0). Equal in length to values - /// \return Status - Status AppendValues(const std::vector& values, - const std::vector& is_valid); - - /// \brief Append a sequence of elements in one shot - /// \param[in] values a std::vector of bytes - /// \return Status - Status AppendValues(const std::vector& values); - - /// \brief Append a sequence of elements in one shot - /// \param[in] values an std::vector indicating true (1) or false - /// \param[in] is_valid an std::vector indicating valid (1) or null - /// (0). Equal in length to values - /// \return Status - Status AppendValues(const std::vector& values, const std::vector& is_valid); - - /// \brief Append a sequence of elements in one shot - /// \param[in] values an std::vector indicating true (1) or false - /// \return Status - Status AppendValues(const std::vector& values); - - /// \brief Append a sequence of elements in one shot - /// \param[in] values_begin InputIterator to the beginning of the values - /// \param[in] values_end InputIterator pointing to the end of the values - /// or null(0) values - /// \return Status - template - Status AppendValues(ValuesIter values_begin, ValuesIter values_end) { - int64_t length = static_cast(std::distance(values_begin, values_end)); - ARROW_RETURN_NOT_OK(Reserve(length)); - auto iter = values_begin; - internal::GenerateBitsUnrolled(raw_data_, length_, length, - [&iter]() -> bool { return *(iter++); }); - - // this updates length_ - UnsafeSetNotNull(length); - return Status::OK(); - } - - /// \brief Append a sequence of elements in one shot, with a specified nullmap - /// \param[in] values_begin InputIterator to the beginning of the values - /// \param[in] values_end InputIterator pointing to the end of the values - /// \param[in] valid_begin InputIterator with elements indication valid(1) - /// or null(0) values - /// \return Status - template - typename std::enable_if::value, Status>::type AppendValues( - ValuesIter values_begin, ValuesIter values_end, ValidIter valid_begin) { - static_assert(!internal::is_null_pointer::value, - "Don't pass a NULLPTR directly as valid_begin, use the 2-argument " - "version instead"); - int64_t length = static_cast(std::distance(values_begin, values_end)); - ARROW_RETURN_NOT_OK(Reserve(length)); - - auto iter = values_begin; - internal::GenerateBitsUnrolled(raw_data_, length_, length, - [&iter]() -> bool { return *(iter++); }); - - // this updates length_ - ArrayBuilder::UnsafeAppendToBitmap(valid_begin, std::next(valid_begin, length)); - return Status::OK(); - } - - // Same as above, for a pointer type ValidIter - template - typename std::enable_if::value, Status>::type AppendValues( - ValuesIter values_begin, ValuesIter values_end, ValidIter valid_begin) { - int64_t length = static_cast(std::distance(values_begin, values_end)); - ARROW_RETURN_NOT_OK(Reserve(length)); - - auto iter = values_begin; - internal::GenerateBitsUnrolled(raw_data_, length_, length, - [&iter]() -> bool { return *(iter++); }); - - // this updates the length_ - if (valid_begin == NULLPTR) { - UnsafeSetNotNull(length); - } else { - UnsafeAppendToBitmap(valid_begin, std::next(valid_begin, length)); - } - - return Status::OK(); - } - - Status FinishInternal(std::shared_ptr* out) override; - void Reset() override; - Status Resize(int64_t capacity) override; - - protected: - std::shared_ptr data_; - uint8_t* raw_data_; -}; - -// ---------------------------------------------------------------------- -// List builder - -/// \class ListBuilder -/// \brief Builder class for variable-length list array value types -/// -/// To use this class, you must append values to the child array builder and use -/// the Append function to delimit each distinct list value (once the values -/// have been appended to the child array) or use the bulk API to append -/// a sequence of offests and null values. -/// -/// A note on types. Per arrow/type.h all types in the c++ implementation are -/// logical so even though this class always builds list array, this can -/// represent multiple different logical types. If no logical type is provided -/// at construction time, the class defaults to List where t is taken from the -/// value_builder/values that the object is constructed with. -class ARROW_EXPORT ListBuilder : public ArrayBuilder { - public: - /// Use this constructor to incrementally build the value array along with offsets and - /// null bitmap. - ListBuilder(MemoryPool* pool, std::shared_ptr const& value_builder, - const std::shared_ptr& type = NULLPTR); - - Status Resize(int64_t capacity) override; - void Reset() override; - Status FinishInternal(std::shared_ptr* out) override; - - /// \brief Vector append - /// - /// If passed, valid_bytes is of equal length to values, and any zero byte - /// will be considered as a null for that slot - Status AppendValues(const int32_t* offsets, int64_t length, - const uint8_t* valid_bytes = NULLPTR); - - /// \brief Start a new variable-length list slot - /// - /// This function should be called before beginning to append elements to the - /// value builder - Status Append(bool is_valid = true); - - Status AppendNull() { return Append(false); } - - ArrayBuilder* value_builder() const; - - protected: - TypedBufferBuilder offsets_builder_; - std::shared_ptr value_builder_; - std::shared_ptr values_; - - Status AppendNextOffset(); -}; - -// ---------------------------------------------------------------------- -// Binary and String - -/// \class BinaryBuilder -/// \brief Builder class for variable-length binary data -class ARROW_EXPORT BinaryBuilder : public ArrayBuilder { - public: - explicit BinaryBuilder(MemoryPool* pool ARROW_MEMORY_POOL_DEFAULT); - - BinaryBuilder(const std::shared_ptr& type, MemoryPool* pool); - - Status Append(const uint8_t* value, int32_t length); - - Status Append(const char* value, int32_t length) { - return Append(reinterpret_cast(value), length); - } - - Status Append(util::string_view value) { - return Append(value.data(), static_cast(value.size())); - } - - Status AppendNull(); - - /// \brief Append without checking capacity - /// - /// Offsets and data should have been presized using Reserve() and - /// ReserveData(), respectively. - void UnsafeAppend(const uint8_t* value, int32_t length) { - UnsafeAppendNextOffset(); - value_data_builder_.UnsafeAppend(value, length); - UnsafeAppendToBitmap(true); - } - - void UnsafeAppend(const char* value, int32_t length) { - UnsafeAppend(reinterpret_cast(value), length); - } - - void UnsafeAppend(const std::string& value) { - UnsafeAppend(value.c_str(), static_cast(value.size())); - } - - void UnsafeAppendNull() { - const int64_t num_bytes = value_data_builder_.length(); - offsets_builder_.UnsafeAppend(static_cast(num_bytes)); - UnsafeAppendToBitmap(false); - } - - void Reset() override; - Status Resize(int64_t capacity) override; - - /// \brief Ensures there is enough allocated capacity to append the indicated - /// number of bytes to the value data buffer without additional allocations - Status ReserveData(int64_t elements); - - Status FinishInternal(std::shared_ptr* out) override; - - /// \return size of values buffer so far - int64_t value_data_length() const { return value_data_builder_.length(); } - /// \return capacity of values buffer - int64_t value_data_capacity() const { return value_data_builder_.capacity(); } - - /// Temporary access to a value. - /// - /// This pointer becomes invalid on the next modifying operation. - const uint8_t* GetValue(int64_t i, int32_t* out_length) const; - - /// Temporary access to a value. - /// - /// This view becomes invalid on the next modifying operation. - util::string_view GetView(int64_t i) const; - - protected: - TypedBufferBuilder offsets_builder_; - TypedBufferBuilder value_data_builder_; - - Status AppendNextOffset(); - - void UnsafeAppendNextOffset() { - const int64_t num_bytes = value_data_builder_.length(); - offsets_builder_.UnsafeAppend(static_cast(num_bytes)); - } -}; - -/// \class StringBuilder -/// \brief Builder class for UTF8 strings -class ARROW_EXPORT StringBuilder : public BinaryBuilder { - public: - using BinaryBuilder::BinaryBuilder; - explicit StringBuilder(MemoryPool* pool ARROW_MEMORY_POOL_DEFAULT); - - using BinaryBuilder::Append; - using BinaryBuilder::Reset; - using BinaryBuilder::UnsafeAppend; - - /// \brief Append a sequence of strings in one shot. - /// - /// \param[in] values a vector of strings - /// \param[in] valid_bytes an optional sequence of bytes where non-zero - /// indicates a valid (non-null) value - /// \return Status - Status AppendValues(const std::vector& values, - const uint8_t* valid_bytes = NULLPTR); - - /// \brief Append a sequence of nul-terminated strings in one shot. - /// If one of the values is NULL, it is processed as a null - /// value even if the corresponding valid_bytes entry is 1. - /// - /// \param[in] values a contiguous C array of nul-terminated char * - /// \param[in] length the number of values to append - /// \param[in] valid_bytes an optional sequence of bytes where non-zero - /// indicates a valid (non-null) value - /// \return Status - Status AppendValues(const char** values, int64_t length, - const uint8_t* valid_bytes = NULLPTR); -}; - -// ---------------------------------------------------------------------- -// FixedSizeBinaryBuilder - -class ARROW_EXPORT FixedSizeBinaryBuilder : public ArrayBuilder { - public: - FixedSizeBinaryBuilder(const std::shared_ptr& type, - MemoryPool* pool ARROW_MEMORY_POOL_DEFAULT); - - Status Append(const uint8_t* value) { - ARROW_RETURN_NOT_OK(Reserve(1)); - UnsafeAppendToBitmap(true); - return byte_builder_.Append(value, byte_width_); - } - - Status Append(const char* value) { - return Append(reinterpret_cast(value)); - } - - Status Append(const util::string_view& view) { -#ifndef NDEBUG - CheckValueSize(static_cast(view.size())); -#endif - return Append(reinterpret_cast(view.data())); - } - - Status Append(const std::string& s) { -#ifndef NDEBUG - CheckValueSize(static_cast(s.size())); -#endif - return Append(reinterpret_cast(s.data())); - } - - template - Status Append(const std::array& value) { - ARROW_RETURN_NOT_OK(Reserve(1)); - UnsafeAppendToBitmap(true); - return byte_builder_.Append(value); - } - - Status AppendValues(const uint8_t* data, int64_t length, - const uint8_t* valid_bytes = NULLPTR); - Status AppendNull(); - - void Reset() override; - Status Resize(int64_t capacity) override; - Status FinishInternal(std::shared_ptr* out) override; - - /// \return size of values buffer so far - int64_t value_data_length() const { return byte_builder_.length(); } - - int32_t byte_width() const { return byte_width_; } - - /// Temporary access to a value. - /// - /// This pointer becomes invalid on the next modifying operation. - const uint8_t* GetValue(int64_t i) const; - - /// Temporary access to a value. - /// - /// This view becomes invalid on the next modifying operation. - util::string_view GetView(int64_t i) const; - - protected: - int32_t byte_width_; - BufferBuilder byte_builder_; - -#ifndef NDEBUG - void CheckValueSize(int64_t size); -#endif -}; - -class ARROW_EXPORT Decimal128Builder : public FixedSizeBinaryBuilder { - public: - explicit Decimal128Builder(const std::shared_ptr& type, - MemoryPool* pool ARROW_MEMORY_POOL_DEFAULT); - - using FixedSizeBinaryBuilder::Append; - using FixedSizeBinaryBuilder::AppendValues; - using FixedSizeBinaryBuilder::Reset; - - Status Append(const Decimal128& val); - - Status FinishInternal(std::shared_ptr* out) override; -}; - -using DecimalBuilder = Decimal128Builder; - -// ---------------------------------------------------------------------- -// Struct - -// --------------------------------------------------------------------------------- -// StructArray builder -/// Append, Resize and Reserve methods are acting on StructBuilder. -/// Please make sure all these methods of all child-builders' are consistently -/// called to maintain data-structure consistency. -class ARROW_EXPORT StructBuilder : public ArrayBuilder { - public: - StructBuilder(const std::shared_ptr& type, MemoryPool* pool, - std::vector>&& field_builders); - - Status FinishInternal(std::shared_ptr* out) override; - - /// Null bitmap is of equal length to every child field, and any zero byte - /// will be considered as a null for that field, but users must using app- - /// end methods or advance methods of the child builders' independently to - /// insert data. - Status AppendValues(int64_t length, const uint8_t* valid_bytes) { - ARROW_RETURN_NOT_OK(Reserve(length)); - UnsafeAppendToBitmap(valid_bytes, length); - return Status::OK(); - } - - /// Append an element to the Struct. All child-builders' Append method must - /// be called independently to maintain data-structure consistency. - Status Append(bool is_valid = true) { - ARROW_RETURN_NOT_OK(Reserve(1)); - UnsafeAppendToBitmap(is_valid); - return Status::OK(); - } - - Status AppendNull() { return Append(false); } - - void Reset() override; - - ArrayBuilder* field_builder(int i) const { return field_builders_[i].get(); } - - int num_fields() const { return static_cast(field_builders_.size()); } - - protected: - std::vector> field_builders_; -}; - -// ---------------------------------------------------------------------- -// Dictionary builder - -namespace internal { - -template -struct DictionaryScalar { - using type = typename T::c_type; -}; - -template <> -struct DictionaryScalar { - using type = util::string_view; -}; - -template <> -struct DictionaryScalar { - using type = util::string_view; -}; - -template <> -struct DictionaryScalar { - using type = util::string_view; -}; - -} // namespace internal - -/// \brief Array builder for created encoded DictionaryArray from dense array -/// -/// Unlike other builders, dictionary builder does not completely reset the state -/// on Finish calls. The arrays built after the initial Finish call will reuse -/// the previously created encoding and build a delta dictionary when new terms -/// occur. -/// -/// data -template -class ARROW_EXPORT DictionaryBuilder : public ArrayBuilder { - public: - using Scalar = typename internal::DictionaryScalar::type; - - // WARNING: the type given below is the value type, not the DictionaryType. - // The DictionaryType is instantiated on the Finish() call. - DictionaryBuilder(const std::shared_ptr& type, MemoryPool* pool); - - template - explicit DictionaryBuilder( - typename std::enable_if::is_parameter_free, MemoryPool*>::type pool) - : DictionaryBuilder(TypeTraits::type_singleton(), pool) {} - - ~DictionaryBuilder() override; - - /// \brief Append a scalar value - Status Append(const Scalar& value); - - /// \brief Append a fixed-width string (only for FixedSizeBinaryType) - template - Status Append(typename std::enable_if::value, - const uint8_t*>::type value) { - return Append(util::string_view(reinterpret_cast(value), byte_width_)); - } - - /// \brief Append a fixed-width string (only for FixedSizeBinaryType) - template - Status Append(typename std::enable_if::value, - const char*>::type value) { - return Append(util::string_view(value, byte_width_)); - } - - /// \brief Append a scalar null value - Status AppendNull(); - - /// \brief Append a whole dense array to the builder - Status AppendArray(const Array& array); - - void Reset() override; - Status Resize(int64_t capacity) override; - Status FinishInternal(std::shared_ptr* out) override; - - /// is the dictionary builder in the delta building mode - bool is_building_delta() { return delta_offset_ > 0; } - - protected: - class MemoTableImpl; - std::unique_ptr memo_table_; - - int32_t delta_offset_; - // Only used for FixedSizeBinaryType - int32_t byte_width_; - - AdaptiveIntBuilder values_builder_; -}; - -template <> -class ARROW_EXPORT DictionaryBuilder : public ArrayBuilder { - public: - DictionaryBuilder(const std::shared_ptr& type, MemoryPool* pool); - explicit DictionaryBuilder(MemoryPool* pool); - - /// \brief Append a scalar null value - Status AppendNull(); - - /// \brief Append a whole dense array to the builder - Status AppendArray(const Array& array); - - Status Resize(int64_t capacity) override; - Status FinishInternal(std::shared_ptr* out) override; - - protected: - AdaptiveIntBuilder values_builder_; -}; - -class ARROW_EXPORT BinaryDictionaryBuilder : public DictionaryBuilder { - public: - using DictionaryBuilder::Append; - using DictionaryBuilder::DictionaryBuilder; - - Status Append(const uint8_t* value, int32_t length) { - return Append(reinterpret_cast(value), length); - } - - Status Append(const char* value, int32_t length) { - return Append(util::string_view(value, length)); - } -}; - -/// \brief Dictionary array builder with convenience methods for strings -class ARROW_EXPORT StringDictionaryBuilder : public DictionaryBuilder { - public: - using DictionaryBuilder::Append; - using DictionaryBuilder::DictionaryBuilder; - - Status Append(const uint8_t* value, int32_t length) { - return Append(reinterpret_cast(value), length); - } - - Status Append(const char* value, int32_t length) { - return Append(util::string_view(value, length)); - } -}; - -// ---------------------------------------------------------------------- -// Helper functions +class DataType; +class MemoryPool; ARROW_EXPORT Status MakeBuilder(MemoryPool* pool, const std::shared_ptr& type, std::unique_ptr* out); } // namespace arrow - -#endif // ARROW_BUILDER_H_ diff --git a/cpp/src/arrow/builder-adaptive.cc b/cpp/src/arrow/columnar/builder_adaptive.cc similarity index 99% rename from cpp/src/arrow/builder-adaptive.cc rename to cpp/src/arrow/columnar/builder_adaptive.cc index a715f469c7a..a9f2d341042 100644 --- a/cpp/src/arrow/builder-adaptive.cc +++ b/cpp/src/arrow/columnar/builder_adaptive.cc @@ -15,13 +15,15 @@ // specific language governing permissions and limitations // under the License. +#include "arrow/columnar/builder_adaptive.h" + +#include #include #include #include #include "arrow/array.h" #include "arrow/buffer.h" -#include "arrow/builder.h" #include "arrow/status.h" #include "arrow/type.h" #include "arrow/type_traits.h" diff --git a/cpp/src/arrow/columnar/builder_adaptive.h b/cpp/src/arrow/columnar/builder_adaptive.h new file mode 100644 index 00000000000..ff378ed4ed9 --- /dev/null +++ b/cpp/src/arrow/columnar/builder_adaptive.h @@ -0,0 +1,174 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#pragma once + +#include + +#include "arrow/columnar/builder_base.h" + +namespace arrow { + +namespace internal { + +class ARROW_EXPORT AdaptiveIntBuilderBase : public ArrayBuilder { + public: + explicit AdaptiveIntBuilderBase(MemoryPool* pool); + + /// Write nulls as uint8_t* (0 value indicates null) into pre-allocated memory + Status AppendNulls(const uint8_t* valid_bytes, int64_t length) { + ARROW_RETURN_NOT_OK(CommitPendingData()); + ARROW_RETURN_NOT_OK(Reserve(length)); + memset(data_->mutable_data() + length_ * int_size_, 0, int_size_ * length); + UnsafeAppendToBitmap(valid_bytes, length); + return Status::OK(); + } + + Status AppendNull() { + pending_data_[pending_pos_] = 0; + pending_valid_[pending_pos_] = 0; + pending_has_nulls_ = true; + ++pending_pos_; + + if (ARROW_PREDICT_FALSE(pending_pos_ >= pending_size_)) { + return CommitPendingData(); + } + return Status::OK(); + } + + void Reset() override; + Status Resize(int64_t capacity) override; + + protected: + virtual Status CommitPendingData() = 0; + + std::shared_ptr data_; + uint8_t* raw_data_; + uint8_t int_size_; + + static constexpr int32_t pending_size_ = 1024; + uint8_t pending_valid_[pending_size_]; + uint64_t pending_data_[pending_size_]; + int32_t pending_pos_; + bool pending_has_nulls_; +}; + +} // namespace internal + +class ARROW_EXPORT AdaptiveUIntBuilder : public internal::AdaptiveIntBuilderBase { + public: + explicit AdaptiveUIntBuilder(MemoryPool* pool ARROW_MEMORY_POOL_DEFAULT); + + using ArrayBuilder::Advance; + using internal::AdaptiveIntBuilderBase::Reset; + + /// Scalar append + Status Append(const uint64_t val) { + pending_data_[pending_pos_] = val; + pending_valid_[pending_pos_] = 1; + ++pending_pos_; + + if (ARROW_PREDICT_FALSE(pending_pos_ >= pending_size_)) { + return CommitPendingData(); + } + return Status::OK(); + } + + /// \brief Append a sequence of elements in one shot + /// \param[in] values a contiguous C array of values + /// \param[in] length the number of values to append + /// \param[in] valid_bytes an optional sequence of bytes where non-zero + /// indicates a valid (non-null) value + /// \return Status + Status AppendValues(const uint64_t* values, int64_t length, + const uint8_t* valid_bytes = NULLPTR); + + Status FinishInternal(std::shared_ptr* out) override; + + protected: + Status CommitPendingData() override; + Status ExpandIntSize(uint8_t new_int_size); + + Status AppendValuesInternal(const uint64_t* values, int64_t length, + const uint8_t* valid_bytes); + + template + typename std::enable_if= sizeof(new_type), Status>::type + ExpandIntSizeInternal(); +#define __LESS(a, b) (a) < (b) + template + typename std::enable_if<__LESS(sizeof(old_type), sizeof(new_type)), Status>::type + ExpandIntSizeInternal(); +#undef __LESS + + template + Status ExpandIntSizeN(); +}; + +class ARROW_EXPORT AdaptiveIntBuilder : public internal::AdaptiveIntBuilderBase { + public: + explicit AdaptiveIntBuilder(MemoryPool* pool ARROW_MEMORY_POOL_DEFAULT); + + using ArrayBuilder::Advance; + using internal::AdaptiveIntBuilderBase::Reset; + + /// Scalar append + Status Append(const int64_t val) { + auto v = static_cast(val); + + pending_data_[pending_pos_] = v; + pending_valid_[pending_pos_] = 1; + ++pending_pos_; + + if (ARROW_PREDICT_FALSE(pending_pos_ >= pending_size_)) { + return CommitPendingData(); + } + return Status::OK(); + } + + /// \brief Append a sequence of elements in one shot + /// \param[in] values a contiguous C array of values + /// \param[in] length the number of values to append + /// \param[in] valid_bytes an optional sequence of bytes where non-zero + /// indicates a valid (non-null) value + /// \return Status + Status AppendValues(const int64_t* values, int64_t length, + const uint8_t* valid_bytes = NULLPTR); + + Status FinishInternal(std::shared_ptr* out) override; + + protected: + Status CommitPendingData() override; + Status ExpandIntSize(uint8_t new_int_size); + + Status AppendValuesInternal(const int64_t* values, int64_t length, + const uint8_t* valid_bytes); + + template + typename std::enable_if= sizeof(new_type), Status>::type + ExpandIntSizeInternal(); +#define __LESS(a, b) (a) < (b) + template + typename std::enable_if<__LESS(sizeof(old_type), sizeof(new_type)), Status>::type + ExpandIntSizeInternal(); +#undef __LESS + + template + Status ExpandIntSizeN(); +}; + +} // namespace arrow diff --git a/cpp/src/arrow/columnar/builder_base.cc b/cpp/src/arrow/columnar/builder_base.cc new file mode 100644 index 00000000000..be91f649b58 --- /dev/null +++ b/cpp/src/arrow/columnar/builder_base.cc @@ -0,0 +1,176 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include "arrow/columnar/builder_base.h" + +#include +#include +#include +#include +#include +#include +#include + +#include "arrow/array.h" +#include "arrow/buffer.h" +#include "arrow/status.h" +#include "arrow/type.h" +#include "arrow/type_traits.h" +#include "arrow/util/bit-util.h" +#include "arrow/util/int-util.h" +#include "arrow/util/logging.h" + +namespace arrow { + +Status ArrayBuilder::TrimBuffer(const int64_t bytes_filled, ResizableBuffer* buffer) { + if (buffer) { + if (bytes_filled < buffer->size()) { + // Trim buffer + RETURN_NOT_OK(buffer->Resize(bytes_filled)); + } + // zero the padding + buffer->ZeroPadding(); + } else { + // Null buffers are allowed in place of 0-byte buffers + DCHECK_EQ(bytes_filled, 0); + } + return Status::OK(); +} + +Status ArrayBuilder::AppendToBitmap(bool is_valid) { + if (length_ == capacity_) { + // If the capacity was not already a multiple of 2, do so here + // TODO(emkornfield) doubling isn't great default allocation practice + // see https://github.com/facebook/folly/blob/master/folly/docs/FBVector.md + // fo discussion + RETURN_NOT_OK(Resize(BitUtil::NextPower2(capacity_ + 1))); + } + UnsafeAppendToBitmap(is_valid); + return Status::OK(); +} + +Status ArrayBuilder::AppendToBitmap(const uint8_t* valid_bytes, int64_t length) { + RETURN_NOT_OK(Reserve(length)); + + UnsafeAppendToBitmap(valid_bytes, length); + return Status::OK(); +} + +Status ArrayBuilder::Resize(int64_t capacity) { + // Target size of validity (null) bitmap data + const int64_t new_bitmap_size = BitUtil::BytesForBits(capacity); + RETURN_NOT_OK(CheckCapacity(capacity, capacity_)); + + if (capacity_ == 0) { + RETURN_NOT_OK(AllocateResizableBuffer(pool_, new_bitmap_size, &null_bitmap_)); + null_bitmap_data_ = null_bitmap_->mutable_data(); + + // Padding is zeroed by AllocateResizableBuffer + memset(null_bitmap_data_, 0, static_cast(new_bitmap_size)); + } else { + const int64_t old_bitmap_capacity = null_bitmap_->capacity(); + RETURN_NOT_OK(null_bitmap_->Resize(new_bitmap_size)); + + const int64_t new_bitmap_capacity = null_bitmap_->capacity(); + null_bitmap_data_ = null_bitmap_->mutable_data(); + + // Zero the region between the original capacity and the new capacity, + // including padding, which has not been zeroed, unlike + // AllocateResizableBuffer + if (old_bitmap_capacity < new_bitmap_capacity) { + memset(null_bitmap_data_ + old_bitmap_capacity, 0, + static_cast(new_bitmap_capacity - old_bitmap_capacity)); + } + } + capacity_ = capacity; + return Status::OK(); +} + +Status ArrayBuilder::Advance(int64_t elements) { + if (length_ + elements > capacity_) { + return Status::Invalid("Builder must be expanded"); + } + length_ += elements; + return Status::OK(); +} + +Status ArrayBuilder::Finish(std::shared_ptr* out) { + std::shared_ptr internal_data; + RETURN_NOT_OK(FinishInternal(&internal_data)); + *out = MakeArray(internal_data); + return Status::OK(); +} + +Status ArrayBuilder::Reserve(int64_t additional_elements) { + if (length_ + additional_elements > capacity_) { + // TODO(emkornfield) power of 2 growth is potentially suboptimal + int64_t new_size = BitUtil::NextPower2(length_ + additional_elements); + return Resize(new_size); + } + return Status::OK(); +} + +void ArrayBuilder::Reset() { + capacity_ = length_ = null_count_ = 0; + null_bitmap_ = nullptr; +} + +Status ArrayBuilder::SetNotNull(int64_t length) { + RETURN_NOT_OK(Reserve(length)); + UnsafeSetNotNull(length); + return Status::OK(); +} + +void ArrayBuilder::UnsafeAppendToBitmap(const uint8_t* valid_bytes, int64_t length) { + if (valid_bytes == nullptr) { + UnsafeSetNotNull(length); + return; + } + UnsafeAppendToBitmap(valid_bytes, valid_bytes + length); +} + +void ArrayBuilder::UnsafeAppendToBitmap(const std::vector& is_valid) { + UnsafeAppendToBitmap(is_valid.begin(), is_valid.end()); +} + +void ArrayBuilder::UnsafeSetNotNull(int64_t length) { + const int64_t new_length = length + length_; + + // Fill up the bytes until we have a byte alignment + int64_t pad_to_byte = std::min(8 - (length_ % 8), length); + + if (pad_to_byte == 8) { + pad_to_byte = 0; + } + for (int64_t i = length_; i < length_ + pad_to_byte; ++i) { + BitUtil::SetBit(null_bitmap_data_, i); + } + + // Fast bitsetting + int64_t fast_length = (length - pad_to_byte) / 8; + memset(null_bitmap_data_ + ((length_ + pad_to_byte) / 8), 0xFF, + static_cast(fast_length)); + + // Trailing bits + for (int64_t i = length_ + pad_to_byte + (fast_length * 8); i < new_length; ++i) { + BitUtil::SetBit(null_bitmap_data_, i); + } + + length_ = new_length; +} + +} // namespace arrow diff --git a/cpp/src/arrow/columnar/builder_base.h b/cpp/src/arrow/columnar/builder_base.h new file mode 100644 index 00000000000..ab0dd26a638 --- /dev/null +++ b/cpp/src/arrow/columnar/builder_base.h @@ -0,0 +1,228 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#pragma once + +#include "arrow/columnar/builder_base.h" + +#include // IWYU pragma: keep +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "arrow/buffer.h" +#include "arrow/memory_pool.h" +#include "arrow/status.h" +#include "arrow/type.h" +#include "arrow/type_traits.h" +#include "arrow/util/bit-util.h" +#include "arrow/util/macros.h" +#include "arrow/util/string_view.h" +#include "arrow/util/type_traits.h" +#include "arrow/util/visibility.h" + +namespace arrow { + +class Array; +struct ArrayData; + +constexpr int64_t kMinBuilderCapacity = 1 << 5; +constexpr int64_t kListMaximumElements = std::numeric_limits::max() - 1; + + +/// Base class for all data array builders. +/// +/// This class provides a facilities for incrementally building the null bitmap +/// (see Append methods) and as a side effect the current number of slots and +/// the null count. +/// +/// \note Users are expected to use builders as one of the concrete types below. +/// For example, ArrayBuilder* pointing to BinaryBuilder should be downcast before use. +class ARROW_EXPORT ArrayBuilder { + public: + explicit ArrayBuilder(const std::shared_ptr& type, MemoryPool* pool) + : type_(type), + pool_(pool), + null_bitmap_(NULLPTR), + null_count_(0), + null_bitmap_data_(NULLPTR), + length_(0), + capacity_(0) {} + + virtual ~ArrayBuilder() = default; + + /// For nested types. Since the objects are owned by this class instance, we + /// skip shared pointers and just return a raw pointer + ArrayBuilder* child(int i) { return children_[i].get(); } + + int num_children() const { return static_cast(children_.size()); } + + int64_t length() const { return length_; } + int64_t null_count() const { return null_count_; } + int64_t capacity() const { return capacity_; } + + /// \brief Ensure that enough memory has been allocated to fit the indicated + /// number of total elements in the builder, including any that have already + /// been appended. Does not account for reallocations that may be due to + /// variable size data, like binary values. To make space for incremental + /// appends, use Reserve instead. + /// + /// \param[in] capacity the minimum number of total array values to + /// accommodate. Must be greater than the current capacity. + /// \return Status + virtual Status Resize(int64_t capacity); + + /// \brief Ensure that there is enough space allocated to add the indicated + /// number of elements without any further calls to Resize. The memory + /// allocated is rounded up to the next highest power of 2 similar to memory + /// allocations in STL containers like std::vector + /// \param[in] additional_capacity the number of additional array values + /// \return Status + Status Reserve(int64_t additional_capacity); + + /// Reset the builder. + virtual void Reset(); + + /// For cases where raw data was memcpy'd into the internal buffers, allows us + /// to advance the length of the builder. It is your responsibility to use + /// this function responsibly. + Status Advance(int64_t elements); + + /// \brief Return result of builder as an internal generic ArrayData + /// object. Resets builder except for dictionary builder + /// + /// \param[out] out the finalized ArrayData object + /// \return Status + virtual Status FinishInternal(std::shared_ptr* out) = 0; + + /// \brief Return result of builder as an Array object. + /// + /// The builder is reset except for DictionaryBuilder. + /// + /// \param[out] out the finalized Array object + /// \return Status + Status Finish(std::shared_ptr* out); + + std::shared_ptr type() const { return type_; } + + protected: + ArrayBuilder() {} + + /// Append to null bitmap + Status AppendToBitmap(bool is_valid); + + /// Vector append. Treat each zero byte as a null. If valid_bytes is null + /// assume all of length bits are valid. + Status AppendToBitmap(const uint8_t* valid_bytes, int64_t length); + + /// Set the next length bits to not null (i.e. valid). + Status SetNotNull(int64_t length); + + // Unsafe operations (don't check capacity/don't resize) + + void UnsafeAppendNull() { UnsafeAppendToBitmap(false); } + + // Append to null bitmap, update the length + void UnsafeAppendToBitmap(bool is_valid) { + if (is_valid) { + BitUtil::SetBit(null_bitmap_data_, length_); + } else { + ++null_count_; + } + ++length_; + } + + template + void UnsafeAppendToBitmap(const IterType& begin, const IterType& end) { + int64_t byte_offset = length_ / 8; + int64_t bit_offset = length_ % 8; + uint8_t bitset = null_bitmap_data_[byte_offset]; + + for (auto iter = begin; iter != end; ++iter) { + if (bit_offset == 8) { + bit_offset = 0; + null_bitmap_data_[byte_offset] = bitset; + byte_offset++; + // TODO: Except for the last byte, this shouldn't be needed + bitset = null_bitmap_data_[byte_offset]; + } + + if (*iter) { + bitset |= BitUtil::kBitmask[bit_offset]; + } else { + bitset &= BitUtil::kFlippedBitmask[bit_offset]; + ++null_count_; + } + + bit_offset++; + } + + if (bit_offset != 0) { + null_bitmap_data_[byte_offset] = bitset; + } + + length_ += std::distance(begin, end); + } + + // Vector append. Treat each zero byte as a nullzero. If valid_bytes is null + // assume all of length bits are valid. + void UnsafeAppendToBitmap(const uint8_t* valid_bytes, int64_t length); + + void UnsafeAppendToBitmap(const std::vector& is_valid); + + // Set the next length bits to not null (i.e. valid). + void UnsafeSetNotNull(int64_t length); + + static Status TrimBuffer(const int64_t bytes_filled, ResizableBuffer* buffer); + + static Status CheckCapacity(int64_t new_capacity, int64_t old_capacity) { + if (new_capacity < 0) { + return Status::Invalid("Resize capacity must be positive"); + } + if (new_capacity < old_capacity) { + return Status::Invalid("Resize cannot downsize"); + } + return Status::OK(); + } + + std::shared_ptr type_; + MemoryPool* pool_; + + // When null_bitmap are first appended to the builder, the null bitmap is allocated + std::shared_ptr null_bitmap_; + int64_t null_count_; + uint8_t* null_bitmap_data_; + + // Array length, so far. Also, the index of the next element to be added + int64_t length_; + int64_t capacity_; + + // Child value array builders. These are owned by this class + std::vector> children_; + + private: + ARROW_DISALLOW_COPY_AND_ASSIGN(ArrayBuilder); +}; + +} // namespace arrow diff --git a/cpp/src/arrow/builder-binary.cc b/cpp/src/arrow/columnar/builder_binary.cc similarity index 89% rename from cpp/src/arrow/builder-binary.cc rename to cpp/src/arrow/columnar/builder_binary.cc index c250837b4a3..36a78a4d920 100644 --- a/cpp/src/arrow/builder-binary.cc +++ b/cpp/src/arrow/columnar/builder_binary.cc @@ -15,6 +15,8 @@ // specific language governing permissions and limitations // under the License. +#include "arrow/columnar/builder_binary.h" + #include #include #include @@ -27,7 +29,6 @@ #include "arrow/array.h" #include "arrow/buffer.h" -#include "arrow/builder.h" #include "arrow/status.h" #include "arrow/type.h" #include "arrow/type_traits.h" @@ -79,23 +80,6 @@ Status BinaryBuilder::AppendNextOffset() { return offsets_builder_.Append(static_cast(num_bytes)); } -Status BinaryBuilder::Append(const uint8_t* value, int32_t length) { - RETURN_NOT_OK(Reserve(1)); - RETURN_NOT_OK(AppendNextOffset()); - RETURN_NOT_OK(value_data_builder_.Append(value, length)); - - UnsafeAppendToBitmap(true); - return Status::OK(); -} - -Status BinaryBuilder::AppendNull() { - RETURN_NOT_OK(AppendNextOffset()); - RETURN_NOT_OK(Reserve(1)); - - UnsafeAppendToBitmap(false); - return Status::OK(); -} - Status BinaryBuilder::FinishInternal(std::shared_ptr* out) { // Write final offset (values length) RETURN_NOT_OK(AppendNextOffset()); @@ -292,24 +276,50 @@ util::string_view FixedSizeBinaryBuilder::GetView(int64_t i) const { } // ---------------------------------------------------------------------- -// Decimal128Builder +// ChunkedArray builders + +namespace internal { -Decimal128Builder::Decimal128Builder(const std::shared_ptr& type, - MemoryPool* pool) - : FixedSizeBinaryBuilder(type, pool) {} +ChunkedBinaryBuilder::ChunkedBinaryBuilder(int32_t max_chunk_size, MemoryPool* pool) + : max_chunk_size_(max_chunk_size), + chunk_data_size_(0), + builder_(new BinaryBuilder(pool)) {} -Status Decimal128Builder::Append(const Decimal128& value) { - RETURN_NOT_OK(FixedSizeBinaryBuilder::Reserve(1)); - return FixedSizeBinaryBuilder::Append(value.ToBytes()); +Status ChunkedBinaryBuilder::Finish(ArrayVector* out) { + if (builder_->length() > 0 || chunks_.size() == 0) { + std::shared_ptr chunk; + RETURN_NOT_OK(builder_->Finish(&chunk)); + chunks_.emplace_back(std::move(chunk)); + } + *out = std::move(chunks_); + return Status::OK(); } -Status Decimal128Builder::FinishInternal(std::shared_ptr* out) { - std::shared_ptr data; - RETURN_NOT_OK(byte_builder_.Finish(&data)); +Status ChunkedBinaryBuilder::NextChunk() { + std::shared_ptr chunk; + RETURN_NOT_OK(builder_->Finish(&chunk)); + chunks_.emplace_back(std::move(chunk)); - *out = ArrayData::Make(type_, length_, {null_bitmap_, data}, null_count_); + chunk_data_size_ = 0; + return Status::OK(); +} + +Status ChunkedStringBuilder::Finish(ArrayVector* out) { + ArrayVector temp; + RETURN_NOT_OK(ChunkedBinaryBuilder::Finish(&temp)); + out->clear(); + out->reserve(temp.size()); + + // XXX: Change data type to string/utf8 + for (const auto& chunk : temp) { + std::shared_ptr data = chunk->data(); + data->type = ::arrow::utf8(); + out->emplace_back(std::make_shared(data)); + } return Status::OK(); } +} // namespace internal + } // namespace arrow diff --git a/cpp/src/arrow/columnar/builder_binary.h b/cpp/src/arrow/columnar/builder_binary.h new file mode 100644 index 00000000000..5c65e001f8a --- /dev/null +++ b/cpp/src/arrow/columnar/builder_binary.h @@ -0,0 +1,298 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#pragma once + +#include +#include +#include +#include + +#include "arrow/array.h" +#include "arrow/columnar/builder_base.h" +#include "arrow/status.h" +#include "arrow/type_traits.h" +#include "arrow/util/macros.h" +#include "arrow/util/string_view.h" + +namespace arrow { + +constexpr int64_t kBinaryMemoryLimit = std::numeric_limits::max() - 1; + +// ---------------------------------------------------------------------- +// Binary and String + +/// \class BinaryBuilder +/// \brief Builder class for variable-length binary data +class ARROW_EXPORT BinaryBuilder : public ArrayBuilder { + public: + explicit BinaryBuilder(MemoryPool* pool ARROW_MEMORY_POOL_DEFAULT); + + BinaryBuilder(const std::shared_ptr& type, MemoryPool* pool); + + Status Append(const uint8_t* value, int32_t length) { + ARROW_RETURN_NOT_OK(Reserve(1)); + ARROW_RETURN_NOT_OK(AppendNextOffset()); + ARROW_RETURN_NOT_OK(value_data_builder_.Append(value, length)); + + UnsafeAppendToBitmap(true); + return Status::OK(); + } + + Status AppendNull() { + ARROW_RETURN_NOT_OK(AppendNextOffset()); + ARROW_RETURN_NOT_OK(Reserve(1)); + UnsafeAppendToBitmap(false); + return Status::OK(); + } + + Status Append(const char* value, int32_t length) { + return Append(reinterpret_cast(value), length); + } + + Status Append(util::string_view value) { + return Append(value.data(), static_cast(value.size())); + } + + /// \brief Append without checking capacity + /// + /// Offsets and data should have been presized using Reserve() and + /// ReserveData(), respectively. + void UnsafeAppend(const uint8_t* value, int32_t length) { + UnsafeAppendNextOffset(); + value_data_builder_.UnsafeAppend(value, length); + UnsafeAppendToBitmap(true); + } + + void UnsafeAppend(const char* value, int32_t length) { + UnsafeAppend(reinterpret_cast(value), length); + } + + void UnsafeAppend(const std::string& value) { + UnsafeAppend(value.c_str(), static_cast(value.size())); + } + + void UnsafeAppendNull() { + const int64_t num_bytes = value_data_builder_.length(); + offsets_builder_.UnsafeAppend(static_cast(num_bytes)); + UnsafeAppendToBitmap(false); + } + + void Reset() override; + Status Resize(int64_t capacity) override; + + /// \brief Ensures there is enough allocated capacity to append the indicated + /// number of bytes to the value data buffer without additional allocations + Status ReserveData(int64_t elements); + + Status FinishInternal(std::shared_ptr* out) override; + + /// \return size of values buffer so far + int64_t value_data_length() const { return value_data_builder_.length(); } + /// \return capacity of values buffer + int64_t value_data_capacity() const { return value_data_builder_.capacity(); } + + /// Temporary access to a value. + /// + /// This pointer becomes invalid on the next modifying operation. + const uint8_t* GetValue(int64_t i, int32_t* out_length) const; + + /// Temporary access to a value. + /// + /// This view becomes invalid on the next modifying operation. + util::string_view GetView(int64_t i) const; + + protected: + TypedBufferBuilder offsets_builder_; + TypedBufferBuilder value_data_builder_; + + Status AppendNextOffset(); + + void UnsafeAppendNextOffset() { + const int64_t num_bytes = value_data_builder_.length(); + offsets_builder_.UnsafeAppend(static_cast(num_bytes)); + } +}; + +/// \class StringBuilder +/// \brief Builder class for UTF8 strings +class ARROW_EXPORT StringBuilder : public BinaryBuilder { + public: + using BinaryBuilder::BinaryBuilder; + explicit StringBuilder(MemoryPool* pool ARROW_MEMORY_POOL_DEFAULT); + + using BinaryBuilder::Append; + using BinaryBuilder::Reset; + using BinaryBuilder::UnsafeAppend; + + /// \brief Append a sequence of strings in one shot. + /// + /// \param[in] values a vector of strings + /// \param[in] valid_bytes an optional sequence of bytes where non-zero + /// indicates a valid (non-null) value + /// \return Status + Status AppendValues(const std::vector& values, + const uint8_t* valid_bytes = NULLPTR); + + /// \brief Append a sequence of nul-terminated strings in one shot. + /// If one of the values is NULL, it is processed as a null + /// value even if the corresponding valid_bytes entry is 1. + /// + /// \param[in] values a contiguous C array of nul-terminated char * + /// \param[in] length the number of values to append + /// \param[in] valid_bytes an optional sequence of bytes where non-zero + /// indicates a valid (non-null) value + /// \return Status + Status AppendValues(const char** values, int64_t length, + const uint8_t* valid_bytes = NULLPTR); +}; + +// ---------------------------------------------------------------------- +// FixedSizeBinaryBuilder + +class ARROW_EXPORT FixedSizeBinaryBuilder : public ArrayBuilder { + public: + FixedSizeBinaryBuilder(const std::shared_ptr& type, + MemoryPool* pool ARROW_MEMORY_POOL_DEFAULT); + + Status Append(const uint8_t* value) { + ARROW_RETURN_NOT_OK(Reserve(1)); + UnsafeAppendToBitmap(true); + return byte_builder_.Append(value, byte_width_); + } + + Status Append(const char* value) { + return Append(reinterpret_cast(value)); + } + + Status Append(const util::string_view& view) { +#ifndef NDEBUG + CheckValueSize(static_cast(view.size())); +#endif + return Append(reinterpret_cast(view.data())); + } + + Status Append(const std::string& s) { +#ifndef NDEBUG + CheckValueSize(static_cast(s.size())); +#endif + return Append(reinterpret_cast(s.data())); + } + + template + Status Append(const std::array& value) { + ARROW_RETURN_NOT_OK(Reserve(1)); + UnsafeAppendToBitmap(true); + return byte_builder_.Append(value); + } + + Status AppendValues(const uint8_t* data, int64_t length, + const uint8_t* valid_bytes = NULLPTR); + Status AppendNull(); + + void Reset() override; + Status Resize(int64_t capacity) override; + Status FinishInternal(std::shared_ptr* out) override; + + /// \return size of values buffer so far + int64_t value_data_length() const { return byte_builder_.length(); } + + int32_t byte_width() const { return byte_width_; } + + /// Temporary access to a value. + /// + /// This pointer becomes invalid on the next modifying operation. + const uint8_t* GetValue(int64_t i) const; + + /// Temporary access to a value. + /// + /// This view becomes invalid on the next modifying operation. + util::string_view GetView(int64_t i) const; + + protected: + int32_t byte_width_; + BufferBuilder byte_builder_; + +#ifndef NDEBUG + void CheckValueSize(int64_t size); +#endif +}; + +// ---------------------------------------------------------------------- +// Chunked builders: build a sequence of BinaryArray or StringArray that are +// limited to a particular size (to the upper limit of 2GB) + +namespace internal { + +class ARROW_EXPORT ChunkedBinaryBuilder { + public: + ChunkedBinaryBuilder(int32_t max_chunk_size, + MemoryPool* pool ARROW_MEMORY_POOL_DEFAULT); + + virtual ~ChunkedBinaryBuilder() = default; + + Status Append(const uint8_t* value, int32_t length) { + if (ARROW_PREDICT_FALSE(length + chunk_data_size_ > max_chunk_size_)) { + // Move onto next chunk, unless the builder length is currently 0, which + // means that max_chunk_size_ is less than the item length + if (builder_->length() > 0) { + ARROW_RETURN_NOT_OK(NextChunk()); + } + // else fall through + } + + chunk_data_size_ += length; + return builder_->Append(value, length); + } + + Status Append(const util::string_view& value) { + return Append(reinterpret_cast(value.data()), + static_cast(value.size())); + } + + Status AppendNull() { + if (ARROW_PREDICT_FALSE(builder_->length() == std::numeric_limits::max())) { + ARROW_RETURN_NOT_OK(NextChunk()); + } + return builder_->AppendNull(); + } + + virtual Status Finish(ArrayVector* out); + + protected: + ChunkedBinaryBuilder(const std::shared_ptr& type, int32_t max_chunk_size, + MemoryPool* pool); + + Status NextChunk(); + + int32_t max_chunk_size_; + int32_t chunk_data_size_; + + std::unique_ptr builder_; + std::vector> chunks_; +}; + +class ARROW_EXPORT ChunkedStringBuilder : public ChunkedBinaryBuilder { + public: + using ChunkedBinaryBuilder::ChunkedBinaryBuilder; + + Status Finish(ArrayVector* out) override; +}; + +} // namespace internal + +} // namespace arrow diff --git a/cpp/src/arrow/columnar/builder_decimal.cc b/cpp/src/arrow/columnar/builder_decimal.cc new file mode 100644 index 00000000000..b8b23a09638 --- /dev/null +++ b/cpp/src/arrow/columnar/builder_decimal.cc @@ -0,0 +1,64 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include "arrow/columnar/builder_decimal.h" + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "arrow/array.h" +#include "arrow/buffer.h" +#include "arrow/status.h" +#include "arrow/type.h" +#include "arrow/type_traits.h" +#include "arrow/util/bit-util.h" +#include "arrow/util/checked_cast.h" +#include "arrow/util/decimal.h" +#include "arrow/util/logging.h" + +namespace arrow { + +// ---------------------------------------------------------------------- +// Decimal128Builder + +Decimal128Builder::Decimal128Builder(const std::shared_ptr& type, + MemoryPool* pool) + : FixedSizeBinaryBuilder(type, pool) {} + +Status Decimal128Builder::Append(const Decimal128& value) { + RETURN_NOT_OK(FixedSizeBinaryBuilder::Reserve(1)); + return FixedSizeBinaryBuilder::Append(value.ToBytes()); +} + +Status Decimal128Builder::FinishInternal(std::shared_ptr* out) { + std::shared_ptr data; + RETURN_NOT_OK(byte_builder_.Finish(&data)); + + *out = ArrayData::Make(type_, length_, {null_bitmap_, data}, null_count_); + + return Status::OK(); +} + +} // namespace arrow diff --git a/cpp/src/arrow/columnar/builder_decimal.h b/cpp/src/arrow/columnar/builder_decimal.h new file mode 100644 index 00000000000..6c9d31eea28 --- /dev/null +++ b/cpp/src/arrow/columnar/builder_decimal.h @@ -0,0 +1,45 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#pragma once + +#include + +#include "arrow/columnar/builder_base.h" +#include "arrow/columnar/builder_binary.h" + +namespace arrow { + +class Decimal128; + +class ARROW_EXPORT Decimal128Builder : public FixedSizeBinaryBuilder { + public: + explicit Decimal128Builder(const std::shared_ptr& type, + MemoryPool* pool ARROW_MEMORY_POOL_DEFAULT); + + using FixedSizeBinaryBuilder::Append; + using FixedSizeBinaryBuilder::AppendValues; + using FixedSizeBinaryBuilder::Reset; + + Status Append(const Decimal128& val); + + Status FinishInternal(std::shared_ptr* out) override; +}; + +using DecimalBuilder = Decimal128Builder; + +} // namespace arrow diff --git a/cpp/src/arrow/builder-dict.cc b/cpp/src/arrow/columnar/builder_dict.cc similarity index 99% rename from cpp/src/arrow/builder-dict.cc rename to cpp/src/arrow/columnar/builder_dict.cc index b021c3a9d37..f7eff6deb0c 100644 --- a/cpp/src/arrow/builder-dict.cc +++ b/cpp/src/arrow/columnar/builder_dict.cc @@ -15,13 +15,15 @@ // specific language governing permissions and limitations // under the License. +#include "arrow/columnar/builder_dict.h" + +#include #include #include #include #include "arrow/array.h" #include "arrow/buffer.h" -#include "arrow/builder.h" #include "arrow/status.h" #include "arrow/type.h" #include "arrow/type_traits.h" diff --git a/cpp/src/arrow/columnar/builder_dict.h b/cpp/src/arrow/columnar/builder_dict.h new file mode 100644 index 00000000000..23a7a09de4e --- /dev/null +++ b/cpp/src/arrow/columnar/builder_dict.h @@ -0,0 +1,168 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#pragma once + +#include + +#include "arrow/columnar/builder_adaptive.h" // IWYU pragma: export +#include "arrow/columnar/builder_base.h" // IWYU pragma: export + +namespace arrow { + + +// ---------------------------------------------------------------------- +// Dictionary builder + +namespace internal { + +template +struct DictionaryScalar { + using type = typename T::c_type; +}; + +template <> +struct DictionaryScalar { + using type = util::string_view; +}; + +template <> +struct DictionaryScalar { + using type = util::string_view; +}; + +template <> +struct DictionaryScalar { + using type = util::string_view; +}; + +} // namespace internal + +/// \brief Array builder for created encoded DictionaryArray from dense array +/// +/// Unlike other builders, dictionary builder does not completely reset the state +/// on Finish calls. The arrays built after the initial Finish call will reuse +/// the previously created encoding and build a delta dictionary when new terms +/// occur. +/// +/// data +template +class ARROW_EXPORT DictionaryBuilder : public ArrayBuilder { + public: + using Scalar = typename internal::DictionaryScalar::type; + + // WARNING: the type given below is the value type, not the DictionaryType. + // The DictionaryType is instantiated on the Finish() call. + DictionaryBuilder(const std::shared_ptr& type, MemoryPool* pool); + + template + explicit DictionaryBuilder( + typename std::enable_if::is_parameter_free, MemoryPool*>::type pool) + : DictionaryBuilder(TypeTraits::type_singleton(), pool) {} + + ~DictionaryBuilder() override; + + /// \brief Append a scalar value + Status Append(const Scalar& value); + + /// \brief Append a fixed-width string (only for FixedSizeBinaryType) + template + Status Append(typename std::enable_if::value, + const uint8_t*>::type value) { + return Append(util::string_view(reinterpret_cast(value), byte_width_)); + } + + /// \brief Append a fixed-width string (only for FixedSizeBinaryType) + template + Status Append(typename std::enable_if::value, + const char*>::type value) { + return Append(util::string_view(value, byte_width_)); + } + + /// \brief Append a scalar null value + Status AppendNull(); + + /// \brief Append a whole dense array to the builder + Status AppendArray(const Array& array); + + void Reset() override; + Status Resize(int64_t capacity) override; + Status FinishInternal(std::shared_ptr* out) override; + + /// is the dictionary builder in the delta building mode + bool is_building_delta() { return delta_offset_ > 0; } + + protected: + class MemoTableImpl; + std::unique_ptr memo_table_; + + int32_t delta_offset_; + // Only used for FixedSizeBinaryType + int32_t byte_width_; + + AdaptiveIntBuilder values_builder_; +}; + +template <> +class ARROW_EXPORT DictionaryBuilder : public ArrayBuilder { + public: + DictionaryBuilder(const std::shared_ptr& type, MemoryPool* pool); + explicit DictionaryBuilder(MemoryPool* pool); + + /// \brief Append a scalar null value + Status AppendNull(); + + /// \brief Append a whole dense array to the builder + Status AppendArray(const Array& array); + + Status Resize(int64_t capacity) override; + Status FinishInternal(std::shared_ptr* out) override; + + protected: + AdaptiveIntBuilder values_builder_; +}; + +class ARROW_EXPORT BinaryDictionaryBuilder : public DictionaryBuilder { + public: + using DictionaryBuilder::Append; + using DictionaryBuilder::DictionaryBuilder; + + Status Append(const uint8_t* value, int32_t length) { + return Append(reinterpret_cast(value), length); + } + + Status Append(const char* value, int32_t length) { + return Append(util::string_view(value, length)); + } +}; + +/// \brief Dictionary array builder with convenience methods for strings +class ARROW_EXPORT StringDictionaryBuilder : public DictionaryBuilder { + public: + using DictionaryBuilder::Append; + using DictionaryBuilder::DictionaryBuilder; + + Status Append(const uint8_t* value, int32_t length) { + return Append(reinterpret_cast(value), length); + } + + Status Append(const char* value, int32_t length) { + return Append(util::string_view(value, length)); + } +}; + +} // namespace arrow diff --git a/cpp/src/arrow/columnar/builder_nested.cc b/cpp/src/arrow/columnar/builder_nested.cc new file mode 100644 index 00000000000..af48a1f5252 --- /dev/null +++ b/cpp/src/arrow/columnar/builder_nested.cc @@ -0,0 +1,156 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include "arrow/columnar/builder_nested.h" + +#include +#include +#include +#include +#include +#include +#include + +#include "arrow/array.h" +#include "arrow/buffer.h" +#include "arrow/status.h" +#include "arrow/type.h" +#include "arrow/type_traits.h" +#include "arrow/util/bit-util.h" +#include "arrow/util/int-util.h" +#include "arrow/util/logging.h" + +namespace arrow { + +// ---------------------------------------------------------------------- +// ListBuilder + +ListBuilder::ListBuilder(MemoryPool* pool, + std::shared_ptr const& value_builder, + const std::shared_ptr& type) + : ArrayBuilder(type ? type + : std::static_pointer_cast( + std::make_shared(value_builder->type())), + pool), + offsets_builder_(pool), + value_builder_(value_builder) {} + +Status ListBuilder::AppendValues(const int32_t* offsets, int64_t length, + const uint8_t* valid_bytes) { + RETURN_NOT_OK(Reserve(length)); + UnsafeAppendToBitmap(valid_bytes, length); + offsets_builder_.UnsafeAppend(offsets, length); + return Status::OK(); +} + +Status ListBuilder::AppendNextOffset() { + int64_t num_values = value_builder_->length(); + if (ARROW_PREDICT_FALSE(num_values > kListMaximumElements)) { + std::stringstream ss; + ss << "ListArray cannot contain more then INT32_MAX - 1 child elements," + << " have " << num_values; + return Status::CapacityError(ss.str()); + } + return offsets_builder_.Append(static_cast(num_values)); +} + +Status ListBuilder::Append(bool is_valid) { + RETURN_NOT_OK(Reserve(1)); + UnsafeAppendToBitmap(is_valid); + return AppendNextOffset(); +} + +Status ListBuilder::Resize(int64_t capacity) { + DCHECK_LE(capacity, kListMaximumElements); + RETURN_NOT_OK(CheckCapacity(capacity, capacity_)); + + // one more then requested for offsets + RETURN_NOT_OK(offsets_builder_.Resize((capacity + 1) * sizeof(int32_t))); + return ArrayBuilder::Resize(capacity); +} + +Status ListBuilder::FinishInternal(std::shared_ptr* out) { + RETURN_NOT_OK(AppendNextOffset()); + + // Offset padding zeroed by BufferBuilder + std::shared_ptr offsets; + RETURN_NOT_OK(offsets_builder_.Finish(&offsets)); + + std::shared_ptr items; + if (values_) { + items = values_->data(); + } else { + if (value_builder_->length() == 0) { + // Try to make sure we get a non-null values buffer (ARROW-2744) + RETURN_NOT_OK(value_builder_->Resize(0)); + } + RETURN_NOT_OK(value_builder_->FinishInternal(&items)); + } + + *out = ArrayData::Make(type_, length_, {null_bitmap_, offsets}, null_count_); + (*out)->child_data.emplace_back(std::move(items)); + Reset(); + return Status::OK(); +} + +void ListBuilder::Reset() { + ArrayBuilder::Reset(); + values_.reset(); + offsets_builder_.Reset(); + value_builder_->Reset(); +} + +ArrayBuilder* ListBuilder::value_builder() const { + DCHECK(!values_) << "Using value builder is pointless when values_ is set"; + return value_builder_.get(); +} + +// ---------------------------------------------------------------------- +// Struct + +StructBuilder::StructBuilder(const std::shared_ptr& type, MemoryPool* pool, + std::vector>&& field_builders) + : ArrayBuilder(type, pool) { + children_ = std::move(field_builders); +} + +void StructBuilder::Reset() { + ArrayBuilder::Reset(); + for (const auto& field_builder : children_) { + field_builder->Reset(); + } +} + +Status StructBuilder::FinishInternal(std::shared_ptr* out) { + RETURN_NOT_OK(TrimBuffer(BitUtil::BytesForBits(length_), null_bitmap_.get())); + *out = ArrayData::Make(type_, length_, {null_bitmap_}, null_count_); + + (*out)->child_data.resize(children_.size()); + for (size_t i = 0; i < children_.size(); ++i) { + if (length_ == 0) { + // Try to make sure the child buffers are initialized + RETURN_NOT_OK(children_[i]->Resize(0)); + } + RETURN_NOT_OK(children_[i]->FinishInternal(&(*out)->child_data[i])); + } + + null_bitmap_ = nullptr; + capacity_ = length_ = null_count_ = 0; + return Status::OK(); +} + +} // namespace arrow diff --git a/cpp/src/arrow/columnar/builder_nested.h b/cpp/src/arrow/columnar/builder_nested.h new file mode 100644 index 00000000000..0e2a2ff92a7 --- /dev/null +++ b/cpp/src/arrow/columnar/builder_nested.h @@ -0,0 +1,125 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#pragma once + +#include +#include + +#include "arrow/columnar/builder_base.h" + +namespace arrow { + + +// ---------------------------------------------------------------------- +// List builder + +/// \class ListBuilder +/// \brief Builder class for variable-length list array value types +/// +/// To use this class, you must append values to the child array builder and use +/// the Append function to delimit each distinct list value (once the values +/// have been appended to the child array) or use the bulk API to append +/// a sequence of offests and null values. +/// +/// A note on types. Per arrow/type.h all types in the c++ implementation are +/// logical so even though this class always builds list array, this can +/// represent multiple different logical types. If no logical type is provided +/// at construction time, the class defaults to List where t is taken from the +/// value_builder/values that the object is constructed with. +class ARROW_EXPORT ListBuilder : public ArrayBuilder { + public: + /// Use this constructor to incrementally build the value array along with offsets and + /// null bitmap. + ListBuilder(MemoryPool* pool, std::shared_ptr const& value_builder, + const std::shared_ptr& type = NULLPTR); + + Status Resize(int64_t capacity) override; + void Reset() override; + Status FinishInternal(std::shared_ptr* out) override; + + /// \brief Vector append + /// + /// If passed, valid_bytes is of equal length to values, and any zero byte + /// will be considered as a null for that slot + Status AppendValues(const int32_t* offsets, int64_t length, + const uint8_t* valid_bytes = NULLPTR); + + /// \brief Start a new variable-length list slot + /// + /// This function should be called before beginning to append elements to the + /// value builder + Status Append(bool is_valid = true); + + Status AppendNull() { return Append(false); } + + ArrayBuilder* value_builder() const; + + protected: + TypedBufferBuilder offsets_builder_; + std::shared_ptr value_builder_; + std::shared_ptr values_; + + Status AppendNextOffset(); +}; + +// ---------------------------------------------------------------------- +// Struct + +// --------------------------------------------------------------------------------- +// StructArray builder +/// Append, Resize and Reserve methods are acting on StructBuilder. +/// Please make sure all these methods of all child-builders' are consistently +/// called to maintain data-structure consistency. +class ARROW_EXPORT StructBuilder : public ArrayBuilder { + public: + StructBuilder(const std::shared_ptr& type, MemoryPool* pool, + std::vector>&& field_builders); + + Status FinishInternal(std::shared_ptr* out) override; + + /// Null bitmap is of equal length to every child field, and any zero byte + /// will be considered as a null for that field, but users must using app- + /// end methods or advance methods of the child builders' independently to + /// insert data. + Status AppendValues(int64_t length, const uint8_t* valid_bytes) { + ARROW_RETURN_NOT_OK(Reserve(length)); + UnsafeAppendToBitmap(valid_bytes, length); + return Status::OK(); + } + + /// Append an element to the Struct. All child-builders' Append method must + /// be called independently to maintain data-structure consistency. + Status Append(bool is_valid = true) { + ARROW_RETURN_NOT_OK(Reserve(1)); + UnsafeAppendToBitmap(is_valid); + return Status::OK(); + } + + Status AppendNull() { return Append(false); } + + void Reset() override; + + ArrayBuilder* field_builder(int i) const { return field_builders_[i].get(); } + + int num_fields() const { return static_cast(field_builders_.size()); } + + protected: + std::vector> field_builders_; +}; + +} // namespace arrow diff --git a/cpp/src/arrow/columnar/builder_primitive.cc b/cpp/src/arrow/columnar/builder_primitive.cc new file mode 100644 index 00000000000..ded7af0b091 --- /dev/null +++ b/cpp/src/arrow/columnar/builder_primitive.cc @@ -0,0 +1,272 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include "arrow/columnar/builder_primitive.h" + +#include +#include +#include +#include +#include +#include +#include + +#include "arrow/array.h" +#include "arrow/buffer.h" +#include "arrow/status.h" +#include "arrow/type.h" +#include "arrow/type_traits.h" +#include "arrow/util/bit-util.h" +#include "arrow/util/int-util.h" +#include "arrow/util/logging.h" + +namespace arrow { + +// ---------------------------------------------------------------------- +// Null builder + +Status NullBuilder::FinishInternal(std::shared_ptr* out) { + *out = ArrayData::Make(null(), length_, {nullptr}, length_); + length_ = null_count_ = 0; + return Status::OK(); +} + +// ---------------------------------------------------------------------- + +template +void PrimitiveBuilder::Reset() { + data_.reset(); + raw_data_ = nullptr; +} + +template +Status PrimitiveBuilder::Resize(int64_t capacity) { + RETURN_NOT_OK(CheckCapacity(capacity, capacity_)); + capacity = std::max(capacity, kMinBuilderCapacity); + + int64_t nbytes = TypeTraits::bytes_required(capacity); + if (capacity_ == 0) { + RETURN_NOT_OK(AllocateResizableBuffer(pool_, nbytes, &data_)); + } else { + RETURN_NOT_OK(data_->Resize(nbytes)); + } + + raw_data_ = reinterpret_cast(data_->mutable_data()); + return ArrayBuilder::Resize(capacity); +} + +template +Status PrimitiveBuilder::AppendValues(const value_type* values, int64_t length, + const uint8_t* valid_bytes) { + RETURN_NOT_OK(Reserve(length)); + + if (length > 0) { + std::memcpy(raw_data_ + length_, values, + static_cast(TypeTraits::bytes_required(length))); + } + + // length_ is update by these + ArrayBuilder::UnsafeAppendToBitmap(valid_bytes, length); + return Status::OK(); +} + +template +Status PrimitiveBuilder::AppendValues(const value_type* values, int64_t length, + const std::vector& is_valid) { + RETURN_NOT_OK(Reserve(length)); + DCHECK_EQ(length, static_cast(is_valid.size())); + + if (length > 0) { + std::memcpy(raw_data_ + length_, values, + static_cast(TypeTraits::bytes_required(length))); + } + + // length_ is update by these + ArrayBuilder::UnsafeAppendToBitmap(is_valid); + return Status::OK(); +} + +template +Status PrimitiveBuilder::AppendValues(const std::vector& values, + const std::vector& is_valid) { + return AppendValues(values.data(), static_cast(values.size()), is_valid); +} + +template +Status PrimitiveBuilder::AppendValues(const std::vector& values) { + return AppendValues(values.data(), static_cast(values.size())); +} + +template +Status PrimitiveBuilder::FinishInternal(std::shared_ptr* out) { + RETURN_NOT_OK(TrimBuffer(BitUtil::BytesForBits(length_), null_bitmap_.get())); + RETURN_NOT_OK(TrimBuffer(TypeTraits::bytes_required(length_), data_.get())); + + *out = ArrayData::Make(type_, length_, {null_bitmap_, data_}, null_count_); + + data_ = null_bitmap_ = nullptr; + capacity_ = length_ = null_count_ = 0; + + return Status::OK(); +} + +template class PrimitiveBuilder; +template class PrimitiveBuilder; +template class PrimitiveBuilder; +template class PrimitiveBuilder; +template class PrimitiveBuilder; +template class PrimitiveBuilder; +template class PrimitiveBuilder; +template class PrimitiveBuilder; +template class PrimitiveBuilder; +template class PrimitiveBuilder; +template class PrimitiveBuilder; +template class PrimitiveBuilder; +template class PrimitiveBuilder; +template class PrimitiveBuilder; +template class PrimitiveBuilder; +template class PrimitiveBuilder; + +BooleanBuilder::BooleanBuilder(MemoryPool* pool) + : ArrayBuilder(boolean(), pool), data_(nullptr), raw_data_(nullptr) {} + +BooleanBuilder::BooleanBuilder(const std::shared_ptr& type, MemoryPool* pool) + : BooleanBuilder(pool) { + DCHECK_EQ(Type::BOOL, type->id()); +} + +void BooleanBuilder::Reset() { + ArrayBuilder::Reset(); + data_.reset(); + raw_data_ = nullptr; +} + +Status BooleanBuilder::Resize(int64_t capacity) { + RETURN_NOT_OK(CheckCapacity(capacity, capacity_)); + capacity = std::max(capacity, kMinBuilderCapacity); + + const int64_t new_bitmap_size = BitUtil::BytesForBits(capacity); + if (capacity_ == 0) { + RETURN_NOT_OK(AllocateResizableBuffer(pool_, new_bitmap_size, &data_)); + raw_data_ = reinterpret_cast(data_->mutable_data()); + + // We zero the memory for booleans to keep things simple; for some reason if + // we do not, even though we may write every bit (through in-place | or &), + // valgrind will still show a warning. If we do not zero the bytes here, we + // will have to be careful to zero them in AppendNull and AppendNulls. Also, + // zeroing the bits results in deterministic bits when each byte may have a + // mix of nulls and not nulls. + // + // We only zero up to new_bitmap_size because the padding was zeroed by + // AllocateResizableBuffer + memset(raw_data_, 0, static_cast(new_bitmap_size)); + } else { + const int64_t old_bitmap_capacity = data_->capacity(); + RETURN_NOT_OK(data_->Resize(new_bitmap_size)); + const int64_t new_bitmap_capacity = data_->capacity(); + raw_data_ = reinterpret_cast(data_->mutable_data()); + + // See comment above about why we zero memory for booleans + memset(raw_data_ + old_bitmap_capacity, 0, + static_cast(new_bitmap_capacity - old_bitmap_capacity)); + } + + return ArrayBuilder::Resize(capacity); +} + +Status BooleanBuilder::FinishInternal(std::shared_ptr* out) { + int64_t bit_offset = length_ % 8; + if (bit_offset > 0) { + // Adjust last byte + data_->mutable_data()[length_ / 8] &= BitUtil::kPrecedingBitmask[bit_offset]; + } + + RETURN_NOT_OK(TrimBuffer(BitUtil::BytesForBits(length_), null_bitmap_.get())); + RETURN_NOT_OK(TrimBuffer(BitUtil::BytesForBits(length_), data_.get())); + + *out = ArrayData::Make(boolean(), length_, {null_bitmap_, data_}, null_count_); + + data_ = null_bitmap_ = nullptr; + capacity_ = length_ = null_count_ = 0; + return Status::OK(); +} + +Status BooleanBuilder::AppendValues(const uint8_t* values, int64_t length, + const uint8_t* valid_bytes) { + RETURN_NOT_OK(Reserve(length)); + + int64_t i = 0; + internal::GenerateBitsUnrolled(raw_data_, length_, length, + [values, &i]() -> bool { return values[i++] != 0; }); + + // this updates length_ + ArrayBuilder::UnsafeAppendToBitmap(valid_bytes, length); + return Status::OK(); +} + +Status BooleanBuilder::AppendValues(const uint8_t* values, int64_t length, + const std::vector& is_valid) { + RETURN_NOT_OK(Reserve(length)); + DCHECK_EQ(length, static_cast(is_valid.size())); + + int64_t i = 0; + internal::GenerateBitsUnrolled(raw_data_, length_, length, + [values, &i]() -> bool { return values[i++]; }); + + // this updates length_ + ArrayBuilder::UnsafeAppendToBitmap(is_valid); + return Status::OK(); +} + +Status BooleanBuilder::AppendValues(const std::vector& values, + const std::vector& is_valid) { + return AppendValues(values.data(), static_cast(values.size()), is_valid); +} + +Status BooleanBuilder::AppendValues(const std::vector& values) { + return AppendValues(values.data(), static_cast(values.size())); +} + +Status BooleanBuilder::AppendValues(const std::vector& values, + const std::vector& is_valid) { + const int64_t length = static_cast(values.size()); + RETURN_NOT_OK(Reserve(length)); + DCHECK_EQ(length, static_cast(is_valid.size())); + + int64_t i = 0; + internal::GenerateBitsUnrolled(raw_data_, length_, length, + [&values, &i]() -> bool { return values[i++]; }); + + // this updates length_ + ArrayBuilder::UnsafeAppendToBitmap(is_valid); + return Status::OK(); +} + +Status BooleanBuilder::AppendValues(const std::vector& values) { + const int64_t length = static_cast(values.size()); + RETURN_NOT_OK(Reserve(length)); + + int64_t i = 0; + internal::GenerateBitsUnrolled(raw_data_, length_, length, + [&values, &i]() -> bool { return values[i++]; }); + + // this updates length_ + ArrayBuilder::UnsafeSetNotNull(length); + return Status::OK(); +} + +} // namespace arrow diff --git a/cpp/src/arrow/columnar/builder_primitive.h b/cpp/src/arrow/columnar/builder_primitive.h new file mode 100644 index 00000000000..5a0c111a013 --- /dev/null +++ b/cpp/src/arrow/columnar/builder_primitive.h @@ -0,0 +1,403 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#pragma once + +#include +#include +#include + +#include "arrow/columnar/builder_base.h" +#include "arrow/type.h" + +namespace arrow { + +class ARROW_EXPORT NullBuilder : public ArrayBuilder { + public: + explicit NullBuilder(MemoryPool* pool ARROW_MEMORY_POOL_DEFAULT) + : ArrayBuilder(null(), pool) {} + + Status AppendNull() { + ++null_count_; + ++length_; + return Status::OK(); + } + + Status Append(std::nullptr_t value) { return AppendNull(); } + + Status FinishInternal(std::shared_ptr* out) override; +}; + + +template +class ARROW_EXPORT PrimitiveBuilder : public ArrayBuilder { + public: + using value_type = typename Type::c_type; + + explicit PrimitiveBuilder(const std::shared_ptr& type, MemoryPool* pool) + : ArrayBuilder(type, pool), data_(NULLPTR), raw_data_(NULLPTR) {} + + using ArrayBuilder::Advance; + + /// Write nulls as uint8_t* (0 value indicates null) into pre-allocated memory + /// The memory at the corresponding data slot is set to 0 to prevent uninitialized + /// memory access + Status AppendNulls(const uint8_t* valid_bytes, int64_t length) { + ARROW_RETURN_NOT_OK(Reserve(length)); + memset(raw_data_ + length_, 0, + static_cast(TypeTraits::bytes_required(length))); + UnsafeAppendToBitmap(valid_bytes, length); + return Status::OK(); + } + + /// \brief Append a single null element + Status AppendNull() { + ARROW_RETURN_NOT_OK(Reserve(1)); + memset(raw_data_ + length_, 0, sizeof(value_type)); + UnsafeAppendToBitmap(false); + return Status::OK(); + } + + value_type GetValue(int64_t index) const { + return reinterpret_cast(data_->data())[index]; + } + + /// \brief Append a sequence of elements in one shot + /// \param[in] values a contiguous C array of values + /// \param[in] length the number of values to append + /// \param[in] valid_bytes an optional sequence of bytes where non-zero + /// indicates a valid (non-null) value + /// \return Status + Status AppendValues(const value_type* values, int64_t length, + const uint8_t* valid_bytes = NULLPTR); + + /// \brief Append a sequence of elements in one shot + /// \param[in] values a contiguous C array of values + /// \param[in] length the number of values to append + /// \param[in] is_valid an std::vector indicating valid (1) or null + /// (0). Equal in length to values + /// \return Status + Status AppendValues(const value_type* values, int64_t length, + const std::vector& is_valid); + + /// \brief Append a sequence of elements in one shot + /// \param[in] values a std::vector of values + /// \param[in] is_valid an std::vector indicating valid (1) or null + /// (0). Equal in length to values + /// \return Status + Status AppendValues(const std::vector& values, + const std::vector& is_valid); + + /// \brief Append a sequence of elements in one shot + /// \param[in] values a std::vector of values + /// \return Status + Status AppendValues(const std::vector& values); + + /// \brief Append a sequence of elements in one shot + /// \param[in] values_begin InputIterator to the beginning of the values + /// \param[in] values_end InputIterator pointing to the end of the values + /// \return Status + + template + Status AppendValues(ValuesIter values_begin, ValuesIter values_end) { + int64_t length = static_cast(std::distance(values_begin, values_end)); + ARROW_RETURN_NOT_OK(Reserve(length)); + + std::copy(values_begin, values_end, raw_data_ + length_); + + // this updates the length_ + UnsafeSetNotNull(length); + return Status::OK(); + } + + /// \brief Append a sequence of elements in one shot, with a specified nullmap + /// \param[in] values_begin InputIterator to the beginning of the values + /// \param[in] values_end InputIterator pointing to the end of the values + /// \param[in] valid_begin InputIterator with elements indication valid(1) + /// or null(0) values. + /// \return Status + template + typename std::enable_if::value, Status>::type AppendValues( + ValuesIter values_begin, ValuesIter values_end, ValidIter valid_begin) { + static_assert(!internal::is_null_pointer::value, + "Don't pass a NULLPTR directly as valid_begin, use the 2-argument " + "version instead"); + int64_t length = static_cast(std::distance(values_begin, values_end)); + ARROW_RETURN_NOT_OK(Reserve(length)); + + std::copy(values_begin, values_end, raw_data_ + length_); + + // this updates the length_ + UnsafeAppendToBitmap(valid_begin, std::next(valid_begin, length)); + return Status::OK(); + } + + // Same as above, with a pointer type ValidIter + template + typename std::enable_if::value, Status>::type AppendValues( + ValuesIter values_begin, ValuesIter values_end, ValidIter valid_begin) { + int64_t length = static_cast(std::distance(values_begin, values_end)); + ARROW_RETURN_NOT_OK(Reserve(length)); + + std::copy(values_begin, values_end, raw_data_ + length_); + + // this updates the length_ + if (valid_begin == NULLPTR) { + UnsafeSetNotNull(length); + } else { + UnsafeAppendToBitmap(valid_begin, std::next(valid_begin, length)); + } + + return Status::OK(); + } + + Status FinishInternal(std::shared_ptr* out) override; + void Reset() override; + + Status Resize(int64_t capacity) override; + + protected: + std::shared_ptr data_; + value_type* raw_data_; +}; + +/// Base class for all Builders that emit an Array of a scalar numerical type. +template +class ARROW_EXPORT NumericBuilder : public PrimitiveBuilder { + public: + using typename PrimitiveBuilder::value_type; + using PrimitiveBuilder::PrimitiveBuilder; + + template + explicit NumericBuilder( + typename std::enable_if::is_parameter_free, MemoryPool*>::type pool + ARROW_MEMORY_POOL_DEFAULT) + : PrimitiveBuilder(TypeTraits::type_singleton(), pool) {} + + using ArrayBuilder::UnsafeAppendNull; + using PrimitiveBuilder::AppendValues; + using PrimitiveBuilder::Resize; + using PrimitiveBuilder::Reserve; + + /// Append a single scalar and increase the size if necessary. + Status Append(const value_type val) { + ARROW_RETURN_NOT_OK(ArrayBuilder::Reserve(1)); + UnsafeAppend(val); + return Status::OK(); + } + + /// Append a single scalar under the assumption that the underlying Buffer is + /// large enough. + /// + /// This method does not capacity-check; make sure to call Reserve + /// beforehand. + void UnsafeAppend(const value_type val) { + BitUtil::SetBit(null_bitmap_data_, length_); + raw_data_[length_++] = val; + } + + protected: + using PrimitiveBuilder::length_; + using PrimitiveBuilder::null_bitmap_data_; + using PrimitiveBuilder::raw_data_; +}; + +// Builders + +using UInt8Builder = NumericBuilder; +using UInt16Builder = NumericBuilder; +using UInt32Builder = NumericBuilder; +using UInt64Builder = NumericBuilder; + +using Int8Builder = NumericBuilder; +using Int16Builder = NumericBuilder; +using Int32Builder = NumericBuilder; +using Int64Builder = NumericBuilder; +using TimestampBuilder = NumericBuilder; +using Time32Builder = NumericBuilder; +using Time64Builder = NumericBuilder; +using Date32Builder = NumericBuilder; +using Date64Builder = NumericBuilder; + +using HalfFloatBuilder = NumericBuilder; +using FloatBuilder = NumericBuilder; +using DoubleBuilder = NumericBuilder; + + +class ARROW_EXPORT BooleanBuilder : public ArrayBuilder { + public: + using value_type = bool; + explicit BooleanBuilder(MemoryPool* pool ARROW_MEMORY_POOL_DEFAULT); + + explicit BooleanBuilder(const std::shared_ptr& type, MemoryPool* pool); + + using ArrayBuilder::Advance; + using ArrayBuilder::UnsafeAppendNull; + + /// Write nulls as uint8_t* (0 value indicates null) into pre-allocated memory + Status AppendNulls(const uint8_t* valid_bytes, int64_t length) { + ARROW_RETURN_NOT_OK(Reserve(length)); + UnsafeAppendToBitmap(valid_bytes, length); + + return Status::OK(); + } + + Status AppendNull() { + ARROW_RETURN_NOT_OK(Reserve(1)); + UnsafeAppendToBitmap(false); + + return Status::OK(); + } + + /// Scalar append + Status Append(const bool val) { + ARROW_RETURN_NOT_OK(Reserve(1)); + UnsafeAppend(val); + return Status::OK(); + } + + Status Append(const uint8_t val) { return Append(val != 0); } + + /// Scalar append, without checking for capacity + void UnsafeAppend(const bool val) { + BitUtil::SetBit(null_bitmap_data_, length_); + if (val) { + BitUtil::SetBit(raw_data_, length_); + } else { + BitUtil::ClearBit(raw_data_, length_); + } + ++length_; + } + + void UnsafeAppend(const uint8_t val) { UnsafeAppend(val != 0); } + + /// \brief Append a sequence of elements in one shot + /// \param[in] values a contiguous array of bytes (non-zero is 1) + /// \param[in] length the number of values to append + /// \param[in] valid_bytes an optional sequence of bytes where non-zero + /// indicates a valid (non-null) value + /// \return Status + Status AppendValues(const uint8_t* values, int64_t length, + const uint8_t* valid_bytes = NULLPTR); + + /// \brief Append a sequence of elements in one shot + /// \param[in] values a contiguous C array of values + /// \param[in] length the number of values to append + /// \param[in] is_valid an std::vector indicating valid (1) or null + /// (0). Equal in length to values + /// \return Status + Status AppendValues(const uint8_t* values, int64_t length, + const std::vector& is_valid); + + /// \brief Append a sequence of elements in one shot + /// \param[in] values a std::vector of bytes + /// \param[in] is_valid an std::vector indicating valid (1) or null + /// (0). Equal in length to values + /// \return Status + Status AppendValues(const std::vector& values, + const std::vector& is_valid); + + /// \brief Append a sequence of elements in one shot + /// \param[in] values a std::vector of bytes + /// \return Status + Status AppendValues(const std::vector& values); + + /// \brief Append a sequence of elements in one shot + /// \param[in] values an std::vector indicating true (1) or false + /// \param[in] is_valid an std::vector indicating valid (1) or null + /// (0). Equal in length to values + /// \return Status + Status AppendValues(const std::vector& values, const std::vector& is_valid); + + /// \brief Append a sequence of elements in one shot + /// \param[in] values an std::vector indicating true (1) or false + /// \return Status + Status AppendValues(const std::vector& values); + + /// \brief Append a sequence of elements in one shot + /// \param[in] values_begin InputIterator to the beginning of the values + /// \param[in] values_end InputIterator pointing to the end of the values + /// or null(0) values + /// \return Status + template + Status AppendValues(ValuesIter values_begin, ValuesIter values_end) { + int64_t length = static_cast(std::distance(values_begin, values_end)); + ARROW_RETURN_NOT_OK(Reserve(length)); + auto iter = values_begin; + internal::GenerateBitsUnrolled(raw_data_, length_, length, + [&iter]() -> bool { return *(iter++); }); + + // this updates length_ + UnsafeSetNotNull(length); + return Status::OK(); + } + + /// \brief Append a sequence of elements in one shot, with a specified nullmap + /// \param[in] values_begin InputIterator to the beginning of the values + /// \param[in] values_end InputIterator pointing to the end of the values + /// \param[in] valid_begin InputIterator with elements indication valid(1) + /// or null(0) values + /// \return Status + template + typename std::enable_if::value, Status>::type AppendValues( + ValuesIter values_begin, ValuesIter values_end, ValidIter valid_begin) { + static_assert(!internal::is_null_pointer::value, + "Don't pass a NULLPTR directly as valid_begin, use the 2-argument " + "version instead"); + int64_t length = static_cast(std::distance(values_begin, values_end)); + ARROW_RETURN_NOT_OK(Reserve(length)); + + auto iter = values_begin; + internal::GenerateBitsUnrolled(raw_data_, length_, length, + [&iter]() -> bool { return *(iter++); }); + + // this updates length_ + ArrayBuilder::UnsafeAppendToBitmap(valid_begin, std::next(valid_begin, length)); + return Status::OK(); + } + + // Same as above, for a pointer type ValidIter + template + typename std::enable_if::value, Status>::type AppendValues( + ValuesIter values_begin, ValuesIter values_end, ValidIter valid_begin) { + int64_t length = static_cast(std::distance(values_begin, values_end)); + ARROW_RETURN_NOT_OK(Reserve(length)); + + auto iter = values_begin; + internal::GenerateBitsUnrolled(raw_data_, length_, length, + [&iter]() -> bool { return *(iter++); }); + + // this updates the length_ + if (valid_begin == NULLPTR) { + UnsafeSetNotNull(length); + } else { + UnsafeAppendToBitmap(valid_begin, std::next(valid_begin, length)); + } + + return Status::OK(); + } + + Status FinishInternal(std::shared_ptr* out) override; + void Reset() override; + Status Resize(int64_t capacity) override; + + protected: + std::shared_ptr data_; + uint8_t* raw_data_; +}; + +} // namespace arrow diff --git a/cpp/src/arrow/csv/column-builder.h b/cpp/src/arrow/csv/column-builder.h index b21cff76be5..054a642295c 100644 --- a/cpp/src/arrow/csv/column-builder.h +++ b/cpp/src/arrow/csv/column-builder.h @@ -18,22 +18,29 @@ #ifndef ARROW_CSV_COLUMN_BUILDER_H #define ARROW_CSV_COLUMN_BUILDER_H +#include #include -#include #include "arrow/array.h" -#include "arrow/csv/converter.h" -#include "arrow/csv/options.h" -#include "arrow/memory_pool.h" #include "arrow/status.h" -#include "arrow/table.h" -#include "arrow/type.h" -#include "arrow/util/task-group.h" #include "arrow/util/visibility.h" namespace arrow { + +class ChunkedArray; +class DataType; + +namespace internal { + +class TaskGroup; + +} // namespace internal + namespace csv { +class BlockParser; +struct ConvertOptions; + class ARROW_EXPORT ColumnBuilder { public: virtual ~ColumnBuilder() = default; diff --git a/cpp/src/arrow/csv/converter.cc b/cpp/src/arrow/csv/converter.cc index 7d8bff870ba..8a249a68c07 100644 --- a/cpp/src/arrow/csv/converter.cc +++ b/cpp/src/arrow/csv/converter.cc @@ -20,6 +20,7 @@ #include #include #include +#include #include "arrow/builder.h" #include "arrow/csv/parser.h" diff --git a/cpp/src/arrow/csv/parser.h b/cpp/src/arrow/csv/parser.h index 8a515744ee2..fdddc37a2c0 100644 --- a/cpp/src/arrow/csv/parser.h +++ b/cpp/src/arrow/csv/parser.h @@ -18,6 +18,7 @@ #ifndef ARROW_CSV_PARSER_H #define ARROW_CSV_PARSER_H +#include #include #include #include diff --git a/cpp/src/arrow/csv/reader.cc b/cpp/src/arrow/csv/reader.cc index 8cf74d6b999..b2a6b7b430a 100644 --- a/cpp/src/arrow/csv/reader.cc +++ b/cpp/src/arrow/csv/reader.cc @@ -23,6 +23,8 @@ #include #include #include +#include +#include #include #include "arrow/buffer.h" diff --git a/cpp/src/arrow/io/buffered.cc b/cpp/src/arrow/io/buffered.cc index 0c04ac21c20..f3eae39c8e6 100644 --- a/cpp/src/arrow/io/buffered.cc +++ b/cpp/src/arrow/io/buffered.cc @@ -21,10 +21,10 @@ #include #include #include -#include #include #include "arrow/buffer.h" +#include "arrow/memory_pool.h" #include "arrow/status.h" #include "arrow/util/logging.h" #include "arrow/util/string_view.h" diff --git a/cpp/src/arrow/io/buffered.h b/cpp/src/arrow/io/buffered.h index e4374ba8079..d5079556c7c 100644 --- a/cpp/src/arrow/io/buffered.h +++ b/cpp/src/arrow/io/buffered.h @@ -29,6 +29,7 @@ namespace arrow { +class Buffer; class MemoryPool; class Status; diff --git a/cpp/src/arrow/ipc/feather-test.cc b/cpp/src/arrow/ipc/feather-test.cc index b0be28925cf..8139c47e09f 100644 --- a/cpp/src/arrow/ipc/feather-test.cc +++ b/cpp/src/arrow/ipc/feather-test.cc @@ -30,6 +30,7 @@ #include "arrow/pretty_print.h" #include "arrow/record_batch.h" #include "arrow/status.h" +#include "arrow/table.h" #include "arrow/test-util.h" #include "arrow/type.h" #include "arrow/util/checked_cast.h" diff --git a/cpp/src/arrow/ipc/json-simple-test.cc b/cpp/src/arrow/ipc/json-simple-test.cc index 45525212d2f..84a2210157f 100644 --- a/cpp/src/arrow/ipc/json-simple-test.cc +++ b/cpp/src/arrow/ipc/json-simple-test.cc @@ -34,6 +34,7 @@ #include "arrow/type.h" #include "arrow/type_traits.h" #include "arrow/util/checked_cast.h" +#include "arrow/util/decimal.h" #if defined(_MSC_VER) // "warning C4307: '+': integral constant overflow" diff --git a/cpp/src/arrow/memory_pool-test.h b/cpp/src/arrow/memory_pool-test.h index 34523a181ba..fc86d943ec1 100644 --- a/cpp/src/arrow/memory_pool-test.h +++ b/cpp/src/arrow/memory_pool-test.h @@ -16,6 +16,7 @@ // under the License. #include +#include #include #include diff --git a/cpp/src/arrow/memory_pool.cc b/cpp/src/arrow/memory_pool.cc index 0a27141b447..d62db32b062 100644 --- a/cpp/src/arrow/memory_pool.cc +++ b/cpp/src/arrow/memory_pool.cc @@ -17,18 +17,16 @@ #include "arrow/memory_pool.h" -#include -#include -#include -#include -#include -#include +#include // IWYU pragma: keep +#include // IWYU pragma: keep +#include // IWYU pragma: keep +#include // IWYU pragma: keep #include #include #include // IWYU pragma: keep #include "arrow/status.h" -#include "arrow/util/logging.h" +#include "arrow/util/logging.h" // IWYU pragma: keep #ifdef ARROW_JEMALLOC // Needed to support jemalloc 3 and 4 diff --git a/cpp/src/arrow/pretty_print-test.cc b/cpp/src/arrow/pretty_print-test.cc index 8434e59b0ce..052a1803920 100644 --- a/cpp/src/arrow/pretty_print-test.cc +++ b/cpp/src/arrow/pretty_print-test.cc @@ -26,12 +26,10 @@ #include "arrow/array.h" #include "arrow/builder.h" -#include "arrow/memory_pool.h" #include "arrow/pretty_print.h" #include "arrow/table.h" #include "arrow/test-util.h" #include "arrow/type.h" -#include "arrow/util/decimal.h" namespace arrow { diff --git a/cpp/src/arrow/pretty_print.cc b/cpp/src/arrow/pretty_print.cc index ec23bfb00fc..4568e172a03 100644 --- a/cpp/src/arrow/pretty_print.cc +++ b/cpp/src/arrow/pretty_print.cc @@ -19,7 +19,6 @@ #include #include #include -#include #include #include #include diff --git a/cpp/src/arrow/pretty_print.h b/cpp/src/arrow/pretty_print.h index fde6c293f9b..ca50bc0bc99 100644 --- a/cpp/src/arrow/pretty_print.h +++ b/cpp/src/arrow/pretty_print.h @@ -21,14 +21,17 @@ #include #include -#include "arrow/type_fwd.h" #include "arrow/util/visibility.h" namespace arrow { class Array; +class Column; class ChunkedArray; +class RecordBatch; +class Schema; class Status; +class Table; struct PrettyPrintOptions { PrettyPrintOptions(int indent_arg, int window_arg = 10, int indent_size_arg = 2, diff --git a/cpp/src/arrow/python/numpy_to_arrow.cc b/cpp/src/arrow/python/numpy_to_arrow.cc index f9a5ea1b0d6..4920796c15b 100644 --- a/cpp/src/arrow/python/numpy_to_arrow.cc +++ b/cpp/src/arrow/python/numpy_to_arrow.cc @@ -539,33 +539,31 @@ Status NumPyConverter::Visit(const BinaryType& type) { auto data = reinterpret_cast(PyArray_DATA(arr_)); - int item_length = 0; + auto AppendNotNull = [&builder, this](const uint8_t* data) { + // This is annoying. NumPy allows strings to have nul-terminators, so + // we must check for them here + int item_length = 0; + for (; item_length < itemsize_; ++item_length) { + if (data[item_length] == 0) { + break; + } + } + return builder.Append(data, item_length); + }; + if (mask_ != nullptr) { Ndarray1DIndexer mask_values(mask_); for (int64_t i = 0; i < length_; ++i) { if (mask_values[i]) { RETURN_NOT_OK(builder.AppendNull()); } else { - // This is annoying. NumPy allows strings to have nul-terminators, so - // we must check for them here - for (item_length = 0; item_length < itemsize_; ++item_length) { - if (data[item_length] == 0) { - break; - } - } - RETURN_NOT_OK(builder.Append(data, item_length)); + RETURN_NOT_OK(AppendNotNull(data)); } data += stride_; } } else { for (int64_t i = 0; i < length_; ++i) { - for (item_length = 0; item_length < itemsize_; ++item_length) { - // Look for nul-terminator - if (data[item_length] == 0) { - break; - } - } - RETURN_NOT_OK(builder.Append(data, item_length)); + RETURN_NOT_OK(AppendNotNull(data)); data += stride_; } } diff --git a/cpp/src/arrow/python/python-test.cc b/cpp/src/arrow/python/python-test.cc index 2d15ce45b3b..7443c548456 100644 --- a/cpp/src/arrow/python/python-test.cc +++ b/cpp/src/arrow/python/python-test.cc @@ -25,6 +25,7 @@ #include "arrow/builder.h" #include "arrow/table.h" #include "arrow/test-util.h" +#include "arrow/util/decimal.h" #include "arrow/python/arrow_to_pandas.h" #include "arrow/python/decimal.h" diff --git a/cpp/src/arrow/record_batch.h b/cpp/src/arrow/record_batch.h index 674b68b40fa..ceb6885da62 100644 --- a/cpp/src/arrow/record_batch.h +++ b/cpp/src/arrow/record_batch.h @@ -32,6 +32,7 @@ namespace arrow { class Array; struct ArrayData; class Status; +class Table; /// \class RecordBatch /// \brief Collection of equal-length arrays matching a particular Schema diff --git a/cpp/src/arrow/tensor.cc b/cpp/src/arrow/tensor.cc index 589ee995e21..792945b1740 100644 --- a/cpp/src/arrow/tensor.cc +++ b/cpp/src/arrow/tensor.cc @@ -17,6 +17,7 @@ #include "arrow/tensor.h" +#include #include #include #include diff --git a/cpp/src/arrow/test-util.cc b/cpp/src/arrow/test-util.cc index 38e07dd060a..8c5f36417f8 100644 --- a/cpp/src/arrow/test-util.cc +++ b/cpp/src/arrow/test-util.cc @@ -18,13 +18,12 @@ #include "arrow/test-util.h" #ifndef _WIN32 -#include -#include -#include +#include // IWYU pragma: keep +#include // IWYU pragma: keep +#include // IWYU pragma: keep #endif #include -#include #include #include #include @@ -33,23 +32,17 @@ #include #include #include -#include #include #include #include "arrow/array.h" #include "arrow/buffer.h" -#include "arrow/builder.h" #include "arrow/ipc/json-simple.h" -#include "arrow/memory_pool.h" #include "arrow/pretty_print.h" #include "arrow/status.h" #include "arrow/table.h" #include "arrow/type.h" -#include "arrow/type_traits.h" -#include "arrow/util/bit-util.h" -#include "arrow/util/decimal.h" #include "arrow/util/logging.h" namespace arrow { diff --git a/cpp/src/arrow/test-util.h b/cpp/src/arrow/test-util.h index 7829ac25678..7fe7685f5a3 100644 --- a/cpp/src/arrow/test-util.h +++ b/cpp/src/arrow/test-util.h @@ -17,23 +17,17 @@ #pragma once -#ifndef _WIN32 -#include -#include -#include -#endif - #include -#include #include #include +#include #include #include #include #include #include #include -#include +#include #include #include @@ -43,13 +37,13 @@ #include "arrow/builder.h" #include "arrow/memory_pool.h" #include "arrow/pretty_print.h" +#include "arrow/record_batch.h" #include "arrow/status.h" -#include "arrow/table.h" #include "arrow/type.h" #include "arrow/type_traits.h" #include "arrow/util/bit-util.h" -#include "arrow/util/decimal.h" #include "arrow/util/logging.h" +#include "arrow/util/macros.h" #include "arrow/util/visibility.h" #define STRINGIFY(x) #x @@ -102,6 +96,10 @@ namespace arrow { +class ChunkedArray; +class Column; +class Table; + using ArrayVector = std::vector>; #define ASSERT_ARRAYS_EQUAL(LEFT, RIGHT) \ diff --git a/cpp/src/arrow/util/compression_lz4.cc b/cpp/src/arrow/util/compression_lz4.cc index 0acd54d0572..97fd46ab6c5 100644 --- a/cpp/src/arrow/util/compression_lz4.cc +++ b/cpp/src/arrow/util/compression_lz4.cc @@ -18,6 +18,7 @@ #include "arrow/util/compression_lz4.h" #include +#include #include #include diff --git a/cpp/src/arrow/util/int-util-test.cc b/cpp/src/arrow/util/int-util-test.cc index 51fd96e4ea2..018eeda7248 100644 --- a/cpp/src/arrow/util/int-util-test.cc +++ b/cpp/src/arrow/util/int-util-test.cc @@ -17,14 +17,12 @@ #include #include -#include #include #include #include #include -#include "arrow/test-util.h" #include "arrow/util/int-util.h" namespace arrow { diff --git a/cpp/src/arrow/util/string_view.h b/cpp/src/arrow/util/string_view.h index 2ee594a9e9a..0f35483e373 100644 --- a/cpp/src/arrow/util/string_view.h +++ b/cpp/src/arrow/util/string_view.h @@ -18,7 +18,7 @@ #ifndef ARROW_UTIL_STRING_VIEW_H #define ARROW_UTIL_STRING_VIEW_H -#include "arrow/util/string_view/string_view.hpp" +#include "arrow/util/string_view/string_view.hpp" // IWYU pragma: export namespace arrow { namespace util { diff --git a/cpp/src/parquet/arrow/reader.cc b/cpp/src/parquet/arrow/reader.cc index 6273fda4640..e8e1118ec6f 100644 --- a/cpp/src/parquet/arrow/reader.cc +++ b/cpp/src/parquet/arrow/reader.cc @@ -1387,6 +1387,7 @@ Status PrimitiveImpl::NextBatch(int64_t records_to_read, std::shared_ptr* return ::arrow::Status::IOError(e.what()); } + Datum result; switch (field_->type()->id()) { TRANSFER_CASE(BOOL, ::arrow::BooleanType, BooleanType) TRANSFER_CASE(UINT8, ::arrow::UInt8Type, Int32Type) diff --git a/cpp/src/parquet/arrow/record_reader.cc b/cpp/src/parquet/arrow/record_reader.cc index 4a3cd526b11..fdbb7095077 100644 --- a/cpp/src/parquet/arrow/record_reader.cc +++ b/cpp/src/parquet/arrow/record_reader.cc @@ -86,14 +86,6 @@ class RecordReader::RecordReaderImpl { valid_bits_ = AllocateBuffer(pool); def_levels_ = AllocateBuffer(pool); rep_levels_ = AllocateBuffer(pool); - - if (descr->physical_type() == Type::BYTE_ARRAY) { - builder_.reset(new ::arrow::BinaryBuilder(pool)); - } else if (descr->physical_type() == Type::FIXED_LEN_BYTE_ARRAY) { - int byte_width = descr->type_length(); - std::shared_ptr<::arrow::DataType> type = ::arrow::fixed_size_binary(byte_width); - builder_.reset(new ::arrow::FixedSizeBinaryBuilder(type, pool)); - } Reset(); } @@ -229,8 +221,6 @@ class RecordReader::RecordReaderImpl { return result; } - ::arrow::ArrayBuilder* builder() { return builder_.get(); } - // Process written repetition/definition levels to reach the end of // records. Process no more levels than necessary to delimit the indicated // number of logical records. Updates internal state of RecordReader @@ -375,7 +365,7 @@ class RecordReader::RecordReaderImpl { records_read_ = 0; - // Calling Finish on the builders also resets them + // Call Finish on the binary builders to reset them } void ResetValues() { @@ -391,6 +381,8 @@ class RecordReader::RecordReaderImpl { virtual void DebugPrintState() = 0; + virtual std::vector> GetChunks() = 0; + protected: virtual bool ReadNewPage() = 0; @@ -434,9 +426,6 @@ class RecordReader::RecordReaderImpl { int64_t levels_position_; int64_t levels_capacity_; - // TODO(wesm): ByteArray / FixedLenByteArray types - std::unique_ptr<::arrow::ArrayBuilder> builder_; - std::shared_ptr<::arrow::ResizableBuffer> values_; template @@ -449,13 +438,36 @@ class RecordReader::RecordReaderImpl { std::shared_ptr<::arrow::ResizableBuffer> rep_levels_; }; +template +struct RecordReaderTraits { + using BuilderType = void; +}; + +template <> +struct RecordReaderTraits { + using BuilderType = ::arrow::ChunkedBinaryBuilder; +}; + template class TypedRecordReader : public RecordReader::RecordReaderImpl { public: - typedef typename DType::c_type T; + using T = typename DType::c_type; + + using BuilderType = typename RecordReaderTraits::BuilderType; + + // Maximum of 16MB chunks + static constexpr int32_t kBinaryChunksize = 1 << 24; - TypedRecordReader(const ColumnDescriptor* schema, ::arrow::MemoryPool* pool) - : RecordReader::RecordReaderImpl(schema, pool), current_decoder_(nullptr) {} + TypedRecordReader(const ColumnDescriptor* descr, ::arrow::MemoryPool* pool) + : RecordReader::RecordReaderImpl(descr, pool), current_decoder_(nullptr) { + if (descr->physical_type() == Type::BYTE_ARRAY) { + builder_.reset(new ::arrow::internal::ChunkedBinaryBuilder(kBinaryChunksize, pool)); + } else if (descr->physical_type() == Type::FIXED_LEN_BYTE_ARRAY) { + int byte_width = descr->type_length(); + std::shared_ptr<::arrow::DataType> type = ::arrow::fixed_size_binary(byte_width); + builder_.reset(new ::arrow::FixedSizeBinaryBuilder(type, pool)); + } + } void ResetDecoders() override { decoders_.clear(); } @@ -546,6 +558,10 @@ class TypedRecordReader : public RecordReader::RecordReaderImpl { std::cout << std::endl; } + virtual std::vector> GetChunks() override { + throw ParquetException("GetChunks only implemented for binary types"); + } + private: typedef Decoder DecoderType; @@ -554,6 +570,8 @@ class TypedRecordReader : public RecordReader::RecordReaderImpl { // plain-encoded data. std::unordered_map> decoders_; + std::unique_ptr builder_; + DecoderType* current_decoder_; // Advance to the next data page @@ -572,6 +590,20 @@ void TypedRecordReader::DebugPrintState() {} template <> void TypedRecordReader::DebugPrintState() {} +template <> +::arrow::ArrayVector TypedRecordReader::GetChunks() { + ::arrow::ArrayVector chunks; + PARQUET_THROW_NOT_OK(builder_->Finish(&chunks)); + return chunks; +} + +template <> +::arrow::ArrayVector TypedRecordReader::GetChunks() { + std::shared_ptr chunk; + PARQUET_THROW_NOT_OK(builder_->Finish(&chunk)); + return ::arrow::ArrayVector({chunk}); +} + template <> inline void TypedRecordReader::ReadValuesDense(int64_t values_to_read) { auto values = ValuesHead(); @@ -579,10 +611,9 @@ inline void TypedRecordReader::ReadValuesDense(int64_t values_to_ current_decoder_->Decode(values, static_cast(values_to_read)); DCHECK_EQ(num_decoded, values_to_read); - auto builder = static_cast<::arrow::BinaryBuilder*>(builder_.get()); for (int64_t i = 0; i < num_decoded; i++) { PARQUET_THROW_NOT_OK( - builder->Append(values[i].ptr, static_cast(values[i].len))); + builder_->Append(values[i].ptr, static_cast(values[i].len))); } ResetValues(); } @@ -594,9 +625,8 @@ inline void TypedRecordReader::ReadValuesDense(int64_t values_to_read) current_decoder_->Decode(values, static_cast(values_to_read)); DCHECK_EQ(num_decoded, values_to_read); - auto builder = static_cast<::arrow::FixedSizeBinaryBuilder*>(builder_.get()); for (int64_t i = 0; i < num_decoded; i++) { - PARQUET_THROW_NOT_OK(builder->Append(values[i].ptr)); + PARQUET_THROW_NOT_OK(builder_->Append(values[i].ptr)); } ResetValues(); } @@ -613,14 +643,12 @@ inline void TypedRecordReader::ReadValuesSpaced(int64_t values_to valid_bits_offset); DCHECK_EQ(num_decoded, values_to_read); - auto builder = static_cast<::arrow::BinaryBuilder*>(builder_.get()); - for (int64_t i = 0; i < num_decoded; i++) { if (::arrow::BitUtil::GetBit(valid_bits, valid_bits_offset + i)) { PARQUET_THROW_NOT_OK( - builder->Append(values[i].ptr, static_cast(values[i].len))); + builder_->Append(values[i].ptr, static_cast(values[i].len))); } else { - PARQUET_THROW_NOT_OK(builder->AppendNull()); + PARQUET_THROW_NOT_OK(builder_->AppendNull()); } } ResetValues(); @@ -638,12 +666,11 @@ inline void TypedRecordReader::ReadValuesSpaced(int64_t values_to_read valid_bits_offset); DCHECK_EQ(num_decoded, values_to_read); - auto builder = static_cast<::arrow::FixedSizeBinaryBuilder*>(builder_.get()); for (int64_t i = 0; i < num_decoded; i++) { if (::arrow::BitUtil::GetBit(valid_bits, valid_bits_offset + i)) { - PARQUET_THROW_NOT_OK(builder->Append(values[i].ptr)); + PARQUET_THROW_NOT_OK(builder_->Append(values[i].ptr)); } else { - PARQUET_THROW_NOT_OK(builder->AppendNull()); + PARQUET_THROW_NOT_OK(builder_->AppendNull()); } } ResetValues(); @@ -845,8 +872,6 @@ std::shared_ptr RecordReader::ReleaseIsValid() { return impl_->ReleaseIsValid(); } -::arrow::ArrayBuilder* RecordReader::builder() { return impl_->builder(); } - int64_t RecordReader::values_written() const { return impl_->values_written(); } int64_t RecordReader::levels_position() const { return impl_->levels_position(); } diff --git a/python/pyarrow/tests/test_parquet.py b/python/pyarrow/tests/test_parquet.py index 89d32245804..77a101a1f42 100644 --- a/python/pyarrow/tests/test_parquet.py +++ b/python/pyarrow/tests/test_parquet.py @@ -1959,6 +1959,31 @@ def test_large_table_int32_overflow(): _write_table(table, f) +@pytest.mark.large_memory +def test_binary_array_overflow_to_chunked(): + # ARROW-3762 + + # 2^31 + 1 bytes + values = [b'x'] + [ + b'x' * (1 << 20) + ] * 2 * (1 << 10) + df = pd.DataFrame({'byte_col': values}) + + tbl = pa.Table.from_pandas(df, preserve_index=False) + + buf = io.BytesIO() + _write_table(tbl, buf) + buf.seek(0) + read_tbl = _read_table(buf) + buf = None + + col0_data = read_tbl[0].data + assert isinstance(col0_data, pa.ChunkedArray) + assert col0_data.num_chunks == 2 + + assert tbl.equals(read_tbl) + + def test_index_column_name_duplicate(tempdir): data = { 'close': { From 9228112781e61cf30a0f9de3feb9d48fbd927506 Mon Sep 17 00:00:00 2001 From: Wes McKinney Date: Thu, 13 Dec 2018 08:55:56 -0600 Subject: [PATCH 02/12] More refactoring Change-Id: I0eced60a1f8e16096a1b441b622ba750d1d59ca6 --- cpp/src/arrow/columnar/builder_base.h | 2 +- cpp/src/parquet/arrow/CMakeLists.txt | 3 +- cpp/src/parquet/arrow/reader.cc | 104 +++++++++++++++----------- cpp/src/parquet/arrow/reader.h | 2 + 4 files changed, 67 insertions(+), 44 deletions(-) diff --git a/cpp/src/arrow/columnar/builder_base.h b/cpp/src/arrow/columnar/builder_base.h index ab0dd26a638..4be8cf7e940 100644 --- a/cpp/src/arrow/columnar/builder_base.h +++ b/cpp/src/arrow/columnar/builder_base.h @@ -219,7 +219,7 @@ class ARROW_EXPORT ArrayBuilder { int64_t capacity_; // Child value array builders. These are owned by this class - std::vector> children_; + std::vector> children_; private: ARROW_DISALLOW_COPY_AND_ASSIGN(ArrayBuilder); diff --git a/cpp/src/parquet/arrow/CMakeLists.txt b/cpp/src/parquet/arrow/CMakeLists.txt index 429dadcd37e..d7e308e2ed7 100644 --- a/cpp/src/parquet/arrow/CMakeLists.txt +++ b/cpp/src/parquet/arrow/CMakeLists.txt @@ -18,9 +18,10 @@ ADD_PARQUET_TEST(arrow-schema-test) ADD_PARQUET_TEST(arrow-reader-writer-test) -ADD_ARROW_BENCHMARK(reader-writer-benchmark +ADD_BENCHMARK(reader-writer-benchmark PREFIX "parquet-arrow" EXTRA_LINK_LIBS ${PARQUET_BENCHMARK_LINK_LIBRARIES}) +add_dependencies(parquet parquet-arrow-reader-writer-benchmark) # Headers: top level install(FILES diff --git a/cpp/src/parquet/arrow/reader.cc b/cpp/src/parquet/arrow/reader.cc index e8e1118ec6f..54ca376e557 100644 --- a/cpp/src/parquet/arrow/reader.cc +++ b/cpp/src/parquet/arrow/reader.cc @@ -27,11 +27,15 @@ #include #include +#include "arrow/api.h" #include "arrow/api.h" #include "arrow/util/bit-util.h" #include "arrow/util/logging.h" #include "arrow/util/thread-pool.h" +// XXX(wesm): For arrow::compute::Datum. This should perhaps be promoted +#include "arrow/compute/kernel.h" + #include "parquet/arrow/record_reader.h" #include "parquet/arrow/schema.h" #include "parquet/column_reader.h" @@ -46,6 +50,7 @@ using arrow::Array; using arrow::BooleanArray; +using arrow::ChunkedArray; using arrow::Column; using arrow::Field; using arrow::Int32Array; @@ -57,6 +62,9 @@ using arrow::StructArray; using arrow::Table; using arrow::TimestampArray; +// For Array/ChunkedArray variant +using arrow::compute::Datum; + using parquet::schema::Node; // Help reduce verbosity @@ -223,15 +231,18 @@ class FileReader::Impl { virtual ~Impl() {} Status GetColumn(int i, std::unique_ptr* out); - Status ReadSchemaField(int i, std::shared_ptr* out); + + Status ReadSchemaField(int i, std::shared_ptr* out); Status ReadSchemaField(int i, const std::vector& indices, - std::shared_ptr* out); + std::shared_ptr* out); + Status ReadColumn(int i, std::shared_ptr* out); + Status ReadColumnChunk(int column_index, int row_group_index, + std::shared_ptr* out); + Status GetReaderForNode(int index, const Node* node, const std::vector& indices, int16_t def_level, std::unique_ptr* out); - Status ReadColumn(int i, std::shared_ptr* out); - Status ReadColumnChunk(int column_index, int row_group_index, - std::shared_ptr* out); + Status GetSchema(std::shared_ptr<::arrow::Schema>* out); Status GetSchema(const std::vector& indices, std::shared_ptr<::arrow::Schema>* out); @@ -267,7 +278,8 @@ class FileReader::Impl { class ColumnReader::ColumnReaderImpl { public: virtual ~ColumnReaderImpl() {} - virtual Status NextBatch(int64_t records_to_read, std::shared_ptr* out) = 0; + virtual Status NextBatch(int64_t records_to_read, + std::shared_ptr* out) = 0; virtual Status GetDefLevels(const int16_t** data, size_t* length) = 0; virtual Status GetRepLevels(const int16_t** data, size_t* length) = 0; virtual const std::shared_ptr field() = 0; @@ -283,10 +295,11 @@ class PARQUET_NO_EXPORT PrimitiveImpl : public ColumnReader::ColumnReaderImpl { NextRowGroup(); } - Status NextBatch(int64_t records_to_read, std::shared_ptr* out) override; + Status NextBatch(int64_t records_to_read, + std::shared_ptr* out) override; template - Status WrapIntoListArray(std::shared_ptr* array); + Status WrapIntoListArray(Datum* inout_array); Status GetDefLevels(const int16_t** data, size_t* length) override; Status GetRepLevels(const int16_t** data, size_t* length) override; @@ -314,7 +327,8 @@ class PARQUET_NO_EXPORT StructImpl : public ColumnReader::ColumnReaderImpl { InitField(node, children); } - Status NextBatch(int64_t records_to_read, std::shared_ptr* out) override; + Status NextBatch(int64_t records_to_read, + std::shared_ptr* out) override; Status GetDefLevels(const int16_t** data, size_t* length) override; Status GetRepLevels(const int16_t** data, size_t* length) override; const std::shared_ptr field() override { return field_; } @@ -395,7 +409,7 @@ Status FileReader::Impl::GetReaderForNode( return Status::OK(); } -Status FileReader::Impl::ReadSchemaField(int i, std::shared_ptr* out) { +Status FileReader::Impl::ReadSchemaField(int i, std::shared_ptr* out) { std::vector indices(reader_->metadata()->num_columns()); for (size_t j = 0; j < indices.size(); ++j) { @@ -406,7 +420,7 @@ Status FileReader::Impl::ReadSchemaField(int i, std::shared_ptr* out) { } Status FileReader::Impl::ReadSchemaField(int i, const std::vector& indices, - std::shared_ptr* out) { + std::shared_ptr* out) { auto parquet_schema = reader_->metadata()->schema(); auto node = parquet_schema->group_node()->field(i).get(); @@ -432,7 +446,7 @@ Status FileReader::Impl::ReadSchemaField(int i, const std::vector& indices, return reader->NextBatch(records_to_read, out); } -Status FileReader::Impl::ReadColumn(int i, std::shared_ptr* out) { +Status FileReader::Impl::ReadColumn(int i, std::shared_ptr* out) { std::unique_ptr flat_column_reader; RETURN_NOT_OK(GetColumn(i, &flat_column_reader)); @@ -452,7 +466,7 @@ Status FileReader::Impl::GetSchema(const std::vector& indices, } Status FileReader::Impl::ReadColumnChunk(int column_index, int row_group_index, - std::shared_ptr* out) { + std::shared_ptr* out) { auto rg_metadata = reader_->metadata()->RowGroup(row_group_index); int64_t records_to_read = rg_metadata->ColumnChunk(column_index)->num_values(); @@ -463,10 +477,7 @@ Status FileReader::Impl::ReadColumnChunk(int column_index, int row_group_index, new PrimitiveImpl(pool_, std::move(input))); ColumnReader flat_column_reader(std::move(impl)); - std::shared_ptr array; - RETURN_NOT_OK(flat_column_reader.NextBatch(records_to_read, &array)); - *out = array; - return Status::OK(); + return flat_column_reader.NextBatch(records_to_read, out); } Status FileReader::Impl::ReadRowGroup(int row_group_index, @@ -485,7 +496,7 @@ Status FileReader::Impl::ReadRowGroup(int row_group_index, auto ReadColumnFunc = [&indices, &row_group_index, &schema, &columns, this](int i) { int column_index = indices[i]; - std::shared_ptr array; + std::shared_ptr array; RETURN_NOT_OK(ReadColumnChunk(column_index, row_group_index, &array)); columns[i] = std::make_shared(schema->field(i), array); return Status::OK(); @@ -532,7 +543,7 @@ Status FileReader::Impl::ReadTable(const std::vector& indices, std::vector> columns(num_fields); auto ReadColumnFunc = [&indices, &field_indices, &schema, &columns, this](int i) { - std::shared_ptr array; + std::shared_ptr array; RETURN_NOT_OK(ReadSchemaField(field_indices[i], indices, &array)); columns[i] = std::make_shared(schema->field(i), array); return Status::OK(); @@ -576,8 +587,6 @@ Status FileReader::Impl::ReadTable(std::shared_ptr* table) { Status FileReader::Impl::ReadRowGroups(const std::vector& row_groups, const std::vector& indices, std::shared_ptr
* table) { - // TODO(PARQUET-1393): Modify the record readers to already read this into a single, - // continuous array. std::vector> tables(row_groups.size(), nullptr); for (size_t i = 0; i < row_groups.size(); ++i) { @@ -633,7 +642,7 @@ Status FileReader::GetSchema(const std::vector& indices, return impl_->GetSchema(indices, out); } -Status FileReader::ReadColumn(int i, std::shared_ptr* out) { +Status FileReader::ReadColumn(int i, std::shared_ptr* out) { try { return impl_->ReadColumn(i, out); } catch (const ::parquet::ParquetException& e) { @@ -641,7 +650,7 @@ Status FileReader::ReadColumn(int i, std::shared_ptr* out) { } } -Status FileReader::ReadSchemaField(int i, std::shared_ptr* out) { +Status FileReader::ReadSchemaField(int i, std::shared_ptr* out) { try { return impl_->ReadSchemaField(i, out); } catch (const ::parquet::ParquetException& e) { @@ -764,7 +773,7 @@ const ParquetFileReader* FileReader::parquet_reader() const { } template -Status PrimitiveImpl::WrapIntoListArray(std::shared_ptr* array) { +Status PrimitiveImpl::WrapIntoListArray(Datum* inout_array) { const int16_t* def_levels = record_reader_->def_levels(); const int16_t* rep_levels = record_reader_->rep_levels(); const int64_t total_levels_read = record_reader_->levels_position(); @@ -910,7 +919,7 @@ struct TransferFunctor { Status operator()(RecordReader* reader, MemoryPool* pool, const std::shared_ptr<::arrow::DataType>& type, - std::shared_ptr* out) { + Datum* out) { static_assert(!std::is_same::value, "The fast path transfer functor should be used " "for primitive values"); @@ -939,7 +948,7 @@ struct TransferFunctor> { Status operator()(RecordReader* reader, MemoryPool* pool, const std::shared_ptr<::arrow::DataType>& type, - std::shared_ptr* out) { + Datum* out) { int64_t length = reader->values_written(); std::shared_ptr values = reader->ReleaseValues(); @@ -958,7 +967,7 @@ template <> struct TransferFunctor<::arrow::BooleanType, BooleanType> { Status operator()(RecordReader* reader, MemoryPool* pool, const std::shared_ptr<::arrow::DataType>& type, - std::shared_ptr* out) { + Datum* out) { int64_t length = reader->values_written(); std::shared_ptr data; @@ -992,7 +1001,7 @@ template <> struct TransferFunctor<::arrow::TimestampType, Int96Type> { Status operator()(RecordReader* reader, MemoryPool* pool, const std::shared_ptr<::arrow::DataType>& type, - std::shared_ptr* out) { + Datum* out) { int64_t length = reader->values_written(); auto values = reinterpret_cast(reader->values()); @@ -1020,7 +1029,7 @@ template <> struct TransferFunctor<::arrow::Date64Type, Int32Type> { Status operator()(RecordReader* reader, MemoryPool* pool, const std::shared_ptr<::arrow::DataType>& type, - std::shared_ptr* out) { + Datum* out) { int64_t length = reader->values_written(); auto values = reinterpret_cast(reader->values()); @@ -1050,7 +1059,7 @@ struct TransferFunctor< std::is_same::value>::type> { Status operator()(RecordReader* reader, MemoryPool* pool, const std::shared_ptr<::arrow::DataType>& type, - std::shared_ptr* out) { + Datum* out) { RETURN_NOT_OK(reader->builder()->Finish(out)); if (type->id() == ::arrow::Type::STRING) { @@ -1176,7 +1185,7 @@ template <> struct TransferFunctor<::arrow::Decimal128Type, FLBAType> { Status operator()(RecordReader* reader, MemoryPool* pool, const std::shared_ptr<::arrow::DataType>& type, - std::shared_ptr* out) { + Datum* out) { DCHECK_EQ(type->id(), ::arrow::Type::DECIMAL); // Finish the built data into a temporary array @@ -1237,7 +1246,7 @@ template <> struct TransferFunctor<::arrow::Decimal128Type, ByteArrayType> { Status operator()(RecordReader* reader, MemoryPool* pool, const std::shared_ptr<::arrow::DataType>& type, - std::shared_ptr* out) { + Datum* out) { DCHECK_EQ(type->id(), ::arrow::Type::DECIMAL); // Finish the built data into a temporary array @@ -1295,7 +1304,7 @@ template ::value>::type> static Status DecimalIntegerTransfer(RecordReader* reader, MemoryPool* pool, const std::shared_ptr<::arrow::DataType>& type, - std::shared_ptr* out) { + Datum* out) { DCHECK_EQ(type->id(), ::arrow::Type::DECIMAL); const int64_t length = reader->values_written(); @@ -1343,7 +1352,7 @@ template <> struct TransferFunctor<::arrow::Decimal128Type, Int32Type> { Status operator()(RecordReader* reader, MemoryPool* pool, const std::shared_ptr<::arrow::DataType>& type, - std::shared_ptr* out) { + Datum* out) { return DecimalIntegerTransfer(reader, pool, type, out); } }; @@ -1352,14 +1361,14 @@ template <> struct TransferFunctor<::arrow::Decimal128Type, Int64Type> { Status operator()(RecordReader* reader, MemoryPool* pool, const std::shared_ptr<::arrow::DataType>& type, - std::shared_ptr* out) { + Datum* out) { return DecimalIntegerTransfer(reader, pool, type, out); } }; -#define TRANSFER_DATA(ArrowType, ParquetType) \ - TransferFunctor func; \ - RETURN_NOT_OK(func(record_reader_.get(), pool_, field_->type(), out)); \ +#define TRANSFER_DATA(ArrowType, ParquetType) \ + TransferFunctor func; \ + RETURN_NOT_OK(func(record_reader_.get(), pool_, field_->type(), &result)); \ RETURN_NOT_OK(WrapIntoListArray(out)) #define TRANSFER_CASE(ENUM, ArrowType, ParquetType) \ @@ -1367,7 +1376,8 @@ struct TransferFunctor<::arrow::Decimal128Type, Int64Type> { TRANSFER_DATA(ArrowType, ParquetType); \ } break; -Status PrimitiveImpl::NextBatch(int64_t records_to_read, std::shared_ptr* out) { +Status PrimitiveImpl::NextBatch(int64_t records_to_read, + std::shared_ptr* out) { try { // Pre-allocation gives much better performance for flat columns record_reader_->Reserve(records_to_read); @@ -1406,8 +1416,8 @@ Status PrimitiveImpl::NextBatch(int64_t records_to_read, std::shared_ptr* TRANSFER_CASE(DATE64, ::arrow::Date64Type, Int32Type) TRANSFER_CASE(FIXED_SIZE_BINARY, ::arrow::FixedSizeBinaryType, FLBAType) case ::arrow::Type::NA: { - *out = std::make_shared<::arrow::NullArray>(record_reader_->values_written()); - RETURN_NOT_OK(WrapIntoListArray(out)); + result = std::make_shared<::arrow::NullArray>(record_reader_->values_written()); + RETURN_NOT_OK(WrapIntoListArray(&result)); break; } case ::arrow::Type::DECIMAL: { @@ -1614,10 +1624,20 @@ RowGroupReader::~RowGroupReader() {} RowGroupReader::RowGroupReader(FileReader::Impl* impl, int row_group_index) : impl_(impl), row_group_index_(row_group_index) {} -Status ColumnChunkReader::Read(std::shared_ptr<::arrow::Array>* out) { +Status ColumnChunkReader::Read(std::shared_ptr<::arrow::ChunkedArray>* out) { return impl_->ReadColumnChunk(column_index_, row_group_index_, out); } +Status ColumnChunkReader::Read(std::shared_ptr<::arrow::Array>* out) { + std::make_shared chunked_out; + RETURN_NOT_OK(impl_->ReadColumnChunk(column_index_, row_group_index_, &chunked_out)); + DCHECK_GT(chunked_out->num_chunks(), 0); + if (chunked_out->num_chunks() > 1) { + return Status::Invalid("ColumnChunk result is a chunked array"); + } + return chunked_out->chunk(0); +} + ColumnChunkReader::~ColumnChunkReader() {} ColumnChunkReader::ColumnChunkReader(FileReader::Impl* impl, int row_group_index, diff --git a/cpp/src/parquet/arrow/reader.h b/cpp/src/parquet/arrow/reader.h index 2cd94ca28fd..1b1bc3bb013 100644 --- a/cpp/src/parquet/arrow/reader.h +++ b/cpp/src/parquet/arrow/reader.h @@ -248,6 +248,8 @@ class PARQUET_EXPORT RowGroupReader { class PARQUET_EXPORT ColumnChunkReader { public: + ::arrow::Status Read(std::shared_ptr<::arrow::ChunkedArray>* out); + ::arrow::Status Read(std::shared_ptr<::arrow::Array>* out); virtual ~ColumnChunkReader(); From 3d075e4aa93904c2a3dff8123cd76d81a24407e3 Mon Sep 17 00:00:00 2001 From: Wes McKinney Date: Thu, 13 Dec 2018 16:27:08 -0600 Subject: [PATCH 03/12] Additional refactoring to make things chunked. Allow implicit construction of arrow::compute::Datum Change-Id: I483059a545c69a9b25d543faad641785da6bea29 --- cpp/src/arrow/columnar/builder_base.h | 1 - cpp/src/arrow/columnar/builder_dict.h | 1 - cpp/src/arrow/columnar/builder_nested.h | 1 - cpp/src/arrow/columnar/builder_primitive.h | 2 - cpp/src/arrow/compute/compute-test.cc | 57 ++- cpp/src/arrow/compute/kernel.h | 27 +- cpp/src/arrow/table.h | 4 + cpp/src/parquet/arrow/reader.cc | 490 ++++++++++++--------- cpp/src/parquet/arrow/reader.h | 32 +- cpp/src/parquet/arrow/record_reader.cc | 8 +- cpp/src/parquet/arrow/record_reader.h | 7 +- 11 files changed, 349 insertions(+), 281 deletions(-) diff --git a/cpp/src/arrow/columnar/builder_base.h b/cpp/src/arrow/columnar/builder_base.h index 4be8cf7e940..d5c3fcf781c 100644 --- a/cpp/src/arrow/columnar/builder_base.h +++ b/cpp/src/arrow/columnar/builder_base.h @@ -50,7 +50,6 @@ struct ArrayData; constexpr int64_t kMinBuilderCapacity = 1 << 5; constexpr int64_t kListMaximumElements = std::numeric_limits::max() - 1; - /// Base class for all data array builders. /// /// This class provides a facilities for incrementally building the null bitmap diff --git a/cpp/src/arrow/columnar/builder_dict.h b/cpp/src/arrow/columnar/builder_dict.h index 23a7a09de4e..87b7feff8f9 100644 --- a/cpp/src/arrow/columnar/builder_dict.h +++ b/cpp/src/arrow/columnar/builder_dict.h @@ -24,7 +24,6 @@ namespace arrow { - // ---------------------------------------------------------------------- // Dictionary builder diff --git a/cpp/src/arrow/columnar/builder_nested.h b/cpp/src/arrow/columnar/builder_nested.h index 0e2a2ff92a7..aa6c439dffd 100644 --- a/cpp/src/arrow/columnar/builder_nested.h +++ b/cpp/src/arrow/columnar/builder_nested.h @@ -24,7 +24,6 @@ namespace arrow { - // ---------------------------------------------------------------------- // List builder diff --git a/cpp/src/arrow/columnar/builder_primitive.h b/cpp/src/arrow/columnar/builder_primitive.h index 5a0c111a013..0d243ae29e1 100644 --- a/cpp/src/arrow/columnar/builder_primitive.h +++ b/cpp/src/arrow/columnar/builder_primitive.h @@ -42,7 +42,6 @@ class ARROW_EXPORT NullBuilder : public ArrayBuilder { Status FinishInternal(std::shared_ptr* out) override; }; - template class ARROW_EXPORT PrimitiveBuilder : public ArrayBuilder { public: @@ -237,7 +236,6 @@ using HalfFloatBuilder = NumericBuilder; using FloatBuilder = NumericBuilder; using DoubleBuilder = NumericBuilder; - class ARROW_EXPORT BooleanBuilder : public ArrayBuilder { public: using value_type = bool; diff --git a/cpp/src/arrow/compute/compute-test.cc b/cpp/src/arrow/compute/compute-test.cc index 52fc5880960..2d23eebebe2 100644 --- a/cpp/src/arrow/compute/compute-test.cc +++ b/cpp/src/arrow/compute/compute-test.cc @@ -70,6 +70,23 @@ shared_ptr _MakeArray(const shared_ptr& type, const vector& return result; } +// ---------------------------------------------------------------------- +// Datum + +template +void CheckImplicitConstructor(enum Datum::type expected_kind) { + std::shared_ptr value; + Datum datum = value; + ASSERT_EQ(expected_kind, datum.kind()); +} + +TEST(TestDatum, ImplicitConstructors) { + CheckImplicitConstructor(Datum::ARRAY); + CheckImplicitConstructor(Datum::CHUNKED_ARRAY); + CheckImplicitConstructor(Datum::RECORD_BATCH); + CheckImplicitConstructor
(Datum::TABLE); +} + // ---------------------------------------------------------------------- // Cast @@ -781,7 +798,7 @@ TEST_F(TestCast, ChunkedArray) { CastOptions options; Datum out; - ASSERT_OK(Cast(&this->ctx_, Datum(carr), out_type, options, &out)); + ASSERT_OK(Cast(&this->ctx_, carr, out_type, options, &out)); ASSERT_EQ(Datum::CHUNKED_ARRAY, out.kind()); auto out_carr = out.chunked_array(); @@ -869,7 +886,7 @@ TEST_F(TestCast, PreallocatedMemory) { out_data->buffers.push_back(out_values); Datum out(out_data); - ASSERT_OK(kernel->Call(&this->ctx_, Datum(arr), &out)); + ASSERT_OK(kernel->Call(&this->ctx_, arr, &out)); // Buffer address unchanged ASSERT_EQ(out_values.get(), out_data->buffers[1].get()); @@ -912,8 +929,8 @@ void CheckOffsetOutputCase(FunctionContext* ctx, const std::shared_ptr Datum out_second(out_second_data); // Cast each bit - ASSERT_OK(kernel->Call(ctx, Datum(arr->Slice(0, first_half)), &out_first)); - ASSERT_OK(kernel->Call(ctx, Datum(arr->Slice(first_half)), &out_second)); + ASSERT_OK(kernel->Call(ctx, arr->Slice(0, first_half), &out_first)); + ASSERT_OK(kernel->Call(ctx, arr->Slice(first_half), &out_second)); shared_ptr result = MakeArray(out_data); @@ -1105,7 +1122,7 @@ TYPED_TEST(TestDictionaryCast, Basic) { TestBase::MakeRandomArray::ArrayType>(10, 2); Datum out; - ASSERT_OK(DictionaryEncode(&this->ctx_, Datum(plain_array->data()), &out)); + ASSERT_OK(DictionaryEncode(&this->ctx_, plain_array->data(), &out)); this->CheckPass(*MakeArray(out.array()), *plain_array, plain_array->type(), options); } @@ -1201,7 +1218,7 @@ void CheckUnique(FunctionContext* ctx, const shared_ptr& type, shared_ptr expected = _MakeArray(type, out_values, out_is_valid); shared_ptr result; - ASSERT_OK(Unique(ctx, Datum(input), &result)); + ASSERT_OK(Unique(ctx, input, &result)); ASSERT_ARRAYS_EQUAL(*expected, *result); } @@ -1218,7 +1235,7 @@ void CheckDictEncode(FunctionContext* ctx, const shared_ptr& type, DictionaryArray expected(dictionary(int32(), ex_dict), ex_indices); Datum datum_out; - ASSERT_OK(DictionaryEncode(ctx, Datum(input), &datum_out)); + ASSERT_OK(DictionaryEncode(ctx, input, &datum_out)); shared_ptr result = MakeArray(datum_out.array()); ASSERT_ARRAYS_EQUAL(expected, *result); @@ -1461,7 +1478,7 @@ TEST_F(TestHashKernel, ChunkedArrayInvoke) { // Unique shared_ptr result; - ASSERT_OK(Unique(&this->ctx_, Datum(carr), &result)); + ASSERT_OK(Unique(&this->ctx_, carr, &result)); ASSERT_ARRAYS_EQUAL(*ex_dict, *result); // Dictionary encode @@ -1475,7 +1492,7 @@ TEST_F(TestHashKernel, ChunkedArrayInvoke) { auto dict_carr = std::make_shared(dict_arrays); Datum encoded_out; - ASSERT_OK(DictionaryEncode(&this->ctx_, Datum(carr), &encoded_out)); + ASSERT_OK(DictionaryEncode(&this->ctx_, carr, &encoded_out)); ASSERT_EQ(Datum::CHUNKED_ARRAY, encoded_out.kind()); AssertChunkedEqual(*dict_carr, *encoded_out.chunked_array()); @@ -1490,7 +1507,7 @@ class TestBooleanKernel : public ComputeFixture, public TestBase { const std::shared_ptr& right, const std::shared_ptr& expected) { Datum result; - ASSERT_OK(kernel(&this->ctx_, Datum(left), Datum(right), &result)); + ASSERT_OK(kernel(&this->ctx_, left, right, &result)); ASSERT_EQ(Datum::ARRAY, result.kind()); std::shared_ptr result_array = result.make_array(); ASSERT_TRUE(result_array->Equals(expected)); @@ -1502,7 +1519,7 @@ class TestBooleanKernel : public ComputeFixture, public TestBase { const std::shared_ptr& expected) { Datum result; std::shared_ptr result_array; - ASSERT_OK(kernel(&this->ctx_, Datum(left), Datum(right), &result)); + ASSERT_OK(kernel(&this->ctx_, left, right, &result)); ASSERT_EQ(Datum::CHUNKED_ARRAY, result.kind()); std::shared_ptr result_ca = result.chunked_array(); ASSERT_TRUE(result_ca->Equals(expected)); @@ -1552,13 +1569,13 @@ TEST_F(TestBooleanKernel, Invert) { // Plain array Datum result; - ASSERT_OK(Invert(&this->ctx_, Datum(a1), &result)); + ASSERT_OK(Invert(&this->ctx_, a1, &result)); ASSERT_EQ(Datum::ARRAY, result.kind()); std::shared_ptr result_array = result.make_array(); ASSERT_TRUE(result_array->Equals(a2)); // Array with offset - ASSERT_OK(Invert(&this->ctx_, Datum(a1->Slice(1)), &result)); + ASSERT_OK(Invert(&this->ctx_, a1->Slice(1), &result)); ASSERT_EQ(Datum::ARRAY, result.kind()); result_array = result.make_array(); ASSERT_TRUE(result_array->Equals(a2->Slice(1))); @@ -1568,7 +1585,7 @@ TEST_F(TestBooleanKernel, Invert) { auto ca1 = std::make_shared(ca1_arrs); std::vector> ca2_arrs = {a2, a2->Slice(1)}; auto ca2 = std::make_shared(ca2_arrs); - ASSERT_OK(Invert(&this->ctx_, Datum(ca1), &result)); + ASSERT_OK(Invert(&this->ctx_, ca1, &result)); ASSERT_EQ(Datum::CHUNKED_ARRAY, result.kind()); std::shared_ptr result_ca = result.chunked_array(); ASSERT_TRUE(result_ca->Equals(ca2)); @@ -1618,14 +1635,14 @@ TEST_F(TestInvokeBinaryKernel, Exceptions) { auto a2 = _MakeArray(type, values2, {}); // Left is not an array-like - ASSERT_RAISES(Invalid, detail::InvokeBinaryArrayKernel( - &this->ctx_, &kernel, Datum(table), Datum(a2), &outputs)); + ASSERT_RAISES(Invalid, detail::InvokeBinaryArrayKernel(&this->ctx_, &kernel, table, a2, + &outputs)); // Right is not an array-like - ASSERT_RAISES(Invalid, detail::InvokeBinaryArrayKernel(&this->ctx_, &kernel, Datum(a1), - Datum(table), &outputs)); + ASSERT_RAISES(Invalid, detail::InvokeBinaryArrayKernel(&this->ctx_, &kernel, a1, table, + &outputs)); // Different sized inputs - ASSERT_RAISES(Invalid, detail::InvokeBinaryArrayKernel(&this->ctx_, &kernel, Datum(a1), - Datum(a1->Slice(1)), &outputs)); + ASSERT_RAISES(Invalid, detail::InvokeBinaryArrayKernel(&this->ctx_, &kernel, a1, + a1->Slice(1), &outputs)); } } // namespace compute diff --git a/cpp/src/arrow/compute/kernel.h b/cpp/src/arrow/compute/kernel.h index bef2b9af21c..bdd0a7cdb71 100644 --- a/cpp/src/arrow/compute/kernel.h +++ b/cpp/src/arrow/compute/kernel.h @@ -61,19 +61,20 @@ struct ARROW_EXPORT Datum { /// \brief Empty datum, to be populated elsewhere Datum() : value(NULLPTR) {} - explicit Datum(const std::shared_ptr& value) : value(value) {} - - explicit Datum(const std::shared_ptr& value) : value(value) {} - - explicit Datum(const std::shared_ptr& value) : Datum(value->data()) {} - - explicit Datum(const std::shared_ptr& value) : value(value) {} - - explicit Datum(const std::shared_ptr& value) : value(value) {} - - explicit Datum(const std::shared_ptr
& value) : value(value) {} - - explicit Datum(const std::vector& value) : value(value) {} + Datum(const std::shared_ptr& value) // NOLINT implicit conversion + : value(value) {} + Datum(const std::shared_ptr& value) // NOLINT implicit conversion + : value(value) {} + Datum(const std::shared_ptr& value) // NOLINT implicit conversion + : Datum(value ? value->data() : NULLPTR) {} + Datum(const std::shared_ptr& value) // NOLINT implicit conversion + : value(value) {} + Datum(const std::shared_ptr& value) // NOLINT implicit conversion + : value(value) {} + Datum(const std::shared_ptr
& value) // NOLINT implicit conversion + : value(value) {} + Datum(const std::vector& value) // NOLINT implicit conversion + : value(value) {} ~Datum() {} diff --git a/cpp/src/arrow/table.h b/cpp/src/arrow/table.h index 6b573325287..9299ca6c1ba 100644 --- a/cpp/src/arrow/table.h +++ b/cpp/src/arrow/table.h @@ -44,6 +44,10 @@ class ARROW_EXPORT ChunkedArray { /// The vector should be non-empty and all its elements should have the same /// data type. explicit ChunkedArray(const ArrayVector& chunks); + + /// \brief Construct a chunked array from a single Array + explicit ChunkedArray(const std::shared_ptr& chunk) : ChunkedArray({chunk}) {} + /// \brief Construct a chunked array from a vector of arrays and a data type /// /// As the data type is passed explicitly, the vector may be empty. diff --git a/cpp/src/parquet/arrow/reader.cc b/cpp/src/parquet/arrow/reader.cc index 54ca376e557..a327c44476e 100644 --- a/cpp/src/parquet/arrow/reader.cc +++ b/cpp/src/parquet/arrow/reader.cc @@ -27,7 +27,6 @@ #include #include -#include "arrow/api.h" #include "arrow/api.h" #include "arrow/util/bit-util.h" #include "arrow/util/logging.h" @@ -93,6 +92,19 @@ static inline int64_t impala_timestamp_to_nanoseconds(const Int96& impala_timest template using ArrayType = typename ::arrow::TypeTraits::ArrayType; +namespace internal { + +Status GetSingleChunk(const ChunkedArray& chunked, std::shared_ptr* out) { + DCHECK_GT(chunked.num_chunks(), 0); + if (chunked.num_chunks() > 1) { + return Status::Invalid("Function call returned a chunked array"); + } + *out = chunked.chunk(0); + return Status::OK(); +} + +} // namespace internal + // ---------------------------------------------------------------------- // Iteration utilities @@ -295,8 +307,7 @@ class PARQUET_NO_EXPORT PrimitiveImpl : public ColumnReader::ColumnReaderImpl { NextRowGroup(); } - Status NextBatch(int64_t records_to_read, - std::shared_ptr* out) override; + Status NextBatch(int64_t records_to_read, std::shared_ptr* out) override; template Status WrapIntoListArray(Datum* inout_array); @@ -327,8 +338,7 @@ class PARQUET_NO_EXPORT StructImpl : public ColumnReader::ColumnReaderImpl { InitField(node, children); } - Status NextBatch(int64_t records_to_read, - std::shared_ptr* out) override; + Status NextBatch(int64_t records_to_read, std::shared_ptr* out) override; Status GetDefLevels(const int16_t** data, size_t* length) override; Status GetRepLevels(const int16_t** data, size_t* length) override; const std::shared_ptr field() override { return field_; } @@ -658,6 +668,18 @@ Status FileReader::ReadSchemaField(int i, std::shared_ptr* out) { } } +Status FileReader::ReadColumn(int i, std::shared_ptr* out) { + std::shared_ptr chunked_out; + RETURN_NOT_OK(ReadColumn(i, &chunked_out)); + return GetSingleChunk(*chunked_out, out); +} + +Status FileReader::ReadSchemaField(int i, std::shared_ptr* out) { + std::shared_ptr chunked_out; + RETURN_NOT_OK(ReadSchemaField(i, &chunked_out)); + return GetSingleChunk(*chunked_out, out); +} + Status FileReader::GetRecordBatchReader(const std::vector& row_group_indices, std::shared_ptr* out) { std::vector indices(impl_->num_columns()); @@ -774,6 +796,27 @@ const ParquetFileReader* FileReader::parquet_reader() const { template Status PrimitiveImpl::WrapIntoListArray(Datum* inout_array) { + if (descr_->max_repetition_level() == 0) { + // Flat, no action + return Status::OK(); + } + + std::shared_ptr flat_array; + + // ARROW-3762(wesm): If inout_array is a chunked array, we reject as this is + // not yet implemented + if (inout_array->kind() == Datum::CHUNKED_ARRAY) { + if (inout_array->chunked_array()->num_chunks() > 1) { + return Status::NotImplemented( + "Nested data conversions not implemented for " + "chunked array outputs"); + } + flat_array = inout_array->chunked_array()->chunk(0); + } else { + DCHECK_EQ(Datum::ARRAY, inout_array->kind()); + flat_array = inout_array->array(); + } + const int16_t* def_levels = record_reader_->def_levels(); const int16_t* rep_levels = record_reader_->rep_levels(); const int64_t total_levels_read = record_reader_->levels_position(); @@ -784,110 +827,106 @@ Status PrimitiveImpl::WrapIntoListArray(Datum* inout_array) { &arrow_schema)); std::shared_ptr current_field = arrow_schema->field(0); - if (descr_->max_repetition_level() > 0) { - // Walk downwards to extract nullability - std::vector nullable; - std::vector> offset_builders; - std::vector> valid_bits_builders; - nullable.push_back(current_field->nullable()); - while (current_field->type()->num_children() > 0) { - if (current_field->type()->num_children() > 1) { - return Status::NotImplemented( - "Fields with more than one child are not supported."); - } else { - if (current_field->type()->id() != ::arrow::Type::LIST) { - return Status::NotImplemented( - "Currently only nesting with Lists is supported."); - } - current_field = current_field->type()->child(0); + // Walk downwards to extract nullability + std::vector nullable; + std::vector> offset_builders; + std::vector> valid_bits_builders; + nullable.push_back(current_field->nullable()); + while (current_field->type()->num_children() > 0) { + if (current_field->type()->num_children() > 1) { + return Status::NotImplemented("Fields with more than one child are not supported."); + } else { + if (current_field->type()->id() != ::arrow::Type::LIST) { + return Status::NotImplemented("Currently only nesting with Lists is supported."); } - offset_builders.emplace_back( - std::make_shared<::arrow::Int32Builder>(::arrow::int32(), pool_)); - valid_bits_builders.emplace_back( - std::make_shared<::arrow::BooleanBuilder>(::arrow::boolean(), pool_)); - nullable.push_back(current_field->nullable()); + current_field = current_field->type()->child(0); } + offset_builders.emplace_back( + std::make_shared<::arrow::Int32Builder>(::arrow::int32(), pool_)); + valid_bits_builders.emplace_back( + std::make_shared<::arrow::BooleanBuilder>(::arrow::boolean(), pool_)); + nullable.push_back(current_field->nullable()); + } - int64_t list_depth = offset_builders.size(); - // This describes the minimal definition that describes a level that - // reflects a value in the primitive values array. - int16_t values_def_level = descr_->max_definition_level(); - if (nullable[nullable.size() - 1]) { - values_def_level--; - } + int64_t list_depth = offset_builders.size(); + // This describes the minimal definition that describes a level that + // reflects a value in the primitive values array. + int16_t values_def_level = descr_->max_definition_level(); + if (nullable[nullable.size() - 1]) { + values_def_level--; + } - // The definition levels that are needed so that a list is declared - // as empty and not null. - std::vector empty_def_level(list_depth); - int def_level = 0; - for (int i = 0; i < list_depth; i++) { - if (nullable[i]) { - def_level++; - } - empty_def_level[i] = static_cast(def_level); + // The definition levels that are needed so that a list is declared + // as empty and not null. + std::vector empty_def_level(list_depth); + int def_level = 0; + for (int i = 0; i < list_depth; i++) { + if (nullable[i]) { def_level++; } + empty_def_level[i] = static_cast(def_level); + def_level++; + } - int32_t values_offset = 0; - std::vector null_counts(list_depth, 0); - for (int64_t i = 0; i < total_levels_read; i++) { - int16_t rep_level = rep_levels[i]; - if (rep_level < descr_->max_repetition_level()) { - for (int64_t j = rep_level; j < list_depth; j++) { - if (j == (list_depth - 1)) { - RETURN_NOT_OK(offset_builders[j]->Append(values_offset)); - } else { - RETURN_NOT_OK(offset_builders[j]->Append( - static_cast(offset_builders[j + 1]->length()))); - } + int32_t values_offset = 0; + std::vector null_counts(list_depth, 0); + for (int64_t i = 0; i < total_levels_read; i++) { + int16_t rep_level = rep_levels[i]; + if (rep_level < descr_->max_repetition_level()) { + for (int64_t j = rep_level; j < list_depth; j++) { + if (j == (list_depth - 1)) { + RETURN_NOT_OK(offset_builders[j]->Append(values_offset)); + } else { + RETURN_NOT_OK(offset_builders[j]->Append( + static_cast(offset_builders[j + 1]->length()))); + } - if (((empty_def_level[j] - 1) == def_levels[i]) && (nullable[j])) { - RETURN_NOT_OK(valid_bits_builders[j]->Append(false)); - null_counts[j]++; + if (((empty_def_level[j] - 1) == def_levels[i]) && (nullable[j])) { + RETURN_NOT_OK(valid_bits_builders[j]->Append(false)); + null_counts[j]++; + break; + } else { + RETURN_NOT_OK(valid_bits_builders[j]->Append(true)); + if (empty_def_level[j] == def_levels[i]) { break; - } else { - RETURN_NOT_OK(valid_bits_builders[j]->Append(true)); - if (empty_def_level[j] == def_levels[i]) { - break; - } } } } - if (def_levels[i] >= values_def_level) { - values_offset++; - } } - // Add the final offset to all lists - for (int64_t j = 0; j < list_depth; j++) { - if (j == (list_depth - 1)) { - RETURN_NOT_OK(offset_builders[j]->Append(values_offset)); - } else { - RETURN_NOT_OK(offset_builders[j]->Append( - static_cast(offset_builders[j + 1]->length()))); - } + if (def_levels[i] >= values_def_level) { + values_offset++; } - - std::vector> offsets; - std::vector> valid_bits; - std::vector list_lengths; - for (int64_t j = 0; j < list_depth; j++) { - list_lengths.push_back(offset_builders[j]->length() - 1); - std::shared_ptr array; - RETURN_NOT_OK(offset_builders[j]->Finish(&array)); - offsets.emplace_back(std::static_pointer_cast(array)->values()); - RETURN_NOT_OK(valid_bits_builders[j]->Finish(&array)); - valid_bits.emplace_back(std::static_pointer_cast(array)->values()); + } + // Add the final offset to all lists + for (int64_t j = 0; j < list_depth; j++) { + if (j == (list_depth - 1)) { + RETURN_NOT_OK(offset_builders[j]->Append(values_offset)); + } else { + RETURN_NOT_OK(offset_builders[j]->Append( + static_cast(offset_builders[j + 1]->length()))); } + } - std::shared_ptr output(*array); - for (int64_t j = list_depth - 1; j >= 0; j--) { - auto list_type = - ::arrow::list(::arrow::field("item", output->type(), nullable[j + 1])); - output = std::make_shared<::arrow::ListArray>( - list_type, list_lengths[j], offsets[j], output, valid_bits[j], null_counts[j]); - } - *array = output; + std::vector> offsets; + std::vector> valid_bits; + std::vector list_lengths; + for (int64_t j = 0; j < list_depth; j++) { + list_lengths.push_back(offset_builders[j]->length() - 1); + std::shared_ptr array; + RETURN_NOT_OK(offset_builders[j]->Finish(&array)); + offsets.emplace_back(std::static_pointer_cast(array)->values()); + RETURN_NOT_OK(valid_bits_builders[j]->Finish(&array)); + valid_bits.emplace_back(std::static_pointer_cast(array)->values()); + } + + std::shared_ptr output(*flat_array); + for (int64_t j = list_depth - 1; j >= 0; j--) { + auto list_type = + ::arrow::list(::arrow::field("item", output->type(), nullable[j + 1])); + output = std::make_shared<::arrow::ListArray>(list_type, list_lengths[j], offsets[j], + output, valid_bits[j], null_counts[j]); } + *inout_array = output; return Status::OK(); } @@ -918,8 +957,7 @@ struct TransferFunctor { using ParquetCType = typename ParquetType::c_type; Status operator()(RecordReader* reader, MemoryPool* pool, - const std::shared_ptr<::arrow::DataType>& type, - Datum* out) { + const std::shared_ptr<::arrow::DataType>& type, Datum* out) { static_assert(!std::is_same::value, "The fast path transfer functor should be used " "for primitive values"); @@ -947,8 +985,7 @@ template struct TransferFunctor> { Status operator()(RecordReader* reader, MemoryPool* pool, - const std::shared_ptr<::arrow::DataType>& type, - Datum* out) { + const std::shared_ptr<::arrow::DataType>& type, Datum* out) { int64_t length = reader->values_written(); std::shared_ptr values = reader->ReleaseValues(); @@ -966,8 +1003,7 @@ struct TransferFunctor struct TransferFunctor<::arrow::BooleanType, BooleanType> { Status operator()(RecordReader* reader, MemoryPool* pool, - const std::shared_ptr<::arrow::DataType>& type, - Datum* out) { + const std::shared_ptr<::arrow::DataType>& type, Datum* out) { int64_t length = reader->values_written(); std::shared_ptr data; @@ -1000,8 +1036,7 @@ struct TransferFunctor<::arrow::BooleanType, BooleanType> { template <> struct TransferFunctor<::arrow::TimestampType, Int96Type> { Status operator()(RecordReader* reader, MemoryPool* pool, - const std::shared_ptr<::arrow::DataType>& type, - Datum* out) { + const std::shared_ptr<::arrow::DataType>& type, Datum* out) { int64_t length = reader->values_written(); auto values = reinterpret_cast(reader->values()); @@ -1028,8 +1063,7 @@ struct TransferFunctor<::arrow::TimestampType, Int96Type> { template <> struct TransferFunctor<::arrow::Date64Type, Int32Type> { Status operator()(RecordReader* reader, MemoryPool* pool, - const std::shared_ptr<::arrow::DataType>& type, - Datum* out) { + const std::shared_ptr<::arrow::DataType>& type, Datum* out) { int64_t length = reader->values_written(); auto values = reinterpret_cast(reader->values()); @@ -1058,16 +1092,18 @@ struct TransferFunctor< typename std::enable_if::value || std::is_same::value>::type> { Status operator()(RecordReader* reader, MemoryPool* pool, - const std::shared_ptr<::arrow::DataType>& type, - Datum* out) { - RETURN_NOT_OK(reader->builder()->Finish(out)); + const std::shared_ptr<::arrow::DataType>& type, Datum* out) { + std::vector> chunks = reader->GetBuilderChunks(); if (type->id() == ::arrow::Type::STRING) { // Convert from BINARY type to STRING - auto new_data = (*out)->data()->Copy(); - new_data->type = type; - *out = ::arrow::MakeArray(new_data); + for (size_t i = 0; i < chunks.size(); ++i) { + auto new_data = chunks[i]->data()->Copy(); + new_data->type = type; + chunks[i] = ::arrow::MakeArray(new_data); + } } + *out = std::make_shared(chunks); return Status::OK(); } }; @@ -1175,121 +1211,126 @@ static inline void RawBytesToDecimalBytes(const uint8_t* value, int32_t byte_wid BytesToIntegerPair(value, byte_width, high, low); } -/// \brief Convert an array of FixedLenByteArrays to an arrow::Decimal128Array -/// We do this by: -/// 1. Creating a arrow::FixedSizeBinaryArray from the RecordReader's builder -/// 2. Allocating a buffer for the arrow::Decimal128Array -/// 3. Converting the big-endian bytes in the FixedSizeBinaryArray to two integers -/// representing the high and low bits of each decimal value. +// ---------------------------------------------------------------------- +// BYTE_ARRAY / FIXED_LEN_BYTE_ARRAY -> Decimal128 + template <> -struct TransferFunctor<::arrow::Decimal128Type, FLBAType> { - Status operator()(RecordReader* reader, MemoryPool* pool, - const std::shared_ptr<::arrow::DataType>& type, - Datum* out) { - DCHECK_EQ(type->id(), ::arrow::Type::DECIMAL); +Status ConvertToDecimal128(const Array& array, + const std::shared_ptr<::arrow::DataType>& type, + MemoryPool* pool, std::shared_ptr* out) { + const auto& fixed_size_binary_array = + static_cast(array); - // Finish the built data into a temporary array - std::shared_ptr array; - RETURN_NOT_OK(reader->builder()->Finish(&array)); - const auto& fixed_size_binary_array = - static_cast(*array); + // The byte width of each decimal value + const int32_t type_length = + static_cast(*type).byte_width(); - // Get the byte width of the values in the FixedSizeBinaryArray. Most of the time - // this will be different from the decimal array width because we write the minimum - // number of bytes necessary to represent a given precision - const int32_t byte_width = - static_cast(*fixed_size_binary_array.type()) - .byte_width(); + // number of elements in the entire array + const int64_t length = fixed_size_binary_array.length(); - // The byte width of each decimal value - const int32_t type_length = - static_cast(*type).byte_width(); + // Get the byte width of the values in the FixedSizeBinaryArray. Most of the time + // this will be different from the decimal array width because we write the minimum + // number of bytes necessary to represent a given precision + const int32_t byte_width = + static_cast(*fixed_size_binary_array.type()) + .byte_width(); - // number of elements in the entire array - const int64_t length = fixed_size_binary_array.length(); + // allocate memory for the decimal array + std::shared_ptr data; + RETURN_NOT_OK(::arrow::AllocateBuffer(pool, length * type_length, &data)); - // allocate memory for the decimal array - std::shared_ptr data; - RETURN_NOT_OK(::arrow::AllocateBuffer(pool, length * type_length, &data)); - - // raw bytes that we can write to - uint8_t* out_ptr = data->mutable_data(); - - // convert each FixedSizeBinary value to valid decimal bytes - const int64_t null_count = fixed_size_binary_array.null_count(); - if (null_count > 0) { - for (int64_t i = 0; i < length; ++i, out_ptr += type_length) { - if (!fixed_size_binary_array.IsNull(i)) { - RawBytesToDecimalBytes(fixed_size_binary_array.GetValue(i), byte_width, - out_ptr); - } - } - } else { - for (int64_t i = 0; i < length; ++i, out_ptr += type_length) { + // raw bytes that we can write to + uint8_t* out_ptr = data->mutable_data(); + + // convert each FixedSizeBinary value to valid decimal bytes + const int64_t null_count = fixed_size_binary_array.null_count(); + if (null_count > 0) { + for (int64_t i = 0; i < length; ++i, out_ptr += type_length) { + if (!fixed_size_binary_array.IsNull(i)) { RawBytesToDecimalBytes(fixed_size_binary_array.GetValue(i), byte_width, out_ptr); } } - - *out = std::make_shared<::arrow::Decimal128Array>( - type, length, data, fixed_size_binary_array.null_bitmap(), null_count); - return Status::OK(); + } else { + for (int64_t i = 0; i < length; ++i, out_ptr += type_length) { + RawBytesToDecimalBytes(fixed_size_binary_array.GetValue(i), byte_width, out_ptr); + } } -}; -/// \brief Convert an arrow::BinaryArray to an arrow::Decimal128Array -/// We do this by: -/// 1. Creating an arrow::BinaryArray from the RecordReader's builder -/// 2. Allocating a buffer for the arrow::Decimal128Array -/// 3. Converting the big-endian bytes in each BinaryArray entry to two integers -/// representing the high and low bits of each decimal value. -template <> -struct TransferFunctor<::arrow::Decimal128Type, ByteArrayType> { - Status operator()(RecordReader* reader, MemoryPool* pool, - const std::shared_ptr<::arrow::DataType>& type, - Datum* out) { - DCHECK_EQ(type->id(), ::arrow::Type::DECIMAL); + *out = std::make_shared<::arrow::Decimal128Array>( + type, length, data, fixed_size_binary_array.null_bitmap(), null_count); - // Finish the built data into a temporary array - std::shared_ptr array; - RETURN_NOT_OK(reader->builder()->Finish(&array)); - const auto& binary_array = static_cast(*array); + return Status::OK(); +} - const int64_t length = binary_array.length(); +template <> + static Status ConvertToDecimal128 < + ByteArrayType(const Array& array, const std::shared_ptr<::arrow::DataType>& type, + std::shared_ptr* out) { + const auto& binary_array = static_cast(*array); + const int64_t length = binary_array.length(); - const auto& decimal_type = static_cast(*type); - const int64_t type_length = decimal_type.byte_width(); + const auto& decimal_type = static_cast(*type); + const int64_t type_length = decimal_type.byte_width(); - std::shared_ptr data; - RETURN_NOT_OK(::arrow::AllocateBuffer(pool, length * type_length, &data)); + std::shared_ptr data; + RETURN_NOT_OK(::arrow::AllocateBuffer(pool, length * type_length, &data)); - // raw bytes that we can write to - uint8_t* out_ptr = data->mutable_data(); + // raw bytes that we can write to + uint8_t* out_ptr = data->mutable_data(); - const int64_t null_count = binary_array.null_count(); + const int64_t null_count = binary_array.null_count(); - // convert each BinaryArray value to valid decimal bytes - for (int64_t i = 0; i < length; i++, out_ptr += type_length) { - int32_t record_len = 0; - const uint8_t* record_loc = binary_array.GetValue(i, &record_len); + // convert each BinaryArray value to valid decimal bytes + for (int64_t i = 0; i < length; i++, out_ptr += type_length) { + int32_t record_len = 0; + const uint8_t* record_loc = binary_array.GetValue(i, &record_len); - if ((record_len < 0) || (record_len > type_length)) { - return Status::Invalid("Invalid BYTE_ARRAY size"); - } + if ((record_len < 0) || (record_len > type_length)) { + return Status::Invalid("Invalid BYTE_ARRAY size"); + } - auto out_ptr_view = reinterpret_cast(out_ptr); - out_ptr_view[0] = 0; - out_ptr_view[1] = 0; + auto out_ptr_view = reinterpret_cast(out_ptr); + out_ptr_view[0] = 0; + out_ptr_view[1] = 0; - // only convert rows that are not null if there are nulls, or - // all rows, if there are not - if (((null_count > 0) && !binary_array.IsNull(i)) || (null_count <= 0)) { - RawBytesToDecimalBytes(record_loc, record_len, out_ptr); - } + // only convert rows that are not null if there are nulls, or + // all rows, if there are not + if (((null_count > 0) && !binary_array.IsNull(i)) || (null_count <= 0)) { + RawBytesToDecimalBytes(record_loc, record_len, out_ptr); } + } + + *out = std::make_shared<::arrow::Decimal128Array>( + type, length, data, binary_array.null_bitmap(), null_count); + return Status::OK(); +} + +/// \brief Convert an arrow::BinaryArray to an arrow::Decimal128Array +/// We do this by: +/// 1. Creating an arrow::BinaryArray from the RecordReader's builder +/// 2. Allocating a buffer for the arrow::Decimal128Array +/// 3. Converting the big-endian bytes in each BinaryArray entry to two integers +/// representing the high and low bits of each decimal value. +struct TransferFunctor< + ArrowType, ParquetType, + typename std::enable_if && + (std::is_same::value || + std::is_same::value)>::type> { + Status operator()(RecordReader* reader, MemoryPool* pool, + const std::shared_ptr<::arrow::DataType>& type, Datum* out) { + DCHECK_EQ(type->id(), ::arrow::Type::DECIMAL); - *out = std::make_shared<::arrow::Decimal128Array>( - type, length, data, binary_array.null_bitmap(), null_count); + ::arrow::ArrayVector chunks = reader->GetBuilderChunks(); + for (size_t i = 0; i < chunks.size(); ++i) { + std::shared_ptr chunk_as_decimal; + RETURN_NOT_OK( + ConvertToDecimal128(*chunks[i], type, pool, &chunk_as_decimal)); + + // Replace the chunk, which will hopefully also free memory as we go + chunks[i] = chunk_as_decimal; + } + *out = std::make_shared(chunks); return Status::OK(); } }; @@ -1351,8 +1392,7 @@ static Status DecimalIntegerTransfer(RecordReader* reader, MemoryPool* pool, template <> struct TransferFunctor<::arrow::Decimal128Type, Int32Type> { Status operator()(RecordReader* reader, MemoryPool* pool, - const std::shared_ptr<::arrow::DataType>& type, - Datum* out) { + const std::shared_ptr<::arrow::DataType>& type, Datum* out) { return DecimalIntegerTransfer(reader, pool, type, out); } }; @@ -1360,14 +1400,13 @@ struct TransferFunctor<::arrow::Decimal128Type, Int32Type> { template <> struct TransferFunctor<::arrow::Decimal128Type, Int64Type> { Status operator()(RecordReader* reader, MemoryPool* pool, - const std::shared_ptr<::arrow::DataType>& type, - Datum* out) { + const std::shared_ptr<::arrow::DataType>& type, Datum* out) { return DecimalIntegerTransfer(reader, pool, type, out); } }; -#define TRANSFER_DATA(ArrowType, ParquetType) \ - TransferFunctor func; \ +#define TRANSFER_DATA(ArrowType, ParquetType) \ + TransferFunctor func; \ RETURN_NOT_OK(func(record_reader_.get(), pool_, field_->type(), &result)); \ RETURN_NOT_OK(WrapIntoListArray(out)) @@ -1463,6 +1502,15 @@ Status PrimitiveImpl::NextBatch(int64_t records_to_read, return Status::NotImplemented(ss.str()); } + DCHECK_NE(result.kind(), Datum::NONE); + + if (result.kind() == Datum::ARRAY) { + *out = std::make_shared(result.array()); + } else if (result.kind() == Datum::CHUNKED_ARRAY) { + *out = result.chunked_array(); + } else { + DCHECK(false) << "Should be impossible"; + } return Status::OK(); } @@ -1488,10 +1536,17 @@ ColumnReader::ColumnReader(std::unique_ptr impl) ColumnReader::~ColumnReader() {} -Status ColumnReader::NextBatch(int64_t records_to_read, std::shared_ptr* out) { +Status ColumnReader::NextBatch(int64_t records_to_read, + std::shared_ptr* out) { return impl_->NextBatch(records_to_read, out); } +Status ColumnReader::NextBatch(int64_t records_to_read, std::shared_ptr* out) { + std::make_shared chunked_out; + RETURN_NOT_OK(impl_->NextBatch(records_to_read, &chunked_out)); + return GetSingleChunk(*chunked_out, out); +} + // StructImpl methods Status StructImpl::DefLevelsToNullArray(std::shared_ptr* null_bitmap_out, @@ -1576,17 +1631,21 @@ Status StructImpl::GetRepLevels(const int16_t** data, size_t* length) { return Status::NotImplemented("GetRepLevels is not implemented for struct"); } -Status StructImpl::NextBatch(int64_t records_to_read, std::shared_ptr* out) { +Status StructImpl::NextBatch(int64_t records_to_read, + std::shared_ptr* out) { std::vector> children_arrays; std::shared_ptr null_bitmap; int64_t null_count; // Gather children arrays and def levels for (auto& child : children_) { - std::shared_ptr child_array; + std::shared_ptr field; + RETURN_NOT_OK(child->NextBatch(records_to_read, &field)); - RETURN_NOT_OK(child->NextBatch(records_to_read, &child_array)); - children_arrays.push_back(child_array); + if (field->num_chunks() > 1) { + return Status::Invalid("Chunked field reads not yet supported with StructArray"); + } + children_arrays.push_back(field->chunk(0)); } RETURN_NOT_OK(DefLevelsToNullArray(&null_bitmap, &null_count)); @@ -1600,8 +1659,9 @@ Status StructImpl::NextBatch(int64_t records_to_read, std::shared_ptr* ou } } - *out = std::make_shared(field()->type(), struct_length, children_arrays, - null_bitmap, null_count); + auto result = std::make_shared(field()->type(), struct_length, + children_arrays, null_bitmap, null_count); + *out = std::make_shared(result); return Status::OK(); } @@ -1631,11 +1691,7 @@ Status ColumnChunkReader::Read(std::shared_ptr<::arrow::ChunkedArray>* out) { Status ColumnChunkReader::Read(std::shared_ptr<::arrow::Array>* out) { std::make_shared chunked_out; RETURN_NOT_OK(impl_->ReadColumnChunk(column_index_, row_group_index_, &chunked_out)); - DCHECK_GT(chunked_out->num_chunks(), 0); - if (chunked_out->num_chunks() > 1) { - return Status::Invalid("ColumnChunk result is a chunked array"); - } - return chunked_out->chunk(0); + return GetSingleChunk(*chunked_out, out); } ColumnChunkReader::~ColumnChunkReader() {} diff --git a/cpp/src/parquet/arrow/reader.h b/cpp/src/parquet/arrow/reader.h index 1b1bc3bb013..9783241d6ee 100644 --- a/cpp/src/parquet/arrow/reader.h +++ b/cpp/src/parquet/arrow/reader.h @@ -30,6 +30,7 @@ namespace arrow { class Array; +class ChunkedArray; class MemoryPool; class RecordBatchReader; class Schema; @@ -125,6 +126,9 @@ class PARQUET_EXPORT FileReader { std::shared_ptr<::arrow::Schema>* out); // Read column as a whole into an Array. + ::arrow::Status ReadColumn(int i, std::shared_ptr<::arrow::ChunkedArray>* out); + + PARQUET_DEPRECATED("Use version with ChunkedArray output") ::arrow::Status ReadColumn(int i, std::shared_ptr<::arrow::Array>* out); // NOTE: Experimental API @@ -139,27 +143,10 @@ class PARQUET_EXPORT FileReader { // 2 foo3 // // i=0 will read the entire foo struct, i=1 the foo2 primitive column etc - ::arrow::Status ReadSchemaField(int i, std::shared_ptr<::arrow::Array>* out); + ::arrow::Status ReadSchemaField(int i, std::shared_ptr<::arrow::ChunkedArray>* out); - // NOTE: Experimental API - // Reads a specific top level schema field into an Array, while keeping only chosen - // leaf columns. - // The index i refers the index of the top level schema field, which may - // be nested or flat, and indices vector refers to the leaf column indices - e.g. - // - // i indices - // 0 0 foo.bar - // 0 1 foo.bar.baz - // 0 2 foo.qux - // 1 3 foo2 - // 2 4 foo3 - // - // i=0 indices={0,2} will read a partial struct with foo.bar and foo.quox columns - // i=1 indices={3} will read foo2 column - // i=1 indices={2} will result in out=nullptr - // leaf indices which are unrelated to the schema field are ignored - ::arrow::Status ReadSchemaField(int i, const std::vector& indices, - std::shared_ptr<::arrow::Array>* out); + PARQUET_DEPRECATED("Use version with ChunkedArray output") + ::arrow::Status ReadSchemaField(int i, std::shared_ptr<::arrow::Array>* out); /// \brief Return a RecordBatchReader of row groups selected from row_group_indices, the /// ordering in row_group_indices matters. @@ -250,6 +237,7 @@ class PARQUET_EXPORT ColumnChunkReader { public: ::arrow::Status Read(std::shared_ptr<::arrow::ChunkedArray>* out); + PARQUET_DEPRECATED("Use version with ChunkedArray output") ::arrow::Status Read(std::shared_ptr<::arrow::Array>* out); virtual ~ColumnChunkReader(); @@ -283,6 +271,10 @@ class PARQUET_EXPORT ColumnReader { // // Returns Status::OK on a successful read, including if you have exhausted // the data available in the file. + ::arrow::Status NextBatch(int64_t batch_size, + std::shared_ptr<::arrow::ChunkedArray>* out); + + PARQUET_DEPRECATED("Use version with ChunkedArray output") ::arrow::Status NextBatch(int64_t batch_size, std::shared_ptr<::arrow::Array>* out); private: diff --git a/cpp/src/parquet/arrow/record_reader.cc b/cpp/src/parquet/arrow/record_reader.cc index fdbb7095077..d60dfae017e 100644 --- a/cpp/src/parquet/arrow/record_reader.cc +++ b/cpp/src/parquet/arrow/record_reader.cc @@ -381,7 +381,7 @@ class RecordReader::RecordReaderImpl { virtual void DebugPrintState() = 0; - virtual std::vector> GetChunks() = 0; + virtual std::vector> GetBuilderChunks() = 0; protected: virtual bool ReadNewPage() = 0; @@ -558,7 +558,7 @@ class TypedRecordReader : public RecordReader::RecordReaderImpl { std::cout << std::endl; } - virtual std::vector> GetChunks() override { + std::vector> GetBuilderChunks() override { throw ParquetException("GetChunks only implemented for binary types"); } @@ -591,14 +591,14 @@ template <> void TypedRecordReader::DebugPrintState() {} template <> -::arrow::ArrayVector TypedRecordReader::GetChunks() { +::arrow::ArrayVector TypedRecordReader::GetBuilderChunks() { ::arrow::ArrayVector chunks; PARQUET_THROW_NOT_OK(builder_->Finish(&chunks)); return chunks; } template <> -::arrow::ArrayVector TypedRecordReader::GetChunks() { +::arrow::ArrayVector TypedRecordReader::GetBuilderChunks() { std::shared_ptr chunk; PARQUET_THROW_NOT_OK(builder_->Finish(&chunk)); return ::arrow::ArrayVector({chunk}); diff --git a/cpp/src/parquet/arrow/record_reader.h b/cpp/src/parquet/arrow/record_reader.h index 7efd0d54899..0f62b744f32 100644 --- a/cpp/src/parquet/arrow/record_reader.h +++ b/cpp/src/parquet/arrow/record_reader.h @@ -20,6 +20,7 @@ #include #include +#include #include "arrow/memory_pool.h" @@ -28,7 +29,7 @@ namespace arrow { -class ArrayBuilder; +class Array; } // namespace arrow @@ -77,7 +78,6 @@ class RecordReader { std::shared_ptr ReleaseValues(); std::shared_ptr ReleaseIsValid(); - ::arrow::ArrayBuilder* builder(); /// \brief Number of values written including nulls (if any) int64_t values_written() const; @@ -106,6 +106,9 @@ class RecordReader { void DebugPrintState(); + // For BYTE_ARRAY, FIXED_LEN_BYTE_ARRAY types that may have chunked output + std::vector> GetBuilderChunks(); + private: std::unique_ptr impl_; explicit RecordReader(RecordReaderImpl* impl); From 2efae064c558b16ec8c029f84d4a363cb1a46e88 Mon Sep 17 00:00:00 2001 From: Wes McKinney Date: Thu, 13 Dec 2018 17:00:39 -0600 Subject: [PATCH 04/12] Finish scaffolding. Get fully compiling again and original parquet-arrow test suite passing Change-Id: Icb260f6ffc4f41ee7519653bf8d3f48c2da30091 --- cpp/src/arrow/compute/compute-test.cc | 4 +++ cpp/src/arrow/compute/kernel.h | 9 +++++ cpp/src/arrow/table.h | 3 +- cpp/src/parquet/arrow/reader.cc | 40 +++++++++++++-------- cpp/src/parquet/arrow/reader.h | 12 ++++--- cpp/src/parquet/arrow/record_reader.cc | 48 ++++++++++++++++++-------- 6 files changed, 81 insertions(+), 35 deletions(-) diff --git a/cpp/src/arrow/compute/compute-test.cc b/cpp/src/arrow/compute/compute-test.cc index 2d23eebebe2..e34a086d8e2 100644 --- a/cpp/src/arrow/compute/compute-test.cc +++ b/cpp/src/arrow/compute/compute-test.cc @@ -82,6 +82,10 @@ void CheckImplicitConstructor(enum Datum::type expected_kind) { TEST(TestDatum, ImplicitConstructors) { CheckImplicitConstructor(Datum::ARRAY); + + // Instantiate from array subclass + CheckImplicitConstructor(Datum::ARRAY); + CheckImplicitConstructor(Datum::CHUNKED_ARRAY); CheckImplicitConstructor(Datum::RECORD_BATCH); CheckImplicitConstructor
(Datum::TABLE); diff --git a/cpp/src/arrow/compute/kernel.h b/cpp/src/arrow/compute/kernel.h index bdd0a7cdb71..fcc06f37be8 100644 --- a/cpp/src/arrow/compute/kernel.h +++ b/cpp/src/arrow/compute/kernel.h @@ -65,8 +65,10 @@ struct ARROW_EXPORT Datum { : value(value) {} Datum(const std::shared_ptr& value) // NOLINT implicit conversion : value(value) {} + Datum(const std::shared_ptr& value) // NOLINT implicit conversion : Datum(value ? value->data() : NULLPTR) {} + Datum(const std::shared_ptr& value) // NOLINT implicit conversion : value(value) {} Datum(const std::shared_ptr& value) // NOLINT implicit conversion @@ -76,6 +78,13 @@ struct ARROW_EXPORT Datum { Datum(const std::vector& value) // NOLINT implicit conversion : value(value) {} + // Cast from subtypes of Array to Datum + template ::value, + void>::type> + Datum(const std::shared_ptr& value) // NOLINT implicit conversion + : Datum(std::shared_ptr(value)) {} + ~Datum() {} Datum(const Datum& other) noexcept { this->value = other.value; } diff --git a/cpp/src/arrow/table.h b/cpp/src/arrow/table.h index 9299ca6c1ba..2ac34b4cde5 100644 --- a/cpp/src/arrow/table.h +++ b/cpp/src/arrow/table.h @@ -46,7 +46,8 @@ class ARROW_EXPORT ChunkedArray { explicit ChunkedArray(const ArrayVector& chunks); /// \brief Construct a chunked array from a single Array - explicit ChunkedArray(const std::shared_ptr& chunk) : ChunkedArray({chunk}) {} + explicit ChunkedArray(const std::shared_ptr& chunk) + : ChunkedArray(ArrayVector({chunk})) {} /// \brief Construct a chunked array from a vector of arrays and a data type /// diff --git a/cpp/src/parquet/arrow/reader.cc b/cpp/src/parquet/arrow/reader.cc index a327c44476e..12093eda8b6 100644 --- a/cpp/src/parquet/arrow/reader.cc +++ b/cpp/src/parquet/arrow/reader.cc @@ -92,7 +92,7 @@ static inline int64_t impala_timestamp_to_nanoseconds(const Int96& impala_timest template using ArrayType = typename ::arrow::TypeTraits::ArrayType; -namespace internal { +namespace { Status GetSingleChunk(const ChunkedArray& chunked, std::shared_ptr* out) { DCHECK_GT(chunked.num_chunks(), 0); @@ -103,7 +103,7 @@ Status GetSingleChunk(const ChunkedArray& chunked, std::shared_ptr* out) return Status::OK(); } -} // namespace internal +} // namespace // ---------------------------------------------------------------------- // Iteration utilities @@ -814,7 +814,7 @@ Status PrimitiveImpl::WrapIntoListArray(Datum* inout_array) { flat_array = inout_array->chunked_array()->chunk(0); } else { DCHECK_EQ(Datum::ARRAY, inout_array->kind()); - flat_array = inout_array->array(); + flat_array = inout_array->make_array(); } const int16_t* def_levels = record_reader_->def_levels(); @@ -919,7 +919,7 @@ Status PrimitiveImpl::WrapIntoListArray(Datum* inout_array) { valid_bits.emplace_back(std::static_pointer_cast(array)->values()); } - std::shared_ptr output(*flat_array); + std::shared_ptr output = flat_array; for (int64_t j = list_depth - 1; j >= 0; j--) { auto list_type = ::arrow::list(::arrow::field("item", output->type(), nullable[j + 1])); @@ -1089,8 +1089,11 @@ struct TransferFunctor<::arrow::Date64Type, Int32Type> { template struct TransferFunctor< ArrowType, ParquetType, - typename std::enable_if::value || - std::is_same::value>::type> { + typename std::enable_if< + (std::is_base_of<::arrow::BinaryType, ArrowType>::value || + std::is_same<::arrow::FixedSizeBinaryType, ArrowType>::value) && + (std::is_same::value || + std::is_same::value)>::type> { Status operator()(RecordReader* reader, MemoryPool* pool, const std::shared_ptr<::arrow::DataType>& type, Datum* out) { std::vector> chunks = reader->GetBuilderChunks(); @@ -1214,6 +1217,12 @@ static inline void RawBytesToDecimalBytes(const uint8_t* value, int32_t byte_wid // ---------------------------------------------------------------------- // BYTE_ARRAY / FIXED_LEN_BYTE_ARRAY -> Decimal128 +template +Status ConvertToDecimal128(const Array& array, const std::shared_ptr<::arrow::DataType>&, + MemoryPool* pool, std::shared_ptr*) { + return Status::NotImplemented("not implemented"); +} + template <> Status ConvertToDecimal128(const Array& array, const std::shared_ptr<::arrow::DataType>& type, @@ -1263,10 +1272,10 @@ Status ConvertToDecimal128(const Array& array, } template <> - static Status ConvertToDecimal128 < - ByteArrayType(const Array& array, const std::shared_ptr<::arrow::DataType>& type, - std::shared_ptr* out) { - const auto& binary_array = static_cast(*array); +Status ConvertToDecimal128(const Array& array, + const std::shared_ptr<::arrow::DataType>& type, + MemoryPool* pool, std::shared_ptr* out) { + const auto& binary_array = static_cast(array); const int64_t length = binary_array.length(); const auto& decimal_type = static_cast(*type); @@ -1311,9 +1320,10 @@ template <> /// 2. Allocating a buffer for the arrow::Decimal128Array /// 3. Converting the big-endian bytes in each BinaryArray entry to two integers /// representing the high and low bits of each decimal value. +template struct TransferFunctor< ArrowType, ParquetType, - typename std::enable_if && + typename std::enable_if::value && (std::is_same::value || std::is_same::value)>::type> { Status operator()(RecordReader* reader, MemoryPool* pool, @@ -1408,7 +1418,7 @@ struct TransferFunctor<::arrow::Decimal128Type, Int64Type> { #define TRANSFER_DATA(ArrowType, ParquetType) \ TransferFunctor func; \ RETURN_NOT_OK(func(record_reader_.get(), pool_, field_->type(), &result)); \ - RETURN_NOT_OK(WrapIntoListArray(out)) + RETURN_NOT_OK(WrapIntoListArray(&result)) #define TRANSFER_CASE(ENUM, ArrowType, ParquetType) \ case ::arrow::Type::ENUM: { \ @@ -1505,7 +1515,7 @@ Status PrimitiveImpl::NextBatch(int64_t records_to_read, DCHECK_NE(result.kind(), Datum::NONE); if (result.kind() == Datum::ARRAY) { - *out = std::make_shared(result.array()); + *out = std::make_shared(result.make_array()); } else if (result.kind() == Datum::CHUNKED_ARRAY) { *out = result.chunked_array(); } else { @@ -1542,7 +1552,7 @@ Status ColumnReader::NextBatch(int64_t records_to_read, } Status ColumnReader::NextBatch(int64_t records_to_read, std::shared_ptr* out) { - std::make_shared chunked_out; + std::shared_ptr chunked_out; RETURN_NOT_OK(impl_->NextBatch(records_to_read, &chunked_out)); return GetSingleChunk(*chunked_out, out); } @@ -1689,7 +1699,7 @@ Status ColumnChunkReader::Read(std::shared_ptr<::arrow::ChunkedArray>* out) { } Status ColumnChunkReader::Read(std::shared_ptr<::arrow::Array>* out) { - std::make_shared chunked_out; + std::shared_ptr chunked_out; RETURN_NOT_OK(impl_->ReadColumnChunk(column_index_, row_group_index_, &chunked_out)); return GetSingleChunk(*chunked_out, out); } diff --git a/cpp/src/parquet/arrow/reader.h b/cpp/src/parquet/arrow/reader.h index 9783241d6ee..5286e742b08 100644 --- a/cpp/src/parquet/arrow/reader.h +++ b/cpp/src/parquet/arrow/reader.h @@ -128,7 +128,8 @@ class PARQUET_EXPORT FileReader { // Read column as a whole into an Array. ::arrow::Status ReadColumn(int i, std::shared_ptr<::arrow::ChunkedArray>* out); - PARQUET_DEPRECATED("Use version with ChunkedArray output") + /// \note Deprecated since 0.12 + ARROW_DEPRECATED("Use version with ChunkedArray output") ::arrow::Status ReadColumn(int i, std::shared_ptr<::arrow::Array>* out); // NOTE: Experimental API @@ -145,7 +146,8 @@ class PARQUET_EXPORT FileReader { // i=0 will read the entire foo struct, i=1 the foo2 primitive column etc ::arrow::Status ReadSchemaField(int i, std::shared_ptr<::arrow::ChunkedArray>* out); - PARQUET_DEPRECATED("Use version with ChunkedArray output") + /// \note Deprecated since 0.12 + ARROW_DEPRECATED("Use version with ChunkedArray output") ::arrow::Status ReadSchemaField(int i, std::shared_ptr<::arrow::Array>* out); /// \brief Return a RecordBatchReader of row groups selected from row_group_indices, the @@ -237,7 +239,8 @@ class PARQUET_EXPORT ColumnChunkReader { public: ::arrow::Status Read(std::shared_ptr<::arrow::ChunkedArray>* out); - PARQUET_DEPRECATED("Use version with ChunkedArray output") + /// \note Deprecated since 0.12 + ARROW_DEPRECATED("Use version with ChunkedArray output") ::arrow::Status Read(std::shared_ptr<::arrow::Array>* out); virtual ~ColumnChunkReader(); @@ -274,7 +277,8 @@ class PARQUET_EXPORT ColumnReader { ::arrow::Status NextBatch(int64_t batch_size, std::shared_ptr<::arrow::ChunkedArray>* out); - PARQUET_DEPRECATED("Use version with ChunkedArray output") + /// \note Deprecated since 0.12 + ARROW_DEPRECATED("Use version with ChunkedArray output") ::arrow::Status NextBatch(int64_t batch_size, std::shared_ptr<::arrow::Array>* out); private: diff --git a/cpp/src/parquet/arrow/record_reader.cc b/cpp/src/parquet/arrow/record_reader.cc index d60dfae017e..d1bf2c5cdfd 100644 --- a/cpp/src/parquet/arrow/record_reader.cc +++ b/cpp/src/parquet/arrow/record_reader.cc @@ -381,7 +381,7 @@ class RecordReader::RecordReaderImpl { virtual void DebugPrintState() = 0; - virtual std::vector> GetBuilderChunks() = 0; + virtual std::vector> GetBuilderChunks() = 0; protected: virtual bool ReadNewPage() = 0; @@ -440,12 +440,17 @@ class RecordReader::RecordReaderImpl { template struct RecordReaderTraits { - using BuilderType = void; + using BuilderType = ::arrow::ArrayBuilder; }; template <> struct RecordReaderTraits { - using BuilderType = ::arrow::ChunkedBinaryBuilder; + using BuilderType = ::arrow::internal::ChunkedBinaryBuilder; +}; + +template <> +struct RecordReaderTraits { + using BuilderType = ::arrow::FixedSizeBinaryBuilder; }; template @@ -455,18 +460,9 @@ class TypedRecordReader : public RecordReader::RecordReaderImpl { using BuilderType = typename RecordReaderTraits::BuilderType; - // Maximum of 16MB chunks - static constexpr int32_t kBinaryChunksize = 1 << 24; - TypedRecordReader(const ColumnDescriptor* descr, ::arrow::MemoryPool* pool) : RecordReader::RecordReaderImpl(descr, pool), current_decoder_(nullptr) { - if (descr->physical_type() == Type::BYTE_ARRAY) { - builder_.reset(new ::arrow::internal::ChunkedBinaryBuilder(kBinaryChunksize, pool)); - } else if (descr->physical_type() == Type::FIXED_LEN_BYTE_ARRAY) { - int byte_width = descr->type_length(); - std::shared_ptr<::arrow::DataType> type = ::arrow::fixed_size_binary(byte_width); - builder_.reset(new ::arrow::FixedSizeBinaryBuilder(type, pool)); - } + InitializeBuilder(); } void ResetDecoders() override { decoders_.clear(); } @@ -558,7 +554,7 @@ class TypedRecordReader : public RecordReader::RecordReaderImpl { std::cout << std::endl; } - std::vector> GetBuilderChunks() override { + std::vector> GetBuilderChunks() override { throw ParquetException("GetChunks only implemented for binary types"); } @@ -577,6 +573,8 @@ class TypedRecordReader : public RecordReader::RecordReaderImpl { // Advance to the next data page bool ReadNewPage() override; + void InitializeBuilder() {} + void ConfigureDictionary(const DictionaryPage* page); }; @@ -590,6 +588,22 @@ void TypedRecordReader::DebugPrintState() {} template <> void TypedRecordReader::DebugPrintState() {} +template <> +void TypedRecordReader::InitializeBuilder() { + // Maximum of 16MB chunks + constexpr int32_t kBinaryChunksize = 1 << 24; + DCHECK_EQ(descr_->physical_type(), Type::BYTE_ARRAY); + builder_.reset(new ::arrow::internal::ChunkedBinaryBuilder(kBinaryChunksize, pool_)); +} + +template <> +void TypedRecordReader::InitializeBuilder() { + DCHECK_EQ(descr_->physical_type(), Type::FIXED_LEN_BYTE_ARRAY); + int byte_width = descr_->type_length(); + std::shared_ptr<::arrow::DataType> type = ::arrow::fixed_size_binary(byte_width); + builder_.reset(new ::arrow::FixedSizeBinaryBuilder(type, pool_)); +} + template <> ::arrow::ArrayVector TypedRecordReader::GetBuilderChunks() { ::arrow::ArrayVector chunks; @@ -599,7 +613,7 @@ ::arrow::ArrayVector TypedRecordReader::GetBuilderChunks() { template <> ::arrow::ArrayVector TypedRecordReader::GetBuilderChunks() { - std::shared_ptr chunk; + std::shared_ptr<::arrow::Array> chunk; PARQUET_THROW_NOT_OK(builder_->Finish(&chunk)); return ::arrow::ArrayVector({chunk}); } @@ -888,6 +902,10 @@ void RecordReader::SetPageReader(std::unique_ptr reader) { impl_->SetPageReader(std::move(reader)); } +::arrow::ArrayVector RecordReader::GetBuilderChunks() { + return impl_->GetBuilderChunks(); +} + void RecordReader::DebugPrintState() { impl_->DebugPrintState(); } } // namespace internal From 81e787c691f435001d1ec603c82fef1b22dcaf05 Mon Sep 17 00:00:00 2001 From: Wes McKinney Date: Thu, 13 Dec 2018 17:10:12 -0600 Subject: [PATCH 05/12] Fix up Python bindings, unit test Change-Id: I35ab3ace0e4ca7a80fc7d85e55ac55ea222b15dc --- cpp/src/parquet/arrow/CMakeLists.txt | 4 +++- python/pyarrow/_parquet.pxd | 6 +++--- python/pyarrow/_parquet.pyx | 23 +++++++---------------- python/pyarrow/lib.pxd | 2 ++ python/pyarrow/tests/test_parquet.py | 4 +++- 5 files changed, 18 insertions(+), 21 deletions(-) diff --git a/cpp/src/parquet/arrow/CMakeLists.txt b/cpp/src/parquet/arrow/CMakeLists.txt index d7e308e2ed7..3c915d47853 100644 --- a/cpp/src/parquet/arrow/CMakeLists.txt +++ b/cpp/src/parquet/arrow/CMakeLists.txt @@ -21,7 +21,9 @@ ADD_PARQUET_TEST(arrow-reader-writer-test) ADD_BENCHMARK(reader-writer-benchmark PREFIX "parquet-arrow" EXTRA_LINK_LIBS ${PARQUET_BENCHMARK_LINK_LIBRARIES}) -add_dependencies(parquet parquet-arrow-reader-writer-benchmark) +if (TARGET parquet-arrow-reader-writer-benchmark) + add_dependencies(parquet parquet-arrow-reader-writer-benchmark) +endif() # Headers: top level install(FILES diff --git a/python/pyarrow/_parquet.pxd b/python/pyarrow/_parquet.pxd index 9e1a24961af..b63e72c57cf 100644 --- a/python/pyarrow/_parquet.pxd +++ b/python/pyarrow/_parquet.pxd @@ -19,7 +19,7 @@ # cython: language_level = 3 from pyarrow.includes.common cimport * -from pyarrow.includes.libarrow cimport (CArray, CSchema, CStatus, +from pyarrow.includes.libarrow cimport (CChunkedArray, CSchema, CStatus, CTable, CMemoryPool, CKeyValueMetadata, RandomAccessFile, OutputStream, @@ -272,8 +272,8 @@ cdef extern from "parquet/arrow/reader.h" namespace "parquet::arrow" nogil: cdef cppclass FileReader: FileReader(CMemoryPool* pool, unique_ptr[ParquetFileReader] reader) - CStatus ReadColumn(int i, shared_ptr[CArray]* out) - CStatus ReadSchemaField(int i, shared_ptr[CArray]* out) + CStatus ReadColumn(int i, shared_ptr[CChunkedArray]* out) + CStatus ReadSchemaField(int i, shared_ptr[CChunkedArray]* out) int num_row_groups() CStatus ReadRowGroup(int i, shared_ptr[CTable]* out) diff --git a/python/pyarrow/_parquet.pyx b/python/pyarrow/_parquet.pyx index 8112504e9e4..36a4d345c6a 100644 --- a/python/pyarrow/_parquet.pyx +++ b/python/pyarrow/_parquet.pyx @@ -26,6 +26,7 @@ from pyarrow.lib cimport (Array, Schema, check_status, MemoryPool, maybe_unbox_memory_pool, Table, + pyarrow_wrap_chunked_array, pyarrow_wrap_schema, pyarrow_wrap_table, NativeFile, get_reader, get_writer) @@ -770,28 +771,18 @@ cdef class ParquetReader: return self._column_idx_map[tobytes(column_name)] def read_column(self, int column_index): - cdef: - Array array = Array() - shared_ptr[CArray] carray - + cdef shared_ptr[CChunkedArray] out with nogil: check_status(self.reader.get() - .ReadColumn(column_index, &carray)) - - array.init(carray) - return array + .ReadColumn(column_index, &out)) + return pyarrow_wrap_chunked_array(out) def read_schema_field(self, int field_index): - cdef: - Array array = Array() - shared_ptr[CArray] carray - + cdef shared_ptr[CChunkedArray] out with nogil: check_status(self.reader.get() - .ReadSchemaField(field_index, &carray)) - - array.init(carray) - return array + .ReadSchemaField(field_index, &out)) + return pyarrow_wrap_chunked_array(out) cdef class ParquetWriter: diff --git a/python/pyarrow/lib.pxd b/python/pyarrow/lib.pxd index 745a049e32a..3e628263ba3 100644 --- a/python/pyarrow/lib.pxd +++ b/python/pyarrow/lib.pxd @@ -396,6 +396,8 @@ cdef object pyarrow_wrap_metadata( # cdef public object pyarrow_wrap_array(const shared_ptr[CArray]& sp_array) +cdef public object pyarrow_wrap_chunked_array( + const shared_ptr[CChunkedArray]& sp_array) # XXX pyarrow.h calls it `wrap_record_batch` cdef public object pyarrow_wrap_batch(const shared_ptr[CRecordBatch]& cbatch) cdef public object pyarrow_wrap_buffer(const shared_ptr[CBuffer]& buf) diff --git a/python/pyarrow/tests/test_parquet.py b/python/pyarrow/tests/test_parquet.py index 77a101a1f42..5c27a9b86a3 100644 --- a/python/pyarrow/tests/test_parquet.py +++ b/python/pyarrow/tests/test_parquet.py @@ -1979,7 +1979,9 @@ def test_binary_array_overflow_to_chunked(): col0_data = read_tbl[0].data assert isinstance(col0_data, pa.ChunkedArray) - assert col0_data.num_chunks == 2 + + # Split up into 16MB chunks. 128 * 16 = 2048, so 129 + assert col0_data.num_chunks == 129 assert tbl.equals(read_tbl) From 8ffaec1ef74a4d1cee98b8e07e649c15153a36e2 Mon Sep 17 00:00:00 2001 From: Wes McKinney Date: Thu, 13 Dec 2018 21:06:18 -0600 Subject: [PATCH 06/12] Address preliminary code comments. Check in missing files Change-Id: I8f0a35ae4e8581790f7731ee2ed023a54caf0f31 --- cpp/src/arrow/array-binary-test.cc | 8 +++++++ cpp/src/arrow/columnar/CMakeLists.txt | 27 ++++++++++++++++++++++++ cpp/src/arrow/columnar/builder_binary.cc | 14 +++++------- cpp/src/arrow/columnar/builder_nested.h | 7 ++---- cpp/src/arrow/compute/kernel.h | 3 +-- cpp/src/parquet/arrow/reader.cc | 2 +- 6 files changed, 44 insertions(+), 17 deletions(-) create mode 100644 cpp/src/arrow/columnar/CMakeLists.txt diff --git a/cpp/src/arrow/array-binary-test.cc b/cpp/src/arrow/array-binary-test.cc index 5bfdc993161..6f938c82bfd 100644 --- a/cpp/src/arrow/array-binary-test.cc +++ b/cpp/src/arrow/array-binary-test.cc @@ -754,6 +754,14 @@ TEST_F(TestChunkedBinaryBuilder, LargeElements) { ArrayVector chunks; ASSERT_OK(builder_->Finish(&chunks)); ASSERT_EQ(iterations, static_cast(chunks.size())); + + int64_t total_data_size = 0; + for (auto chunk : chunks) { + ASSERT_EQ(1, chunk->length()); + total_data_size += + static_cast(static_cast(*chunk).GetView(0).size()); + } + ASSERT_EQ(iterations * bufsize, total_data_size); } TEST(TestChunkedStringBuilder, BasicOperation) { diff --git a/cpp/src/arrow/columnar/CMakeLists.txt b/cpp/src/arrow/columnar/CMakeLists.txt new file mode 100644 index 00000000000..3cd675f4010 --- /dev/null +++ b/cpp/src/arrow/columnar/CMakeLists.txt @@ -0,0 +1,27 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +# Headers: top level +install(FILES + builder_adaptive.h + builder_base.h + builder_binary.h + builder_decimal.h + builder_dict.h + builder_nested.h + builder_primitive.h + DESTINATION "${CMAKE_INSTALL_INCLUDEDIR}/arrow/columnar") diff --git a/cpp/src/arrow/columnar/builder_binary.cc b/cpp/src/arrow/columnar/builder_binary.cc index 36a78a4d920..82334134612 100644 --- a/cpp/src/arrow/columnar/builder_binary.cc +++ b/cpp/src/arrow/columnar/builder_binary.cc @@ -305,17 +305,13 @@ Status ChunkedBinaryBuilder::NextChunk() { } Status ChunkedStringBuilder::Finish(ArrayVector* out) { - ArrayVector temp; - RETURN_NOT_OK(ChunkedBinaryBuilder::Finish(&temp)); + RETURN_NOT_OK(ChunkedBinaryBuilder::Finish(out)); - out->clear(); - out->reserve(temp.size()); - - // XXX: Change data type to string/utf8 - for (const auto& chunk : temp) { - std::shared_ptr data = chunk->data(); + // Change data type to string/utf8 + for (size_t i = 0; i < out->size(); ++i) { + std::shared_ptr data = (*out)[i]->data(); data->type = ::arrow::utf8(); - out->emplace_back(std::make_shared(data)); + (*out)[i] = std::make_shared(data); } return Status::OK(); } diff --git a/cpp/src/arrow/columnar/builder_nested.h b/cpp/src/arrow/columnar/builder_nested.h index aa6c439dffd..a9cb412b644 100644 --- a/cpp/src/arrow/columnar/builder_nested.h +++ b/cpp/src/arrow/columnar/builder_nested.h @@ -113,12 +113,9 @@ class ARROW_EXPORT StructBuilder : public ArrayBuilder { void Reset() override; - ArrayBuilder* field_builder(int i) const { return field_builders_[i].get(); } + ArrayBuilder* field_builder(int i) const { return children_[i].get(); } - int num_fields() const { return static_cast(field_builders_.size()); } - - protected: - std::vector> field_builders_; + int num_fields() const { return static_cast(children_.size()); } }; } // namespace arrow diff --git a/cpp/src/arrow/compute/kernel.h b/cpp/src/arrow/compute/kernel.h index fcc06f37be8..87080b1000d 100644 --- a/cpp/src/arrow/compute/kernel.h +++ b/cpp/src/arrow/compute/kernel.h @@ -80,8 +80,7 @@ struct ARROW_EXPORT Datum { // Cast from subtypes of Array to Datum template ::value, - void>::type> + typename = typename std::enable_if::value>::type> Datum(const std::shared_ptr& value) // NOLINT implicit conversion : Datum(std::shared_ptr(value)) {} diff --git a/cpp/src/parquet/arrow/reader.cc b/cpp/src/parquet/arrow/reader.cc index 12093eda8b6..2a7730d42ad 100644 --- a/cpp/src/parquet/arrow/reader.cc +++ b/cpp/src/parquet/arrow/reader.cc @@ -32,7 +32,7 @@ #include "arrow/util/logging.h" #include "arrow/util/thread-pool.h" -// XXX(wesm): For arrow::compute::Datum. This should perhaps be promoted +// For arrow::compute::Datum. This should perhaps be promoted. See ARROW-4022 #include "arrow/compute/kernel.h" #include "parquet/arrow/record_reader.h" From 5fdbbb2613a48f91eb397e6be0f9d8fad0c76a18 Mon Sep 17 00:00:00 2001 From: Wes McKinney Date: Fri, 14 Dec 2018 09:09:54 -0600 Subject: [PATCH 07/12] Rename columnar/ directory to array/ Change-Id: I7fac456a34aa81683fa7315ae1b287be7f0d16e0 --- cpp/src/arrow/CMakeLists.txt | 21 ++++++++++--------- .../arrow/{columnar => array}/CMakeLists.txt | 2 +- cpp/src/arrow/array/README.md | 20 ++++++++++++++++++ .../{columnar => array}/builder_adaptive.cc | 2 +- .../{columnar => array}/builder_adaptive.h | 2 +- .../arrow/{columnar => array}/builder_base.cc | 2 +- .../arrow/{columnar => array}/builder_base.h | 2 +- .../{columnar => array}/builder_binary.cc | 2 +- .../{columnar => array}/builder_binary.h | 2 +- .../{columnar => array}/builder_decimal.cc | 2 +- .../{columnar => array}/builder_decimal.h | 4 ++-- .../arrow/{columnar => array}/builder_dict.cc | 2 +- .../arrow/{columnar => array}/builder_dict.h | 4 ++-- .../{columnar => array}/builder_nested.cc | 2 +- .../{columnar => array}/builder_nested.h | 2 +- .../{columnar => array}/builder_primitive.cc | 2 +- .../{columnar => array}/builder_primitive.h | 2 +- cpp/src/arrow/builder.h | 14 ++++++------- 18 files changed, 55 insertions(+), 34 deletions(-) rename cpp/src/arrow/{columnar => array}/CMakeLists.txt (94%) create mode 100644 cpp/src/arrow/array/README.md rename cpp/src/arrow/{columnar => array}/builder_adaptive.cc (99%) rename cpp/src/arrow/{columnar => array}/builder_adaptive.h (99%) rename cpp/src/arrow/{columnar => array}/builder_base.cc (99%) rename cpp/src/arrow/{columnar => array}/builder_base.h (99%) rename cpp/src/arrow/{columnar => array}/builder_binary.cc (99%) rename cpp/src/arrow/{columnar => array}/builder_binary.h (99%) rename cpp/src/arrow/{columnar => array}/builder_decimal.cc (97%) rename cpp/src/arrow/{columnar => array}/builder_decimal.h (94%) rename cpp/src/arrow/{columnar => array}/builder_dict.cc (99%) rename cpp/src/arrow/{columnar => array}/builder_dict.h (97%) rename cpp/src/arrow/{columnar => array}/builder_nested.cc (99%) rename cpp/src/arrow/{columnar => array}/builder_nested.h (99%) rename cpp/src/arrow/{columnar => array}/builder_primitive.cc (99%) rename cpp/src/arrow/{columnar => array}/builder_primitive.h (99%) diff --git a/cpp/src/arrow/CMakeLists.txt b/cpp/src/arrow/CMakeLists.txt index e381d706c68..96da890b6cc 100644 --- a/cpp/src/arrow/CMakeLists.txt +++ b/cpp/src/arrow/CMakeLists.txt @@ -53,8 +53,17 @@ endfunction() set(ARROW_SRCS array.cc - buffer.cc + builder.cc + array/builder_adaptive.cc + array/builder_base.cc + array/builder_binary.cc + array/builder_decimal.cc + array/builder_dict.cc + array/builder_nested.cc + array/builder_primitive.cc + + buffer.cc compare.cc memory_pool.cc pretty_print.cc @@ -66,14 +75,6 @@ set(ARROW_SRCS type.cc visitor.cc - columnar/builder_adaptive.cc - columnar/builder_base.cc - columnar/builder_binary.cc - columnar/builder_decimal.cc - columnar/builder_dict.cc - columnar/builder_nested.cc - columnar/builder_primitive.cc - csv/converter.cc csv/chunker.cc csv/column-builder.cc @@ -302,7 +303,7 @@ ADD_ARROW_TEST(tensor-test) ADD_ARROW_BENCHMARK(builder-benchmark) ADD_ARROW_BENCHMARK(column-benchmark) -add_subdirectory(columnar) +add_subdirectory(array) add_subdirectory(csv) add_subdirectory(io) add_subdirectory(util) diff --git a/cpp/src/arrow/columnar/CMakeLists.txt b/cpp/src/arrow/array/CMakeLists.txt similarity index 94% rename from cpp/src/arrow/columnar/CMakeLists.txt rename to cpp/src/arrow/array/CMakeLists.txt index 3cd675f4010..a789c88dd9d 100644 --- a/cpp/src/arrow/columnar/CMakeLists.txt +++ b/cpp/src/arrow/array/CMakeLists.txt @@ -24,4 +24,4 @@ install(FILES builder_dict.h builder_nested.h builder_primitive.h - DESTINATION "${CMAKE_INSTALL_INCLUDEDIR}/arrow/columnar") + DESTINATION "${CMAKE_INSTALL_INCLUDEDIR}/arrow/array") diff --git a/cpp/src/arrow/array/README.md b/cpp/src/arrow/array/README.md new file mode 100644 index 00000000000..09580193aad --- /dev/null +++ b/cpp/src/arrow/array/README.md @@ -0,0 +1,20 @@ + + +## Implementation details related to columnnar (array) data structures diff --git a/cpp/src/arrow/columnar/builder_adaptive.cc b/cpp/src/arrow/array/builder_adaptive.cc similarity index 99% rename from cpp/src/arrow/columnar/builder_adaptive.cc rename to cpp/src/arrow/array/builder_adaptive.cc index a9f2d341042..599e9e1c38d 100644 --- a/cpp/src/arrow/columnar/builder_adaptive.cc +++ b/cpp/src/arrow/array/builder_adaptive.cc @@ -15,7 +15,7 @@ // specific language governing permissions and limitations // under the License. -#include "arrow/columnar/builder_adaptive.h" +#include "arrow/array/builder_adaptive.h" #include #include diff --git a/cpp/src/arrow/columnar/builder_adaptive.h b/cpp/src/arrow/array/builder_adaptive.h similarity index 99% rename from cpp/src/arrow/columnar/builder_adaptive.h rename to cpp/src/arrow/array/builder_adaptive.h index ff378ed4ed9..6523de41622 100644 --- a/cpp/src/arrow/columnar/builder_adaptive.h +++ b/cpp/src/arrow/array/builder_adaptive.h @@ -19,7 +19,7 @@ #include -#include "arrow/columnar/builder_base.h" +#include "arrow/array/builder_base.h" namespace arrow { diff --git a/cpp/src/arrow/columnar/builder_base.cc b/cpp/src/arrow/array/builder_base.cc similarity index 99% rename from cpp/src/arrow/columnar/builder_base.cc rename to cpp/src/arrow/array/builder_base.cc index be91f649b58..321aa44dab5 100644 --- a/cpp/src/arrow/columnar/builder_base.cc +++ b/cpp/src/arrow/array/builder_base.cc @@ -15,7 +15,7 @@ // specific language governing permissions and limitations // under the License. -#include "arrow/columnar/builder_base.h" +#include "arrow/array/builder_base.h" #include #include diff --git a/cpp/src/arrow/columnar/builder_base.h b/cpp/src/arrow/array/builder_base.h similarity index 99% rename from cpp/src/arrow/columnar/builder_base.h rename to cpp/src/arrow/array/builder_base.h index d5c3fcf781c..ae400fc4638 100644 --- a/cpp/src/arrow/columnar/builder_base.h +++ b/cpp/src/arrow/array/builder_base.h @@ -17,7 +17,7 @@ #pragma once -#include "arrow/columnar/builder_base.h" +#include "arrow/array/builder_base.h" #include // IWYU pragma: keep #include diff --git a/cpp/src/arrow/columnar/builder_binary.cc b/cpp/src/arrow/array/builder_binary.cc similarity index 99% rename from cpp/src/arrow/columnar/builder_binary.cc rename to cpp/src/arrow/array/builder_binary.cc index 82334134612..fceb964ac78 100644 --- a/cpp/src/arrow/columnar/builder_binary.cc +++ b/cpp/src/arrow/array/builder_binary.cc @@ -15,7 +15,7 @@ // specific language governing permissions and limitations // under the License. -#include "arrow/columnar/builder_binary.h" +#include "arrow/array/builder_binary.h" #include #include diff --git a/cpp/src/arrow/columnar/builder_binary.h b/cpp/src/arrow/array/builder_binary.h similarity index 99% rename from cpp/src/arrow/columnar/builder_binary.h rename to cpp/src/arrow/array/builder_binary.h index 5c65e001f8a..ac644f40598 100644 --- a/cpp/src/arrow/columnar/builder_binary.h +++ b/cpp/src/arrow/array/builder_binary.h @@ -23,7 +23,7 @@ #include #include "arrow/array.h" -#include "arrow/columnar/builder_base.h" +#include "arrow/array/builder_base.h" #include "arrow/status.h" #include "arrow/type_traits.h" #include "arrow/util/macros.h" diff --git a/cpp/src/arrow/columnar/builder_decimal.cc b/cpp/src/arrow/array/builder_decimal.cc similarity index 97% rename from cpp/src/arrow/columnar/builder_decimal.cc rename to cpp/src/arrow/array/builder_decimal.cc index b8b23a09638..d64c4db6f0c 100644 --- a/cpp/src/arrow/columnar/builder_decimal.cc +++ b/cpp/src/arrow/array/builder_decimal.cc @@ -15,7 +15,7 @@ // specific language governing permissions and limitations // under the License. -#include "arrow/columnar/builder_decimal.h" +#include "arrow/array/builder_decimal.h" #include #include diff --git a/cpp/src/arrow/columnar/builder_decimal.h b/cpp/src/arrow/array/builder_decimal.h similarity index 94% rename from cpp/src/arrow/columnar/builder_decimal.h rename to cpp/src/arrow/array/builder_decimal.h index 6c9d31eea28..fb40a7950ab 100644 --- a/cpp/src/arrow/columnar/builder_decimal.h +++ b/cpp/src/arrow/array/builder_decimal.h @@ -19,8 +19,8 @@ #include -#include "arrow/columnar/builder_base.h" -#include "arrow/columnar/builder_binary.h" +#include "arrow/array/builder_base.h" +#include "arrow/array/builder_binary.h" namespace arrow { diff --git a/cpp/src/arrow/columnar/builder_dict.cc b/cpp/src/arrow/array/builder_dict.cc similarity index 99% rename from cpp/src/arrow/columnar/builder_dict.cc rename to cpp/src/arrow/array/builder_dict.cc index f7eff6deb0c..0891e4c0829 100644 --- a/cpp/src/arrow/columnar/builder_dict.cc +++ b/cpp/src/arrow/array/builder_dict.cc @@ -15,7 +15,7 @@ // specific language governing permissions and limitations // under the License. -#include "arrow/columnar/builder_dict.h" +#include "arrow/array/builder_dict.h" #include #include diff --git a/cpp/src/arrow/columnar/builder_dict.h b/cpp/src/arrow/array/builder_dict.h similarity index 97% rename from cpp/src/arrow/columnar/builder_dict.h rename to cpp/src/arrow/array/builder_dict.h index 87b7feff8f9..6f0271683ae 100644 --- a/cpp/src/arrow/columnar/builder_dict.h +++ b/cpp/src/arrow/array/builder_dict.h @@ -19,8 +19,8 @@ #include -#include "arrow/columnar/builder_adaptive.h" // IWYU pragma: export -#include "arrow/columnar/builder_base.h" // IWYU pragma: export +#include "arrow/array/builder_adaptive.h" // IWYU pragma: export +#include "arrow/array/builder_base.h" // IWYU pragma: export namespace arrow { diff --git a/cpp/src/arrow/columnar/builder_nested.cc b/cpp/src/arrow/array/builder_nested.cc similarity index 99% rename from cpp/src/arrow/columnar/builder_nested.cc rename to cpp/src/arrow/array/builder_nested.cc index af48a1f5252..e73324323af 100644 --- a/cpp/src/arrow/columnar/builder_nested.cc +++ b/cpp/src/arrow/array/builder_nested.cc @@ -15,7 +15,7 @@ // specific language governing permissions and limitations // under the License. -#include "arrow/columnar/builder_nested.h" +#include "arrow/array/builder_nested.h" #include #include diff --git a/cpp/src/arrow/columnar/builder_nested.h b/cpp/src/arrow/array/builder_nested.h similarity index 99% rename from cpp/src/arrow/columnar/builder_nested.h rename to cpp/src/arrow/array/builder_nested.h index a9cb412b644..863e6fef06f 100644 --- a/cpp/src/arrow/columnar/builder_nested.h +++ b/cpp/src/arrow/array/builder_nested.h @@ -20,7 +20,7 @@ #include #include -#include "arrow/columnar/builder_base.h" +#include "arrow/array/builder_base.h" namespace arrow { diff --git a/cpp/src/arrow/columnar/builder_primitive.cc b/cpp/src/arrow/array/builder_primitive.cc similarity index 99% rename from cpp/src/arrow/columnar/builder_primitive.cc rename to cpp/src/arrow/array/builder_primitive.cc index ded7af0b091..bc14000c3e1 100644 --- a/cpp/src/arrow/columnar/builder_primitive.cc +++ b/cpp/src/arrow/array/builder_primitive.cc @@ -15,7 +15,7 @@ // specific language governing permissions and limitations // under the License. -#include "arrow/columnar/builder_primitive.h" +#include "arrow/array/builder_primitive.h" #include #include diff --git a/cpp/src/arrow/columnar/builder_primitive.h b/cpp/src/arrow/array/builder_primitive.h similarity index 99% rename from cpp/src/arrow/columnar/builder_primitive.h rename to cpp/src/arrow/array/builder_primitive.h index 0d243ae29e1..13f6c229b2a 100644 --- a/cpp/src/arrow/columnar/builder_primitive.h +++ b/cpp/src/arrow/array/builder_primitive.h @@ -21,7 +21,7 @@ #include #include -#include "arrow/columnar/builder_base.h" +#include "arrow/array/builder_base.h" #include "arrow/type.h" namespace arrow { diff --git a/cpp/src/arrow/builder.h b/cpp/src/arrow/builder.h index 6d58b8f2a0d..a7ab22c1bee 100644 --- a/cpp/src/arrow/builder.h +++ b/cpp/src/arrow/builder.h @@ -19,13 +19,13 @@ #include -#include "arrow/columnar/builder_adaptive.h" // IWYU pragma: export -#include "arrow/columnar/builder_base.h" // IWYU pragma: export -#include "arrow/columnar/builder_binary.h" // IWYU pragma: export -#include "arrow/columnar/builder_decimal.h" // IWYU pragma: export -#include "arrow/columnar/builder_dict.h" // IWYU pragma: export -#include "arrow/columnar/builder_nested.h" // IWYU pragma: export -#include "arrow/columnar/builder_primitive.h" // IWYU pragma: export +#include "arrow/array/builder_adaptive.h" // IWYU pragma: export +#include "arrow/array/builder_base.h" // IWYU pragma: export +#include "arrow/array/builder_binary.h" // IWYU pragma: export +#include "arrow/array/builder_decimal.h" // IWYU pragma: export +#include "arrow/array/builder_dict.h" // IWYU pragma: export +#include "arrow/array/builder_nested.h" // IWYU pragma: export +#include "arrow/array/builder_primitive.h" // IWYU pragma: export #include "arrow/status.h" #include "arrow/util/visibility.h" From 3669201beca0be4f607e4b2f9c5864261345ac53 Mon Sep 17 00:00:00 2001 From: Wes McKinney Date: Fri, 14 Dec 2018 09:19:19 -0600 Subject: [PATCH 08/12] Fix deprecated API use Change-Id: I47f93c7d8561b83414ab34f709fec66a6eb462d2 --- cpp/examples/parquet/CMakeLists.txt | 2 +- cpp/examples/parquet/parquet-arrow/CMakeLists.txt | 2 +- .../parquet-arrow/{src => }/reader-writer.cc | 4 ++-- cpp/src/arrow/pretty_print-test.cc | 2 +- cpp/src/parquet/arrow/arrow-reader-writer-test.cc | 15 ++++++++++----- 5 files changed, 15 insertions(+), 10 deletions(-) rename cpp/examples/parquet/parquet-arrow/{src => }/reader-writer.cc (98%) diff --git a/cpp/examples/parquet/CMakeLists.txt b/cpp/examples/parquet/CMakeLists.txt index 98c5cd9402b..db172a2534f 100644 --- a/cpp/examples/parquet/CMakeLists.txt +++ b/cpp/examples/parquet/CMakeLists.txt @@ -22,7 +22,7 @@ target_include_directories(parquet-low-level-example2 PRIVATE low-level-api/) target_link_libraries(parquet-low-level-example parquet_static) target_link_libraries(parquet-low-level-example2 parquet_static) -add_executable(parquet-arrow-example parquet-arrow/src/reader-writer.cc) +add_executable(parquet-arrow-example parquet-arrow/reader-writer.cc) target_link_libraries(parquet-arrow-example parquet_shared) add_dependencies(parquet diff --git a/cpp/examples/parquet/parquet-arrow/CMakeLists.txt b/cpp/examples/parquet/parquet-arrow/CMakeLists.txt index d9e01acd3ee..915930ec228 100644 --- a/cpp/examples/parquet/parquet-arrow/CMakeLists.txt +++ b/cpp/examples/parquet/parquet-arrow/CMakeLists.txt @@ -38,5 +38,5 @@ find_package(Parquet) include_directories(SYSTEM ${ARROW_INCLUDE_DIR} ${PARQUET_INCLUDE_DIR}) -add_executable(parquet-arrow-example src/reader-writer.cc) +add_executable(parquet-arrow-example reader-writer.cc) target_link_libraries(parquet-arrow-example ${PARQUET_SHARED_LIB} ${ARROW_SHARED_LIB}) diff --git a/cpp/examples/parquet/parquet-arrow/src/reader-writer.cc b/cpp/examples/parquet/parquet-arrow/reader-writer.cc similarity index 98% rename from cpp/examples/parquet/parquet-arrow/src/reader-writer.cc rename to cpp/examples/parquet/parquet-arrow/reader-writer.cc index 8d474486e74..a5f928b6d4f 100644 --- a/cpp/examples/parquet/parquet-arrow/src/reader-writer.cc +++ b/cpp/examples/parquet/parquet-arrow/reader-writer.cc @@ -100,7 +100,7 @@ void read_single_column() { std::unique_ptr reader; PARQUET_THROW_NOT_OK( parquet::arrow::OpenFile(infile, arrow::default_memory_pool(), &reader)); - std::shared_ptr array; + std::shared_ptr array; PARQUET_THROW_NOT_OK(reader->ReadColumn(0, &array)); PARQUET_THROW_NOT_OK(arrow::PrettyPrint(*array, 4, &std::cout)); std::cout << std::endl; @@ -119,7 +119,7 @@ void read_single_column_chunk() { std::unique_ptr reader; PARQUET_THROW_NOT_OK( parquet::arrow::OpenFile(infile, arrow::default_memory_pool(), &reader)); - std::shared_ptr array; + std::shared_ptr array; PARQUET_THROW_NOT_OK(reader->RowGroup(0)->Column(0)->Read(&array)); PARQUET_THROW_NOT_OK(arrow::PrettyPrint(*array, 4, &std::cout)); std::cout << std::endl; diff --git a/cpp/src/arrow/pretty_print-test.cc b/cpp/src/arrow/pretty_print-test.cc index 052a1803920..a1acfb81aef 100644 --- a/cpp/src/arrow/pretty_print-test.cc +++ b/cpp/src/arrow/pretty_print-test.cc @@ -340,7 +340,7 @@ TEST_F(TestPrettyPrint, DictionaryType) { TEST_F(TestPrettyPrint, ChunkedArrayPrimitiveType) { auto array = ArrayFromJSON(int32(), "[0, 1, null, 3, null]"); - ChunkedArray chunked_array({array}); + ChunkedArray chunked_array(array); static const char* expected = R"expected([ [ diff --git a/cpp/src/parquet/arrow/arrow-reader-writer-test.cc b/cpp/src/parquet/arrow/arrow-reader-writer-test.cc index 24ec0dd24ee..07124ebb305 100644 --- a/cpp/src/parquet/arrow/arrow-reader-writer-test.cc +++ b/cpp/src/parquet/arrow/arrow-reader-writer-test.cc @@ -464,7 +464,11 @@ class TestParquetIO : public ::testing::Test { ASSERT_OK_NO_THROW(file_reader->GetColumn(0, &column_reader)); ASSERT_NE(nullptr, column_reader.get()); - ASSERT_OK(column_reader->NextBatch(SMALL_SIZE, out)); + std::shared_ptr chunked_out; + ASSERT_OK(column_reader->NextBatch(SMALL_SIZE, &chunked_out)); + + ASSERT_EQ(1, chunked_out->num_chunks()); + *out = chunked_out->chunk(0); ASSERT_NE(nullptr, out->get()); } @@ -1745,10 +1749,11 @@ TEST(TestArrowReadWrite, ListLargeRecords) { std::vector> pieces; for (int i = 0; i < num_rows; ++i) { - std::shared_ptr piece; - ASSERT_OK(col_reader->NextBatch(1, &piece)); - ASSERT_EQ(1, piece->length()); - pieces.push_back(piece); + std::shared_ptr chunked_piece; + ASSERT_OK(col_reader->NextBatch(1, &chunked_piece)); + ASSERT_EQ(1, chunked_piece->length()); + ASSERT_EQ(1, chunked_piece->num_chunks()); + pieces.push_back(chunked_piece->chunk(0)); } auto chunked = std::make_shared<::arrow::ChunkedArray>(pieces); From b90eb4b7177e775823c8bf855a580e5180f77e69 Mon Sep 17 00:00:00 2001 From: Wes McKinney Date: Fri, 14 Dec 2018 09:57:41 -0600 Subject: [PATCH 09/12] Restore sstream include to pretty_print.cc Change-Id: I8266354f04c8e14819fe4c72d28474e09843c13c --- cpp/src/arrow/pretty_print.cc | 1 + 1 file changed, 1 insertion(+) diff --git a/cpp/src/arrow/pretty_print.cc b/cpp/src/arrow/pretty_print.cc index 4568e172a03..c524039c3e8 100644 --- a/cpp/src/arrow/pretty_print.cc +++ b/cpp/src/arrow/pretty_print.cc @@ -19,6 +19,7 @@ #include #include #include +#include // IWYU pragma: keep #include #include #include From 5a525115c9d36b0d56796d8427c79eca78adbdb3 Mon Sep 17 00:00:00 2001 From: Wes McKinney Date: Fri, 14 Dec 2018 13:01:21 -0600 Subject: [PATCH 10/12] Use strnlen to compute string length. Inline BinaryBuilder::AppendNextOffset Change-Id: Ibfc09617b365c937e7af6a4943c274843f6e7a33 --- cpp/cmake_modules/SetupCxxFlags.cmake | 3 +++ cpp/src/arrow/array/builder_binary.cc | 14 +++++--------- cpp/src/arrow/array/builder_binary.h | 11 ++++++++++- cpp/src/arrow/builder-benchmark.cc | 20 ++++++++++---------- cpp/src/arrow/python/numpy_to_arrow.cc | 11 ++++------- 5 files changed, 32 insertions(+), 27 deletions(-) diff --git a/cpp/cmake_modules/SetupCxxFlags.cmake b/cpp/cmake_modules/SetupCxxFlags.cmake index 893ec360d3e..61fd14ca2cf 100644 --- a/cpp/cmake_modules/SetupCxxFlags.cmake +++ b/cpp/cmake_modules/SetupCxxFlags.cmake @@ -25,6 +25,9 @@ CHECK_CXX_COMPILER_FLAG("-maltivec" CXX_SUPPORTS_ALTIVEC) # Arm64 compiler flags CHECK_CXX_COMPILER_FLAG("-march=armv8-a+crc" CXX_SUPPORTS_ARMCRC) +# Support C11 +set(CMAKE_C_STANDARD 11) + # This ensures that things like gnu++11 get passed correctly set(CMAKE_CXX_STANDARD 11) diff --git a/cpp/src/arrow/array/builder_binary.cc b/cpp/src/arrow/array/builder_binary.cc index fceb964ac78..613bd806e8d 100644 --- a/cpp/src/arrow/array/builder_binary.cc +++ b/cpp/src/arrow/array/builder_binary.cc @@ -69,15 +69,11 @@ Status BinaryBuilder::ReserveData(int64_t elements) { return Status::OK(); } -Status BinaryBuilder::AppendNextOffset() { - const int64_t num_bytes = value_data_builder_.length(); - if (ARROW_PREDICT_FALSE(num_bytes > kBinaryMemoryLimit)) { - std::stringstream ss; - ss << "BinaryArray cannot contain more than " << kBinaryMemoryLimit << " bytes, have " - << num_bytes; - return Status::CapacityError(ss.str()); - } - return offsets_builder_.Append(static_cast(num_bytes)); +Status BinaryBuilder::AppendOverflow(int32_t num_bytes) { + std::stringstream ss; + ss << "BinaryArray cannot contain more than " << kBinaryMemoryLimit << " bytes, have " + << num_bytes; + return Status::CapacityError(ss.str()); } Status BinaryBuilder::FinishInternal(std::shared_ptr* out) { diff --git a/cpp/src/arrow/array/builder_binary.h b/cpp/src/arrow/array/builder_binary.h index ac644f40598..3486c18aa0e 100644 --- a/cpp/src/arrow/array/builder_binary.h +++ b/cpp/src/arrow/array/builder_binary.h @@ -19,6 +19,7 @@ #include #include +#include #include #include @@ -120,7 +121,15 @@ class ARROW_EXPORT BinaryBuilder : public ArrayBuilder { TypedBufferBuilder offsets_builder_; TypedBufferBuilder value_data_builder_; - Status AppendNextOffset(); + Status AppendOverflow(int32_t num_bytes); + + Status AppendNextOffset() { + const int64_t num_bytes = value_data_builder_.length(); + if (ARROW_PREDICT_FALSE(num_bytes > kBinaryMemoryLimit)) { + return AppendOverflow(num_bytes); + } + return offsets_builder_.Append(static_cast(num_bytes)); + } void UnsafeAppendNextOffset() { const int64_t num_bytes = value_data_builder_.length(); diff --git a/cpp/src/arrow/builder-benchmark.cc b/cpp/src/arrow/builder-benchmark.cc index 2dd56478c0a..fae9c89a14f 100644 --- a/cpp/src/arrow/builder-benchmark.cc +++ b/cpp/src/arrow/builder-benchmark.cc @@ -163,10 +163,11 @@ static void BM_BuildBooleanArrayNoNulls( } static void BM_BuildBinaryArray(benchmark::State& state) { // NOLINT non-const reference - const int64_t iterations = 1 << 20; - + // About 160MB + const int64_t iterations = 1 << 24; std::string value = "1234567890"; - while (state.KeepRunning()) { + + for (auto _ : state) { BinaryBuilder builder; for (int64_t i = 0; i < iterations; i++) { ABORT_NOT_OK(builder.Append(value)); @@ -179,10 +180,11 @@ static void BM_BuildBinaryArray(benchmark::State& state) { // NOLINT non-const static void BM_BuildChunkedBinaryArray( benchmark::State& state) { // NOLINT non-const reference - const int64_t iterations = 1 << 20; - + // About 160MB + const int64_t iterations = 1 << 24; std::string value = "1234567890"; - while (state.KeepRunning()) { + + for (auto _ : state) { // 1MB chunks const int32_t chunksize = 1 << 20; internal::ChunkedBinaryBuilder builder(chunksize); @@ -390,10 +392,8 @@ BENCHMARK(BM_BuildAdaptiveUIntNoNullsScalarAppend) ->Repetitions(kRepetitions) ->Unit(benchmark::kMicrosecond); -BENCHMARK(BM_BuildBinaryArray)->Repetitions(kRepetitions)->Unit(benchmark::kMicrosecond); -BENCHMARK(BM_BuildChunkedBinaryArray) - ->Repetitions(kRepetitions) - ->Unit(benchmark::kMicrosecond); +BENCHMARK(BM_BuildBinaryArray)->MinTime(1.0)->Unit(benchmark::kMicrosecond); +BENCHMARK(BM_BuildChunkedBinaryArray)->MinTime(1.0)->Unit(benchmark::kMicrosecond); BENCHMARK(BM_BuildFixedSizeBinaryArray) ->Repetitions(kRepetitions) ->Unit(benchmark::kMicrosecond); diff --git a/cpp/src/arrow/python/numpy_to_arrow.cc b/cpp/src/arrow/python/numpy_to_arrow.cc index 4920796c15b..da288d3c686 100644 --- a/cpp/src/arrow/python/numpy_to_arrow.cc +++ b/cpp/src/arrow/python/numpy_to_arrow.cc @@ -25,6 +25,7 @@ #include #include #include +#include #include #include #include @@ -542,13 +543,9 @@ Status NumPyConverter::Visit(const BinaryType& type) { auto AppendNotNull = [&builder, this](const uint8_t* data) { // This is annoying. NumPy allows strings to have nul-terminators, so // we must check for them here - int item_length = 0; - for (; item_length < itemsize_; ++item_length) { - if (data[item_length] == 0) { - break; - } - } - return builder.Append(data, item_length); + const size_t item_size = + strnlen(reinterpret_cast(data), static_cast(itemsize_)); + return builder.Append(data, static_cast(item_size)); }; if (mask_ != nullptr) { From 695ffc9dfd49ae5d77a76090f1b5ecb2a12bc583 Mon Sep 17 00:00:00 2001 From: Wes McKinney Date: Fri, 14 Dec 2018 08:56:49 -0800 Subject: [PATCH 11/12] Remove unimplemented and unused ChunkedBinaryBuilder ctor --- cpp/src/arrow/array/builder_binary.h | 3 --- 1 file changed, 3 deletions(-) diff --git a/cpp/src/arrow/array/builder_binary.h b/cpp/src/arrow/array/builder_binary.h index 3486c18aa0e..80d1e2480ee 100644 --- a/cpp/src/arrow/array/builder_binary.h +++ b/cpp/src/arrow/array/builder_binary.h @@ -283,9 +283,6 @@ class ARROW_EXPORT ChunkedBinaryBuilder { virtual Status Finish(ArrayVector* out); protected: - ChunkedBinaryBuilder(const std::shared_ptr& type, int32_t max_chunk_size, - MemoryPool* pool); - Status NextChunk(); int32_t max_chunk_size_; From 8224512804d0353d7b20f786e96a0758839b7d89 Mon Sep 17 00:00:00 2001 From: Wes McKinney Date: Fri, 14 Dec 2018 13:27:23 -0600 Subject: [PATCH 12/12] Fix int conversion warning on Windows Change-Id: I48147645784402e7cf004a82151d66f337d1664e --- cpp/src/arrow/array/builder_binary.cc | 2 +- cpp/src/arrow/array/builder_binary.h | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/cpp/src/arrow/array/builder_binary.cc b/cpp/src/arrow/array/builder_binary.cc index 613bd806e8d..ad6ba11a484 100644 --- a/cpp/src/arrow/array/builder_binary.cc +++ b/cpp/src/arrow/array/builder_binary.cc @@ -69,7 +69,7 @@ Status BinaryBuilder::ReserveData(int64_t elements) { return Status::OK(); } -Status BinaryBuilder::AppendOverflow(int32_t num_bytes) { +Status BinaryBuilder::AppendOverflow(int64_t num_bytes) { std::stringstream ss; ss << "BinaryArray cannot contain more than " << kBinaryMemoryLimit << " bytes, have " << num_bytes; diff --git a/cpp/src/arrow/array/builder_binary.h b/cpp/src/arrow/array/builder_binary.h index 80d1e2480ee..7c101bdffc5 100644 --- a/cpp/src/arrow/array/builder_binary.h +++ b/cpp/src/arrow/array/builder_binary.h @@ -121,7 +121,7 @@ class ARROW_EXPORT BinaryBuilder : public ArrayBuilder { TypedBufferBuilder offsets_builder_; TypedBufferBuilder value_data_builder_; - Status AppendOverflow(int32_t num_bytes); + Status AppendOverflow(int64_t num_bytes); Status AppendNextOffset() { const int64_t num_bytes = value_data_builder_.length();