From 21fc450832649ea258472ff2b8d007ca2ea4d853 Mon Sep 17 00:00:00 2001 From: Hatem Helal Date: Thu, 14 Feb 2019 06:23:39 -0500 Subject: [PATCH 01/35] Add some basic unittests that exercise the DecodeArrow methods --- cpp/src/parquet/encoding-test.cc | 78 ++++++++++++++++++++++++++++++++ 1 file changed, 78 insertions(+) diff --git a/cpp/src/parquet/encoding-test.cc b/cpp/src/parquet/encoding-test.cc index 28d98126ec8..0c5040727a9 100644 --- a/cpp/src/parquet/encoding-test.cc +++ b/cpp/src/parquet/encoding-test.cc @@ -22,6 +22,10 @@ #include #include +#include "arrow/array.h" +#include "arrow/testing/gtest_util.h" +#include "arrow/testing/util.h" +#include "arrow/type.h" #include "arrow/util/bit-util.h" #include "parquet/encoding.h" @@ -314,6 +318,80 @@ TEST(TestDictionaryEncoding, CannotDictDecodeBoolean) { ASSERT_THROW(MakeDictDecoder(nullptr), ParquetException); } +// ---------------------------------------------------------------------- +// Arrow builder decode tests +class TestDecodeArrow : public ::testing::Test { + public: + void SetUp() override { + allocator_ = default_memory_pool(); + + // Build a dictionary array and its dense representation for testing building both + auto dict_values = ::arrow::ArrayFromJSON(::arrow::binary(), "[\"foo\", \"bar\"]"); + auto dtype = ::arrow::dictionary(::arrow::int8(), dict_values); + auto indices = ::arrow::ArrayFromJSON(::arrow::int8(), "[0, 1, 0, 0]"); + ASSERT_OK(::arrow::DictionaryArray::FromArrays(dtype, indices, &expected_dict_)); + expected_array_ = + ArrayFromJSON(::arrow::binary(), "[\"foo\", \"bar\", \"foo\", \"foo\"]"); + num_values_ = static_cast(expected_array_->length()); + + // arrow::BinaryType maps to parquet::ByteArray + const auto& binary_array = static_cast(*expected_array_); + for (int64_t i = 0; i < binary_array.length(); i++) { + auto view = binary_array.GetView(i); + input_data_.emplace_back(view.length(), + reinterpret_cast(view.data())); + } + + valid_bits_ = vector(::arrow::BitUtil::BytesForBits(num_values_) + 1, 255); + + // Setup encoder/decoder pair + encoder_ = MakeTypedEncoder(Encoding::PLAIN); + decoder_ = MakeTypedDecoder(Encoding::PLAIN); + encoder_->Put(input_data_.data(), num_values_); + buffer_ = encoder_->FlushValues(); + decoder_->SetData(num_values_, buffer_->data(), static_cast(buffer_->size())); + } + + void TearDown() override {} + + protected: + MemoryPool* allocator_; + std::shared_ptr<::arrow::Array> expected_dict_; + std::shared_ptr<::arrow::Array> expected_array_; + int num_values_; + vector input_data_; + vector valid_bits_; + std::unique_ptr encoder_; + std::unique_ptr decoder_; + std::shared_ptr buffer_; +}; + +TEST_F(TestDecodeArrow, DecodeDense) { + ::arrow::internal::ChunkedBinaryBuilder builder(static_cast(buffer_->size()), + allocator_); + auto actual_num_values = + decoder_->DecodeArrow(num_values_, 0, valid_bits_.data(), 0, &builder); + + ASSERT_EQ(actual_num_values, num_values_); + ::arrow::ArrayVector actual_vec; + ASSERT_OK(builder.Finish(&actual_vec)); + ASSERT_EQ(actual_vec.size(), 1); + ASSERT_TRUE(actual_vec[0]->Equals(expected_array_)) + << "Actual: " << actual_vec[0] << "\nExpected: " << expected_array_; +} + +TEST_F(TestDecodeArrow, DecodeDictionary) { + ::arrow::BinaryDictionaryBuilder builder(allocator_); + auto actual_num_values = + decoder_->DecodeArrow(num_values_, 0, valid_bits_.data(), 0, &builder); + + ASSERT_EQ(actual_num_values, num_values_); + std::shared_ptr<::arrow::Array> actual; + ASSERT_OK(builder.Finish(&actual)); + ASSERT_TRUE(actual->Equals(expected_dict_)) + << "Actual: " << actual << "\nExpected: " << expected_dict_; +} + } // namespace test } // namespace parquet From fa504158ffa9c18d1bd8df2a2630dbf2b4e66958 Mon Sep 17 00:00:00 2001 From: Hatem Helal Date: Thu, 14 Feb 2019 07:40:49 -0500 Subject: [PATCH 02/35] Implement DecodeArrowNonNull and unit tests --- cpp/src/parquet/encoding-test.cc | 24 ++++++++++++++++++++++++ cpp/src/parquet/encoding.cc | 17 +++++++++++++++-- cpp/src/parquet/encoding.h | 3 +++ 3 files changed, 42 insertions(+), 2 deletions(-) diff --git a/cpp/src/parquet/encoding-test.cc b/cpp/src/parquet/encoding-test.cc index 0c5040727a9..cac4583824c 100644 --- a/cpp/src/parquet/encoding-test.cc +++ b/cpp/src/parquet/encoding-test.cc @@ -380,6 +380,19 @@ TEST_F(TestDecodeArrow, DecodeDense) { << "Actual: " << actual_vec[0] << "\nExpected: " << expected_array_; } +TEST_F(TestDecodeArrow, DecodeNonNullDense) { + ::arrow::internal::ChunkedBinaryBuilder builder(static_cast(buffer_->size()), + allocator_); + auto actual_num_values = decoder_->DecodeArrowNonNull(num_values_, &builder); + + ASSERT_EQ(actual_num_values, num_values_); + ::arrow::ArrayVector actual_vec; + ASSERT_OK(builder.Finish(&actual_vec)); + ASSERT_EQ(actual_vec.size(), 1); + ASSERT_TRUE(actual_vec[0]->Equals(expected_array_)) + << "Actual: " << actual_vec[0] << "\nExpected: " << expected_array_; +} + TEST_F(TestDecodeArrow, DecodeDictionary) { ::arrow::BinaryDictionaryBuilder builder(allocator_); auto actual_num_values = @@ -392,6 +405,17 @@ TEST_F(TestDecodeArrow, DecodeDictionary) { << "Actual: " << actual << "\nExpected: " << expected_dict_; } +TEST_F(TestDecodeArrow, DecodeNonNullDictionary) { + ::arrow::BinaryDictionaryBuilder builder(allocator_); + auto actual_num_values = decoder_->DecodeArrowNonNull(num_values_, &builder); + + ASSERT_EQ(actual_num_values, num_values_); + std::shared_ptr<::arrow::Array> actual; + ASSERT_OK(builder.Finish(&actual)); + ASSERT_TRUE(actual->Equals(expected_dict_)) + << "Actual: " << actual << "\nExpected: " << expected_dict_; +} + } // namespace test } // namespace parquet diff --git a/cpp/src/parquet/encoding.cc b/cpp/src/parquet/encoding.cc index da630671f79..e09f08d875a 100644 --- a/cpp/src/parquet/encoding.cc +++ b/cpp/src/parquet/encoding.cc @@ -722,6 +722,12 @@ class PlainByteArrayDecoder : public PlainDecoder, return result; } + int DecodeArrowNonNull(int num_values, ::arrow::BinaryDictionaryBuilder* out) override { + int result = 0; + PARQUET_THROW_NOT_OK(DecodeArrowNonNull(num_values, out, &result)); + return result; + } + private: template ::arrow::Status DecodeArrow(int num_values, int null_count, const uint8_t* valid_bits, @@ -762,8 +768,8 @@ class PlainByteArrayDecoder : public PlainDecoder, return ::arrow::Status::OK(); } - ::arrow::Status DecodeArrowNonNull(int num_values, - ::arrow::internal::ChunkedBinaryBuilder* out, + template + ::arrow::Status DecodeArrowNonNull(int num_values, BuilderType* out, int* values_decoded) { num_values = std::min(num_values, num_values_); ARROW_RETURN_NOT_OK(out->Reserve(num_values)); @@ -779,6 +785,7 @@ class PlainByteArrayDecoder : public PlainDecoder, data += increment; data_size -= increment; bytes_decoded += increment; + ++i; } data_ += bytes_decoded; @@ -941,6 +948,12 @@ class DictByteArrayDecoder : public DictDecoderImpl, return result; } + int DecodeArrowNonNull(int num_values, ::arrow::BinaryDictionaryBuilder* out) override { + int result = 0; + PARQUET_THROW_NOT_OK(DecodeArrowNonNull(num_values, out, &result)); + return result; + } + private: template ::arrow::Status DecodeArrow(int num_values, int null_count, const uint8_t* valid_bits, diff --git a/cpp/src/parquet/encoding.h b/cpp/src/parquet/encoding.h index 046296cdb14..59c0a0fd05f 100644 --- a/cpp/src/parquet/encoding.h +++ b/cpp/src/parquet/encoding.h @@ -219,6 +219,9 @@ class ByteArrayDecoder : virtual public TypedDecoder { // See also ARROW-3772, ARROW-3769 virtual int DecodeArrowNonNull(int num_values, ::arrow::internal::ChunkedBinaryBuilder* builder) = 0; + + virtual int DecodeArrowNonNull(int num_values, + ::arrow::BinaryDictionaryBuilder* builder) = 0; }; class FLBADecoder : virtual public TypedDecoder { From 4a26f7405c9e9e36a0e9ce5e0c5c652d679681e5 Mon Sep 17 00:00:00 2001 From: Hatem Helal Date: Thu, 14 Feb 2019 07:53:38 -0500 Subject: [PATCH 03/35] remove todo message --- cpp/src/parquet/encoding.h | 2 -- 1 file changed, 2 deletions(-) diff --git a/cpp/src/parquet/encoding.h b/cpp/src/parquet/encoding.h index 59c0a0fd05f..3f6cf645f52 100644 --- a/cpp/src/parquet/encoding.h +++ b/cpp/src/parquet/encoding.h @@ -215,8 +215,6 @@ class ByteArrayDecoder : virtual public TypedDecoder { int64_t valid_bits_offset, ::arrow::BinaryDictionaryBuilder* builder) = 0; - // TODO(wesm): Implement DecodeArrowNonNull as part of ARROW-3325 - // See also ARROW-3772, ARROW-3769 virtual int DecodeArrowNonNull(int num_values, ::arrow::internal::ChunkedBinaryBuilder* builder) = 0; From 31667ffbbe0b016a64d480d68014dc86752a82b4 Mon Sep 17 00:00:00 2001 From: Hatem Helal Date: Mon, 18 Feb 2019 08:39:42 -0500 Subject: [PATCH 04/35] added tests for DictByteArrayDecoder and reworked previous tests --- cpp/src/parquet/encoding-test.cc | 167 +++++++++++++++++++++++-------- cpp/src/parquet/encoding.cc | 3 +- 2 files changed, 126 insertions(+), 44 deletions(-) diff --git a/cpp/src/parquet/encoding-test.cc b/cpp/src/parquet/encoding-test.cc index cac4583824c..089fd46b466 100644 --- a/cpp/src/parquet/encoding-test.cc +++ b/cpp/src/parquet/encoding-test.cc @@ -319,12 +319,15 @@ TEST(TestDictionaryEncoding, CannotDictDecodeBoolean) { } // ---------------------------------------------------------------------- -// Arrow builder decode tests +// Shared arrow builder decode tests class TestDecodeArrow : public ::testing::Test { public: void SetUp() override { - allocator_ = default_memory_pool(); + InitDataInputs(); + SetupEncoderDecoder(); + } + void InitDataInputs() { // Build a dictionary array and its dense representation for testing building both auto dict_values = ::arrow::ArrayFromJSON(::arrow::binary(), "[\"foo\", \"bar\"]"); auto dtype = ::arrow::dictionary(::arrow::int8(), dict_values); @@ -343,19 +346,31 @@ class TestDecodeArrow : public ::testing::Test { } valid_bits_ = vector(::arrow::BitUtil::BytesForBits(num_values_) + 1, 255); + } - // Setup encoder/decoder pair - encoder_ = MakeTypedEncoder(Encoding::PLAIN); - decoder_ = MakeTypedDecoder(Encoding::PLAIN); - encoder_->Put(input_data_.data(), num_values_); - buffer_ = encoder_->FlushValues(); - decoder_->SetData(num_values_, buffer_->data(), static_cast(buffer_->size())); + // Setup encoder/decoder pair for testing with + virtual void SetupEncoderDecoder() = 0; + + void CheckDecodeDense(const int actual_num_values, + ::arrow::internal::ChunkedBinaryBuilder& builder) { + ASSERT_EQ(actual_num_values, num_values_); + ::arrow::ArrayVector actual_vec; + ASSERT_OK(builder.Finish(&actual_vec)); + ASSERT_EQ(actual_vec.size(), 1); + ASSERT_TRUE(actual_vec[0]->Equals(expected_array_)) + << "Actual: " << actual_vec[0] << "\nExpected: " << expected_array_; } - void TearDown() override {} + void CheckDecodeDict(const int actual_num_values, + ::arrow::BinaryDictionaryBuilder& builder) { + ASSERT_EQ(actual_num_values, num_values_); + std::shared_ptr<::arrow::Array> actual; + ASSERT_OK(builder.Finish(&actual)); + ASSERT_TRUE(actual->Equals(expected_dict_)) + << "Actual: " << actual << "\nExpected: " << expected_dict_; + } protected: - MemoryPool* allocator_; std::shared_ptr<::arrow::Array> expected_dict_; std::shared_ptr<::arrow::Array> expected_array_; int num_values_; @@ -366,54 +381,120 @@ class TestDecodeArrow : public ::testing::Test { std::shared_ptr buffer_; }; -TEST_F(TestDecodeArrow, DecodeDense) { +// ---------------------------------------------------------------------- +// Arrow builder decode tests for PlainByteArrayDecoder +class TestDecodeArrowPlain : public TestDecodeArrow { + public: + void SetupEncoderDecoder() override { + encoder_ = MakeTypedEncoder(Encoding::PLAIN); + decoder_ = MakeTypedDecoder(Encoding::PLAIN); + encoder_->Put(input_data_.data(), num_values_); + buffer_ = encoder_->FlushValues(); + decoder_->SetData(num_values_, buffer_->data(), static_cast(buffer_->size())); + } + + void TearDown() override {} +}; + +TEST_F(TestDecodeArrowPlain, DecodeDense) { ::arrow::internal::ChunkedBinaryBuilder builder(static_cast(buffer_->size()), - allocator_); + default_memory_pool()); auto actual_num_values = decoder_->DecodeArrow(num_values_, 0, valid_bits_.data(), 0, &builder); - - ASSERT_EQ(actual_num_values, num_values_); - ::arrow::ArrayVector actual_vec; - ASSERT_OK(builder.Finish(&actual_vec)); - ASSERT_EQ(actual_vec.size(), 1); - ASSERT_TRUE(actual_vec[0]->Equals(expected_array_)) - << "Actual: " << actual_vec[0] << "\nExpected: " << expected_array_; + CheckDecodeDense(actual_num_values, builder); } -TEST_F(TestDecodeArrow, DecodeNonNullDense) { +TEST_F(TestDecodeArrowPlain, DecodeNonNullDense) { ::arrow::internal::ChunkedBinaryBuilder builder(static_cast(buffer_->size()), - allocator_); + default_memory_pool()); auto actual_num_values = decoder_->DecodeArrowNonNull(num_values_, &builder); - - ASSERT_EQ(actual_num_values, num_values_); - ::arrow::ArrayVector actual_vec; - ASSERT_OK(builder.Finish(&actual_vec)); - ASSERT_EQ(actual_vec.size(), 1); - ASSERT_TRUE(actual_vec[0]->Equals(expected_array_)) - << "Actual: " << actual_vec[0] << "\nExpected: " << expected_array_; + CheckDecodeDense(actual_num_values, builder); } -TEST_F(TestDecodeArrow, DecodeDictionary) { - ::arrow::BinaryDictionaryBuilder builder(allocator_); +TEST_F(TestDecodeArrowPlain, DecodeDictionary) { + ::arrow::BinaryDictionaryBuilder builder(default_memory_pool()); auto actual_num_values = decoder_->DecodeArrow(num_values_, 0, valid_bits_.data(), 0, &builder); - - ASSERT_EQ(actual_num_values, num_values_); - std::shared_ptr<::arrow::Array> actual; - ASSERT_OK(builder.Finish(&actual)); - ASSERT_TRUE(actual->Equals(expected_dict_)) - << "Actual: " << actual << "\nExpected: " << expected_dict_; + CheckDecodeDict(actual_num_values, builder); } -TEST_F(TestDecodeArrow, DecodeNonNullDictionary) { - ::arrow::BinaryDictionaryBuilder builder(allocator_); +TEST_F(TestDecodeArrowPlain, DecodeNonNullDictionary) { + ::arrow::BinaryDictionaryBuilder builder(default_memory_pool()); auto actual_num_values = decoder_->DecodeArrowNonNull(num_values_, &builder); + CheckDecodeDict(actual_num_values, builder); +} + +// ---------------------------------------------------------------------- +// Arrow builder decode tests for DictByteArrayDecoder +class TestDecodeArrowDict : public TestDecodeArrow { + public: + void SetupEncoderDecoder() override { + auto node = schema::ByteArray("name"); + descr_ = std::unique_ptr(new ColumnDescriptor(node, 0, 0)); + encoder_ = MakeTypedEncoder(Encoding::PLAIN, /*use_dictionary=*/true, + descr_.get()); + ASSERT_NO_THROW(encoder_->Put(input_data_.data(), num_values_)); + buffer_ = encoder_->FlushValues(); + + auto dict_encoder = dynamic_cast*>(encoder_.get()); + ASSERT_NE(dict_encoder, nullptr); + dict_buffer_ = + AllocateBuffer(default_memory_pool(), dict_encoder->dict_encoded_size()); + dict_encoder->WriteDict(dict_buffer_->mutable_data()); + + // Simulate reading the dictionary page followed by a data page + decoder_ = MakeTypedDecoder(Encoding::PLAIN, descr_.get()); + decoder_->SetData(dict_encoder->num_entries(), dict_buffer_->data(), + static_cast(dict_buffer_->size())); + + dict_decoder_ = MakeDictDecoder(descr_.get()); + dict_decoder_->SetDict(decoder_.get()); + dict_decoder_->SetData(num_values_, buffer_->data(), + static_cast(buffer_->size())); + } + + void TearDown() override {} + + protected: + std::unique_ptr descr_; + std::unique_ptr> dict_decoder_; + std::shared_ptr dict_buffer_; +}; + +TEST_F(TestDecodeArrowDict, DecodeDense) { + ::arrow::internal::ChunkedBinaryBuilder builder(static_cast(dict_buffer_->size()), + default_memory_pool()); + auto byte_array_decoder = dynamic_cast(dict_decoder_.get()); + ASSERT_NE(byte_array_decoder, nullptr); + auto actual_num_values = + byte_array_decoder->DecodeArrow(num_values_, 0, valid_bits_.data(), 0, &builder); + CheckDecodeDense(actual_num_values, builder); +} + +TEST_F(TestDecodeArrowDict, DecodeNonNullDense) { + ::arrow::internal::ChunkedBinaryBuilder builder(static_cast(dict_buffer_->size()), + default_memory_pool()); + auto byte_array_decoder = dynamic_cast(dict_decoder_.get()); + ASSERT_NE(byte_array_decoder, nullptr); + auto actual_num_values = byte_array_decoder->DecodeArrowNonNull(num_values_, &builder); + CheckDecodeDense(actual_num_values, builder); +} + +TEST_F(TestDecodeArrowDict, DecodeDictionary) { + ::arrow::BinaryDictionaryBuilder builder(default_memory_pool()); + auto byte_array_decoder = dynamic_cast(dict_decoder_.get()); + ASSERT_NE(byte_array_decoder, nullptr); + auto actual_num_values = + byte_array_decoder->DecodeArrow(num_values_, 0, valid_bits_.data(), 0, &builder); + CheckDecodeDict(actual_num_values, builder); +} - ASSERT_EQ(actual_num_values, num_values_); - std::shared_ptr<::arrow::Array> actual; - ASSERT_OK(builder.Finish(&actual)); - ASSERT_TRUE(actual->Equals(expected_dict_)) - << "Actual: " << actual << "\nExpected: " << expected_dict_; +TEST_F(TestDecodeArrowDict, DecodeNonNullDictionary) { + ::arrow::BinaryDictionaryBuilder builder(default_memory_pool()); + auto byte_array_decoder = dynamic_cast(dict_decoder_.get()); + ASSERT_NE(byte_array_decoder, nullptr); + auto actual_num_values = byte_array_decoder->DecodeArrowNonNull(num_values_, &builder); + CheckDecodeDict(actual_num_values, builder); } } // namespace test diff --git a/cpp/src/parquet/encoding.cc b/cpp/src/parquet/encoding.cc index e09f08d875a..f8e1931fd30 100644 --- a/cpp/src/parquet/encoding.cc +++ b/cpp/src/parquet/encoding.cc @@ -1015,7 +1015,8 @@ class DictByteArrayDecoder : public DictDecoderImpl, int32_t indices_buffer[buffer_size]; int values_decoded = 0; while (values_decoded < num_values) { - int num_indices = idx_decoder_.GetBatch(indices_buffer, buffer_size); + int32_t batch_size = std::min(buffer_size, num_values - values_decoded); + int num_indices = idx_decoder_.GetBatch(indices_buffer, batch_size); if (num_indices == 0) break; for (int i = 0; i < num_indices; ++i) { const auto& val = dictionary_[indices_buffer[i]]; From ef55081a00b95d7c86e06ac7e6c5007f9b59a024 Mon Sep 17 00:00:00 2001 From: Hatem Helal Date: Mon, 18 Feb 2019 13:00:25 -0500 Subject: [PATCH 05/35] added benchmarks for decoding plain encoded data using arrow builders --- cpp/src/parquet/encoding-benchmark.cc | 92 +++++++++++++++++++++++++++ 1 file changed, 92 insertions(+) diff --git a/cpp/src/parquet/encoding-benchmark.cc b/cpp/src/parquet/encoding-benchmark.cc index 8031aeb7ce1..fece1f26733 100644 --- a/cpp/src/parquet/encoding-benchmark.cc +++ b/cpp/src/parquet/encoding-benchmark.cc @@ -17,6 +17,11 @@ #include "benchmark/benchmark.h" +#include "arrow/array.h" +#include "arrow/array/builder_binary.h" +#include "arrow/array/builder_dict.h" +#include "arrow/type.h" + #include "parquet/encoding.h" #include "parquet/schema.h" #include "parquet/util/memory.h" @@ -163,4 +168,91 @@ static void BM_DictDecodingInt64_literals(benchmark::State& state) { BENCHMARK(BM_DictDecodingInt64_literals)->Range(1024, 65536); +class BM_PlainDecodingByteArray : public ::benchmark::Fixture { + public: + void SetUp(const ::benchmark::State& state) override { + num_values_ = static_cast(state.range()); + input_string_ = "foo"; + byte_array_ = ByteArray(static_cast(input_string_.size()), + reinterpret_cast(input_string_.data())); + values_ = std::vector(num_values_, byte_array_); + valid_bits_ = + std::vector(::arrow::BitUtil::BytesForBits(num_values_) + 1, 255); + + auto encoder = MakeTypedEncoder(Encoding::PLAIN); + encoder->Put(values_.data(), num_values_); + buffer_ = encoder->FlushValues(); + } + + void TearDown(const ::benchmark::State& state) override {} + + protected: + int num_values_; + std::string input_string_; + ByteArray byte_array_; + std::vector values_; + std::vector valid_bits_; + std::shared_ptr buffer_; +}; + +BENCHMARK_DEFINE_F(BM_PlainDecodingByteArray, DecodeArrow_Dense) +(benchmark::State& state) { + while (state.KeepRunning()) { + auto decoder = MakeTypedDecoder(Encoding::PLAIN); + decoder->SetData(num_values_, buffer_->data(), static_cast(buffer_->size())); + ::arrow::internal::ChunkedBinaryBuilder builder(static_cast(buffer_->size()), + ::arrow::default_memory_pool()); + decoder->DecodeArrow(num_values_, 0, valid_bits_.data(), 0, &builder); + } + + state.SetBytesProcessed(state.iterations() * state.range(0) * input_string_.length()); +} + +BENCHMARK_REGISTER_F(BM_PlainDecodingByteArray, DecodeArrow_Dense)->Range(1024, 65536); + +BENCHMARK_DEFINE_F(BM_PlainDecodingByteArray, DecodeArrowNonNull_Dense) +(benchmark::State& state) { + while (state.KeepRunning()) { + auto decoder = MakeTypedDecoder(Encoding::PLAIN); + decoder->SetData(num_values_, buffer_->data(), static_cast(buffer_->size())); + ::arrow::internal::ChunkedBinaryBuilder builder(static_cast(buffer_->size()), + ::arrow::default_memory_pool()); + decoder->DecodeArrowNonNull(num_values_, &builder); + } + + state.SetBytesProcessed(state.iterations() * state.range(0) * input_string_.length()); +} + +BENCHMARK_REGISTER_F(BM_PlainDecodingByteArray, DecodeArrowNonNull_Dense) + ->Range(1024, 65536); + +BENCHMARK_DEFINE_F(BM_PlainDecodingByteArray, DecodeArrow_Dict) +(benchmark::State& state) { + while (state.KeepRunning()) { + auto decoder = MakeTypedDecoder(Encoding::PLAIN); + decoder->SetData(num_values_, buffer_->data(), static_cast(buffer_->size())); + ::arrow::BinaryDictionaryBuilder builder(::arrow::default_memory_pool()); + decoder->DecodeArrow(num_values_, 0, valid_bits_.data(), 0, &builder); + } + + state.SetBytesProcessed(state.iterations() * state.range(0) * input_string_.length()); +} + +BENCHMARK_REGISTER_F(BM_PlainDecodingByteArray, DecodeArrow_Dict)->Range(1024, 65536); + +BENCHMARK_DEFINE_F(BM_PlainDecodingByteArray, DecodeArrowNonNull_Dict) +(benchmark::State& state) { + while (state.KeepRunning()) { + auto decoder = MakeTypedDecoder(Encoding::PLAIN); + decoder->SetData(num_values_, buffer_->data(), static_cast(buffer_->size())); + ::arrow::BinaryDictionaryBuilder builder(::arrow::default_memory_pool()); + decoder->DecodeArrowNonNull(num_values_, &builder); + } + + state.SetBytesProcessed(state.iterations() * state.range(0) * input_string_.length()); +} + +BENCHMARK_REGISTER_F(BM_PlainDecodingByteArray, DecodeArrowNonNull_Dict) + ->Range(1024, 65536); + } // namespace parquet From 89de5d5be32ecf8e859a7adb2a4cbb63ffac0f2c Mon Sep 17 00:00:00 2001 From: Hatem Helal Date: Tue, 19 Feb 2019 14:08:25 -0500 Subject: [PATCH 06/35] rework data generation so that decoding benchmark runs using a more realistic dataset --- cpp/src/parquet/encoding-benchmark.cc | 64 ++++++++++++++++++++++----- 1 file changed, 54 insertions(+), 10 deletions(-) diff --git a/cpp/src/parquet/encoding-benchmark.cc b/cpp/src/parquet/encoding-benchmark.cc index fece1f26733..d8a15a4d1f6 100644 --- a/cpp/src/parquet/encoding-benchmark.cc +++ b/cpp/src/parquet/encoding-benchmark.cc @@ -20,12 +20,16 @@ #include "arrow/array.h" #include "arrow/array/builder_binary.h" #include "arrow/array/builder_dict.h" +#include "arrow/testing/gtest_util.h" +#include "arrow/testing/util.h" #include "arrow/type.h" #include "parquet/encoding.h" #include "parquet/schema.h" #include "parquet/util/memory.h" +#include + using arrow::default_memory_pool; using arrow::MemoryPool; @@ -168,14 +172,54 @@ static void BM_DictDecodingInt64_literals(benchmark::State& state) { BENCHMARK(BM_DictDecodingInt64_literals)->Range(1024, 65536); +std::shared_ptr<::arrow::Array> MakeRandomStringsWithRepeats(size_t num_unique, + size_t num_values) { + const int64_t min_length = 2; + const int64_t max_length = 10; + std::vector buffer(max_length); + std::vector dictionary(num_unique); + + uint32_t seed = 0; + std::default_random_engine gen(seed); + std::uniform_int_distribution length_dist(min_length, max_length); + + std::generate(dictionary.begin(), dictionary.end(), [&] { + auto length = length_dist(gen); + ::arrow::random_ascii(length, seed++, buffer.data()); + return std::string(buffer.begin(), buffer.begin() + length); + }); + + std::uniform_int_distribution indices_dist(0, num_unique - 1); + ::arrow::StringBuilder builder; + + for (size_t i = 0; i < num_values; i++) { + const auto index = indices_dist(gen); + const auto value = dictionary[index]; + ABORT_NOT_OK(builder.Append(value)); + } + + std::shared_ptr<::arrow::Array> result; + ABORT_NOT_OK(builder.Finish(&result)); + return result; +} + class BM_PlainDecodingByteArray : public ::benchmark::Fixture { public: void SetUp(const ::benchmark::State& state) override { num_values_ = static_cast(state.range()); - input_string_ = "foo"; - byte_array_ = ByteArray(static_cast(input_string_.size()), - reinterpret_cast(input_string_.data())); - values_ = std::vector(num_values_, byte_array_); + + input_array_ = MakeRandomStringsWithRepeats(num_values_ / 8, num_values_); + const auto& binary_array = static_cast(*input_array_); + values_ = std::vector(); + values_.reserve(num_values_); + total_size_ = 0; + + for (int64_t i = 0; i < binary_array.length(); i++) { + auto view = binary_array.GetView(i); + values_.emplace_back(view.length(), reinterpret_cast(view.data())); + total_size_ += view.length(); + } + valid_bits_ = std::vector(::arrow::BitUtil::BytesForBits(num_values_) + 1, 255); @@ -188,8 +232,8 @@ class BM_PlainDecodingByteArray : public ::benchmark::Fixture { protected: int num_values_; - std::string input_string_; - ByteArray byte_array_; + std::shared_ptr<::arrow::Array> input_array_; + uint64_t total_size_; std::vector values_; std::vector valid_bits_; std::shared_ptr buffer_; @@ -205,7 +249,7 @@ BENCHMARK_DEFINE_F(BM_PlainDecodingByteArray, DecodeArrow_Dense) decoder->DecodeArrow(num_values_, 0, valid_bits_.data(), 0, &builder); } - state.SetBytesProcessed(state.iterations() * state.range(0) * input_string_.length()); + state.SetBytesProcessed(state.iterations() * total_size_); } BENCHMARK_REGISTER_F(BM_PlainDecodingByteArray, DecodeArrow_Dense)->Range(1024, 65536); @@ -220,7 +264,7 @@ BENCHMARK_DEFINE_F(BM_PlainDecodingByteArray, DecodeArrowNonNull_Dense) decoder->DecodeArrowNonNull(num_values_, &builder); } - state.SetBytesProcessed(state.iterations() * state.range(0) * input_string_.length()); + state.SetBytesProcessed(state.iterations() * total_size_); } BENCHMARK_REGISTER_F(BM_PlainDecodingByteArray, DecodeArrowNonNull_Dense) @@ -235,7 +279,7 @@ BENCHMARK_DEFINE_F(BM_PlainDecodingByteArray, DecodeArrow_Dict) decoder->DecodeArrow(num_values_, 0, valid_bits_.data(), 0, &builder); } - state.SetBytesProcessed(state.iterations() * state.range(0) * input_string_.length()); + state.SetBytesProcessed(state.iterations() * total_size_); } BENCHMARK_REGISTER_F(BM_PlainDecodingByteArray, DecodeArrow_Dict)->Range(1024, 65536); @@ -249,7 +293,7 @@ BENCHMARK_DEFINE_F(BM_PlainDecodingByteArray, DecodeArrowNonNull_Dict) decoder->DecodeArrowNonNull(num_values_, &builder); } - state.SetBytesProcessed(state.iterations() * state.range(0) * input_string_.length()); + state.SetBytesProcessed(state.iterations() * total_size_); } BENCHMARK_REGISTER_F(BM_PlainDecodingByteArray, DecodeArrowNonNull_Dict) From 39a5f1994d80427009e919dc8c238cbe435d08f8 Mon Sep 17 00:00:00 2001 From: Hatem Helal Date: Thu, 21 Feb 2019 05:31:33 -0500 Subject: [PATCH 07/35] fix appveyor windows failure --- cpp/src/parquet/encoding-benchmark.cc | 3 ++- cpp/src/parquet/encoding-test.cc | 2 +- 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/cpp/src/parquet/encoding-benchmark.cc b/cpp/src/parquet/encoding-benchmark.cc index d8a15a4d1f6..92875bdfee4 100644 --- a/cpp/src/parquet/encoding-benchmark.cc +++ b/cpp/src/parquet/encoding-benchmark.cc @@ -216,7 +216,8 @@ class BM_PlainDecodingByteArray : public ::benchmark::Fixture { for (int64_t i = 0; i < binary_array.length(); i++) { auto view = binary_array.GetView(i); - values_.emplace_back(view.length(), reinterpret_cast(view.data())); + values_.emplace_back(static_cast(view.length()), + reinterpret_cast(view.data())); total_size_ += view.length(); } diff --git a/cpp/src/parquet/encoding-test.cc b/cpp/src/parquet/encoding-test.cc index 089fd46b466..01de15ff7c3 100644 --- a/cpp/src/parquet/encoding-test.cc +++ b/cpp/src/parquet/encoding-test.cc @@ -341,7 +341,7 @@ class TestDecodeArrow : public ::testing::Test { const auto& binary_array = static_cast(*expected_array_); for (int64_t i = 0; i < binary_array.length(); i++) { auto view = binary_array.GetView(i); - input_data_.emplace_back(view.length(), + input_data_.emplace_back(static_cast(view.length()), reinterpret_cast(view.data())); } From 4fbcf1fab32cb3c5be8c8a212c95666661b04095 Mon Sep 17 00:00:00 2001 From: Hatem Helal Date: Thu, 21 Feb 2019 11:04:47 -0500 Subject: [PATCH 08/35] fix loop increment in templated PlainByteArrayDecoder::DecodeArrow method --- cpp/src/parquet/encoding.cc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cpp/src/parquet/encoding.cc b/cpp/src/parquet/encoding.cc index f8e1931fd30..edcd5ecc0b6 100644 --- a/cpp/src/parquet/encoding.cc +++ b/cpp/src/parquet/encoding.cc @@ -754,11 +754,11 @@ class PlainByteArrayDecoder : public PlainDecoder, data += increment; data_size -= increment; bytes_decoded += increment; - ++i; } else { ARROW_RETURN_NOT_OK(out->AppendNull()); } bit_reader.Next(); + ++i; } data_ += bytes_decoded; From f234ca2a2268de56dba1ce93f924613165555cb0 Mon Sep 17 00:00:00 2001 From: Hatem Helal Date: Fri, 22 Feb 2019 11:14:55 -0500 Subject: [PATCH 09/35] respond to code review feedback - many readability fixes in benchmark and tests --- cpp/src/parquet/encoding-benchmark.cc | 39 ++++++++++++++++++--------- cpp/src/parquet/encoding-test.cc | 35 ++++++++++++------------ 2 files changed, 43 insertions(+), 31 deletions(-) diff --git a/cpp/src/parquet/encoding-benchmark.cc b/cpp/src/parquet/encoding-benchmark.cc index 92875bdfee4..feb85a553c8 100644 --- a/cpp/src/parquet/encoding-benchmark.cc +++ b/cpp/src/parquet/encoding-benchmark.cc @@ -33,6 +33,12 @@ using arrow::default_memory_pool; using arrow::MemoryPool; +namespace { +// The min/max number of values used to drive each family of encoding benchmarks +constexpr int MIN_RANGE = 1024; +constexpr int MAX_RANGE = 65536; +} // namespace + namespace parquet { using schema::PrimitiveNode; @@ -55,7 +61,7 @@ static void BM_PlainEncodingBoolean(benchmark::State& state) { state.SetBytesProcessed(state.iterations() * state.range(0) * sizeof(bool)); } -BENCHMARK(BM_PlainEncodingBoolean)->Range(1024, 65536); +BENCHMARK(BM_PlainEncodingBoolean)->Range(MIN_RANGE, MAX_RANGE); static void BM_PlainDecodingBoolean(benchmark::State& state) { std::vector values(state.range(0), true); @@ -76,7 +82,7 @@ static void BM_PlainDecodingBoolean(benchmark::State& state) { delete[] output; } -BENCHMARK(BM_PlainDecodingBoolean)->Range(1024, 65536); +BENCHMARK(BM_PlainDecodingBoolean)->Range(MIN_RANGE, MAX_RANGE); static void BM_PlainEncodingInt64(benchmark::State& state) { std::vector values(state.range(0), 64); @@ -88,7 +94,7 @@ static void BM_PlainEncodingInt64(benchmark::State& state) { state.SetBytesProcessed(state.iterations() * state.range(0) * sizeof(int64_t)); } -BENCHMARK(BM_PlainEncodingInt64)->Range(1024, 65536); +BENCHMARK(BM_PlainEncodingInt64)->Range(MIN_RANGE, MAX_RANGE); static void BM_PlainDecodingInt64(benchmark::State& state) { std::vector values(state.range(0), 64); @@ -105,7 +111,7 @@ static void BM_PlainDecodingInt64(benchmark::State& state) { state.SetBytesProcessed(state.iterations() * state.range(0) * sizeof(int64_t)); } -BENCHMARK(BM_PlainDecodingInt64)->Range(1024, 65536); +BENCHMARK(BM_PlainDecodingInt64)->Range(MIN_RANGE, MAX_RANGE); template static void DecodeDict(std::vector& values, @@ -157,7 +163,7 @@ static void BM_DictDecodingInt64_repeats(benchmark::State& state) { DecodeDict(values, state); } -BENCHMARK(BM_DictDecodingInt64_repeats)->Range(1024, 65536); +BENCHMARK(BM_DictDecodingInt64_repeats)->Range(MIN_RANGE, MAX_RANGE); static void BM_DictDecodingInt64_literals(benchmark::State& state) { typedef Int64Type Type; @@ -170,7 +176,7 @@ static void BM_DictDecodingInt64_literals(benchmark::State& state) { DecodeDict(values, state); } -BENCHMARK(BM_DictDecodingInt64_literals)->Range(1024, 65536); +BENCHMARK(BM_DictDecodingInt64_literals)->Range(MIN_RANGE, MAX_RANGE); std::shared_ptr<::arrow::Array> MakeRandomStringsWithRepeats(size_t num_unique, size_t num_values) { @@ -191,6 +197,7 @@ std::shared_ptr<::arrow::Array> MakeRandomStringsWithRepeats(size_t num_unique, std::uniform_int_distribution indices_dist(0, num_unique - 1); ::arrow::StringBuilder builder; + ABORT_NOT_OK(builder.Reserve(num_values)); for (size_t i = 0; i < num_values; i++) { const auto index = indices_dist(gen); @@ -207,8 +214,14 @@ class BM_PlainDecodingByteArray : public ::benchmark::Fixture { public: void SetUp(const ::benchmark::State& state) override { num_values_ = static_cast(state.range()); + constexpr int repeat_factor = 8; + input_array_ = MakeRandomStringsWithRepeats(num_values_ / repeat_factor, num_values_); + PlainEncodeInputArray(); + } + + void TearDown(const ::benchmark::State& state) override {} - input_array_ = MakeRandomStringsWithRepeats(num_values_ / 8, num_values_); + void PlainEncodeInputArray() { const auto& binary_array = static_cast(*input_array_); values_ = std::vector(); values_.reserve(num_values_); @@ -229,8 +242,6 @@ class BM_PlainDecodingByteArray : public ::benchmark::Fixture { buffer_ = encoder->FlushValues(); } - void TearDown(const ::benchmark::State& state) override {} - protected: int num_values_; std::shared_ptr<::arrow::Array> input_array_; @@ -253,7 +264,8 @@ BENCHMARK_DEFINE_F(BM_PlainDecodingByteArray, DecodeArrow_Dense) state.SetBytesProcessed(state.iterations() * total_size_); } -BENCHMARK_REGISTER_F(BM_PlainDecodingByteArray, DecodeArrow_Dense)->Range(1024, 65536); +BENCHMARK_REGISTER_F(BM_PlainDecodingByteArray, DecodeArrow_Dense) + ->Range(MIN_RANGE, MAX_RANGE); BENCHMARK_DEFINE_F(BM_PlainDecodingByteArray, DecodeArrowNonNull_Dense) (benchmark::State& state) { @@ -269,7 +281,7 @@ BENCHMARK_DEFINE_F(BM_PlainDecodingByteArray, DecodeArrowNonNull_Dense) } BENCHMARK_REGISTER_F(BM_PlainDecodingByteArray, DecodeArrowNonNull_Dense) - ->Range(1024, 65536); + ->Range(MIN_RANGE, MAX_RANGE); BENCHMARK_DEFINE_F(BM_PlainDecodingByteArray, DecodeArrow_Dict) (benchmark::State& state) { @@ -283,7 +295,8 @@ BENCHMARK_DEFINE_F(BM_PlainDecodingByteArray, DecodeArrow_Dict) state.SetBytesProcessed(state.iterations() * total_size_); } -BENCHMARK_REGISTER_F(BM_PlainDecodingByteArray, DecodeArrow_Dict)->Range(1024, 65536); +BENCHMARK_REGISTER_F(BM_PlainDecodingByteArray, DecodeArrow_Dict) + ->Range(MIN_RANGE, MAX_RANGE); BENCHMARK_DEFINE_F(BM_PlainDecodingByteArray, DecodeArrowNonNull_Dict) (benchmark::State& state) { @@ -298,6 +311,6 @@ BENCHMARK_DEFINE_F(BM_PlainDecodingByteArray, DecodeArrowNonNull_Dict) } BENCHMARK_REGISTER_F(BM_PlainDecodingByteArray, DecodeArrowNonNull_Dict) - ->Range(1024, 65536); + ->Range(MIN_RANGE, MAX_RANGE); } // namespace parquet diff --git a/cpp/src/parquet/encoding-test.cc b/cpp/src/parquet/encoding-test.cc index 01de15ff7c3..3c476b294a6 100644 --- a/cpp/src/parquet/encoding-test.cc +++ b/cpp/src/parquet/encoding-test.cc @@ -330,16 +330,17 @@ class TestDecodeArrow : public ::testing::Test { void InitDataInputs() { // Build a dictionary array and its dense representation for testing building both auto dict_values = ::arrow::ArrayFromJSON(::arrow::binary(), "[\"foo\", \"bar\"]"); - auto dtype = ::arrow::dictionary(::arrow::int8(), dict_values); - auto indices = ::arrow::ArrayFromJSON(::arrow::int8(), "[0, 1, 0, 0]"); - ASSERT_OK(::arrow::DictionaryArray::FromArrays(dtype, indices, &expected_dict_)); - expected_array_ = + auto dict_type = ::arrow::dictionary(::arrow::int8(), dict_values); + auto dict_indices = ::arrow::ArrayFromJSON(::arrow::int8(), "[0, 1, 0, 0]"); + ASSERT_OK( + ::arrow::DictionaryArray::FromArrays(dict_type, dict_indices, &expected_dict_)); + expected_dense_ = ArrayFromJSON(::arrow::binary(), "[\"foo\", \"bar\", \"foo\", \"foo\"]"); - num_values_ = static_cast(expected_array_->length()); + num_values_ = static_cast(expected_dense_->length()); - // arrow::BinaryType maps to parquet::ByteArray - const auto& binary_array = static_cast(*expected_array_); - for (int64_t i = 0; i < binary_array.length(); i++) { + // Initialize input_data_ for the encoder from the expected_array_ values + const auto& binary_array = static_cast(*expected_dense_); + for (int64_t i = 0; i < binary_array.length(); ++i) { auto view = binary_array.GetView(i); input_data_.emplace_back(static_cast(view.length()), reinterpret_cast(view.data())); @@ -357,8 +358,7 @@ class TestDecodeArrow : public ::testing::Test { ::arrow::ArrayVector actual_vec; ASSERT_OK(builder.Finish(&actual_vec)); ASSERT_EQ(actual_vec.size(), 1); - ASSERT_TRUE(actual_vec[0]->Equals(expected_array_)) - << "Actual: " << actual_vec[0] << "\nExpected: " << expected_array_; + ASSERT_ARRAYS_EQUAL(*actual_vec[0], *expected_dense_); } void CheckDecodeDict(const int actual_num_values, @@ -366,13 +366,12 @@ class TestDecodeArrow : public ::testing::Test { ASSERT_EQ(actual_num_values, num_values_); std::shared_ptr<::arrow::Array> actual; ASSERT_OK(builder.Finish(&actual)); - ASSERT_TRUE(actual->Equals(expected_dict_)) - << "Actual: " << actual << "\nExpected: " << expected_dict_; + ASSERT_ARRAYS_EQUAL(*actual, *expected_dict_); } protected: std::shared_ptr<::arrow::Array> expected_dict_; - std::shared_ptr<::arrow::Array> expected_array_; + std::shared_ptr<::arrow::Array> expected_dense_; int num_values_; vector input_data_; vector valid_bits_; @@ -388,7 +387,7 @@ class TestDecodeArrowPlain : public TestDecodeArrow { void SetupEncoderDecoder() override { encoder_ = MakeTypedEncoder(Encoding::PLAIN); decoder_ = MakeTypedDecoder(Encoding::PLAIN); - encoder_->Put(input_data_.data(), num_values_); + ASSERT_NO_THROW(encoder_->Put(input_data_.data(), num_values_)); buffer_ = encoder_->FlushValues(); decoder_->SetData(num_values_, buffer_->data(), static_cast(buffer_->size())); } @@ -411,14 +410,14 @@ TEST_F(TestDecodeArrowPlain, DecodeNonNullDense) { CheckDecodeDense(actual_num_values, builder); } -TEST_F(TestDecodeArrowPlain, DecodeDictionary) { +TEST_F(TestDecodeArrowPlain, DecodeDict) { ::arrow::BinaryDictionaryBuilder builder(default_memory_pool()); auto actual_num_values = decoder_->DecodeArrow(num_values_, 0, valid_bits_.data(), 0, &builder); CheckDecodeDict(actual_num_values, builder); } -TEST_F(TestDecodeArrowPlain, DecodeNonNullDictionary) { +TEST_F(TestDecodeArrowPlain, DecodeNonNullDict) { ::arrow::BinaryDictionaryBuilder builder(default_memory_pool()); auto actual_num_values = decoder_->DecodeArrowNonNull(num_values_, &builder); CheckDecodeDict(actual_num_values, builder); @@ -480,7 +479,7 @@ TEST_F(TestDecodeArrowDict, DecodeNonNullDense) { CheckDecodeDense(actual_num_values, builder); } -TEST_F(TestDecodeArrowDict, DecodeDictionary) { +TEST_F(TestDecodeArrowDict, DecodeDict) { ::arrow::BinaryDictionaryBuilder builder(default_memory_pool()); auto byte_array_decoder = dynamic_cast(dict_decoder_.get()); ASSERT_NE(byte_array_decoder, nullptr); @@ -489,7 +488,7 @@ TEST_F(TestDecodeArrowDict, DecodeDictionary) { CheckDecodeDict(actual_num_values, builder); } -TEST_F(TestDecodeArrowDict, DecodeNonNullDictionary) { +TEST_F(TestDecodeArrowDict, DecodeNonNullDict) { ::arrow::BinaryDictionaryBuilder builder(default_memory_pool()); auto byte_array_decoder = dynamic_cast(dict_decoder_.get()); ASSERT_NE(byte_array_decoder, nullptr); From 84df23bfa8deaad2984ee0b5160698251405784c Mon Sep 17 00:00:00 2001 From: Hatem Helal Date: Fri, 22 Feb 2019 11:57:24 -0500 Subject: [PATCH 10/35] prefer range-based for loop to KeepRunning while loop pattern --- cpp/src/parquet/encoding-benchmark.cc | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/cpp/src/parquet/encoding-benchmark.cc b/cpp/src/parquet/encoding-benchmark.cc index feb85a553c8..6434b1b9ffa 100644 --- a/cpp/src/parquet/encoding-benchmark.cc +++ b/cpp/src/parquet/encoding-benchmark.cc @@ -54,7 +54,7 @@ static void BM_PlainEncodingBoolean(benchmark::State& state) { auto encoder = MakeEncoder(Type::BOOLEAN, Encoding::PLAIN); auto typed_encoder = dynamic_cast(encoder.get()); - while (state.KeepRunning()) { + for (auto _ : state) { typed_encoder->Put(values, static_cast(values.size())); typed_encoder->FlushValues(); } @@ -71,7 +71,7 @@ static void BM_PlainDecodingBoolean(benchmark::State& state) { typed_encoder->Put(values, static_cast(values.size())); std::shared_ptr buf = encoder->FlushValues(); - while (state.KeepRunning()) { + for (auto _ : state) { auto decoder = MakeTypedDecoder(Encoding::PLAIN); decoder->SetData(static_cast(values.size()), buf->data(), static_cast(buf->size())); @@ -87,7 +87,7 @@ BENCHMARK(BM_PlainDecodingBoolean)->Range(MIN_RANGE, MAX_RANGE); static void BM_PlainEncodingInt64(benchmark::State& state) { std::vector values(state.range(0), 64); auto encoder = MakeTypedEncoder(Encoding::PLAIN); - while (state.KeepRunning()) { + for (auto _ : state) { encoder->Put(values.data(), static_cast(values.size())); encoder->FlushValues(); } @@ -102,7 +102,7 @@ static void BM_PlainDecodingInt64(benchmark::State& state) { encoder->Put(values.data(), static_cast(values.size())); std::shared_ptr buf = encoder->FlushValues(); - while (state.KeepRunning()) { + for (auto _ : state) { auto decoder = MakeTypedDecoder(Encoding::PLAIN); decoder->SetData(static_cast(values.size()), buf->data(), static_cast(buf->size())); @@ -141,7 +141,7 @@ static void DecodeDict(std::vector& values, PARQUET_THROW_NOT_OK(indices->Resize(actual_bytes)); - while (state.KeepRunning()) { + for (auto _ : state) { auto dict_decoder = MakeTypedDecoder(Encoding::PLAIN, descr.get()); dict_decoder->SetData(dict_traits->num_entries(), dict_buffer->data(), static_cast(dict_buffer->size())); @@ -253,7 +253,7 @@ class BM_PlainDecodingByteArray : public ::benchmark::Fixture { BENCHMARK_DEFINE_F(BM_PlainDecodingByteArray, DecodeArrow_Dense) (benchmark::State& state) { - while (state.KeepRunning()) { + for (auto _ : state) { auto decoder = MakeTypedDecoder(Encoding::PLAIN); decoder->SetData(num_values_, buffer_->data(), static_cast(buffer_->size())); ::arrow::internal::ChunkedBinaryBuilder builder(static_cast(buffer_->size()), @@ -269,7 +269,7 @@ BENCHMARK_REGISTER_F(BM_PlainDecodingByteArray, DecodeArrow_Dense) BENCHMARK_DEFINE_F(BM_PlainDecodingByteArray, DecodeArrowNonNull_Dense) (benchmark::State& state) { - while (state.KeepRunning()) { + for (auto _ : state) { auto decoder = MakeTypedDecoder(Encoding::PLAIN); decoder->SetData(num_values_, buffer_->data(), static_cast(buffer_->size())); ::arrow::internal::ChunkedBinaryBuilder builder(static_cast(buffer_->size()), @@ -285,7 +285,7 @@ BENCHMARK_REGISTER_F(BM_PlainDecodingByteArray, DecodeArrowNonNull_Dense) BENCHMARK_DEFINE_F(BM_PlainDecodingByteArray, DecodeArrow_Dict) (benchmark::State& state) { - while (state.KeepRunning()) { + for (auto _ : state) { auto decoder = MakeTypedDecoder(Encoding::PLAIN); decoder->SetData(num_values_, buffer_->data(), static_cast(buffer_->size())); ::arrow::BinaryDictionaryBuilder builder(::arrow::default_memory_pool()); @@ -300,7 +300,7 @@ BENCHMARK_REGISTER_F(BM_PlainDecodingByteArray, DecodeArrow_Dict) BENCHMARK_DEFINE_F(BM_PlainDecodingByteArray, DecodeArrowNonNull_Dict) (benchmark::State& state) { - while (state.KeepRunning()) { + for (auto _ : state) { auto decoder = MakeTypedDecoder(Encoding::PLAIN); decoder->SetData(num_values_, buffer_->data(), static_cast(buffer_->size())); ::arrow::BinaryDictionaryBuilder builder(::arrow::default_memory_pool()); From 78eddb8afacf6abb34ace0d43561eabb5caf40d5 Mon Sep 17 00:00:00 2001 From: Hatem Helal Date: Tue, 26 Feb 2019 10:47:48 -0500 Subject: [PATCH 11/35] Use value parameterization in decoding tests --- cpp/src/parquet/encoding-test.cc | 83 +++++++++++++++++++++----------- 1 file changed, 55 insertions(+), 28 deletions(-) diff --git a/cpp/src/parquet/encoding-test.cc b/cpp/src/parquet/encoding-test.cc index 3c476b294a6..476d78b084a 100644 --- a/cpp/src/parquet/encoding-test.cc +++ b/cpp/src/parquet/encoding-test.cc @@ -40,6 +40,13 @@ using arrow::MemoryPool; using std::string; using std::vector; +// TODO(hatemhelal): investigate whether this can be replaced with GTEST_SKIP in a future +// gtest release that contains https://github.com/google/googletest/pull/1544 +#define SKIP_TEST_IF(condition) \ + if (condition) { \ + return; \ + } + namespace parquet { namespace test { @@ -320,33 +327,38 @@ TEST(TestDictionaryEncoding, CannotDictDecodeBoolean) { // ---------------------------------------------------------------------- // Shared arrow builder decode tests -class TestDecodeArrow : public ::testing::Test { +class TestDecodeArrow : public ::testing::TestWithParam { public: void SetUp() override { - InitDataInputs(); + InitFromJSON(GetParam()); SetupEncoderDecoder(); } - void InitDataInputs() { - // Build a dictionary array and its dense representation for testing building both - auto dict_values = ::arrow::ArrayFromJSON(::arrow::binary(), "[\"foo\", \"bar\"]"); - auto dict_type = ::arrow::dictionary(::arrow::int8(), dict_values); - auto dict_indices = ::arrow::ArrayFromJSON(::arrow::int8(), "[0, 1, 0, 0]"); - ASSERT_OK( - ::arrow::DictionaryArray::FromArrays(dict_type, dict_indices, &expected_dict_)); - expected_dense_ = - ArrayFromJSON(::arrow::binary(), "[\"foo\", \"bar\", \"foo\", \"foo\"]"); + void InitFromJSON(const char* json) { + // Use input JSON to initialize the dense array + expected_dense_ = ::arrow::ArrayFromJSON(::arrow::binary(), json); num_values_ = static_cast(expected_dense_->length()); + null_count_ = static_cast(expected_dense_->null_count()); + valid_bits_ = expected_dense_->null_bitmap()->data(); - // Initialize input_data_ for the encoder from the expected_array_ values + // Use builder to create the expected dictionary encoded array as well as initialize + // input_data_ for the encoder from the expected_array_ values + ::arrow::BinaryDictionaryBuilder builder(default_memory_pool()); const auto& binary_array = static_cast(*expected_dense_); + for (int64_t i = 0; i < binary_array.length(); ++i) { auto view = binary_array.GetView(i); input_data_.emplace_back(static_cast(view.length()), reinterpret_cast(view.data())); + + if (binary_array.IsNull(i)) { + ASSERT_OK(builder.AppendNull()); + } else { + ASSERT_OK(builder.Append(view)); + } } - valid_bits_ = vector(::arrow::BitUtil::BytesForBits(num_values_) + 1, 255); + ASSERT_OK(builder.Finish(&expected_dict_)); } // Setup encoder/decoder pair for testing with @@ -373,8 +385,9 @@ class TestDecodeArrow : public ::testing::Test { std::shared_ptr<::arrow::Array> expected_dict_; std::shared_ptr<::arrow::Array> expected_dense_; int num_values_; + int null_count_; vector input_data_; - vector valid_bits_; + const uint8_t* valid_bits_; std::unique_ptr encoder_; std::unique_ptr decoder_; std::shared_ptr buffer_; @@ -382,7 +395,7 @@ class TestDecodeArrow : public ::testing::Test { // ---------------------------------------------------------------------- // Arrow builder decode tests for PlainByteArrayDecoder -class TestDecodeArrowPlain : public TestDecodeArrow { +class FromPlainEncoding : public TestDecodeArrow { public: void SetupEncoderDecoder() override { encoder_ = MakeTypedEncoder(Encoding::PLAIN); @@ -395,29 +408,33 @@ class TestDecodeArrowPlain : public TestDecodeArrow { void TearDown() override {} }; -TEST_F(TestDecodeArrowPlain, DecodeDense) { +TEST_P(FromPlainEncoding, DecodeDense) { ::arrow::internal::ChunkedBinaryBuilder builder(static_cast(buffer_->size()), default_memory_pool()); auto actual_num_values = - decoder_->DecodeArrow(num_values_, 0, valid_bits_.data(), 0, &builder); + decoder_->DecodeArrow(num_values_, null_count_, valid_bits_, 0, &builder); CheckDecodeDense(actual_num_values, builder); } -TEST_F(TestDecodeArrowPlain, DecodeNonNullDense) { +TEST_P(FromPlainEncoding, DecodeNonNullDense) { + // Skip this test if input data contains nulls (optionals) + SKIP_TEST_IF(null_count_ > 0) ::arrow::internal::ChunkedBinaryBuilder builder(static_cast(buffer_->size()), default_memory_pool()); auto actual_num_values = decoder_->DecodeArrowNonNull(num_values_, &builder); CheckDecodeDense(actual_num_values, builder); } -TEST_F(TestDecodeArrowPlain, DecodeDict) { +TEST_P(FromPlainEncoding, DecodeDict) { ::arrow::BinaryDictionaryBuilder builder(default_memory_pool()); auto actual_num_values = - decoder_->DecodeArrow(num_values_, 0, valid_bits_.data(), 0, &builder); + decoder_->DecodeArrow(num_values_, null_count_, valid_bits_, 0, &builder); CheckDecodeDict(actual_num_values, builder); } -TEST_F(TestDecodeArrowPlain, DecodeNonNullDict) { +TEST_P(FromPlainEncoding, DecodeNonNullDict) { + // Skip this test if input data contains nulls (optionals) + SKIP_TEST_IF(null_count_ > 0) ::arrow::BinaryDictionaryBuilder builder(default_memory_pool()); auto actual_num_values = decoder_->DecodeArrowNonNull(num_values_, &builder); CheckDecodeDict(actual_num_values, builder); @@ -425,7 +442,7 @@ TEST_F(TestDecodeArrowPlain, DecodeNonNullDict) { // ---------------------------------------------------------------------- // Arrow builder decode tests for DictByteArrayDecoder -class TestDecodeArrowDict : public TestDecodeArrow { +class FromDictEncoding : public TestDecodeArrow { public: void SetupEncoderDecoder() override { auto node = schema::ByteArray("name"); @@ -460,17 +477,19 @@ class TestDecodeArrowDict : public TestDecodeArrow { std::shared_ptr dict_buffer_; }; -TEST_F(TestDecodeArrowDict, DecodeDense) { +TEST_P(FromDictEncoding, DecodeDense) { ::arrow::internal::ChunkedBinaryBuilder builder(static_cast(dict_buffer_->size()), default_memory_pool()); auto byte_array_decoder = dynamic_cast(dict_decoder_.get()); ASSERT_NE(byte_array_decoder, nullptr); auto actual_num_values = - byte_array_decoder->DecodeArrow(num_values_, 0, valid_bits_.data(), 0, &builder); + byte_array_decoder->DecodeArrow(num_values_, null_count_, valid_bits_, 0, &builder); CheckDecodeDense(actual_num_values, builder); } -TEST_F(TestDecodeArrowDict, DecodeNonNullDense) { +TEST_P(FromDictEncoding, DecodeNonNullDense) { + // Skip this test if input data contains nulls (optionals) + SKIP_TEST_IF(null_count_ > 0) ::arrow::internal::ChunkedBinaryBuilder builder(static_cast(dict_buffer_->size()), default_memory_pool()); auto byte_array_decoder = dynamic_cast(dict_decoder_.get()); @@ -479,16 +498,18 @@ TEST_F(TestDecodeArrowDict, DecodeNonNullDense) { CheckDecodeDense(actual_num_values, builder); } -TEST_F(TestDecodeArrowDict, DecodeDict) { +TEST_P(FromDictEncoding, DecodeDict) { ::arrow::BinaryDictionaryBuilder builder(default_memory_pool()); auto byte_array_decoder = dynamic_cast(dict_decoder_.get()); ASSERT_NE(byte_array_decoder, nullptr); auto actual_num_values = - byte_array_decoder->DecodeArrow(num_values_, 0, valid_bits_.data(), 0, &builder); + byte_array_decoder->DecodeArrow(num_values_, null_count_, valid_bits_, 0, &builder); CheckDecodeDict(actual_num_values, builder); } -TEST_F(TestDecodeArrowDict, DecodeNonNullDict) { +TEST_P(FromDictEncoding, DecodeNonNullDict) { + // Skip this test if input data contains nulls (optionals) + SKIP_TEST_IF(null_count_ > 0) ::arrow::BinaryDictionaryBuilder builder(default_memory_pool()); auto byte_array_decoder = dynamic_cast(dict_decoder_.get()); ASSERT_NE(byte_array_decoder, nullptr); @@ -496,6 +517,12 @@ TEST_F(TestDecodeArrowDict, DecodeNonNullDict) { CheckDecodeDict(actual_num_values, builder); } +const char* json[] = {"[\"foo\", \"bar\", \"foo\", \"foo\"]", + "[\"foo\", \"bar\", \"foo\", null]"}; + +INSTANTIATE_TEST_CASE_P(TestDecodeArrow, FromPlainEncoding, ::testing::ValuesIn(json)); + +INSTANTIATE_TEST_CASE_P(TestDecodeArrow, FromDictEncoding, ::testing::ValuesIn(json)); } // namespace test } // namespace parquet From ff380211c607fbdf266502a7d89712a65a6f9307 Mon Sep 17 00:00:00 2001 From: Hatem Helal Date: Wed, 27 Feb 2019 08:59:51 +0000 Subject: [PATCH 12/35] prefer mersenne twister prng over default one which is implemenation defined --- cpp/src/parquet/encoding-benchmark.cc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cpp/src/parquet/encoding-benchmark.cc b/cpp/src/parquet/encoding-benchmark.cc index 6434b1b9ffa..dbb5335f97c 100644 --- a/cpp/src/parquet/encoding-benchmark.cc +++ b/cpp/src/parquet/encoding-benchmark.cc @@ -186,7 +186,7 @@ std::shared_ptr<::arrow::Array> MakeRandomStringsWithRepeats(size_t num_unique, std::vector dictionary(num_unique); uint32_t seed = 0; - std::default_random_engine gen(seed); + std::mt19937 gen(seed); std::uniform_int_distribution length_dist(min_length, max_length); std::generate(dictionary.begin(), dictionary.end(), [&] { From b16eaa978751cb2cebe417adb5d6ea1dddf725ba Mon Sep 17 00:00:00 2001 From: Hatem Helal Date: Wed, 27 Feb 2019 13:28:40 -0500 Subject: [PATCH 13/35] prefer default_random_engine to avoid potential slowdown with Mersenne Twister prng --- cpp/src/parquet/encoding-benchmark.cc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cpp/src/parquet/encoding-benchmark.cc b/cpp/src/parquet/encoding-benchmark.cc index dbb5335f97c..6434b1b9ffa 100644 --- a/cpp/src/parquet/encoding-benchmark.cc +++ b/cpp/src/parquet/encoding-benchmark.cc @@ -186,7 +186,7 @@ std::shared_ptr<::arrow::Array> MakeRandomStringsWithRepeats(size_t num_unique, std::vector dictionary(num_unique); uint32_t seed = 0; - std::mt19937 gen(seed); + std::default_random_engine gen(seed); std::uniform_int_distribution length_dist(min_length, max_length); std::generate(dictionary.begin(), dictionary.end(), [&] { From 8f59198e8c6718c5adf23a2878e1cdd8ee562a30 Mon Sep 17 00:00:00 2001 From: Hatem Helal Date: Thu, 28 Feb 2019 07:59:22 +0000 Subject: [PATCH 14/35] Add overloads for decoding using a StringDictionaryBuilder --- cpp/src/parquet/encoding-test.cc | 76 +++++++++++++++++++++++++++----- cpp/src/parquet/encoding.cc | 30 +++++++++++++ cpp/src/parquet/encoding.h | 8 ++++ 3 files changed, 103 insertions(+), 11 deletions(-) diff --git a/cpp/src/parquet/encoding-test.cc b/cpp/src/parquet/encoding-test.cc index 476d78b084a..582fdacbab3 100644 --- a/cpp/src/parquet/encoding-test.cc +++ b/cpp/src/parquet/encoding-test.cc @@ -341,24 +341,36 @@ class TestDecodeArrow : public ::testing::TestWithParam { null_count_ = static_cast(expected_dense_->null_count()); valid_bits_ = expected_dense_->null_bitmap()->data(); - // Use builder to create the expected dictionary encoded array as well as initialize - // input_data_ for the encoder from the expected_array_ values - ::arrow::BinaryDictionaryBuilder builder(default_memory_pool()); + // Build both binary and string dictionary arrays from the dense array. + BuildDict<::arrow::BinaryDictionaryBuilder>(&expected_bin_dict_); + BuildDict<::arrow::StringDictionaryBuilder>(&expected_str_dict_); + + // Initialize input_data_ for the encoder from the expected_array_ values const auto& binary_array = static_cast(*expected_dense_); + input_data_.reserve(binary_array.length()); for (int64_t i = 0; i < binary_array.length(); ++i) { auto view = binary_array.GetView(i); input_data_.emplace_back(static_cast(view.length()), reinterpret_cast(view.data())); + } + } + + // Builds a dictionary encoded array from the dense array using BuilderType + template + void BuildDict(std::shared_ptr<::arrow::Array>* result) { + const auto& binary_array = static_cast(*expected_dense_); + BuilderType builder(default_memory_pool()); + for (int64_t i = 0; i < binary_array.length(); ++i) { if (binary_array.IsNull(i)) { ASSERT_OK(builder.AppendNull()); } else { - ASSERT_OK(builder.Append(view)); + ASSERT_OK(builder.Append(binary_array.GetView(i))); } } - ASSERT_OK(builder.Finish(&expected_dict_)); + ASSERT_OK(builder.Finish(result)); } // Setup encoder/decoder pair for testing with @@ -378,11 +390,20 @@ class TestDecodeArrow : public ::testing::TestWithParam { ASSERT_EQ(actual_num_values, num_values_); std::shared_ptr<::arrow::Array> actual; ASSERT_OK(builder.Finish(&actual)); - ASSERT_ARRAYS_EQUAL(*actual, *expected_dict_); + ASSERT_ARRAYS_EQUAL(*actual, *expected_bin_dict_); + } + + void CheckDecodeDict(const int actual_num_values, + ::arrow::StringDictionaryBuilder& builder) { + ASSERT_EQ(actual_num_values, num_values_); + std::shared_ptr<::arrow::Array> actual; + ASSERT_OK(builder.Finish(&actual)); + ASSERT_ARRAYS_EQUAL(*actual, *expected_str_dict_); } protected: - std::shared_ptr<::arrow::Array> expected_dict_; + std::shared_ptr<::arrow::Array> expected_bin_dict_; + std::shared_ptr<::arrow::Array> expected_str_dict_; std::shared_ptr<::arrow::Array> expected_dense_; int num_values_; int null_count_; @@ -425,14 +446,21 @@ TEST_P(FromPlainEncoding, DecodeNonNullDense) { CheckDecodeDense(actual_num_values, builder); } -TEST_P(FromPlainEncoding, DecodeDict) { +TEST_P(FromPlainEncoding, DecodeBinaryDict) { ::arrow::BinaryDictionaryBuilder builder(default_memory_pool()); auto actual_num_values = decoder_->DecodeArrow(num_values_, null_count_, valid_bits_, 0, &builder); CheckDecodeDict(actual_num_values, builder); } -TEST_P(FromPlainEncoding, DecodeNonNullDict) { +TEST_P(FromPlainEncoding, DecodeStringDict) { + ::arrow::StringDictionaryBuilder builder(default_memory_pool()); + auto actual_num_values = + decoder_->DecodeArrow(num_values_, null_count_, valid_bits_, 0, &builder); + CheckDecodeDict(actual_num_values, builder); +} + +TEST_P(FromPlainEncoding, DecodeNonNullBinaryDict) { // Skip this test if input data contains nulls (optionals) SKIP_TEST_IF(null_count_ > 0) ::arrow::BinaryDictionaryBuilder builder(default_memory_pool()); @@ -440,6 +468,14 @@ TEST_P(FromPlainEncoding, DecodeNonNullDict) { CheckDecodeDict(actual_num_values, builder); } +TEST_P(FromPlainEncoding, DecodeNonNullStringDict) { + // Skip this test if input data contains nulls (optionals) + SKIP_TEST_IF(null_count_ > 0) + ::arrow::StringDictionaryBuilder builder(default_memory_pool()); + auto actual_num_values = decoder_->DecodeArrowNonNull(num_values_, &builder); + CheckDecodeDict(actual_num_values, builder); +} + // ---------------------------------------------------------------------- // Arrow builder decode tests for DictByteArrayDecoder class FromDictEncoding : public TestDecodeArrow { @@ -498,7 +534,7 @@ TEST_P(FromDictEncoding, DecodeNonNullDense) { CheckDecodeDense(actual_num_values, builder); } -TEST_P(FromDictEncoding, DecodeDict) { +TEST_P(FromDictEncoding, DecodeBinaryDict) { ::arrow::BinaryDictionaryBuilder builder(default_memory_pool()); auto byte_array_decoder = dynamic_cast(dict_decoder_.get()); ASSERT_NE(byte_array_decoder, nullptr); @@ -507,7 +543,16 @@ TEST_P(FromDictEncoding, DecodeDict) { CheckDecodeDict(actual_num_values, builder); } -TEST_P(FromDictEncoding, DecodeNonNullDict) { +TEST_P(FromDictEncoding, DecodeStringDict) { + ::arrow::StringDictionaryBuilder builder(default_memory_pool()); + auto byte_array_decoder = dynamic_cast(dict_decoder_.get()); + ASSERT_NE(byte_array_decoder, nullptr); + auto actual_num_values = + byte_array_decoder->DecodeArrow(num_values_, null_count_, valid_bits_, 0, &builder); + CheckDecodeDict(actual_num_values, builder); +} + +TEST_P(FromDictEncoding, DecodeNonNullBinaryDict) { // Skip this test if input data contains nulls (optionals) SKIP_TEST_IF(null_count_ > 0) ::arrow::BinaryDictionaryBuilder builder(default_memory_pool()); @@ -516,6 +561,15 @@ TEST_P(FromDictEncoding, DecodeNonNullDict) { auto actual_num_values = byte_array_decoder->DecodeArrowNonNull(num_values_, &builder); CheckDecodeDict(actual_num_values, builder); } +TEST_P(FromDictEncoding, DecodeNonNullStringDict) { + // Skip this test if input data contains nulls (optionals) + SKIP_TEST_IF(null_count_ > 0) + ::arrow::StringDictionaryBuilder builder(default_memory_pool()); + auto byte_array_decoder = dynamic_cast(dict_decoder_.get()); + ASSERT_NE(byte_array_decoder, nullptr); + auto actual_num_values = byte_array_decoder->DecodeArrowNonNull(num_values_, &builder); + CheckDecodeDict(actual_num_values, builder); +} const char* json[] = {"[\"foo\", \"bar\", \"foo\", \"foo\"]", "[\"foo\", \"bar\", \"foo\", null]"}; diff --git a/cpp/src/parquet/encoding.cc b/cpp/src/parquet/encoding.cc index edcd5ecc0b6..c09a32b6c7f 100644 --- a/cpp/src/parquet/encoding.cc +++ b/cpp/src/parquet/encoding.cc @@ -715,6 +715,15 @@ class PlainByteArrayDecoder : public PlainDecoder, return result; } + int DecodeArrow(int num_values, int null_count, const uint8_t* valid_bits, + int64_t valid_bits_offset, + ::arrow::StringDictionaryBuilder* out) override { + int result = 0; + PARQUET_THROW_NOT_OK( + DecodeArrow(num_values, null_count, valid_bits, valid_bits_offset, out, &result)); + return result; + } + int DecodeArrowNonNull(int num_values, ::arrow::internal::ChunkedBinaryBuilder* out) override { int result = 0; @@ -728,6 +737,12 @@ class PlainByteArrayDecoder : public PlainDecoder, return result; } + int DecodeArrowNonNull(int num_values, ::arrow::StringDictionaryBuilder* out) override { + int result = 0; + PARQUET_THROW_NOT_OK(DecodeArrowNonNull(num_values, out, &result)); + return result; + } + private: template ::arrow::Status DecodeArrow(int num_values, int null_count, const uint8_t* valid_bits, @@ -941,6 +956,15 @@ class DictByteArrayDecoder : public DictDecoderImpl, return result; } + int DecodeArrow(int num_values, int null_count, const uint8_t* valid_bits, + int64_t valid_bits_offset, + ::arrow::StringDictionaryBuilder* out) override { + int result = 0; + PARQUET_THROW_NOT_OK( + DecodeArrow(num_values, null_count, valid_bits, valid_bits_offset, out, &result)); + return result; + } + int DecodeArrowNonNull(int num_values, ::arrow::internal::ChunkedBinaryBuilder* out) override { int result = 0; @@ -954,6 +978,12 @@ class DictByteArrayDecoder : public DictDecoderImpl, return result; } + int DecodeArrowNonNull(int num_values, ::arrow::StringDictionaryBuilder* out) override { + int result = 0; + PARQUET_THROW_NOT_OK(DecodeArrowNonNull(num_values, out, &result)); + return result; + } + private: template ::arrow::Status DecodeArrow(int num_values, int null_count, const uint8_t* valid_bits, diff --git a/cpp/src/parquet/encoding.h b/cpp/src/parquet/encoding.h index 3f6cf645f52..c869a8c6cfc 100644 --- a/cpp/src/parquet/encoding.h +++ b/cpp/src/parquet/encoding.h @@ -35,6 +35,7 @@ namespace arrow { class BinaryDictionaryBuilder; +class StringDictionaryBuilder; namespace internal { @@ -215,11 +216,18 @@ class ByteArrayDecoder : virtual public TypedDecoder { int64_t valid_bits_offset, ::arrow::BinaryDictionaryBuilder* builder) = 0; + virtual int DecodeArrow(int num_values, int null_count, const uint8_t* valid_bits, + int64_t valid_bits_offset, + ::arrow::StringDictionaryBuilder* builder) = 0; + virtual int DecodeArrowNonNull(int num_values, ::arrow::internal::ChunkedBinaryBuilder* builder) = 0; virtual int DecodeArrowNonNull(int num_values, ::arrow::BinaryDictionaryBuilder* builder) = 0; + + virtual int DecodeArrowNonNull(int num_values, + ::arrow::StringDictionaryBuilder* builder) = 0; }; class FLBADecoder : virtual public TypedDecoder { From 28d76b7b22c4ed4d2c4cb2c9701182e2b937b9b8 Mon Sep 17 00:00:00 2001 From: Hatem Helal Date: Thu, 28 Feb 2019 08:17:58 +0000 Subject: [PATCH 15/35] Add benchmark for dictionary decoding using arrow builder --- cpp/src/parquet/encoding-benchmark.cc | 159 +++++++++++++++++++++++--- 1 file changed, 143 insertions(+), 16 deletions(-) diff --git a/cpp/src/parquet/encoding-benchmark.cc b/cpp/src/parquet/encoding-benchmark.cc index 6434b1b9ffa..88f795678e3 100644 --- a/cpp/src/parquet/encoding-benchmark.cc +++ b/cpp/src/parquet/encoding-benchmark.cc @@ -210,22 +210,26 @@ std::shared_ptr<::arrow::Array> MakeRandomStringsWithRepeats(size_t num_unique, return result; } -class BM_PlainDecodingByteArray : public ::benchmark::Fixture { +// ---------------------------------------------------------------------- +// Shared benchmarks for decoding using arrow builders +class BenchmarkDecodeArrow : public ::benchmark::Fixture { public: void SetUp(const ::benchmark::State& state) override { num_values_ = static_cast(state.range()); - constexpr int repeat_factor = 8; - input_array_ = MakeRandomStringsWithRepeats(num_values_ / repeat_factor, num_values_); - PlainEncodeInputArray(); + InitDataInputs(); + DoEncodeData(); } void TearDown(const ::benchmark::State& state) override {} - void PlainEncodeInputArray() { - const auto& binary_array = static_cast(*input_array_); + void InitDataInputs() { + constexpr int repeat_factor = 8; + input_array_ = MakeRandomStringsWithRepeats(num_values_ / repeat_factor, num_values_); + valid_bits_ = input_array_->null_bitmap()->data(); values_ = std::vector(); values_.reserve(num_values_); total_size_ = 0; + const auto& binary_array = static_cast(*input_array_); for (int64_t i = 0; i < binary_array.length(); i++) { auto view = binary_array.GetView(i); @@ -233,24 +237,30 @@ class BM_PlainDecodingByteArray : public ::benchmark::Fixture { reinterpret_cast(view.data())); total_size_ += view.length(); } - - valid_bits_ = - std::vector(::arrow::BitUtil::BytesForBits(num_values_) + 1, 255); - - auto encoder = MakeTypedEncoder(Encoding::PLAIN); - encoder->Put(values_.data(), num_values_); - buffer_ = encoder->FlushValues(); } + virtual void DoEncodeData() = 0; + protected: int num_values_; std::shared_ptr<::arrow::Array> input_array_; uint64_t total_size_; std::vector values_; - std::vector valid_bits_; + const uint8_t* valid_bits_; std::shared_ptr buffer_; }; +// ---------------------------------------------------------------------- +// Benchmark Decoding from Plain Encoding +class BM_PlainDecodingByteArray : public BenchmarkDecodeArrow { + public: + void DoEncodeData() override { + auto encoder = MakeTypedEncoder(Encoding::PLAIN); + encoder->Put(values_.data(), num_values_); + buffer_ = encoder->FlushValues(); + } +}; + BENCHMARK_DEFINE_F(BM_PlainDecodingByteArray, DecodeArrow_Dense) (benchmark::State& state) { for (auto _ : state) { @@ -258,7 +268,7 @@ BENCHMARK_DEFINE_F(BM_PlainDecodingByteArray, DecodeArrow_Dense) decoder->SetData(num_values_, buffer_->data(), static_cast(buffer_->size())); ::arrow::internal::ChunkedBinaryBuilder builder(static_cast(buffer_->size()), ::arrow::default_memory_pool()); - decoder->DecodeArrow(num_values_, 0, valid_bits_.data(), 0, &builder); + decoder->DecodeArrow(num_values_, 0, valid_bits_, 0, &builder); } state.SetBytesProcessed(state.iterations() * total_size_); @@ -289,7 +299,7 @@ BENCHMARK_DEFINE_F(BM_PlainDecodingByteArray, DecodeArrow_Dict) auto decoder = MakeTypedDecoder(Encoding::PLAIN); decoder->SetData(num_values_, buffer_->data(), static_cast(buffer_->size())); ::arrow::BinaryDictionaryBuilder builder(::arrow::default_memory_pool()); - decoder->DecodeArrow(num_values_, 0, valid_bits_.data(), 0, &builder); + decoder->DecodeArrow(num_values_, 0, valid_bits_, 0, &builder); } state.SetBytesProcessed(state.iterations() * total_size_); @@ -313,4 +323,121 @@ BENCHMARK_DEFINE_F(BM_PlainDecodingByteArray, DecodeArrowNonNull_Dict) BENCHMARK_REGISTER_F(BM_PlainDecodingByteArray, DecodeArrowNonNull_Dict) ->Range(MIN_RANGE, MAX_RANGE); +// ---------------------------------------------------------------------- +// Benchmark Decoding from Dictionary Encoding +class BM_DictDecodingByteArray : public BenchmarkDecodeArrow { + public: + void DoEncodeData() override { + auto node = schema::ByteArray("name"); + descr_ = std::unique_ptr(new ColumnDescriptor(node, 0, 0)); + auto encoder = MakeTypedEncoder(Encoding::PLAIN, + /*use_dictionary=*/true, descr_.get()); + ASSERT_NO_THROW(encoder->Put(values_.data(), num_values_)); + buffer_ = encoder->FlushValues(); + + auto dict_encoder = dynamic_cast*>(encoder.get()); + ASSERT_NE(dict_encoder, nullptr); + dict_buffer_ = + AllocateBuffer(default_memory_pool(), dict_encoder->dict_encoded_size()); + dict_encoder->WriteDict(dict_buffer_->mutable_data()); + num_dict_entries_ = dict_encoder->num_entries(); + } + + protected: + std::unique_ptr descr_; + std::unique_ptr> dict_decoder_; + std::shared_ptr dict_buffer_; + int num_dict_entries_; +}; + +BENCHMARK_DEFINE_F(BM_DictDecodingByteArray, DecodeArrow_Dense) +(benchmark::State& state) { + for (auto _ : state) { + auto decoder = MakeTypedDecoder(Encoding::PLAIN, descr_.get()); + decoder->SetData(num_dict_entries_, dict_buffer_->data(), + static_cast(dict_buffer_->size())); + auto dict_decoder = MakeDictDecoder(descr_.get()); + dict_decoder->SetDict(decoder.get()); + dict_decoder->SetData(num_values_, buffer_->data(), + static_cast(buffer_->size())); + auto byte_array_decoder = dynamic_cast(dict_decoder.get()); + + ::arrow::internal::ChunkedBinaryBuilder builder( + static_cast(dict_buffer_->size()), ::arrow::default_memory_pool()); + byte_array_decoder->DecodeArrow(num_values_, 0, valid_bits_, 0, &builder); + } + + state.SetBytesProcessed(state.iterations() * total_size_); +} + +BENCHMARK_REGISTER_F(BM_DictDecodingByteArray, DecodeArrow_Dense) + ->Range(MIN_RANGE, MAX_RANGE); + +BENCHMARK_DEFINE_F(BM_DictDecodingByteArray, DecodeArrowNonNull_Dense) +(benchmark::State& state) { + for (auto _ : state) { + auto decoder = MakeTypedDecoder(Encoding::PLAIN, descr_.get()); + decoder->SetData(num_dict_entries_, dict_buffer_->data(), + static_cast(dict_buffer_->size())); + auto dict_decoder = MakeDictDecoder(descr_.get()); + dict_decoder->SetDict(decoder.get()); + dict_decoder->SetData(num_values_, buffer_->data(), + static_cast(buffer_->size())); + auto byte_array_decoder = dynamic_cast(dict_decoder.get()); + + ::arrow::internal::ChunkedBinaryBuilder builder( + static_cast(dict_buffer_->size()), ::arrow::default_memory_pool()); + byte_array_decoder->DecodeArrowNonNull(num_values_, &builder); + } + + state.SetBytesProcessed(state.iterations() * total_size_); +} + +BENCHMARK_REGISTER_F(BM_DictDecodingByteArray, DecodeArrowNonNull_Dense) + ->Range(MIN_RANGE, MAX_RANGE); + +BENCHMARK_DEFINE_F(BM_DictDecodingByteArray, DecodeArrow_Dict) +(benchmark::State& state) { + for (auto _ : state) { + auto decoder = MakeTypedDecoder(Encoding::PLAIN, descr_.get()); + decoder->SetData(num_dict_entries_, dict_buffer_->data(), + static_cast(dict_buffer_->size())); + auto dict_decoder = MakeDictDecoder(descr_.get()); + dict_decoder->SetDict(decoder.get()); + dict_decoder->SetData(num_values_, buffer_->data(), + static_cast(buffer_->size())); + auto byte_array_decoder = dynamic_cast(dict_decoder.get()); + + ::arrow::BinaryDictionaryBuilder builder(::arrow::default_memory_pool()); + byte_array_decoder->DecodeArrow(num_values_, 0, valid_bits_, 0, &builder); + } + + state.SetBytesProcessed(state.iterations() * total_size_); +} + +BENCHMARK_REGISTER_F(BM_DictDecodingByteArray, DecodeArrow_Dict) + ->Range(MIN_RANGE, MAX_RANGE); + +BENCHMARK_DEFINE_F(BM_DictDecodingByteArray, DecodeArrowNonNull_Dict) +(benchmark::State& state) { + for (auto _ : state) { + auto decoder = MakeTypedDecoder(Encoding::PLAIN, descr_.get()); + decoder->SetData(num_dict_entries_, dict_buffer_->data(), + static_cast(dict_buffer_->size())); + auto dict_decoder = MakeDictDecoder(descr_.get()); + dict_decoder->SetDict(decoder.get()); + dict_decoder->SetData(num_values_, buffer_->data(), + static_cast(buffer_->size())); + auto byte_array_decoder = dynamic_cast(dict_decoder.get()); + + ::arrow::BinaryDictionaryBuilder builder(::arrow::default_memory_pool()); + byte_array_decoder->DecodeArrowNonNull(num_values_, &builder); + } + + state.SetBytesProcessed(state.iterations() * total_size_); +} + +BENCHMARK_REGISTER_F(BM_DictDecodingByteArray, DecodeArrowNonNull_Dict) + ->Range(MIN_RANGE, MAX_RANGE); + } // namespace parquet From a8c15354eaa03c6875d6803617ea61ebc6bcd78e Mon Sep 17 00:00:00 2001 From: Hatem Helal Date: Tue, 5 Mar 2019 20:07:12 +0000 Subject: [PATCH 16/35] Add support for requesting a parquet column be read as a DictionaryArray --- cpp/src/parquet/arrow/reader.cc | 49 +++-- cpp/src/parquet/arrow/reader.h | 50 ++++- cpp/src/parquet/arrow/record_reader.cc | 260 ++++++++++++++----------- cpp/src/parquet/arrow/record_reader.h | 7 +- 4 files changed, 233 insertions(+), 133 deletions(-) diff --git a/cpp/src/parquet/arrow/reader.cc b/cpp/src/parquet/arrow/reader.cc index 61f5bb28b79..7535de55fb6 100644 --- a/cpp/src/parquet/arrow/reader.cc +++ b/cpp/src/parquet/arrow/reader.cc @@ -236,8 +236,9 @@ using FileColumnIteratorFactory = class FileReader::Impl { public: - Impl(MemoryPool* pool, std::unique_ptr reader) - : pool_(pool), reader_(std::move(reader)), use_threads_(false) {} + Impl(MemoryPool* pool, std::unique_ptr reader, + const ArrowReaderProperties& properties) + : pool_(pool), reader_(std::move(reader)), reader_properties_(properties) {} virtual ~Impl() {} @@ -279,14 +280,16 @@ class FileReader::Impl { int num_columns() const { return reader_->metadata()->num_columns(); } - void set_use_threads(bool use_threads) { use_threads_ = use_threads; } + void set_use_threads(bool use_threads) { + reader_properties_.set_use_threads(use_threads); + } ParquetFileReader* reader() { return reader_.get(); } private: MemoryPool* pool_; std::unique_ptr reader_; - bool use_threads_; + ArrowReaderProperties reader_properties_; }; class ColumnReader::ColumnReaderImpl { @@ -302,9 +305,10 @@ class ColumnReader::ColumnReaderImpl { // Reader implementation for primitive arrays class PARQUET_NO_EXPORT PrimitiveImpl : public ColumnReader::ColumnReaderImpl { public: - PrimitiveImpl(MemoryPool* pool, std::unique_ptr input) + PrimitiveImpl(MemoryPool* pool, std::unique_ptr input, + const bool read_dictionary) : pool_(pool), input_(std::move(input)), descr_(input_->descr()) { - record_reader_ = RecordReader::Make(descr_, pool_); + record_reader_ = RecordReader::Make(descr_, pool_, read_dictionary); Status s = NodeToField(*input_->descr()->schema_node(), &field_); DCHECK_OK(s); NextRowGroup(); @@ -358,17 +362,19 @@ class PARQUET_NO_EXPORT StructImpl : public ColumnReader::ColumnReaderImpl { const std::vector>& children); }; -FileReader::FileReader(MemoryPool* pool, std::unique_ptr reader) - : impl_(new FileReader::Impl(pool, std::move(reader))) {} +FileReader::FileReader(MemoryPool* pool, std::unique_ptr reader, + const ArrowReaderProperties& properties) + : impl_(new FileReader::Impl(pool, std::move(reader), properties)) {} FileReader::~FileReader() {} Status FileReader::Impl::GetColumn(int i, FileColumnIteratorFactory iterator_factory, std::unique_ptr* out) { std::unique_ptr input(iterator_factory(i, reader_.get())); + bool read_dict = reader_properties_.read_dictionary(i); std::unique_ptr impl( - new PrimitiveImpl(pool_, std::move(input))); + new PrimitiveImpl(pool_, std::move(input), read_dict)); *out = std::unique_ptr(new ColumnReader(std::move(impl))); return Status::OK(); } @@ -552,7 +558,7 @@ Status FileReader::Impl::ReadRowGroup(int row_group_index, return Status::OK(); }; - if (use_threads_) { + if (reader_properties_.use_threads()) { std::vector> futures; auto pool = ::arrow::internal::GetCpuThreadPool(); for (int i = 0; i < num_fields; i++) { @@ -599,7 +605,7 @@ Status FileReader::Impl::ReadTable(const std::vector& indices, return Status::OK(); }; - if (use_threads_) { + if (reader_properties_.use_threads()) { std::vector> futures; auto pool = ::arrow::internal::GetCpuThreadPool(); for (int i = 0; i < num_fields; i++) { @@ -683,6 +689,18 @@ Status OpenFile(const std::shared_ptr<::arrow::io::RandomAccessFile>& file, reader); } +Status OpenFile(const std::shared_ptr<::arrow::io::ReadableFileInterface>& file, + ::arrow::MemoryPool* allocator, const ArrowReaderProperties& properties, + std::unique_ptr* reader) { + std::unique_ptr io_wrapper(new ArrowInputFile(file)); + std::unique_ptr pq_reader; + PARQUET_CATCH_NOT_OK( + pq_reader = ParquetReader::Open(std::move(io_wrapper), + ::parquet::default_reader_properties(), nullptr)); + reader->reset(new FileReader(allocator, std::move(pq_reader), properties)); + return Status::OK(); +} + Status FileReader::GetColumn(int i, std::unique_ptr* out) { FileColumnIteratorFactory iterator_factory = [](int i, ParquetFileReader* reader) { return new AllRowGroupsIterator(i, reader); @@ -1138,15 +1156,6 @@ struct TransferFunctor< Status operator()(RecordReader* reader, MemoryPool* pool, const std::shared_ptr<::arrow::DataType>& type, Datum* out) { std::vector> chunks = reader->GetBuilderChunks(); - - if (type->id() == ::arrow::Type::STRING) { - // Convert from BINARY type to STRING - for (size_t i = 0; i < chunks.size(); ++i) { - auto new_data = chunks[i]->data()->Copy(); - new_data->type = type; - chunks[i] = ::arrow::MakeArray(new_data); - } - } *out = std::make_shared(chunks); return Status::OK(); } diff --git a/cpp/src/parquet/arrow/reader.h b/cpp/src/parquet/arrow/reader.h index 7ef21fddf58..0b0078f8427 100644 --- a/cpp/src/parquet/arrow/reader.h +++ b/cpp/src/parquet/arrow/reader.h @@ -20,6 +20,7 @@ #include #include +#include #include #include "parquet/util/visibility.h" @@ -51,6 +52,46 @@ class ColumnChunkReader; class ColumnReader; class RowGroupReader; +static constexpr bool DEFAULT_USE_THREADS = false; +static constexpr bool DEFAULT_READ_DICTIONARY = false; + +class PARQUET_EXPORT ArrowReaderProperties { + public: + explicit ArrowReaderProperties(const bool use_threads = DEFAULT_USE_THREADS, + bool read_dict = DEFAULT_READ_DICTIONARY) + : use_threads_(use_threads), default_read_dict_(read_dict), read_dict_indices_() {} + + void set_use_threads(const bool use_threads) { use_threads_ = use_threads; } + + bool use_threads() const { return use_threads_; } + + void set_read_dictionary(const int column_index, const bool read_dict) { + if (read_dict) { + read_dict_indices_.emplace(column_index); + } else { + read_dict_indices_.erase(column_index); + } + } + bool read_dictionary(const int column_index) const { + if (read_dict_indices_.find(column_index) != read_dict_indices_.end()) { + return true; + } else { + return default_read_dict_; + } + } + + private: + bool use_threads_; + bool default_read_dict_; + std::unordered_set read_dict_indices_; +}; + +PARQUET_EXPORT +ArrowReaderProperties default_arrow_reader_properties() { + static ArrowReaderProperties default_reader_props; + return default_reader_props; +} + // Arrow read adapter class for deserializing Parquet files as Arrow row // batches. // @@ -109,7 +150,8 @@ class RowGroupReader; // arrays class PARQUET_EXPORT FileReader { public: - FileReader(::arrow::MemoryPool* pool, std::unique_ptr reader); + FileReader(::arrow::MemoryPool* pool, std::unique_ptr reader, + const ArrowReaderProperties& properties = default_arrow_reader_properties()); // Since the distribution of columns amongst a Parquet file's row groups may // be uneven (the number of values in each column chunk can be different), we @@ -306,6 +348,12 @@ ::arrow::Status OpenFile(const std::shared_ptr<::arrow::io::RandomAccessFile>& f ::arrow::MemoryPool* allocator, std::unique_ptr* reader); +PARQUET_EXPORT +::arrow::Status OpenFile(const std::shared_ptr<::arrow::io::ReadableFileInterface>& file, + ::arrow::MemoryPool* allocator, + const ArrowReaderProperties& properties, + std::unique_ptr* reader); + } // namespace arrow } // namespace parquet diff --git a/cpp/src/parquet/arrow/record_reader.cc b/cpp/src/parquet/arrow/record_reader.cc index c800f36d2ea..67bff957eff 100644 --- a/cpp/src/parquet/arrow/record_reader.cc +++ b/cpp/src/parquet/arrow/record_reader.cc @@ -448,36 +448,17 @@ class RecordReader::RecordReaderImpl { std::shared_ptr<::arrow::ResizableBuffer> rep_levels_; }; -template -struct RecordReaderTraits { - using BuilderType = ::arrow::ArrayBuilder; -}; - -template <> -struct RecordReaderTraits { - using BuilderType = ::arrow::internal::ChunkedBinaryBuilder; -}; - -template <> -struct RecordReaderTraits { - using BuilderType = ::arrow::FixedSizeBinaryBuilder; -}; - template class TypedRecordReader : public RecordReader::RecordReaderImpl { public: using T = typename DType::c_type; - using BuilderType = typename RecordReaderTraits::BuilderType; - TypedRecordReader(const ColumnDescriptor* descr, ::arrow::MemoryPool* pool) - : RecordReader::RecordReaderImpl(descr, pool), current_decoder_(nullptr) { - InitializeBuilder(); - } + : RecordReader::RecordReaderImpl(descr, pool), current_decoder_(nullptr) {} void ResetDecoders() override { decoders_.clear(); } - inline void ReadValuesSpaced(int64_t values_with_nulls, int64_t null_count) { + virtual inline void ReadValuesSpaced(int64_t values_with_nulls, int64_t null_count) { uint8_t* valid_bits = valid_bits_->mutable_data(); const int64_t valid_bits_offset = values_written_; @@ -487,7 +468,7 @@ class TypedRecordReader : public RecordReader::RecordReaderImpl { DCHECK_EQ(num_decoded, values_with_nulls); } - inline void ReadValuesDense(int64_t values_to_read) { + virtual inline void ReadValuesDense(int64_t values_to_read) { int64_t num_decoded = current_decoder_->Decode(ValuesHead(), static_cast(values_to_read)); DCHECK_EQ(num_decoded, values_to_read); @@ -568,18 +549,17 @@ class TypedRecordReader : public RecordReader::RecordReaderImpl { throw ParquetException("GetChunks only implemented for binary types"); } - private: + protected: using DecoderType = typename EncodingTraits::Decoder; + DecoderType* current_decoder_; + + private: // Map of encoding type to the respective decoder object. For example, a // column chunk's data pages may include both dictionary-encoded and // plain-encoded data. std::unordered_map> decoders_; - std::unique_ptr builder_; - - DecoderType* current_decoder_; - // Initialize repetition and definition level decoders on the next data page. int64_t InitializeLevelDecoders(const DataPage& page, Encoding::type repetition_level_encoding, @@ -590,103 +570,143 @@ class TypedRecordReader : public RecordReader::RecordReaderImpl { // Advance to the next data page bool ReadNewPage() override; - void InitializeBuilder() {} - void ConfigureDictionary(const DictionaryPage* page); }; -// TODO(wesm): Implement these to some satisfaction -template <> -void TypedRecordReader::DebugPrintState() {} +class FLBARecordReader : public TypedRecordReader { + public: + FLBARecordReader(const ColumnDescriptor* descr, ::arrow::MemoryPool* pool) + : TypedRecordReader(descr, pool), builder_(nullptr) { + DCHECK_EQ(descr_->physical_type(), Type::FIXED_LEN_BYTE_ARRAY); + int byte_width = descr_->type_length(); + std::shared_ptr<::arrow::DataType> type = ::arrow::fixed_size_binary(byte_width); + builder_.reset(new ::arrow::FixedSizeBinaryBuilder(type, pool_)); + } -template <> -void TypedRecordReader::DebugPrintState() {} + ::arrow::ArrayVector GetBuilderChunks() override { + std::shared_ptr<::arrow::Array> chunk; + PARQUET_THROW_NOT_OK(builder_->Finish(&chunk)); + return ::arrow::ArrayVector({chunk}); + } -template <> -void TypedRecordReader::DebugPrintState() {} + void ReadValuesDense(int64_t values_to_read) override { + auto values = ValuesHead(); + int64_t num_decoded = + current_decoder_->Decode(values, static_cast(values_to_read)); + DCHECK_EQ(num_decoded, values_to_read); -template <> -void TypedRecordReader::InitializeBuilder() { - // Maximum of 16MB chunks - constexpr int32_t kBinaryChunksize = 1 << 24; - DCHECK_EQ(descr_->physical_type(), Type::BYTE_ARRAY); - builder_.reset(new ::arrow::internal::ChunkedBinaryBuilder(kBinaryChunksize, pool_)); -} + for (int64_t i = 0; i < num_decoded; i++) { + PARQUET_THROW_NOT_OK(builder_->Append(values[i].ptr)); + } + ResetValues(); + } -template <> -void TypedRecordReader::InitializeBuilder() { - DCHECK_EQ(descr_->physical_type(), Type::FIXED_LEN_BYTE_ARRAY); - int byte_width = descr_->type_length(); - std::shared_ptr<::arrow::DataType> type = ::arrow::fixed_size_binary(byte_width); - builder_.reset(new ::arrow::FixedSizeBinaryBuilder(type, pool_)); -} + void ReadValuesSpaced(int64_t values_to_read, int64_t null_count) override { + uint8_t* valid_bits = valid_bits_->mutable_data(); + const int64_t valid_bits_offset = values_written_; + auto values = ValuesHead(); -template <> -::arrow::ArrayVector TypedRecordReader::GetBuilderChunks() { - ::arrow::ArrayVector chunks; - PARQUET_THROW_NOT_OK(builder_->Finish(&chunks)); - return chunks; -} + int64_t num_decoded = current_decoder_->DecodeSpaced( + values, static_cast(values_to_read), static_cast(null_count), + valid_bits, valid_bits_offset); + DCHECK_EQ(num_decoded, values_to_read); -template <> -::arrow::ArrayVector TypedRecordReader::GetBuilderChunks() { - std::shared_ptr<::arrow::Array> chunk; - PARQUET_THROW_NOT_OK(builder_->Finish(&chunk)); - return ::arrow::ArrayVector({chunk}); -} + for (int64_t i = 0; i < num_decoded; i++) { + if (::arrow::BitUtil::GetBit(valid_bits, valid_bits_offset + i)) { + PARQUET_THROW_NOT_OK(builder_->Append(values[i].ptr)); + } else { + PARQUET_THROW_NOT_OK(builder_->AppendNull()); + } + } + ResetValues(); + } -template <> -inline void TypedRecordReader::ReadValuesDense(int64_t values_to_read) { - int64_t num_decoded = current_decoder_->DecodeArrowNonNull( - static_cast(values_to_read), builder_.get()); - DCHECK_EQ(num_decoded, values_to_read); - ResetValues(); -} + private: + std::unique_ptr<::arrow::FixedSizeBinaryBuilder> builder_; +}; -template <> -inline void TypedRecordReader::ReadValuesDense(int64_t values_to_read) { - auto values = ValuesHead(); - int64_t num_decoded = - current_decoder_->Decode(values, static_cast(values_to_read)); - DCHECK_EQ(num_decoded, values_to_read); +class ByteArrayChunkedRecordReader : public TypedRecordReader { + public: + ByteArrayChunkedRecordReader(const ColumnDescriptor* descr, ::arrow::MemoryPool* pool) + : TypedRecordReader(descr, pool), builder_(nullptr) { + // Maximum of 16MB chunks + constexpr int32_t kBinaryChunksize = 1 << 24; + DCHECK_EQ(descr_->physical_type(), Type::BYTE_ARRAY); + if (descr_->logical_type() == LogicalType::UTF8) { + builder_.reset( + new ::arrow::internal::ChunkedStringBuilder(kBinaryChunksize, pool_)); + } else { + builder_.reset( + new ::arrow::internal::ChunkedBinaryBuilder(kBinaryChunksize, pool_)); + } + } - for (int64_t i = 0; i < num_decoded; i++) { - PARQUET_THROW_NOT_OK(builder_->Append(values[i].ptr)); + ::arrow::ArrayVector GetBuilderChunks() override { + ::arrow::ArrayVector chunks; + PARQUET_THROW_NOT_OK(builder_->Finish(&chunks)); + return chunks; } - ResetValues(); -} + void ReadValuesDense(int64_t values_to_read) override { + int64_t num_decoded = current_decoder_->DecodeArrowNonNull( + static_cast(values_to_read), builder_.get()); + DCHECK_EQ(num_decoded, values_to_read); + ResetValues(); + } + + void ReadValuesSpaced(int64_t values_to_read, int64_t null_count) override { + int64_t num_decoded = current_decoder_->DecodeArrow( + static_cast(values_to_read), static_cast(null_count), + valid_bits_->mutable_data(), values_written_, builder_.get()); + DCHECK_EQ(num_decoded, values_to_read); + ResetValues(); + } + + private: + std::unique_ptr<::arrow::internal::ChunkedBinaryBuilder> builder_; +}; + +template +class ByteArrayDictionaryRecordReader : public TypedRecordReader { + public: + ByteArrayDictionaryRecordReader(const ColumnDescriptor* descr, + ::arrow::MemoryPool* pool) + : TypedRecordReader(descr, pool), builder_(new BuilderType(pool)) {} + + ::arrow::ArrayVector GetBuilderChunks() override { + std::shared_ptr<::arrow::Array> chunk; + PARQUET_THROW_NOT_OK(builder_->Finish(&chunk)); + return ::arrow::ArrayVector({chunk}); + } + + void ReadValuesDense(int64_t values_to_read) override { + int64_t num_decoded = current_decoder_->DecodeArrowNonNull( + static_cast(values_to_read), builder_.get()); + DCHECK_EQ(num_decoded, values_to_read); + ResetValues(); + } + + void ReadValuesSpaced(int64_t values_to_read, int64_t null_count) override { + int64_t num_decoded = current_decoder_->DecodeArrow( + static_cast(values_to_read), static_cast(null_count), + valid_bits_->mutable_data(), values_written_, builder_.get()); + DCHECK_EQ(num_decoded, values_to_read); + ResetValues(); + } + + private: + std::unique_ptr builder_; +}; + +// TODO(wesm): Implement these to some satisfaction template <> -inline void TypedRecordReader::ReadValuesSpaced(int64_t values_to_read, - int64_t null_count) { - int64_t num_decoded = current_decoder_->DecodeArrow( - static_cast(values_to_read), static_cast(null_count), - valid_bits_->mutable_data(), values_written_, builder_.get()); - DCHECK_EQ(num_decoded, values_to_read); - ResetValues(); -} +void TypedRecordReader::DebugPrintState() {} template <> -inline void TypedRecordReader::ReadValuesSpaced(int64_t values_to_read, - int64_t null_count) { - uint8_t* valid_bits = valid_bits_->mutable_data(); - const int64_t valid_bits_offset = values_written_; - auto values = ValuesHead(); - - int64_t num_decoded = current_decoder_->DecodeSpaced( - values, static_cast(values_to_read), static_cast(null_count), valid_bits, - valid_bits_offset); - DCHECK_EQ(num_decoded, values_to_read); - - for (int64_t i = 0; i < num_decoded; i++) { - if (::arrow::BitUtil::GetBit(valid_bits, valid_bits_offset + i)) { - PARQUET_THROW_NOT_OK(builder_->Append(values[i].ptr)); - } else { - PARQUET_THROW_NOT_OK(builder_->AppendNull()); - } - } - ResetValues(); -} +void TypedRecordReader::DebugPrintState() {} + +template <> +void TypedRecordReader::DebugPrintState() {} template inline void TypedRecordReader::ConfigureDictionary(const DictionaryPage* page) { @@ -845,8 +865,27 @@ bool TypedRecordReader::ReadNewPage() { return true; } +std::shared_ptr RecordReader::MakeByteArrayRecordReader( + const ColumnDescriptor* descr, arrow::MemoryPool* pool, bool read_dictionary) { + if (read_dictionary) { + if (descr->logical_type() == LogicalType::UTF8) { + using Builder = ::arrow::StringDictionaryBuilder; + return std::shared_ptr( + new RecordReader(new ByteArrayDictionaryRecordReader(descr, pool))); + } else { + using Builder = ::arrow::BinaryDictionaryBuilder; + return std::shared_ptr( + new RecordReader(new ByteArrayDictionaryRecordReader(descr, pool))); + } + } else { + return std::shared_ptr( + new RecordReader(new ByteArrayChunkedRecordReader(descr, pool))); + } +} + std::shared_ptr RecordReader::Make(const ColumnDescriptor* descr, - MemoryPool* pool) { + MemoryPool* pool, + const bool read_dictionary) { switch (descr->physical_type()) { case Type::BOOLEAN: return std::shared_ptr( @@ -867,11 +906,10 @@ std::shared_ptr RecordReader::Make(const ColumnDescriptor* descr, return std::shared_ptr( new RecordReader(new TypedRecordReader(descr, pool))); case Type::BYTE_ARRAY: - return std::shared_ptr( - new RecordReader(new TypedRecordReader(descr, pool))); + return RecordReader::MakeByteArrayRecordReader(descr, pool, read_dictionary); case Type::FIXED_LEN_BYTE_ARRAY: return std::shared_ptr( - new RecordReader(new TypedRecordReader(descr, pool))); + new RecordReader(new FLBARecordReader(descr, pool))); default: { // PARQUET-1481: This can occur if the file is corrupt std::stringstream ss; diff --git a/cpp/src/parquet/arrow/record_reader.h b/cpp/src/parquet/arrow/record_reader.h index cc932c28650..c999dd00ef2 100644 --- a/cpp/src/parquet/arrow/record_reader.h +++ b/cpp/src/parquet/arrow/record_reader.h @@ -51,7 +51,8 @@ class RecordReader { static std::shared_ptr Make( const ColumnDescriptor* descr, - ::arrow::MemoryPool* pool = ::arrow::default_memory_pool()); + ::arrow::MemoryPool* pool = ::arrow::default_memory_pool(), + const bool read_dictionary = false); virtual ~RecordReader(); @@ -111,6 +112,10 @@ class RecordReader { private: std::unique_ptr impl_; explicit RecordReader(RecordReaderImpl* impl); + + static std::shared_ptr MakeByteArrayRecordReader( + const ColumnDescriptor* descr, ::arrow::MemoryPool* pool, + const bool read_dictionary); }; } // namespace internal From a6740f31edf276606962650ba27e0aedad41d998 Mon Sep 17 00:00:00 2001 From: Hatem Helal Date: Tue, 5 Mar 2019 22:35:09 +0000 Subject: [PATCH 17/35] Make sure to update the schema when reading a column as a DictionaryArray --- cpp/src/parquet/arrow/reader.cc | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) diff --git a/cpp/src/parquet/arrow/reader.cc b/cpp/src/parquet/arrow/reader.cc index 7535de55fb6..9bd32c8f9ec 100644 --- a/cpp/src/parquet/arrow/reader.cc +++ b/cpp/src/parquet/arrow/reader.cc @@ -554,6 +554,14 @@ Status FileReader::Impl::ReadRowGroup(int row_group_index, this](int i) { std::shared_ptr array; RETURN_NOT_OK(ReadColumnChunk(field_indices[i], indices, row_group_index, &array)); + + // Update the schema when reading dictionary array + if (reader_properties_.read_dictionary(i)) { + auto dict_field = + std::make_shared<::arrow::Field>(schema->field(i)->name(), array->type()); + RETURN_NOT_OK(schema->SetField(i, dict_field, &schema)); + } + columns[i] = std::make_shared(schema->field(i), array); return Status::OK(); }; @@ -601,6 +609,14 @@ Status FileReader::Impl::ReadTable(const std::vector& indices, auto ReadColumnFunc = [&indices, &field_indices, &schema, &columns, this](int i) { std::shared_ptr array; RETURN_NOT_OK(ReadSchemaField(field_indices[i], indices, &array)); + + // Update the schema when reading dictionary array + if (reader_properties_.read_dictionary(i)) { + auto dict_field = + std::make_shared<::arrow::Field>(schema->field(i)->name(), array->type()); + RETURN_NOT_OK(schema->SetField(i, dict_field, &schema)); + } + columns[i] = std::make_shared(schema->field(i), array); return Status::OK(); }; From a35754456b485dc0a8334373efce8c9ebfe6b77d Mon Sep 17 00:00:00 2001 From: Hatem Helal Date: Tue, 5 Mar 2019 23:02:43 +0000 Subject: [PATCH 18/35] Basic unittests for reading DictionaryArray directly from parquet --- .../parquet/arrow/arrow-reader-writer-test.cc | 75 +++++++++++++++++++ 1 file changed, 75 insertions(+) diff --git a/cpp/src/parquet/arrow/arrow-reader-writer-test.cc b/cpp/src/parquet/arrow/arrow-reader-writer-test.cc index 3995d2c2407..b80d4481705 100644 --- a/cpp/src/parquet/arrow/arrow-reader-writer-test.cc +++ b/cpp/src/parquet/arrow/arrow-reader-writer-test.cc @@ -2439,6 +2439,81 @@ TEST(TestArrowWriterAdHoc, SchemaMismatch) { ASSERT_RAISES(Invalid, writer->WriteTable(*tbl, 1)); } +// ---------------------------------------------------------------------- +// Tests for directly reading DictionaryArray +class TestArrowReadDictionary : public ::testing::TestWithParam { + public: + void SetUp() override { + InitFromJSON(GetParam()); + ASSERT_NO_FATAL_FAILURE( + WriteTableToBuffer(expected_dense_, expected_dense_->num_rows() / 2, + default_arrow_writer_properties(), &buffer_)); + + properties_ = default_arrow_reader_properties(); + } + + void InitFromJSON(const char* jsonStr) { + auto dense_array = ::arrow::ArrayFromJSON(::arrow::utf8(), jsonStr); + expected_dense_ = MakeSimpleTable(dense_array, /*nullable=*/true); + + ::arrow::StringDictionaryBuilder builder(default_memory_pool()); + const auto& string_array = static_cast(*dense_array); + + for (int64_t i = 0; i < string_array.length(); ++i) { + if (string_array.IsNull(i)) { + ASSERT_OK(builder.AppendNull()); + } else { + ASSERT_OK(builder.Append(string_array.GetView(i))); + } + } + + std::shared_ptr<::arrow::Array> dict_array; + ASSERT_OK(builder.Finish(&dict_array)); + expected_dict_ = MakeSimpleTable(dict_array, /*nullable=*/true); + + // TODO(hatemhelal): Figure out if we can use the following to init the expected_dict_ + // Currently fails due to DataType mismatch for indices array. + // Datum out; + // FunctionContext ctx(default_memory_pool()); + // ASSERT_OK(DictionaryEncode(&ctx, Datum(dense_array), &out)); + // expected_dict_ = MakeSimpleTable(out.make_array(), /*nullable=*/true); + } + + void TearDown() override {} + + void CheckReadWholeFile(const Table& expected) { + std::unique_ptr reader; + ASSERT_OK_NO_THROW(OpenFile(std::make_shared(buffer_), + ::arrow::default_memory_pool(), properties_, &reader)); + + std::shared_ptr actual; + ASSERT_OK_NO_THROW(reader->ReadTable(&actual)); + ::arrow::AssertTablesEqual(*actual, expected, false); + } + + protected: + std::shared_ptr
expected_dense_; + std::shared_ptr
expected_dict_; + std::shared_ptr buffer_; + ArrowReaderProperties properties_; +}; + +TEST_P(TestArrowReadDictionary, ReadWholeFileDict) { + properties_.set_read_dictionary(0, true); + CheckReadWholeFile(*expected_dict_); +} + +TEST_P(TestArrowReadDictionary, ReadWholeFileDense) { + properties_.set_read_dictionary(0, false); + CheckReadWholeFile(*expected_dense_); +} + +const char* jsonInputs[] = {"[\"foo\", \"bar\", \"foo\", \"foo\"]", + "[\"foo\", \"bar\", \"foo\", \"null\"]"}; + +INSTANTIATE_TEST_CASE_P(ReadDictionary, TestArrowReadDictionary, + ::testing::ValuesIn(jsonInputs)); + } // namespace arrow } // namespace parquet From 077a8f1ae286190c202897e9129da78c89d9d1b4 Mon Sep 17 00:00:00 2001 From: Hatem Helal Date: Wed, 6 Mar 2019 11:14:02 +0000 Subject: [PATCH 19/35] Move function definition to (hopefully) resolve appveyor build failure due to C2491 --- cpp/src/parquet/arrow/reader.cc | 5 +++++ cpp/src/parquet/arrow/reader.h | 5 +---- 2 files changed, 6 insertions(+), 4 deletions(-) diff --git a/cpp/src/parquet/arrow/reader.cc b/cpp/src/parquet/arrow/reader.cc index 9bd32c8f9ec..f490e029646 100644 --- a/cpp/src/parquet/arrow/reader.cc +++ b/cpp/src/parquet/arrow/reader.cc @@ -101,6 +101,11 @@ Status GetSingleChunk(const ChunkedArray& chunked, std::shared_ptr* out) } // namespace +ArrowReaderProperties default_arrow_reader_properties() { + static ArrowReaderProperties default_reader_props; + return default_reader_props; +} + // ---------------------------------------------------------------------- // Iteration utilities diff --git a/cpp/src/parquet/arrow/reader.h b/cpp/src/parquet/arrow/reader.h index 0b0078f8427..c3031352ee8 100644 --- a/cpp/src/parquet/arrow/reader.h +++ b/cpp/src/parquet/arrow/reader.h @@ -87,10 +87,7 @@ class PARQUET_EXPORT ArrowReaderProperties { }; PARQUET_EXPORT -ArrowReaderProperties default_arrow_reader_properties() { - static ArrowReaderProperties default_reader_props; - return default_reader_props; -} +ArrowReaderProperties default_arrow_reader_properties(); // Arrow read adapter class for deserializing Parquet files as Arrow row // batches. From 7aac84c4576b08709fd185563f0103206a6caa79 Mon Sep 17 00:00:00 2001 From: Hatem Helal Date: Sat, 9 Mar 2019 19:38:18 +0000 Subject: [PATCH 20/35] Reworked encoding benchmark to reduce code duplication --- cpp/src/parquet/encoding-benchmark.cc | 182 ++++++++++---------------- 1 file changed, 66 insertions(+), 116 deletions(-) diff --git a/cpp/src/parquet/encoding-benchmark.cc b/cpp/src/parquet/encoding-benchmark.cc index 88f795678e3..c7ec1f14025 100644 --- a/cpp/src/parquet/encoding-benchmark.cc +++ b/cpp/src/parquet/encoding-benchmark.cc @@ -241,6 +241,33 @@ class BenchmarkDecodeArrow : public ::benchmark::Fixture { virtual void DoEncodeData() = 0; + virtual std::unique_ptr InitializeDecoder() = 0; + + template + std::unique_ptr CreateBuilder(); + + template + void DecodeArrowBenchmark(benchmark::State& state) { + for (auto _ : state) { + auto decoder = InitializeDecoder(); + auto builder = CreateBuilder(); + decoder->DecodeArrow(num_values_, 0, valid_bits_, 0, builder.get()); + } + + state.SetBytesProcessed(state.iterations() * total_size_); + } + + template + void DecodeArrowNonNullBenchmark(benchmark::State& state) { + for (auto _ : state) { + auto decoder = InitializeDecoder(); + auto builder = CreateBuilder(); + decoder->DecodeArrowNonNull(num_values_, builder.get()); + } + + state.SetBytesProcessed(state.iterations() * total_size_); + } + protected: int num_values_; std::shared_ptr<::arrow::Array> input_array_; @@ -250,6 +277,22 @@ class BenchmarkDecodeArrow : public ::benchmark::Fixture { std::shared_ptr buffer_; }; +using ::arrow::BinaryDictionaryBuilder; +using ::arrow::internal::ChunkedBinaryBuilder; + +template <> +std::unique_ptr BenchmarkDecodeArrow::CreateBuilder() { + int chunk_size = static_cast(buffer_->size()); + return std::unique_ptr( + new ChunkedBinaryBuilder(chunk_size, default_memory_pool())); +} + +template <> +std::unique_ptr BenchmarkDecodeArrow::CreateBuilder() { + return std::unique_ptr( + new BinaryDictionaryBuilder(default_memory_pool())); +} + // ---------------------------------------------------------------------- // Benchmark Decoding from Plain Encoding class BM_PlainDecodingByteArray : public BenchmarkDecodeArrow { @@ -259,67 +302,31 @@ class BM_PlainDecodingByteArray : public BenchmarkDecodeArrow { encoder->Put(values_.data(), num_values_); buffer_ = encoder->FlushValues(); } -}; -BENCHMARK_DEFINE_F(BM_PlainDecodingByteArray, DecodeArrow_Dense) -(benchmark::State& state) { - for (auto _ : state) { + std::unique_ptr InitializeDecoder() override { auto decoder = MakeTypedDecoder(Encoding::PLAIN); decoder->SetData(num_values_, buffer_->data(), static_cast(buffer_->size())); - ::arrow::internal::ChunkedBinaryBuilder builder(static_cast(buffer_->size()), - ::arrow::default_memory_pool()); - decoder->DecodeArrow(num_values_, 0, valid_bits_, 0, &builder); + return decoder; } +}; - state.SetBytesProcessed(state.iterations() * total_size_); -} - +BENCHMARK_DEFINE_F(BM_PlainDecodingByteArray, DecodeArrow_Dense) +(benchmark::State& state) { DecodeArrowBenchmark(state); } BENCHMARK_REGISTER_F(BM_PlainDecodingByteArray, DecodeArrow_Dense) ->Range(MIN_RANGE, MAX_RANGE); BENCHMARK_DEFINE_F(BM_PlainDecodingByteArray, DecodeArrowNonNull_Dense) -(benchmark::State& state) { - for (auto _ : state) { - auto decoder = MakeTypedDecoder(Encoding::PLAIN); - decoder->SetData(num_values_, buffer_->data(), static_cast(buffer_->size())); - ::arrow::internal::ChunkedBinaryBuilder builder(static_cast(buffer_->size()), - ::arrow::default_memory_pool()); - decoder->DecodeArrowNonNull(num_values_, &builder); - } - - state.SetBytesProcessed(state.iterations() * total_size_); -} - +(benchmark::State& state) { DecodeArrowNonNullBenchmark(state); } BENCHMARK_REGISTER_F(BM_PlainDecodingByteArray, DecodeArrowNonNull_Dense) ->Range(MIN_RANGE, MAX_RANGE); BENCHMARK_DEFINE_F(BM_PlainDecodingByteArray, DecodeArrow_Dict) -(benchmark::State& state) { - for (auto _ : state) { - auto decoder = MakeTypedDecoder(Encoding::PLAIN); - decoder->SetData(num_values_, buffer_->data(), static_cast(buffer_->size())); - ::arrow::BinaryDictionaryBuilder builder(::arrow::default_memory_pool()); - decoder->DecodeArrow(num_values_, 0, valid_bits_, 0, &builder); - } - - state.SetBytesProcessed(state.iterations() * total_size_); -} - +(benchmark::State& state) { DecodeArrowBenchmark(state); } BENCHMARK_REGISTER_F(BM_PlainDecodingByteArray, DecodeArrow_Dict) ->Range(MIN_RANGE, MAX_RANGE); BENCHMARK_DEFINE_F(BM_PlainDecodingByteArray, DecodeArrowNonNull_Dict) -(benchmark::State& state) { - for (auto _ : state) { - auto decoder = MakeTypedDecoder(Encoding::PLAIN); - decoder->SetData(num_values_, buffer_->data(), static_cast(buffer_->size())); - ::arrow::BinaryDictionaryBuilder builder(::arrow::default_memory_pool()); - decoder->DecodeArrowNonNull(num_values_, &builder); - } - - state.SetBytesProcessed(state.iterations() * total_size_); -} - +(benchmark::State& state) { DecodeArrowNonNullBenchmark(state); } BENCHMARK_REGISTER_F(BM_PlainDecodingByteArray, DecodeArrowNonNull_Dict) ->Range(MIN_RANGE, MAX_RANGE); @@ -343,16 +350,7 @@ class BM_DictDecodingByteArray : public BenchmarkDecodeArrow { num_dict_entries_ = dict_encoder->num_entries(); } - protected: - std::unique_ptr descr_; - std::unique_ptr> dict_decoder_; - std::shared_ptr dict_buffer_; - int num_dict_entries_; -}; - -BENCHMARK_DEFINE_F(BM_DictDecodingByteArray, DecodeArrow_Dense) -(benchmark::State& state) { - for (auto _ : state) { + std::unique_ptr InitializeDecoder() override { auto decoder = MakeTypedDecoder(Encoding::PLAIN, descr_.get()); decoder->SetData(num_dict_entries_, dict_buffer_->data(), static_cast(dict_buffer_->size())); @@ -360,83 +358,35 @@ BENCHMARK_DEFINE_F(BM_DictDecodingByteArray, DecodeArrow_Dense) dict_decoder->SetDict(decoder.get()); dict_decoder->SetData(num_values_, buffer_->data(), static_cast(buffer_->size())); - auto byte_array_decoder = dynamic_cast(dict_decoder.get()); - - ::arrow::internal::ChunkedBinaryBuilder builder( - static_cast(dict_buffer_->size()), ::arrow::default_memory_pool()); - byte_array_decoder->DecodeArrow(num_values_, 0, valid_bits_, 0, &builder); + return std::unique_ptr( + dynamic_cast(dict_decoder.release())); } - state.SetBytesProcessed(state.iterations() * total_size_); -} + protected: + std::unique_ptr descr_; + std::unique_ptr> dict_decoder_; + std::shared_ptr dict_buffer_; + int num_dict_entries_; +}; +BENCHMARK_DEFINE_F(BM_DictDecodingByteArray, DecodeArrow_Dense)(benchmark::State& state) { + DecodeArrowBenchmark(state); +} BENCHMARK_REGISTER_F(BM_DictDecodingByteArray, DecodeArrow_Dense) ->Range(MIN_RANGE, MAX_RANGE); BENCHMARK_DEFINE_F(BM_DictDecodingByteArray, DecodeArrowNonNull_Dense) -(benchmark::State& state) { - for (auto _ : state) { - auto decoder = MakeTypedDecoder(Encoding::PLAIN, descr_.get()); - decoder->SetData(num_dict_entries_, dict_buffer_->data(), - static_cast(dict_buffer_->size())); - auto dict_decoder = MakeDictDecoder(descr_.get()); - dict_decoder->SetDict(decoder.get()); - dict_decoder->SetData(num_values_, buffer_->data(), - static_cast(buffer_->size())); - auto byte_array_decoder = dynamic_cast(dict_decoder.get()); - - ::arrow::internal::ChunkedBinaryBuilder builder( - static_cast(dict_buffer_->size()), ::arrow::default_memory_pool()); - byte_array_decoder->DecodeArrowNonNull(num_values_, &builder); - } - - state.SetBytesProcessed(state.iterations() * total_size_); -} - +(benchmark::State& state) { DecodeArrowNonNullBenchmark(state); } BENCHMARK_REGISTER_F(BM_DictDecodingByteArray, DecodeArrowNonNull_Dense) ->Range(MIN_RANGE, MAX_RANGE); BENCHMARK_DEFINE_F(BM_DictDecodingByteArray, DecodeArrow_Dict) -(benchmark::State& state) { - for (auto _ : state) { - auto decoder = MakeTypedDecoder(Encoding::PLAIN, descr_.get()); - decoder->SetData(num_dict_entries_, dict_buffer_->data(), - static_cast(dict_buffer_->size())); - auto dict_decoder = MakeDictDecoder(descr_.get()); - dict_decoder->SetDict(decoder.get()); - dict_decoder->SetData(num_values_, buffer_->data(), - static_cast(buffer_->size())); - auto byte_array_decoder = dynamic_cast(dict_decoder.get()); - - ::arrow::BinaryDictionaryBuilder builder(::arrow::default_memory_pool()); - byte_array_decoder->DecodeArrow(num_values_, 0, valid_bits_, 0, &builder); - } - - state.SetBytesProcessed(state.iterations() * total_size_); -} - +(benchmark::State& state) { DecodeArrowBenchmark(state); } BENCHMARK_REGISTER_F(BM_DictDecodingByteArray, DecodeArrow_Dict) ->Range(MIN_RANGE, MAX_RANGE); BENCHMARK_DEFINE_F(BM_DictDecodingByteArray, DecodeArrowNonNull_Dict) -(benchmark::State& state) { - for (auto _ : state) { - auto decoder = MakeTypedDecoder(Encoding::PLAIN, descr_.get()); - decoder->SetData(num_dict_entries_, dict_buffer_->data(), - static_cast(dict_buffer_->size())); - auto dict_decoder = MakeDictDecoder(descr_.get()); - dict_decoder->SetDict(decoder.get()); - dict_decoder->SetData(num_values_, buffer_->data(), - static_cast(buffer_->size())); - auto byte_array_decoder = dynamic_cast(dict_decoder.get()); - - ::arrow::BinaryDictionaryBuilder builder(::arrow::default_memory_pool()); - byte_array_decoder->DecodeArrowNonNull(num_values_, &builder); - } - - state.SetBytesProcessed(state.iterations() * total_size_); -} - +(benchmark::State& state) { DecodeArrowNonNullBenchmark(state); } BENCHMARK_REGISTER_F(BM_DictDecodingByteArray, DecodeArrowNonNull_Dict) ->Range(MIN_RANGE, MAX_RANGE); From a267a27d4fd4e54202a5caa5d091471fda623712 Mon Sep 17 00:00:00 2001 From: Hatem Helal Date: Mon, 11 Mar 2019 06:51:54 -0400 Subject: [PATCH 21/35] remove unnecessary inlines --- cpp/src/parquet/arrow/record_reader.cc | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/cpp/src/parquet/arrow/record_reader.cc b/cpp/src/parquet/arrow/record_reader.cc index 67bff957eff..42334b91439 100644 --- a/cpp/src/parquet/arrow/record_reader.cc +++ b/cpp/src/parquet/arrow/record_reader.cc @@ -458,7 +458,7 @@ class TypedRecordReader : public RecordReader::RecordReaderImpl { void ResetDecoders() override { decoders_.clear(); } - virtual inline void ReadValuesSpaced(int64_t values_with_nulls, int64_t null_count) { + virtual void ReadValuesSpaced(int64_t values_with_nulls, int64_t null_count) { uint8_t* valid_bits = valid_bits_->mutable_data(); const int64_t valid_bits_offset = values_written_; @@ -468,7 +468,7 @@ class TypedRecordReader : public RecordReader::RecordReaderImpl { DCHECK_EQ(num_decoded, values_with_nulls); } - virtual inline void ReadValuesDense(int64_t values_to_read) { + virtual void ReadValuesDense(int64_t values_to_read) { int64_t num_decoded = current_decoder_->Decode(ValuesHead(), static_cast(values_to_read)); DCHECK_EQ(num_decoded, values_to_read); From babe52e38fe92ae16ae5a87afa9b769b8e4588f1 Mon Sep 17 00:00:00 2001 From: Hatem Helal Date: Mon, 11 Mar 2019 06:55:20 -0400 Subject: [PATCH 22/35] replace deprecated ReadableFileInterface with RandomAccessFile --- cpp/src/parquet/arrow/reader.cc | 2 +- cpp/src/parquet/arrow/reader.h | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/cpp/src/parquet/arrow/reader.cc b/cpp/src/parquet/arrow/reader.cc index f490e029646..ade342266a5 100644 --- a/cpp/src/parquet/arrow/reader.cc +++ b/cpp/src/parquet/arrow/reader.cc @@ -710,7 +710,7 @@ Status OpenFile(const std::shared_ptr<::arrow::io::RandomAccessFile>& file, reader); } -Status OpenFile(const std::shared_ptr<::arrow::io::ReadableFileInterface>& file, +Status OpenFile(const std::shared_ptr<::arrow::io::RandomAccessFile>& file, ::arrow::MemoryPool* allocator, const ArrowReaderProperties& properties, std::unique_ptr* reader) { std::unique_ptr io_wrapper(new ArrowInputFile(file)); diff --git a/cpp/src/parquet/arrow/reader.h b/cpp/src/parquet/arrow/reader.h index c3031352ee8..6312113158f 100644 --- a/cpp/src/parquet/arrow/reader.h +++ b/cpp/src/parquet/arrow/reader.h @@ -330,7 +330,7 @@ class PARQUET_EXPORT ColumnReader { }; // Helper function to create a file reader from an implementation of an Arrow -// readable file +// random access file // // metadata : separately-computed file metadata, can be nullptr PARQUET_EXPORT @@ -346,7 +346,7 @@ ::arrow::Status OpenFile(const std::shared_ptr<::arrow::io::RandomAccessFile>& f std::unique_ptr* reader); PARQUET_EXPORT -::arrow::Status OpenFile(const std::shared_ptr<::arrow::io::ReadableFileInterface>& file, +::arrow::Status OpenFile(const std::shared_ptr<::arrow::io::RandomAccessFile>& file, ::arrow::MemoryPool* allocator, const ArrowReaderProperties& properties, std::unique_ptr* reader); From 6e65fdbdff831c02abd3cc4d2771c579b0967660 Mon Sep 17 00:00:00 2001 From: Hatem Helal Date: Mon, 11 Mar 2019 06:56:39 -0400 Subject: [PATCH 23/35] simplify ArrowReaderProperties and mark as experimental --- cpp/src/parquet/arrow/reader.h | 19 +++++++++---------- 1 file changed, 9 insertions(+), 10 deletions(-) diff --git a/cpp/src/parquet/arrow/reader.h b/cpp/src/parquet/arrow/reader.h index 6312113158f..52fcec8f0d7 100644 --- a/cpp/src/parquet/arrow/reader.h +++ b/cpp/src/parquet/arrow/reader.h @@ -53,39 +53,38 @@ class ColumnReader; class RowGroupReader; static constexpr bool DEFAULT_USE_THREADS = false; -static constexpr bool DEFAULT_READ_DICTIONARY = false; +/// EXPERIMENTAL: Properties for configuring FileReader behavior. class PARQUET_EXPORT ArrowReaderProperties { public: - explicit ArrowReaderProperties(const bool use_threads = DEFAULT_USE_THREADS, - bool read_dict = DEFAULT_READ_DICTIONARY) - : use_threads_(use_threads), default_read_dict_(read_dict), read_dict_indices_() {} + explicit ArrowReaderProperties(bool use_threads = DEFAULT_USE_THREADS) + : use_threads_(use_threads), read_dict_indices_() {} - void set_use_threads(const bool use_threads) { use_threads_ = use_threads; } + void set_use_threads(bool use_threads) { use_threads_ = use_threads; } bool use_threads() const { return use_threads_; } - void set_read_dictionary(const int column_index, const bool read_dict) { + void set_read_dictionary(int column_index, bool read_dict) { if (read_dict) { - read_dict_indices_.emplace(column_index); + read_dict_indices_.insert(column_index); } else { read_dict_indices_.erase(column_index); } } - bool read_dictionary(const int column_index) const { + bool read_dictionary(int column_index) const { if (read_dict_indices_.find(column_index) != read_dict_indices_.end()) { return true; } else { - return default_read_dict_; + return false; } } private: bool use_threads_; - bool default_read_dict_; std::unordered_set read_dict_indices_; }; +/// EXPERIMENTAL: Constructs the default ArrowReaderProperties PARQUET_EXPORT ArrowReaderProperties default_arrow_reader_properties(); From 4d7bb30de55e553fb771e1200ed6d7de02cf8218 Mon Sep 17 00:00:00 2001 From: Hatem Helal Date: Tue, 12 Mar 2019 08:28:43 +0000 Subject: [PATCH 24/35] Refactor dictionary data generation into RandomArrayGenerator --- cpp/src/arrow/testing/random.cc | 63 ++++++++++++++++++++++++++- cpp/src/arrow/testing/random.h | 29 ++++++++++++ cpp/src/parquet/encoding-benchmark.cc | 41 ++++------------- 3 files changed, 98 insertions(+), 35 deletions(-) diff --git a/cpp/src/arrow/testing/random.cc b/cpp/src/arrow/testing/random.cc index b050c5d69b7..f693a4535e9 100644 --- a/cpp/src/arrow/testing/random.cc +++ b/cpp/src/arrow/testing/random.cc @@ -79,7 +79,7 @@ std::shared_ptr RandomArrayGenerator::Boolean(int64_t size, double probab // only calls the GenerateBitmap method. using GenOpt = GenerateOptions>; - std::vector> buffers{2}; + BufferVector buffers{2}; // Need 2 distinct generators such that probabilities are not shared. GenOpt value_gen(seed(), 0, 1, probability); GenOpt null_gen(seed(), 0, 1, null_probability); @@ -100,7 +100,7 @@ static std::shared_ptr> GenerateNumericArray(int64_t siz OptionType options) { using CType = typename ArrowType::c_type; auto type = TypeTraits::type_singleton(); - std::vector> buffers{2}; + BufferVector buffers{2}; int64_t null_count = 0; ABORT_NOT_OK(AllocateEmptyBitmap(size, &buffers[0])); @@ -145,5 +145,64 @@ PRIMITIVE_RAND_FLOAT_IMPL(Float64, double, DoubleType) #undef PRIMITIVE_RAND_FLOAT_IMPL #undef PRIMITIVE_RAND_IMPL +std::shared_ptr RandomArrayGenerator::String(int64_t size, + int32_t min_length, + int32_t max_length, + double null_probability) { + if (null_probability < 0 || null_probability > 1) { + ABORT_NOT_OK(Status::Invalid("null_probability must be between 0 and 1")); + } + + auto int32_lengths = Int32(size, min_length, max_length, null_probability); + auto lengths = std::dynamic_pointer_cast(int32_lengths); + + // Visual Studio does not implement uniform_int_distribution for char types. + using GenOpt = GenerateOptions>; + GenOpt options(seed(), static_cast('A'), static_cast('z'), + /*null_probability=*/0); + + std::vector str_buffer(max_length); + StringBuilder builder; + + for (int64_t i = 0; i < size; ++i) { + if (lengths->IsValid(i)) { + options.GenerateData(str_buffer.data(), lengths->Value(i)); + ABORT_NOT_OK(builder.Append(str_buffer.data(), lengths->Value(i))); + } else { + ABORT_NOT_OK(builder.AppendNull()); + } + } + + std::shared_ptr result; + ABORT_NOT_OK(builder.Finish(&result)); + return result; +} + +std::shared_ptr RandomArrayGenerator::StringWithRepeats( + int64_t size, int64_t unique, int32_t min_length, int32_t max_length, + double null_probability) { + // Generate a random string dictionary without any nulls + auto array = String(unique, min_length, max_length, /*null_probability=*/0); + auto dictionary = std::dynamic_pointer_cast(array); + + // Generate random indices to sample the dictionary with + auto id_array = Int64(size, 0, unique - 1, null_probability); + auto indices = std::dynamic_pointer_cast(id_array); + StringBuilder builder; + + for (int64_t i = 0; i < size; ++i) { + if (indices->IsValid(i)) { + const auto index = indices->Value(i); + const auto value = dictionary->GetView(index); + ABORT_NOT_OK(builder.Append(value)); + } else { + ABORT_NOT_OK(builder.AppendNull()); + } + } + + std::shared_ptr result; + ABORT_NOT_OK(builder.Finish(&result)); + return result; +} } // namespace random } // namespace arrow diff --git a/cpp/src/arrow/testing/random.h b/cpp/src/arrow/testing/random.h index 1ed8d03e8a1..f69b7058db3 100644 --- a/cpp/src/arrow/testing/random.h +++ b/cpp/src/arrow/testing/random.h @@ -198,6 +198,35 @@ class ARROW_EXPORT RandomArrayGenerator { } } + /// \brief Generates a random StringArray + /// + /// \param[in] size the size of the array to generate + /// \param[in] min_length the lower bound of the string length + /// determined by the uniform distribution + /// \param[in] max_length the upper bound of the string length + /// determined by the uniform distribution + /// \param[in] null_probability the probability of a row being null + /// + /// \return a generated Array + std::shared_ptr String(int64_t size, int32_t min_length, + int32_t max_length, double null_probability); + + /// \brief Generates a random StringArray with repeated values + /// + /// \param[in] size the size of the array to generate + /// \param[in] unique the number of unique string values used + /// to populate the array + /// \param[in] min_length the lower bound of the string length + /// determined by the uniform distribution + /// \param[in] max_length the upper bound of the string length + /// determined by the uniform distribution + /// \param[in] null_probability the probability of a row being null + /// + /// \return a generated Array + std::shared_ptr StringWithRepeats(int64_t size, int64_t unique, + int32_t min_length, int32_t max_length, + double null_probability); + private: SeedType seed() { return seed_distribution_(seed_rng_); } diff --git a/cpp/src/parquet/encoding-benchmark.cc b/cpp/src/parquet/encoding-benchmark.cc index c7ec1f14025..0e881903525 100644 --- a/cpp/src/parquet/encoding-benchmark.cc +++ b/cpp/src/parquet/encoding-benchmark.cc @@ -21,6 +21,7 @@ #include "arrow/array/builder_binary.h" #include "arrow/array/builder_dict.h" #include "arrow/testing/gtest_util.h" +#include "arrow/testing/random.h" #include "arrow/testing/util.h" #include "arrow/type.h" @@ -178,38 +179,6 @@ static void BM_DictDecodingInt64_literals(benchmark::State& state) { BENCHMARK(BM_DictDecodingInt64_literals)->Range(MIN_RANGE, MAX_RANGE); -std::shared_ptr<::arrow::Array> MakeRandomStringsWithRepeats(size_t num_unique, - size_t num_values) { - const int64_t min_length = 2; - const int64_t max_length = 10; - std::vector buffer(max_length); - std::vector dictionary(num_unique); - - uint32_t seed = 0; - std::default_random_engine gen(seed); - std::uniform_int_distribution length_dist(min_length, max_length); - - std::generate(dictionary.begin(), dictionary.end(), [&] { - auto length = length_dist(gen); - ::arrow::random_ascii(length, seed++, buffer.data()); - return std::string(buffer.begin(), buffer.begin() + length); - }); - - std::uniform_int_distribution indices_dist(0, num_unique - 1); - ::arrow::StringBuilder builder; - ABORT_NOT_OK(builder.Reserve(num_values)); - - for (size_t i = 0; i < num_values; i++) { - const auto index = indices_dist(gen); - const auto value = dictionary[index]; - ABORT_NOT_OK(builder.Append(value)); - } - - std::shared_ptr<::arrow::Array> result; - ABORT_NOT_OK(builder.Finish(&result)); - return result; -} - // ---------------------------------------------------------------------- // Shared benchmarks for decoding using arrow builders class BenchmarkDecodeArrow : public ::benchmark::Fixture { @@ -223,8 +192,14 @@ class BenchmarkDecodeArrow : public ::benchmark::Fixture { void TearDown(const ::benchmark::State& state) override {} void InitDataInputs() { + // Generate a random string dictionary without any nulls so that this dataset can be + // used for benchmarking the DecodeArrowNonNull API constexpr int repeat_factor = 8; - input_array_ = MakeRandomStringsWithRepeats(num_values_ / repeat_factor, num_values_); + constexpr int64_t min_length = 2; + constexpr int64_t max_length = 10; + ::arrow::random::RandomArrayGenerator rag(0); + input_array_ = rag.StringWithRepeats(num_values_, num_values_ / repeat_factor, + min_length, max_length, /*null_probability=*/0); valid_bits_ = input_array_->null_bitmap()->data(); values_ = std::vector(); values_.reserve(num_values_); From 5fb9e860acad192052756da23b5efc7434fe89ae Mon Sep 17 00:00:00 2001 From: Hatem Helal Date: Tue, 12 Mar 2019 12:50:55 -0400 Subject: [PATCH 25/35] Rework parquet encoding tests --- cpp/src/parquet/encoding-test.cc | 289 ++++++++++++++++--------------- 1 file changed, 145 insertions(+), 144 deletions(-) diff --git a/cpp/src/parquet/encoding-test.cc b/cpp/src/parquet/encoding-test.cc index 582fdacbab3..378e8cb474f 100644 --- a/cpp/src/parquet/encoding-test.cc +++ b/cpp/src/parquet/encoding-test.cc @@ -23,7 +23,9 @@ #include #include "arrow/array.h" +#include "arrow/compute/api.h" #include "arrow/testing/gtest_util.h" +#include "arrow/testing/random.h" #include "arrow/testing/util.h" #include "arrow/type.h" #include "arrow/util/bit-util.h" @@ -327,23 +329,61 @@ TEST(TestDictionaryEncoding, CannotDictDecodeBoolean) { // ---------------------------------------------------------------------- // Shared arrow builder decode tests -class TestDecodeArrow : public ::testing::TestWithParam { +template +struct BuilderTraits {}; + +template <> +struct BuilderTraits<::arrow::BinaryType> { + using DenseArrayBuilder = ::arrow::internal::ChunkedBinaryBuilder; + using DictArrayBuilder = ::arrow::BinaryDictionaryBuilder; +}; + +template <> +struct BuilderTraits<::arrow::StringType> { + using DenseArrayBuilder = ::arrow::internal::ChunkedStringBuilder; + using DictArrayBuilder = ::arrow::StringDictionaryBuilder; +}; + +template +class TestArrowBuilderDecoding : public ::testing::Test { public: - void SetUp() override { - InitFromJSON(GetParam()); + using DenseBuilder = typename BuilderTraits::DenseArrayBuilder; + using DictBuilder = typename BuilderTraits::DictArrayBuilder; + + void SetUp() override { null_probabilities_ = {0.1}; } + void TearDown() override {} + + void InitTestCase(double null_probability) { + InitFromJSON(null_probability); SetupEncoderDecoder(); } - void InitFromJSON(const char* json) { - // Use input JSON to initialize the dense array - expected_dense_ = ::arrow::ArrayFromJSON(::arrow::binary(), json); + void InitFromJSON(double null_probability) { + constexpr int num_unique = 100; + constexpr int repeat = 10; + constexpr int64_t min_length = 2; + constexpr int64_t max_length = 10; + ::arrow::random::RandomArrayGenerator rag(0); + expected_dense_ = rag.StringWithRepeats(repeat * num_unique, num_unique, min_length, + max_length, null_probability); + + std::shared_ptr<::arrow::DataType> data_type = std::make_shared(); + + if (data_type->id() == ::arrow::BinaryType::type_id) { + // TODO(hatemhelal): this is a kludge. Probably best to extend the + // RandomArrayGenerator to also generate BinaryType arrays. + auto data = expected_dense_->data()->Copy(); + data->type = data_type; + expected_dense_ = std::make_shared<::arrow::BinaryArray>(data); + } + num_values_ = static_cast(expected_dense_->length()); null_count_ = static_cast(expected_dense_->null_count()); valid_bits_ = expected_dense_->null_bitmap()->data(); - // Build both binary and string dictionary arrays from the dense array. - BuildDict<::arrow::BinaryDictionaryBuilder>(&expected_bin_dict_); - BuildDict<::arrow::StringDictionaryBuilder>(&expected_str_dict_); + auto builder = CreateDictBuilder(); + ASSERT_OK(builder->AppendArray(*expected_dense_)); + ASSERT_OK(builder->Finish(&expected_dict_)); // Initialize input_data_ for the encoder from the expected_array_ values const auto& binary_array = static_cast(*expected_dense_); @@ -351,33 +391,27 @@ class TestDecodeArrow : public ::testing::TestWithParam { for (int64_t i = 0; i < binary_array.length(); ++i) { auto view = binary_array.GetView(i); - input_data_.emplace_back(static_cast(view.length()), - reinterpret_cast(view.data())); + input_data_[i] = {static_cast(view.length()), + reinterpret_cast(view.data())}; } } - // Builds a dictionary encoded array from the dense array using BuilderType - template - void BuildDict(std::shared_ptr<::arrow::Array>* result) { - const auto& binary_array = static_cast(*expected_dense_); - BuilderType builder(default_memory_pool()); - - for (int64_t i = 0; i < binary_array.length(); ++i) { - if (binary_array.IsNull(i)) { - ASSERT_OK(builder.AppendNull()); - } else { - ASSERT_OK(builder.Append(binary_array.GetView(i))); - } - } + std::unique_ptr CreateDenseBuilder() { + // Use same default chunk size of 16MB as used in ByteArrayChunkedRecordReader + constexpr int32_t kChunkSize = 1 << 24; + return std::unique_ptr( + new DenseBuilder(kChunkSize, default_memory_pool())); + } - ASSERT_OK(builder.Finish(result)); + std::unique_ptr CreateDictBuilder() { + return std::unique_ptr(new DictBuilder(default_memory_pool())); } // Setup encoder/decoder pair for testing with virtual void SetupEncoderDecoder() = 0; - void CheckDecodeDense(const int actual_num_values, - ::arrow::internal::ChunkedBinaryBuilder& builder) { + template + void CheckDense(int actual_num_values, Builder& builder) { ASSERT_EQ(actual_num_values, num_values_); ::arrow::ArrayVector actual_vec; ASSERT_OK(builder.Finish(&actual_vec)); @@ -385,25 +419,57 @@ class TestDecodeArrow : public ::testing::TestWithParam { ASSERT_ARRAYS_EQUAL(*actual_vec[0], *expected_dense_); } - void CheckDecodeDict(const int actual_num_values, - ::arrow::BinaryDictionaryBuilder& builder) { + template + void CheckDict(int actual_num_values, Builder& builder) { ASSERT_EQ(actual_num_values, num_values_); std::shared_ptr<::arrow::Array> actual; ASSERT_OK(builder.Finish(&actual)); - ASSERT_ARRAYS_EQUAL(*actual, *expected_bin_dict_); + ASSERT_ARRAYS_EQUAL(*actual, *expected_dict_); } - void CheckDecodeDict(const int actual_num_values, - ::arrow::StringDictionaryBuilder& builder) { - ASSERT_EQ(actual_num_values, num_values_); - std::shared_ptr<::arrow::Array> actual; - ASSERT_OK(builder.Finish(&actual)); - ASSERT_ARRAYS_EQUAL(*actual, *expected_str_dict_); + void CheckDecodeArrowUsingDenseBuilder() { + for (auto np : null_probabilities_) { + InitTestCase(np); + auto builder = CreateDenseBuilder(); + auto actual_num_values = + decoder_->DecodeArrow(num_values_, null_count_, valid_bits_, 0, builder.get()); + CheckDense(actual_num_values, *builder); + } + } + + void CheckDecodeArrowUsingDictBuilder() { + for (auto np : null_probabilities_) { + InitTestCase(np); + auto builder = CreateDictBuilder(); + auto actual_num_values = + decoder_->DecodeArrow(num_values_, null_count_, valid_bits_, 0, builder.get()); + CheckDict(actual_num_values, *builder); + } + } + + void CheckDecodeArrowNonNullUsingDenseBuilder() { + for (auto np : null_probabilities_) { + InitTestCase(np); + SKIP_TEST_IF(null_count_ > 0) + auto builder = CreateDenseBuilder(); + auto actual_num_values = decoder_->DecodeArrowNonNull(num_values_, builder.get()); + CheckDense(actual_num_values, *builder); + } + } + + void CheckDecodeArrowNonNullUsingDictBuilder() { + for (auto np : null_probabilities_) { + InitTestCase(np); + SKIP_TEST_IF(null_count_ > 0) + auto builder = CreateDictBuilder(); + auto actual_num_values = decoder_->DecodeArrowNonNull(num_values_, builder.get()); + CheckDict(actual_num_values, *builder); + } } protected: - std::shared_ptr<::arrow::Array> expected_bin_dict_; - std::shared_ptr<::arrow::Array> expected_str_dict_; + std::vector null_probabilities_; + std::shared_ptr<::arrow::Array> expected_dict_; std::shared_ptr<::arrow::Array> expected_dense_; int num_values_; int null_count_; @@ -414,9 +480,15 @@ class TestDecodeArrow : public ::testing::TestWithParam { std::shared_ptr buffer_; }; -// ---------------------------------------------------------------------- -// Arrow builder decode tests for PlainByteArrayDecoder -class FromPlainEncoding : public TestDecodeArrow { +#define TEST_ARROW_BUILDER_BASE_MEMBERS() \ + using TestArrowBuilderDecoding::encoder_; \ + using TestArrowBuilderDecoding::decoder_; \ + using TestArrowBuilderDecoding::input_data_; \ + using TestArrowBuilderDecoding::num_values_; \ + using TestArrowBuilderDecoding::buffer_ + +template +class PlainEncoding : public TestArrowBuilderDecoding { public: void SetupEncoderDecoder() override { encoder_ = MakeTypedEncoder(Encoding::PLAIN); @@ -426,59 +498,32 @@ class FromPlainEncoding : public TestDecodeArrow { decoder_->SetData(num_values_, buffer_->data(), static_cast(buffer_->size())); } - void TearDown() override {} + protected: + TEST_ARROW_BUILDER_BASE_MEMBERS(); }; -TEST_P(FromPlainEncoding, DecodeDense) { - ::arrow::internal::ChunkedBinaryBuilder builder(static_cast(buffer_->size()), - default_memory_pool()); - auto actual_num_values = - decoder_->DecodeArrow(num_values_, null_count_, valid_bits_, 0, &builder); - CheckDecodeDense(actual_num_values, builder); -} - -TEST_P(FromPlainEncoding, DecodeNonNullDense) { - // Skip this test if input data contains nulls (optionals) - SKIP_TEST_IF(null_count_ > 0) - ::arrow::internal::ChunkedBinaryBuilder builder(static_cast(buffer_->size()), - default_memory_pool()); - auto actual_num_values = decoder_->DecodeArrowNonNull(num_values_, &builder); - CheckDecodeDense(actual_num_values, builder); -} +using BuilderArrayTypes = ::testing::Types<::arrow::BinaryType, ::arrow::StringType>; +// using BuilderArrayTypes = ::testing::Types<::arrow::StringType>; +TYPED_TEST_CASE(PlainEncoding, BuilderArrayTypes); -TEST_P(FromPlainEncoding, DecodeBinaryDict) { - ::arrow::BinaryDictionaryBuilder builder(default_memory_pool()); - auto actual_num_values = - decoder_->DecodeArrow(num_values_, null_count_, valid_bits_, 0, &builder); - CheckDecodeDict(actual_num_values, builder); +TYPED_TEST(PlainEncoding, CheckDecodeArrowUsingDenseBuilder) { + this->CheckDecodeArrowUsingDenseBuilder(); } -TEST_P(FromPlainEncoding, DecodeStringDict) { - ::arrow::StringDictionaryBuilder builder(default_memory_pool()); - auto actual_num_values = - decoder_->DecodeArrow(num_values_, null_count_, valid_bits_, 0, &builder); - CheckDecodeDict(actual_num_values, builder); +TYPED_TEST(PlainEncoding, CheckDecodeArrowUsingDictBuilder) { + this->CheckDecodeArrowUsingDictBuilder(); } -TEST_P(FromPlainEncoding, DecodeNonNullBinaryDict) { - // Skip this test if input data contains nulls (optionals) - SKIP_TEST_IF(null_count_ > 0) - ::arrow::BinaryDictionaryBuilder builder(default_memory_pool()); - auto actual_num_values = decoder_->DecodeArrowNonNull(num_values_, &builder); - CheckDecodeDict(actual_num_values, builder); +TYPED_TEST(PlainEncoding, CheckDecodeArrowNonNullDenseBuilder) { + this->CheckDecodeArrowNonNullUsingDenseBuilder(); } -TEST_P(FromPlainEncoding, DecodeNonNullStringDict) { - // Skip this test if input data contains nulls (optionals) - SKIP_TEST_IF(null_count_ > 0) - ::arrow::StringDictionaryBuilder builder(default_memory_pool()); - auto actual_num_values = decoder_->DecodeArrowNonNull(num_values_, &builder); - CheckDecodeDict(actual_num_values, builder); +TYPED_TEST(PlainEncoding, CheckDecodeArrowNonNullDictBuilder) { + this->CheckDecodeArrowNonNullUsingDictBuilder(); } -// ---------------------------------------------------------------------- -// Arrow builder decode tests for DictByteArrayDecoder -class FromDictEncoding : public TestDecodeArrow { +template +class DictEncoding : public TestArrowBuilderDecoding { public: void SetupEncoderDecoder() override { auto node = schema::ByteArray("name"); @@ -495,88 +540,44 @@ class FromDictEncoding : public TestDecodeArrow { dict_encoder->WriteDict(dict_buffer_->mutable_data()); // Simulate reading the dictionary page followed by a data page - decoder_ = MakeTypedDecoder(Encoding::PLAIN, descr_.get()); - decoder_->SetData(dict_encoder->num_entries(), dict_buffer_->data(), - static_cast(dict_buffer_->size())); + plain_decoder_ = MakeTypedDecoder(Encoding::PLAIN, descr_.get()); + plain_decoder_->SetData(dict_encoder->num_entries(), dict_buffer_->data(), + static_cast(dict_buffer_->size())); dict_decoder_ = MakeDictDecoder(descr_.get()); - dict_decoder_->SetDict(decoder_.get()); + dict_decoder_->SetDict(plain_decoder_.get()); dict_decoder_->SetData(num_values_, buffer_->data(), static_cast(buffer_->size())); + decoder_ = std::unique_ptr( + dynamic_cast(dict_decoder_.release())); } - void TearDown() override {} - protected: + TEST_ARROW_BUILDER_BASE_MEMBERS(); std::unique_ptr descr_; + std::unique_ptr plain_decoder_; std::unique_ptr> dict_decoder_; std::shared_ptr dict_buffer_; }; -TEST_P(FromDictEncoding, DecodeDense) { - ::arrow::internal::ChunkedBinaryBuilder builder(static_cast(dict_buffer_->size()), - default_memory_pool()); - auto byte_array_decoder = dynamic_cast(dict_decoder_.get()); - ASSERT_NE(byte_array_decoder, nullptr); - auto actual_num_values = - byte_array_decoder->DecodeArrow(num_values_, null_count_, valid_bits_, 0, &builder); - CheckDecodeDense(actual_num_values, builder); -} +TYPED_TEST_CASE(DictEncoding, BuilderArrayTypes); -TEST_P(FromDictEncoding, DecodeNonNullDense) { - // Skip this test if input data contains nulls (optionals) - SKIP_TEST_IF(null_count_ > 0) - ::arrow::internal::ChunkedBinaryBuilder builder(static_cast(dict_buffer_->size()), - default_memory_pool()); - auto byte_array_decoder = dynamic_cast(dict_decoder_.get()); - ASSERT_NE(byte_array_decoder, nullptr); - auto actual_num_values = byte_array_decoder->DecodeArrowNonNull(num_values_, &builder); - CheckDecodeDense(actual_num_values, builder); +TYPED_TEST(DictEncoding, CheckDecodeArrowUsingDenseBuilder) { + this->CheckDecodeArrowUsingDenseBuilder(); } -TEST_P(FromDictEncoding, DecodeBinaryDict) { - ::arrow::BinaryDictionaryBuilder builder(default_memory_pool()); - auto byte_array_decoder = dynamic_cast(dict_decoder_.get()); - ASSERT_NE(byte_array_decoder, nullptr); - auto actual_num_values = - byte_array_decoder->DecodeArrow(num_values_, null_count_, valid_bits_, 0, &builder); - CheckDecodeDict(actual_num_values, builder); + TYPED_TEST(DictEncoding, CheckDecodeArrowUsingDictBuilder) { + this->CheckDecodeArrowUsingDictBuilder(); } -TEST_P(FromDictEncoding, DecodeStringDict) { - ::arrow::StringDictionaryBuilder builder(default_memory_pool()); - auto byte_array_decoder = dynamic_cast(dict_decoder_.get()); - ASSERT_NE(byte_array_decoder, nullptr); - auto actual_num_values = - byte_array_decoder->DecodeArrow(num_values_, null_count_, valid_bits_, 0, &builder); - CheckDecodeDict(actual_num_values, builder); + TYPED_TEST(DictEncoding, CheckDecodeArrowNonNullDenseBuilder) { + this->CheckDecodeArrowNonNullUsingDenseBuilder(); } -TEST_P(FromDictEncoding, DecodeNonNullBinaryDict) { - // Skip this test if input data contains nulls (optionals) - SKIP_TEST_IF(null_count_ > 0) - ::arrow::BinaryDictionaryBuilder builder(default_memory_pool()); - auto byte_array_decoder = dynamic_cast(dict_decoder_.get()); - ASSERT_NE(byte_array_decoder, nullptr); - auto actual_num_values = byte_array_decoder->DecodeArrowNonNull(num_values_, &builder); - CheckDecodeDict(actual_num_values, builder); + TYPED_TEST(DictEncoding, CheckDecodeArrowNonNullDictBuilder) { + this->CheckDecodeArrowNonNullUsingDictBuilder(); } -TEST_P(FromDictEncoding, DecodeNonNullStringDict) { - // Skip this test if input data contains nulls (optionals) - SKIP_TEST_IF(null_count_ > 0) - ::arrow::StringDictionaryBuilder builder(default_memory_pool()); - auto byte_array_decoder = dynamic_cast(dict_decoder_.get()); - ASSERT_NE(byte_array_decoder, nullptr); - auto actual_num_values = byte_array_decoder->DecodeArrowNonNull(num_values_, &builder); - CheckDecodeDict(actual_num_values, builder); -} - -const char* json[] = {"[\"foo\", \"bar\", \"foo\", \"foo\"]", - "[\"foo\", \"bar\", \"foo\", null]"}; - -INSTANTIATE_TEST_CASE_P(TestDecodeArrow, FromPlainEncoding, ::testing::ValuesIn(json)); -INSTANTIATE_TEST_CASE_P(TestDecodeArrow, FromDictEncoding, ::testing::ValuesIn(json)); } // namespace test } // namespace parquet From 7347cfa26fe918749d18e34cef37f57e5701c183 Mon Sep 17 00:00:00 2001 From: Hatem Helal Date: Tue, 12 Mar 2019 12:51:13 -0400 Subject: [PATCH 26/35] Fix DecodeArrow from plain encoded columns --- cpp/src/parquet/encoding.cc | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/cpp/src/parquet/encoding.cc b/cpp/src/parquet/encoding.cc index c09a32b6c7f..4fcf4d99c90 100644 --- a/cpp/src/parquet/encoding.cc +++ b/cpp/src/parquet/encoding.cc @@ -759,19 +759,19 @@ class PlainByteArrayDecoder : public PlainDecoder, int64_t data_size = len_; int bytes_decoded = 0; while (i < num_values) { + uint32_t len = *reinterpret_cast(data); + increment = static_cast(sizeof(uint32_t) + len); + if (data_size < increment) { + ParquetException::EofException(); + } if (bit_reader.IsSet()) { - uint32_t len = *reinterpret_cast(data); - increment = static_cast(sizeof(uint32_t) + len); - if (data_size < increment) { - ParquetException::EofException(); - } ARROW_RETURN_NOT_OK(out->Append(data + sizeof(uint32_t), len)); - data += increment; - data_size -= increment; - bytes_decoded += increment; } else { ARROW_RETURN_NOT_OK(out->AppendNull()); } + data += increment; + data_size -= increment; + bytes_decoded += increment; bit_reader.Next(); ++i; } From 9da1331424df83c048818f74dd518c9c2a859b2e Mon Sep 17 00:00:00 2001 From: Hatem Helal Date: Tue, 12 Mar 2019 12:52:31 -0400 Subject: [PATCH 27/35] Temporarily disable tests for arrow builder decoding from dictionary encoded col --- cpp/src/parquet/encoding-test.cc | 30 +++++++++++++++--------------- 1 file changed, 15 insertions(+), 15 deletions(-) diff --git a/cpp/src/parquet/encoding-test.cc b/cpp/src/parquet/encoding-test.cc index 378e8cb474f..11501a19e7b 100644 --- a/cpp/src/parquet/encoding-test.cc +++ b/cpp/src/parquet/encoding-test.cc @@ -562,21 +562,21 @@ class DictEncoding : public TestArrowBuilderDecoding { TYPED_TEST_CASE(DictEncoding, BuilderArrayTypes); -TYPED_TEST(DictEncoding, CheckDecodeArrowUsingDenseBuilder) { - this->CheckDecodeArrowUsingDenseBuilder(); -} - - TYPED_TEST(DictEncoding, CheckDecodeArrowUsingDictBuilder) { - this->CheckDecodeArrowUsingDictBuilder(); -} - - TYPED_TEST(DictEncoding, CheckDecodeArrowNonNullDenseBuilder) { - this->CheckDecodeArrowNonNullUsingDenseBuilder(); -} - - TYPED_TEST(DictEncoding, CheckDecodeArrowNonNullDictBuilder) { - this->CheckDecodeArrowNonNullUsingDictBuilder(); -} +//TYPED_TEST(DictEncoding, CheckDecodeArrowUsingDenseBuilder) { +// this->CheckDecodeArrowUsingDenseBuilder(); +//} +// +// TYPED_TEST(DictEncoding, CheckDecodeArrowUsingDictBuilder) { +// this->CheckDecodeArrowUsingDictBuilder(); +//} +// +// TYPED_TEST(DictEncoding, CheckDecodeArrowNonNullDenseBuilder) { +// this->CheckDecodeArrowNonNullUsingDenseBuilder(); +//} +// +// TYPED_TEST(DictEncoding, CheckDecodeArrowNonNullDictBuilder) { +// this->CheckDecodeArrowNonNullUsingDictBuilder(); +//} } // namespace test From e6ca0db43646f04e1486ab4a49d74c1ca07198c5 Mon Sep 17 00:00:00 2001 From: Hatem Helal Date: Wed, 13 Mar 2019 00:17:03 +0000 Subject: [PATCH 28/35] Fix DictEncoding test: need to use PutSpaced instead of Put in setup --- cpp/src/parquet/encoding-test.cc | 39 ++++++++++++++++---------------- 1 file changed, 20 insertions(+), 19 deletions(-) diff --git a/cpp/src/parquet/encoding-test.cc b/cpp/src/parquet/encoding-test.cc index 11501a19e7b..66b2ac7692c 100644 --- a/cpp/src/parquet/encoding-test.cc +++ b/cpp/src/parquet/encoding-test.cc @@ -350,15 +350,15 @@ class TestArrowBuilderDecoding : public ::testing::Test { using DenseBuilder = typename BuilderTraits::DenseArrayBuilder; using DictBuilder = typename BuilderTraits::DictArrayBuilder; - void SetUp() override { null_probabilities_ = {0.1}; } + void SetUp() override { null_probabilities_ = {0.0, 0.5, 1.0}; } void TearDown() override {} void InitTestCase(double null_probability) { - InitFromJSON(null_probability); + GenerateInputData(null_probability); SetupEncoderDecoder(); } - void InitFromJSON(double null_probability) { + void GenerateInputData(double null_probability) { constexpr int num_unique = 100; constexpr int repeat = 10; constexpr int64_t min_length = 2; @@ -484,6 +484,7 @@ class TestArrowBuilderDecoding : public ::testing::Test { using TestArrowBuilderDecoding::encoder_; \ using TestArrowBuilderDecoding::decoder_; \ using TestArrowBuilderDecoding::input_data_; \ + using TestArrowBuilderDecoding::valid_bits_; \ using TestArrowBuilderDecoding::num_values_; \ using TestArrowBuilderDecoding::buffer_ @@ -530,7 +531,7 @@ class DictEncoding : public TestArrowBuilderDecoding { descr_ = std::unique_ptr(new ColumnDescriptor(node, 0, 0)); encoder_ = MakeTypedEncoder(Encoding::PLAIN, /*use_dictionary=*/true, descr_.get()); - ASSERT_NO_THROW(encoder_->Put(input_data_.data(), num_values_)); + ASSERT_NO_THROW(encoder_->PutSpaced(input_data_.data(), num_values_, valid_bits_, 0)); buffer_ = encoder_->FlushValues(); auto dict_encoder = dynamic_cast*>(encoder_.get()); @@ -562,21 +563,21 @@ class DictEncoding : public TestArrowBuilderDecoding { TYPED_TEST_CASE(DictEncoding, BuilderArrayTypes); -//TYPED_TEST(DictEncoding, CheckDecodeArrowUsingDenseBuilder) { -// this->CheckDecodeArrowUsingDenseBuilder(); -//} -// -// TYPED_TEST(DictEncoding, CheckDecodeArrowUsingDictBuilder) { -// this->CheckDecodeArrowUsingDictBuilder(); -//} -// -// TYPED_TEST(DictEncoding, CheckDecodeArrowNonNullDenseBuilder) { -// this->CheckDecodeArrowNonNullUsingDenseBuilder(); -//} -// -// TYPED_TEST(DictEncoding, CheckDecodeArrowNonNullDictBuilder) { -// this->CheckDecodeArrowNonNullUsingDictBuilder(); -//} +TYPED_TEST(DictEncoding, CheckDecodeArrowUsingDenseBuilder) { + this->CheckDecodeArrowUsingDenseBuilder(); +} + +TYPED_TEST(DictEncoding, CheckDecodeArrowUsingDictBuilder) { + this->CheckDecodeArrowUsingDictBuilder(); +} + +TYPED_TEST(DictEncoding, CheckDecodeArrowNonNullDenseBuilder) { + this->CheckDecodeArrowNonNullUsingDenseBuilder(); +} + +TYPED_TEST(DictEncoding, CheckDecodeArrowNonNullDictBuilder) { + this->CheckDecodeArrowNonNullUsingDictBuilder(); +} } // namespace test From 7719b944f718f1f1e22b9b2d0ed3a11c4cab9448 Mon Sep 17 00:00:00 2001 From: Hatem Helal Date: Wed, 13 Mar 2019 00:34:12 +0000 Subject: [PATCH 29/35] Use random string generator instead of poor JSON --- .../parquet/arrow/arrow-reader-writer-test.cc | 36 +++++++++---------- 1 file changed, 18 insertions(+), 18 deletions(-) diff --git a/cpp/src/parquet/arrow/arrow-reader-writer-test.cc b/cpp/src/parquet/arrow/arrow-reader-writer-test.cc index b80d4481705..b54c3f41a5d 100644 --- a/cpp/src/parquet/arrow/arrow-reader-writer-test.cc +++ b/cpp/src/parquet/arrow/arrow-reader-writer-test.cc @@ -30,6 +30,7 @@ #include #include "arrow/api.h" +#include "arrow/testing/random.h" #include "arrow/testing/util.h" #include "arrow/type_traits.h" #include "arrow/util/decimal.h" @@ -2441,10 +2442,10 @@ TEST(TestArrowWriterAdHoc, SchemaMismatch) { // ---------------------------------------------------------------------- // Tests for directly reading DictionaryArray -class TestArrowReadDictionary : public ::testing::TestWithParam { +class TestArrowReadDictionary : public ::testing::TestWithParam { public: void SetUp() override { - InitFromJSON(GetParam()); + GenerateData(GetParam()); ASSERT_NO_FATAL_FAILURE( WriteTableToBuffer(expected_dense_, expected_dense_->num_rows() / 2, default_arrow_writer_properties(), &buffer_)); @@ -2452,20 +2453,19 @@ class TestArrowReadDictionary : public ::testing::TestWithParam { properties_ = default_arrow_reader_properties(); } - void InitFromJSON(const char* jsonStr) { - auto dense_array = ::arrow::ArrayFromJSON(::arrow::utf8(), jsonStr); + void GenerateData(double null_probability) { + constexpr int num_unique = 100; + constexpr int repeat = 10; + constexpr int64_t min_length = 2; + constexpr int64_t max_length = 10; + ::arrow::random::RandomArrayGenerator rag(0); + auto dense_array = rag.StringWithRepeats(repeat * num_unique, num_unique, min_length, + max_length, null_probability); expected_dense_ = MakeSimpleTable(dense_array, /*nullable=*/true); ::arrow::StringDictionaryBuilder builder(default_memory_pool()); const auto& string_array = static_cast(*dense_array); - - for (int64_t i = 0; i < string_array.length(); ++i) { - if (string_array.IsNull(i)) { - ASSERT_OK(builder.AppendNull()); - } else { - ASSERT_OK(builder.Append(string_array.GetView(i))); - } - } + ASSERT_OK(builder.AppendArray(string_array)); std::shared_ptr<::arrow::Array> dict_array; ASSERT_OK(builder.Finish(&dict_array)); @@ -2488,9 +2488,11 @@ class TestArrowReadDictionary : public ::testing::TestWithParam { std::shared_ptr
actual; ASSERT_OK_NO_THROW(reader->ReadTable(&actual)); - ::arrow::AssertTablesEqual(*actual, expected, false); + ::arrow::AssertTablesEqual(*actual, expected, /*same_chunk_layout=*/false); } + static std::vector null_probabilites() { return {0.0, 0.5, 1}; } + protected: std::shared_ptr
expected_dense_; std::shared_ptr
expected_dict_; @@ -2508,11 +2510,9 @@ TEST_P(TestArrowReadDictionary, ReadWholeFileDense) { CheckReadWholeFile(*expected_dense_); } -const char* jsonInputs[] = {"[\"foo\", \"bar\", \"foo\", \"foo\"]", - "[\"foo\", \"bar\", \"foo\", \"null\"]"}; - -INSTANTIATE_TEST_CASE_P(ReadDictionary, TestArrowReadDictionary, - ::testing::ValuesIn(jsonInputs)); +INSTANTIATE_TEST_CASE_P( + ReadDictionary, TestArrowReadDictionary, + ::testing::ValuesIn(TestArrowReadDictionary::null_probabilites())); } // namespace arrow From 2c8fa7efdbbea17cd6e6c77ad92938b47fdd42ad Mon Sep 17 00:00:00 2001 From: Hatem Helal Date: Wed, 13 Mar 2019 10:27:47 -0400 Subject: [PATCH 30/35] revert incorrect changes to PlainByteArrayDecoder::DecodeArrow method --- cpp/src/parquet/encoding.cc | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/cpp/src/parquet/encoding.cc b/cpp/src/parquet/encoding.cc index 4fcf4d99c90..c09a32b6c7f 100644 --- a/cpp/src/parquet/encoding.cc +++ b/cpp/src/parquet/encoding.cc @@ -759,19 +759,19 @@ class PlainByteArrayDecoder : public PlainDecoder, int64_t data_size = len_; int bytes_decoded = 0; while (i < num_values) { - uint32_t len = *reinterpret_cast(data); - increment = static_cast(sizeof(uint32_t) + len); - if (data_size < increment) { - ParquetException::EofException(); - } if (bit_reader.IsSet()) { + uint32_t len = *reinterpret_cast(data); + increment = static_cast(sizeof(uint32_t) + len); + if (data_size < increment) { + ParquetException::EofException(); + } ARROW_RETURN_NOT_OK(out->Append(data + sizeof(uint32_t), len)); + data += increment; + data_size -= increment; + bytes_decoded += increment; } else { ARROW_RETURN_NOT_OK(out->AppendNull()); } - data += increment; - data_size -= increment; - bytes_decoded += increment; bit_reader.Next(); ++i; } From 5bc933b973e60b1e0ad82f7f970e72c7fcb37840 Mon Sep 17 00:00:00 2001 From: Hatem Helal Date: Wed, 13 Mar 2019 10:28:10 -0400 Subject: [PATCH 31/35] use PutSpaced in test setup to correctly initialize encoded data --- cpp/src/parquet/encoding-test.cc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cpp/src/parquet/encoding-test.cc b/cpp/src/parquet/encoding-test.cc index 66b2ac7692c..4ec537a9e84 100644 --- a/cpp/src/parquet/encoding-test.cc +++ b/cpp/src/parquet/encoding-test.cc @@ -494,7 +494,7 @@ class PlainEncoding : public TestArrowBuilderDecoding { void SetupEncoderDecoder() override { encoder_ = MakeTypedEncoder(Encoding::PLAIN); decoder_ = MakeTypedDecoder(Encoding::PLAIN); - ASSERT_NO_THROW(encoder_->Put(input_data_.data(), num_values_)); + ASSERT_NO_THROW(encoder_->PutSpaced(input_data_.data(), num_values_, valid_bits_, 0)); buffer_ = encoder_->FlushValues(); decoder_->SetData(num_values_, buffer_->data(), static_cast(buffer_->size())); } From 2026b513cc4045069c7320e1a8f06381cfc2d0ae Mon Sep 17 00:00:00 2001 From: Hatem Helal Date: Wed, 13 Mar 2019 13:13:33 -0400 Subject: [PATCH 32/35] Rework ByteArrayDecoder interface to reduce code duplication --- cpp/src/parquet/encoding.cc | 137 ++++++------------------------------ cpp/src/parquet/encoding.h | 64 +++++++++++++---- 2 files changed, 70 insertions(+), 131 deletions(-) diff --git a/cpp/src/parquet/encoding.cc b/cpp/src/parquet/encoding.cc index c09a32b6c7f..c6c3b3f2c6b 100644 --- a/cpp/src/parquet/encoding.cc +++ b/cpp/src/parquet/encoding.cc @@ -697,61 +697,12 @@ class PlainByteArrayDecoder : public PlainDecoder, using Base::DecodeSpaced; using Base::PlainDecoder; - int DecodeArrow(int num_values, int null_count, const uint8_t* valid_bits, - int64_t valid_bits_offset, - ::arrow::internal::ChunkedBinaryBuilder* out) override { - int result = 0; - PARQUET_THROW_NOT_OK( - DecodeArrow(num_values, null_count, valid_bits, valid_bits_offset, out, &result)); - return result; - } - - int DecodeArrow(int num_values, int null_count, const uint8_t* valid_bits, - int64_t valid_bits_offset, - ::arrow::BinaryDictionaryBuilder* out) override { - int result = 0; - PARQUET_THROW_NOT_OK( - DecodeArrow(num_values, null_count, valid_bits, valid_bits_offset, out, &result)); - return result; - } - - int DecodeArrow(int num_values, int null_count, const uint8_t* valid_bits, - int64_t valid_bits_offset, - ::arrow::StringDictionaryBuilder* out) override { - int result = 0; - PARQUET_THROW_NOT_OK( - DecodeArrow(num_values, null_count, valid_bits, valid_bits_offset, out, &result)); - return result; - } - - int DecodeArrowNonNull(int num_values, - ::arrow::internal::ChunkedBinaryBuilder* out) override { - int result = 0; - PARQUET_THROW_NOT_OK(DecodeArrowNonNull(num_values, out, &result)); - return result; - } - - int DecodeArrowNonNull(int num_values, ::arrow::BinaryDictionaryBuilder* out) override { - int result = 0; - PARQUET_THROW_NOT_OK(DecodeArrowNonNull(num_values, out, &result)); - return result; - } - - int DecodeArrowNonNull(int num_values, ::arrow::StringDictionaryBuilder* out) override { - int result = 0; - PARQUET_THROW_NOT_OK(DecodeArrowNonNull(num_values, out, &result)); - return result; - } - private: - template ::arrow::Status DecodeArrow(int num_values, int null_count, const uint8_t* valid_bits, - int64_t valid_bits_offset, BuilderType* out, - int* values_decoded) { + int64_t valid_bits_offset, WrappedBuilderInterface* builder, + int* values_decoded) override { num_values = std::min(num_values, num_values_); - - ARROW_RETURN_NOT_OK(out->Reserve(num_values)); - + builder->Reserve(num_values); ::arrow::internal::BitmapReader bit_reader(valid_bits, valid_bits_offset, num_values); int increment; int i = 0; @@ -765,12 +716,12 @@ class PlainByteArrayDecoder : public PlainDecoder, if (data_size < increment) { ParquetException::EofException(); } - ARROW_RETURN_NOT_OK(out->Append(data + sizeof(uint32_t), len)); + builder->Append(data + sizeof(uint32_t), len); data += increment; data_size -= increment; bytes_decoded += increment; } else { - ARROW_RETURN_NOT_OK(out->AppendNull()); + builder->AppendNull(); } bit_reader.Next(); ++i; @@ -783,20 +734,20 @@ class PlainByteArrayDecoder : public PlainDecoder, return ::arrow::Status::OK(); } - template - ::arrow::Status DecodeArrowNonNull(int num_values, BuilderType* out, - int* values_decoded) { + ::arrow::Status DecodeArrowNonNull(int num_values, WrappedBuilderInterface* builder, + int* values_decoded) override { num_values = std::min(num_values, num_values_); - ARROW_RETURN_NOT_OK(out->Reserve(num_values)); + builder->Reserve(num_values); int i = 0; const uint8_t* data = data_; int64_t data_size = len_; int bytes_decoded = 0; + while (i < num_values) { uint32_t len = *reinterpret_cast(data); int increment = static_cast(sizeof(uint32_t) + len); if (data_size < increment) ParquetException::EofException(); - ARROW_RETURN_NOT_OK(out->Append(data + sizeof(uint32_t), len)); + builder->Append(data + sizeof(uint32_t), len); data += increment; data_size -= increment; bytes_decoded += increment; @@ -938,60 +889,13 @@ class DictByteArrayDecoder : public DictDecoderImpl, using BASE = DictDecoderImpl; using BASE::DictDecoderImpl; - int DecodeArrow(int num_values, int null_count, const uint8_t* valid_bits, - int64_t valid_bits_offset, - ::arrow::internal::ChunkedBinaryBuilder* out) override { - int result = 0; - PARQUET_THROW_NOT_OK( - DecodeArrow(num_values, null_count, valid_bits, valid_bits_offset, out, &result)); - return result; - } - - int DecodeArrow(int num_values, int null_count, const uint8_t* valid_bits, - int64_t valid_bits_offset, - ::arrow::BinaryDictionaryBuilder* out) override { - int result = 0; - PARQUET_THROW_NOT_OK( - DecodeArrow(num_values, null_count, valid_bits, valid_bits_offset, out, &result)); - return result; - } - - int DecodeArrow(int num_values, int null_count, const uint8_t* valid_bits, - int64_t valid_bits_offset, - ::arrow::StringDictionaryBuilder* out) override { - int result = 0; - PARQUET_THROW_NOT_OK( - DecodeArrow(num_values, null_count, valid_bits, valid_bits_offset, out, &result)); - return result; - } - - int DecodeArrowNonNull(int num_values, - ::arrow::internal::ChunkedBinaryBuilder* out) override { - int result = 0; - PARQUET_THROW_NOT_OK(DecodeArrowNonNull(num_values, out, &result)); - return result; - } - - int DecodeArrowNonNull(int num_values, ::arrow::BinaryDictionaryBuilder* out) override { - int result = 0; - PARQUET_THROW_NOT_OK(DecodeArrowNonNull(num_values, out, &result)); - return result; - } - - int DecodeArrowNonNull(int num_values, ::arrow::StringDictionaryBuilder* out) override { - int result = 0; - PARQUET_THROW_NOT_OK(DecodeArrowNonNull(num_values, out, &result)); - return result; - } - private: - template ::arrow::Status DecodeArrow(int num_values, int null_count, const uint8_t* valid_bits, - int64_t valid_bits_offset, BuilderType* builder, - int* out_num_values) { + int64_t valid_bits_offset, WrappedBuilderInterface* builder, + int* out_num_values) override { constexpr int32_t buffer_size = 1024; int32_t indices_buffer[buffer_size]; - + builder->Reserve(num_values); ::arrow::internal::BitmapReader bit_reader(valid_bits, valid_bits_offset, num_values); int values_decoded = 0; @@ -1009,10 +913,10 @@ class DictByteArrayDecoder : public DictDecoderImpl, // Consume all indices if (is_valid) { const auto& val = dictionary_[indices_buffer[i]]; - ARROW_RETURN_NOT_OK(builder->Append(val.ptr, val.len)); + builder->Append(val.ptr, val.len); ++i; } else { - ARROW_RETURN_NOT_OK(builder->AppendNull()); + builder->AppendNull(); --null_count; } ++values_decoded; @@ -1025,7 +929,7 @@ class DictByteArrayDecoder : public DictDecoderImpl, bit_reader.Next(); } } else { - ARROW_RETURN_NOT_OK(builder->AppendNull()); + builder->AppendNull(); --null_count; ++values_decoded; } @@ -1038,19 +942,20 @@ class DictByteArrayDecoder : public DictDecoderImpl, return ::arrow::Status::OK(); } - template - ::arrow::Status DecodeArrowNonNull(int num_values, BuilderType* builder, - int* out_num_values) { + ::arrow::Status DecodeArrowNonNull(int num_values, WrappedBuilderInterface* builder, + int* out_num_values) override { constexpr int32_t buffer_size = 2048; int32_t indices_buffer[buffer_size]; int values_decoded = 0; + builder->Reserve(num_values); + while (values_decoded < num_values) { int32_t batch_size = std::min(buffer_size, num_values - values_decoded); int num_indices = idx_decoder_.GetBatch(indices_buffer, batch_size); if (num_indices == 0) break; for (int i = 0; i < num_indices; ++i) { const auto& val = dictionary_[indices_buffer[i]]; - PARQUET_THROW_NOT_OK(builder->Append(val.ptr, val.len)); + builder->Append(val.ptr, val.len); } values_decoded += num_indices; } diff --git a/cpp/src/parquet/encoding.h b/cpp/src/parquet/encoding.h index c869a8c6cfc..db98bac6ce6 100644 --- a/cpp/src/parquet/encoding.h +++ b/cpp/src/parquet/encoding.h @@ -208,26 +208,60 @@ using DoubleDecoder = TypedDecoder; class ByteArrayDecoder : virtual public TypedDecoder { public: using TypedDecoder::DecodeSpaced; - virtual int DecodeArrow(int num_values, int null_count, const uint8_t* valid_bits, - int64_t valid_bits_offset, - ::arrow::internal::ChunkedBinaryBuilder* builder) = 0; - virtual int DecodeArrow(int num_values, int null_count, const uint8_t* valid_bits, - int64_t valid_bits_offset, - ::arrow::BinaryDictionaryBuilder* builder) = 0; + class WrappedBuilderInterface { + public: + virtual void Reserve(int64_t values) = 0; + virtual void Append(const uint8_t* value, uint32_t length) = 0; + virtual void AppendNull() = 0; + }; - virtual int DecodeArrow(int num_values, int null_count, const uint8_t* valid_bits, - int64_t valid_bits_offset, - ::arrow::StringDictionaryBuilder* builder) = 0; + template + class WrappedBuilder : public WrappedBuilderInterface { + public: + explicit WrappedBuilder(Builder* builder) : builder_(builder) {} - virtual int DecodeArrowNonNull(int num_values, - ::arrow::internal::ChunkedBinaryBuilder* builder) = 0; + void Reserve(int64_t values) override { + PARQUET_THROW_NOT_OK(builder_->Reserve(values)); + } + void Append(const uint8_t* value, uint32_t length) override { + PARQUET_THROW_NOT_OK(builder_->Append(value, length)); + } + + void AppendNull() override { PARQUET_THROW_NOT_OK(builder_->AppendNull()); } + + private: + Builder* builder_; + }; + + template + int DecodeArrow(int num_values, int null_count, const uint8_t* valid_bits, + int64_t valid_bits_offset, Builder* builder) { + int result = 0; + WrappedBuilder wrapped_builder(builder); + PARQUET_THROW_NOT_OK(DecodeArrow(num_values, null_count, valid_bits, + valid_bits_offset, &wrapped_builder, &result)); + return result; + } + + template + int DecodeArrowNonNull(int num_values, Builder* builder) { + int result = 0; + WrappedBuilder wrapped_builder(builder); + PARQUET_THROW_NOT_OK(DecodeArrowNonNull(num_values, &wrapped_builder, &result)); + return result; + } - virtual int DecodeArrowNonNull(int num_values, - ::arrow::BinaryDictionaryBuilder* builder) = 0; + private: + virtual ::arrow::Status DecodeArrow(int num_values, int null_count, + const uint8_t* valid_bits, + int64_t valid_bits_offset, + WrappedBuilderInterface* builder, + int* values_decoded) = 0; - virtual int DecodeArrowNonNull(int num_values, - ::arrow::StringDictionaryBuilder* builder) = 0; + virtual ::arrow::Status DecodeArrowNonNull(int num_values, + WrappedBuilderInterface* builder, + int* values_decoded) = 0; }; class FLBADecoder : virtual public TypedDecoder { From 99e9dee12987cd27659ead0f8ab5a2043f352435 Mon Sep 17 00:00:00 2001 From: Hatem Helal Date: Wed, 13 Mar 2019 13:39:31 -0400 Subject: [PATCH 33/35] Removed dependencies on arrow builder in parquet/encoding --- cpp/src/parquet/encoding.cc | 1 - cpp/src/parquet/encoding.h | 12 ------------ 2 files changed, 13 deletions(-) diff --git a/cpp/src/parquet/encoding.cc b/cpp/src/parquet/encoding.cc index c6c3b3f2c6b..217ee808e23 100644 --- a/cpp/src/parquet/encoding.cc +++ b/cpp/src/parquet/encoding.cc @@ -24,7 +24,6 @@ #include #include -#include "arrow/builder.h" #include "arrow/status.h" #include "arrow/util/bit-stream-utils.h" #include "arrow/util/bit-util.h" diff --git a/cpp/src/parquet/encoding.h b/cpp/src/parquet/encoding.h index db98bac6ce6..20f33ba52e8 100644 --- a/cpp/src/parquet/encoding.h +++ b/cpp/src/parquet/encoding.h @@ -32,18 +32,6 @@ #include "parquet/util/memory.h" #include "parquet/util/visibility.h" -namespace arrow { - -class BinaryDictionaryBuilder; -class StringDictionaryBuilder; - -namespace internal { - -class ChunkedBinaryBuilder; - -} // namespace internal -} // namespace arrow - namespace parquet { class ColumnDescriptor; From 023c022c3ba52aeece058fe4ee94945df31b8a5b Mon Sep 17 00:00:00 2001 From: Hatem Helal Date: Wed, 13 Mar 2019 14:03:16 -0400 Subject: [PATCH 34/35] Add virtual destructor to WrappedBuilderInterface --- cpp/src/parquet/encoding.h | 1 + 1 file changed, 1 insertion(+) diff --git a/cpp/src/parquet/encoding.h b/cpp/src/parquet/encoding.h index 20f33ba52e8..09c1d0ffc77 100644 --- a/cpp/src/parquet/encoding.h +++ b/cpp/src/parquet/encoding.h @@ -202,6 +202,7 @@ class ByteArrayDecoder : virtual public TypedDecoder { virtual void Reserve(int64_t values) = 0; virtual void Append(const uint8_t* value, uint32_t length) = 0; virtual void AppendNull() = 0; + virtual ~WrappedBuilderInterface() = default; }; template From f644fff9cd53b6c6aba4fced063f3bdfbaa3ebfd Mon Sep 17 00:00:00 2001 From: Hatem Helal Date: Thu, 14 Mar 2019 18:32:57 +0000 Subject: [PATCH 35/35] Move schema fix logic to post-processing step --- cpp/src/parquet/arrow/reader.cc | 63 ++++++++++++++++++++++++--------- 1 file changed, 46 insertions(+), 17 deletions(-) diff --git a/cpp/src/parquet/arrow/reader.cc b/cpp/src/parquet/arrow/reader.cc index ade342266a5..f891682387e 100644 --- a/cpp/src/parquet/arrow/reader.cc +++ b/cpp/src/parquet/arrow/reader.cc @@ -291,6 +291,11 @@ class FileReader::Impl { ParquetFileReader* reader() { return reader_.get(); } + std::vector GetDictionaryIndices(const std::vector& indices); + std::shared_ptr<::arrow::Schema> FixSchema( + const ::arrow::Schema& old_schema, const std::vector& dict_indices, + std::vector>& columns); + private: MemoryPool* pool_; std::unique_ptr reader_; @@ -559,14 +564,6 @@ Status FileReader::Impl::ReadRowGroup(int row_group_index, this](int i) { std::shared_ptr array; RETURN_NOT_OK(ReadColumnChunk(field_indices[i], indices, row_group_index, &array)); - - // Update the schema when reading dictionary array - if (reader_properties_.read_dictionary(i)) { - auto dict_field = - std::make_shared<::arrow::Field>(schema->field(i)->name(), array->type()); - RETURN_NOT_OK(schema->SetField(i, dict_field, &schema)); - } - columns[i] = std::make_shared(schema->field(i), array); return Status::OK(); }; @@ -591,7 +588,15 @@ Status FileReader::Impl::ReadRowGroup(int row_group_index, } } - *out = Table::Make(schema, columns); + auto dict_indices = GetDictionaryIndices(indices); + + if (!dict_indices.empty()) { + schema = FixSchema(*schema, dict_indices, columns); + } + + std::shared_ptr
table = Table::Make(schema, columns); + RETURN_NOT_OK(table->Validate()); + *out = table; return Status::OK(); } @@ -614,14 +619,6 @@ Status FileReader::Impl::ReadTable(const std::vector& indices, auto ReadColumnFunc = [&indices, &field_indices, &schema, &columns, this](int i) { std::shared_ptr array; RETURN_NOT_OK(ReadSchemaField(field_indices[i], indices, &array)); - - // Update the schema when reading dictionary array - if (reader_properties_.read_dictionary(i)) { - auto dict_field = - std::make_shared<::arrow::Field>(schema->field(i)->name(), array->type()); - RETURN_NOT_OK(schema->SetField(i, dict_field, &schema)); - } - columns[i] = std::make_shared(schema->field(i), array); return Status::OK(); }; @@ -646,6 +643,12 @@ Status FileReader::Impl::ReadTable(const std::vector& indices, } } + auto dict_indices = GetDictionaryIndices(indices); + + if (!dict_indices.empty()) { + schema = FixSchema(*schema, dict_indices, columns); + } + std::shared_ptr
table = Table::Make(schema, columns); RETURN_NOT_OK(table->Validate()); *out = table; @@ -691,6 +694,32 @@ Status FileReader::Impl::ReadRowGroup(int i, std::shared_ptr
* table) { return ReadRowGroup(i, indices, table); } +std::vector FileReader::Impl::GetDictionaryIndices(const std::vector& indices) { + // Select the column indices that were read as DictionaryArray + std::vector dict_indices(indices); + auto remove_func = [this](int i) { return !reader_properties_.read_dictionary(i); }; + auto it = std::remove_if(dict_indices.begin(), dict_indices.end(), remove_func); + dict_indices.erase(it, dict_indices.end()); + return dict_indices; +} + +std::shared_ptr<::arrow::Schema> FileReader::Impl::FixSchema( + const ::arrow::Schema& old_schema, const std::vector& dict_indices, + std::vector>& columns) { + // Fix the schema with the actual DictionaryType that was read + auto fields = old_schema.fields(); + + for (int idx : dict_indices) { + auto name = columns[idx]->name(); + auto dict_array = columns[idx]->data(); + auto dict_field = std::make_shared<::arrow::Field>(name, dict_array->type()); + fields[idx] = dict_field; + columns[idx] = std::make_shared(dict_field, dict_array); + } + + return std::make_shared<::arrow::Schema>(fields, old_schema.metadata()); +} + // Static ctor Status OpenFile(const std::shared_ptr<::arrow::io::RandomAccessFile>& file, MemoryPool* allocator, const ReaderProperties& props,