From cbeea802ef1b3ab96f410163483f2a089350cf76 Mon Sep 17 00:00:00 2001 From: Wes McKinney Date: Thu, 18 Jul 2019 21:53:28 -0500 Subject: [PATCH 01/11] Some refactoring / cleaning to prep Add methods to DictionaryBuilder to insert dictionary memo values and indices independently Prototype writing dictionary values / indices directly [skip ci] --- cpp/src/arrow/array-dict-test.cc | 67 +++-- cpp/src/arrow/array/builder_dict.cc | 67 +++-- cpp/src/arrow/array/builder_dict.h | 45 +++- cpp/src/arrow/testing/random.cc | 10 + cpp/src/arrow/testing/random.h | 5 + cpp/src/arrow/type_traits.h | 6 + cpp/src/arrow/util/hashing.h | 4 +- cpp/src/parquet/arrow/reader.cc | 378 ++++++++++++---------------- cpp/src/parquet/column_reader.cc | 130 ++++++---- cpp/src/parquet/column_reader.h | 16 +- cpp/src/parquet/encoding-test.cc | 71 +----- cpp/src/parquet/encoding.cc | 85 ++++++- cpp/src/parquet/encoding.h | 96 ++++--- cpp/src/parquet/types.h | 3 +- 14 files changed, 550 insertions(+), 433 deletions(-) diff --git a/cpp/src/arrow/array-dict-test.cc b/cpp/src/arrow/array-dict-test.cc index ed37df30264..eafdcfc3470 100644 --- a/cpp/src/arrow/array-dict-test.cc +++ b/cpp/src/arrow/array-dict-test.cc @@ -53,7 +53,7 @@ TYPED_TEST_CASE(TestDictionaryBuilder, PrimitiveDictionaries); TYPED_TEST(TestDictionaryBuilder, Basic) { using c_type = typename TypeParam::c_type; - DictionaryBuilder builder(default_memory_pool()); + DictionaryBuilder builder; ASSERT_OK(builder.Append(static_cast(1))); ASSERT_OK(builder.Append(static_cast(2))); ASSERT_OK(builder.Append(static_cast(1))); @@ -81,7 +81,7 @@ TYPED_TEST(TestDictionaryBuilder, ArrayInit) { auto dict_array = ArrayFromJSON(value_type, "[1, 2]"); auto dict_type = dictionary(int8(), value_type); - DictionaryBuilder builder(dict_array, default_memory_pool()); + DictionaryBuilder builder(dict_array); ASSERT_OK(builder.Append(static_cast(1))); ASSERT_OK(builder.Append(static_cast(2))); ASSERT_OK(builder.Append(static_cast(1))); @@ -134,7 +134,7 @@ TYPED_TEST(TestDictionaryBuilder, ArrayConversion) { auto type = std::make_shared(); auto intermediate_result = ArrayFromJSON(type, "[1, 2, 1]"); - DictionaryBuilder dictionary_builder(default_memory_pool()); + DictionaryBuilder dictionary_builder; ASSERT_OK(dictionary_builder.AppendArray(*intermediate_result)); std::shared_ptr result; ASSERT_OK(dictionary_builder.Finish(&result)); @@ -154,7 +154,7 @@ TYPED_TEST(TestDictionaryBuilder, DoubleTableSize) { // Skip this test for (u)int8 if (sizeof(Scalar) > 1) { // Build the dictionary Array - DictionaryBuilder builder(default_memory_pool()); + DictionaryBuilder builder; // Build expected data NumericBuilder dict_builder; Int16Builder int_builder; @@ -192,7 +192,7 @@ TYPED_TEST(TestDictionaryBuilder, DeltaDictionary) { using c_type = typename TypeParam::c_type; auto type = std::make_shared(); - DictionaryBuilder builder(default_memory_pool()); + DictionaryBuilder builder; ASSERT_OK(builder.Append(static_cast(1))); ASSERT_OK(builder.Append(static_cast(2))); @@ -232,7 +232,7 @@ TYPED_TEST(TestDictionaryBuilder, DoubleDeltaDictionary) { auto type = std::make_shared(); auto dict_type = dictionary(int8(), type); - DictionaryBuilder builder(default_memory_pool()); + DictionaryBuilder builder; ASSERT_OK(builder.Append(static_cast(1))); ASSERT_OK(builder.Append(static_cast(2))); @@ -284,7 +284,7 @@ TYPED_TEST(TestDictionaryBuilder, DoubleDeltaDictionary) { TEST(TestStringDictionaryBuilder, Basic) { // Build the dictionary Array - StringDictionaryBuilder builder(default_memory_pool()); + StringDictionaryBuilder builder; ASSERT_OK(builder.Append("test")); ASSERT_OK(builder.Append("test2")); ASSERT_OK(builder.Append("test")); @@ -301,12 +301,40 @@ TEST(TestStringDictionaryBuilder, Basic) { ASSERT_TRUE(expected.Equals(result)); } +TEST(TestStringDictionaryBuilder, AppendIndices) { + auto ex_dict = ArrayFromJSON(utf8(), R"(["c", "a", "b", "d"])"); + auto invalid_dict = ArrayFromJSON(binary(), R"(["e", "f"])"); + + StringDictionaryBuilder builder; + ASSERT_OK(builder.InsertMemoValues(*ex_dict)); + + // Inserting again should have no effect + ASSERT_OK(builder.InsertMemoValues(*ex_dict)); + + // Type mismatch + ASSERT_RAISES(Invalid, builder.InsertMemoValues(*invalid_dict)); + + std::vector raw_indices = {0, 1, 2, -1, 3}; + std::vector is_valid = {1, 1, 1, 0, 1}; + for (int i = 0; i < 2; ++i) { + ASSERT_OK(builder.AppendIndices( + raw_indices.data(), static_cast(raw_indices.size()), is_valid.data())); + } + std::shared_ptr result; + ASSERT_OK(builder.Finish(&result)); + + auto ex_indices = ArrayFromJSON(int8(), R"([0, 1, 2, null, 3, 0, 1, 2, null, 3])"); + auto dtype = dictionary(int8(), utf8()); + DictionaryArray expected(dtype, ex_indices, ex_dict); + ASSERT_TRUE(expected.Equals(result)); +} + TEST(TestStringDictionaryBuilder, ArrayInit) { auto dict_array = ArrayFromJSON(utf8(), R"(["test", "test2"])"); auto int_array = ArrayFromJSON(int8(), "[0, 1, 0]"); // Build the dictionary Array - StringDictionaryBuilder builder(dict_array, default_memory_pool()); + StringDictionaryBuilder builder(dict_array); ASSERT_OK(builder.Append("test")); ASSERT_OK(builder.Append("test2")); ASSERT_OK(builder.Append("test")); @@ -345,7 +373,7 @@ TEST(TestStringDictionaryBuilder, MakeBuilder) { // ARROW-4367 TEST(TestStringDictionaryBuilder, OnlyNull) { // Build the dictionary Array - StringDictionaryBuilder builder(default_memory_pool()); + StringDictionaryBuilder builder; ASSERT_OK(builder.AppendNull()); std::shared_ptr result; @@ -362,7 +390,7 @@ TEST(TestStringDictionaryBuilder, OnlyNull) { TEST(TestStringDictionaryBuilder, DoubleTableSize) { // Build the dictionary Array - StringDictionaryBuilder builder(default_memory_pool()); + StringDictionaryBuilder builder; // Build expected data StringBuilder str_builder; Int16Builder int_builder; @@ -398,7 +426,7 @@ TEST(TestStringDictionaryBuilder, DoubleTableSize) { TEST(TestStringDictionaryBuilder, DeltaDictionary) { // Build the dictionary Array - StringDictionaryBuilder builder(default_memory_pool()); + StringDictionaryBuilder builder; ASSERT_OK(builder.Append("test")); ASSERT_OK(builder.Append("test2")); ASSERT_OK(builder.Append("test")); @@ -432,7 +460,7 @@ TEST(TestStringDictionaryBuilder, DeltaDictionary) { TEST(TestStringDictionaryBuilder, BigDeltaDictionary) { constexpr int16_t kTestLength = 2048; // Build the dictionary Array - StringDictionaryBuilder builder(default_memory_pool()); + StringDictionaryBuilder builder; StringBuilder str_builder1; Int16Builder int_builder1; @@ -518,8 +546,7 @@ TEST(TestStringDictionaryBuilder, BigDeltaDictionary) { TEST(TestFixedSizeBinaryDictionaryBuilder, Basic) { // Build the dictionary Array - DictionaryBuilder builder(arrow::fixed_size_binary(4), - default_memory_pool()); + DictionaryBuilder builder(arrow::fixed_size_binary(4)); std::vector test{12, 12, 11, 12}; std::vector test2{12, 12, 11, 11}; ASSERT_OK(builder.Append(test.data())); @@ -555,7 +582,7 @@ TEST(TestFixedSizeBinaryDictionaryBuilder, ArrayInit) { auto value_type = fixed_size_binary(4); auto dict_array = ArrayFromJSON(value_type, R"(["abcd", "wxyz"])"); util::string_view test = "abcd", test2 = "wxyz"; - DictionaryBuilder builder(dict_array, default_memory_pool()); + DictionaryBuilder builder(dict_array); ASSERT_OK(builder.Append(test)); ASSERT_OK(builder.Append(test2)); ASSERT_OK(builder.Append(test)); @@ -597,7 +624,7 @@ TEST(TestFixedSizeBinaryDictionaryBuilder, DeltaDictionary) { auto value_type = arrow::fixed_size_binary(4); auto dict_type = dictionary(int8(), value_type); - DictionaryBuilder builder(value_type, default_memory_pool()); + DictionaryBuilder builder(value_type); std::vector test{12, 12, 11, 12}; std::vector test2{12, 12, 11, 11}; std::vector test3{12, 12, 11, 10}; @@ -656,7 +683,7 @@ TEST(TestFixedSizeBinaryDictionaryBuilder, DoubleTableSize) { auto value_type = arrow::fixed_size_binary(4); auto dict_type = dictionary(int16(), value_type); - DictionaryBuilder builder(value_type, default_memory_pool()); + DictionaryBuilder builder(value_type); // Build expected data FixedSizeBinaryBuilder fsb_builder(value_type); Int16Builder int_builder; @@ -693,7 +720,7 @@ TEST(TestFixedSizeBinaryDictionaryBuilder, DoubleTableSize) { TEST(TestFixedSizeBinaryDictionaryBuilder, InvalidTypeAppend) { // Build the dictionary Array auto value_type = arrow::fixed_size_binary(4); - DictionaryBuilder builder(value_type, default_memory_pool()); + DictionaryBuilder builder(value_type); // Build an array with different byte width FixedSizeBinaryBuilder fsb_builder(arrow::fixed_size_binary(5)); std::vector value{100, 1, 1, 1, 1}; @@ -707,7 +734,7 @@ TEST(TestFixedSizeBinaryDictionaryBuilder, InvalidTypeAppend) { TEST(TestDecimalDictionaryBuilder, Basic) { // Build the dictionary Array auto decimal_type = arrow::decimal(2, 0); - DictionaryBuilder builder(decimal_type, default_memory_pool()); + DictionaryBuilder builder(decimal_type); // Test data std::vector test{12, 12, 11, 12}; @@ -730,7 +757,7 @@ TEST(TestDecimalDictionaryBuilder, DoubleTableSize) { const auto& decimal_type = arrow::decimal(21, 0); // Build the dictionary Array - DictionaryBuilder builder(decimal_type, default_memory_pool()); + DictionaryBuilder builder(decimal_type); // Build expected data FixedSizeBinaryBuilder fsb_builder(decimal_type); diff --git a/cpp/src/arrow/array/builder_dict.cc b/cpp/src/arrow/array/builder_dict.cc index 8d5814ad81f..6034465b374 100644 --- a/cpp/src/arrow/array/builder_dict.cc +++ b/cpp/src/arrow/array/builder_dict.cc @@ -40,6 +40,16 @@ using internal::checked_cast; // ---------------------------------------------------------------------- // DictionaryType unification +template +using enable_if_memoize = typename std::enable_if< + !std::is_same::MemoTableType, void>::value, + Out>::type; + +template +using enable_if_no_memoize = typename std::enable_if< + std::is_same::MemoTableType, void>::value, + Out>::type; + struct UnifyDictionaryValues { MemoryPool* pool_; std::shared_ptr value_type_; @@ -48,21 +58,15 @@ struct UnifyDictionaryValues { std::shared_ptr* out_values_; std::vector>* out_transpose_maps_; - Status Visit(const DataType&, void* = nullptr) { + template + enable_if_no_memoize Visit(const T&) { // Default implementation for non-dictionary-supported datatypes return Status::NotImplemented("Unification of ", value_type_, " dictionaries is not implemented"); } - Status Visit(const DayTimeIntervalType&, void* = nullptr) { - return Status::NotImplemented( - "Unification of DayTime" - " dictionaries is not implemented"); - } - template - Status Visit(const T&, - typename internal::DictionaryTraits::MemoTableType* = nullptr) { + enable_if_memoize Visit(const T&) { using ArrayType = typename TypeTraits::ArrayType; using DictTraits = typename internal::DictionaryTraits; using MemoTableType = typename DictTraits::MemoTableType; @@ -163,14 +167,14 @@ class internal::DictionaryMemoTable::DictionaryMemoTableImpl { std::shared_ptr value_type_; std::unique_ptr* memo_table_; - Status Visit(const DataType&, void* = nullptr) { + template + enable_if_no_memoize Visit(const T&) { return Status::NotImplemented("Initialization of ", value_type_, " memo table is not implemented"); } template - Status Visit(const T&, - typename internal::DictionaryTraits::MemoTableType* = nullptr) { + enable_if_memoize Visit(const T&) { using MemoTable = typename internal::DictionaryTraits::MemoTableType; memo_table_->reset(new MemoTable(0)); return Status::OK(); @@ -179,23 +183,24 @@ class internal::DictionaryMemoTable::DictionaryMemoTableImpl { struct ArrayValuesInserter { DictionaryMemoTableImpl* impl_; + const Array& values_; template - Status Visit(const T& array) { - return InsertValues(array.type(), array); + Status Visit(const T& type) { + using ArrayType = typename TypeTraits::ArrayType; + return InsertValues(type, checked_cast(values_)); } private: - template - Status InsertValues(const DataType& type, const Array&, void* = nullptr) { + template + enable_if_no_memoize InsertValues(const DType& type, + const ArrayType&) { return Status::NotImplemented("Inserting array values of ", type, " is not implemented"); } - template - Status InsertValues( - const DataType&, const Array& array, - typename internal::DictionaryTraits::MemoTableType* = nullptr) { + template + enable_if_memoize InsertValues(const DType&, const ArrayType& array) { for (int64_t i = 0; i < array.length(); ++i) { ARROW_IGNORE_EXPR(impl_->GetOrInsert(array.GetView(i))); } @@ -210,14 +215,14 @@ class internal::DictionaryMemoTable::DictionaryMemoTableImpl { int64_t start_offset_; std::shared_ptr* out_; - Status Visit(const DataType&, void* = nullptr) { + template + enable_if_no_memoize Visit(const T&) { return Status::NotImplemented("Getting array data of ", value_type_, " is not implemented"); } template - Status Visit(const T&, - typename internal::DictionaryTraits::MemoTableType* = nullptr) { + enable_if_memoize Visit(const T&) { using ConcreteMemoTable = typename internal::DictionaryTraits::MemoTableType; auto memo_table = static_cast(memo_table_); return internal::DictionaryTraits::GetDictionaryArrayData( @@ -232,9 +237,13 @@ class internal::DictionaryMemoTable::DictionaryMemoTableImpl { ARROW_IGNORE_EXPR(VisitTypeInline(*type_, &visitor)); } - Status InsertValues(const std::shared_ptr& array) { - ArrayValuesInserter visitor{this}; - return VisitArrayInline(*array, &visitor); + Status InsertValues(const Array& array) { + if (!array.type()->Equals(*type_)) { + return Status::Invalid("Array value type does not match memo type: ", + array.type()->ToString()); + } + ArrayValuesInserter visitor{this, array}; + return VisitTypeInline(*array.type(), &visitor); } template @@ -267,7 +276,7 @@ internal::DictionaryMemoTable::DictionaryMemoTable(const std::shared_ptr& dictionary) : impl_(new DictionaryMemoTableImpl(dictionary->type())) { - ARROW_IGNORE_EXPR(impl_->InsertValues(dictionary)); + ARROW_IGNORE_EXPR(impl_->InsertValues(*dictionary)); } internal::DictionaryMemoTable::~DictionaryMemoTable() = default; @@ -325,6 +334,10 @@ Status internal::DictionaryMemoTable::GetArrayData(MemoryPool* pool, int64_t sta return impl_->GetArrayData(pool, start_offset, out); } +Status internal::DictionaryMemoTable::InsertValues(const Array& array) { + return impl_->InsertValues(array); +} + int32_t internal::DictionaryMemoTable::size() const { return impl_->size(); } } // namespace arrow diff --git a/cpp/src/arrow/array/builder_dict.h b/cpp/src/arrow/array/builder_dict.h index 93cad2975a2..a932fa3f1be 100644 --- a/cpp/src/arrow/array/builder_dict.h +++ b/cpp/src/arrow/array/builder_dict.h @@ -74,6 +74,9 @@ class ARROW_EXPORT DictionaryMemoTable { Status GetArrayData(MemoryPool* pool, int64_t start_offset, std::shared_ptr* out); + /// \brief Insert new memo values + Status InsertValues(const Array& values); + int32_t size() const; private: @@ -103,7 +106,7 @@ class DictionaryBuilder : public ArrayBuilder { DictionaryBuilder( typename std::enable_if::value, const std::shared_ptr&>::type type, - MemoryPool* pool) + MemoryPool* pool = default_memory_pool()) : ArrayBuilder(type, pool), memo_table_(new internal::DictionaryMemoTable(type)), delta_offset_(0), @@ -114,7 +117,7 @@ class DictionaryBuilder : public ArrayBuilder { explicit DictionaryBuilder( typename std::enable_if::value, const std::shared_ptr&>::type type, - MemoryPool* pool) + MemoryPool* pool = default_memory_pool()) : ArrayBuilder(type, pool), memo_table_(new internal::DictionaryMemoTable(type)), delta_offset_(0), @@ -123,10 +126,12 @@ class DictionaryBuilder : public ArrayBuilder { template explicit DictionaryBuilder( - typename std::enable_if::is_parameter_free, MemoryPool*>::type pool) + typename std::enable_if::is_parameter_free, MemoryPool*>::type pool = + default_memory_pool()) : DictionaryBuilder(TypeTraits::type_singleton(), pool) {} - DictionaryBuilder(const std::shared_ptr& dictionary, MemoryPool* pool) + DictionaryBuilder(const std::shared_ptr& dictionary, + MemoryPool* pool = default_memory_pool()) : ArrayBuilder(dictionary->type(), pool), memo_table_(new internal::DictionaryMemoTable(dictionary)), delta_offset_(0), @@ -175,6 +180,26 @@ class DictionaryBuilder : public ArrayBuilder { return values_builder_.AppendNulls(length); } + /// \brief Insert values into the dictionary's memo, but do not append any + /// indices. Can be used to initialize a new builder with known dictionary + /// values + /// \param[in] values dictionary values to add to memo. Type must match + /// builder type + Status InsertMemoValues(const Array& values) { + return memo_table_->InsertValues(values); + } + + /// \brief Append dictionary indices directly without modifying memo + /// + /// NOTE: Experimental API + Status AppendIndices(const int64_t* values, int64_t length, + const uint8_t* valid_bytes = NULLPTR) { + int64_t null_count_before = values_builder_.null_count(); + ARROW_RETURN_NOT_OK(values_builder_.AppendValues(values, length, valid_bytes)); + null_count_ += values_builder_.null_count() - null_count_before; + return Status::OK(); + } + /// \brief Append a whole dense array to the builder template Status AppendArray( @@ -277,12 +302,14 @@ class DictionaryBuilder : public ArrayBuilder { template <> class DictionaryBuilder : public ArrayBuilder { public: - DictionaryBuilder(const std::shared_ptr& type, MemoryPool* pool) + DictionaryBuilder(const std::shared_ptr& type, + MemoryPool* pool = default_memory_pool()) : ArrayBuilder(type, pool), values_builder_(pool) {} - explicit DictionaryBuilder(MemoryPool* pool) + explicit DictionaryBuilder(MemoryPool* pool = default_memory_pool()) : ArrayBuilder(null(), pool), values_builder_(pool) {} - DictionaryBuilder(const std::shared_ptr& dictionary, MemoryPool* pool) + DictionaryBuilder(const std::shared_ptr& dictionary, + MemoryPool* pool = default_memory_pool()) : ArrayBuilder(dictionary->type(), pool), values_builder_(pool) {} /// \brief Append a scalar null value @@ -342,6 +369,8 @@ class ARROW_EXPORT BinaryDictionaryBuilder : public DictionaryBuilder(value), length); } @@ -357,6 +386,8 @@ class ARROW_EXPORT StringDictionaryBuilder : public DictionaryBuilder(value), length); } diff --git a/cpp/src/arrow/testing/random.cc b/cpp/src/arrow/testing/random.cc index f693a4535e9..3346e631ab5 100644 --- a/cpp/src/arrow/testing/random.cc +++ b/cpp/src/arrow/testing/random.cc @@ -178,6 +178,16 @@ std::shared_ptr RandomArrayGenerator::String(int64_t size, return result; } +std::shared_ptr RandomArrayGenerator::BinaryWithRepeats( + int64_t size, int64_t unique, int32_t min_length, int32_t max_length, + double null_probability) { + auto strings = + StringWithRepeats(size, unique, min_length, max_length, null_probability); + auto data = strings->data()->Copy(); + data->type = binary(); + return MakeArray(data); +} + std::shared_ptr RandomArrayGenerator::StringWithRepeats( int64_t size, int64_t unique, int32_t min_length, int32_t max_length, double null_probability) { diff --git a/cpp/src/arrow/testing/random.h b/cpp/src/arrow/testing/random.h index 6b188fd573b..3126a6901d0 100644 --- a/cpp/src/arrow/testing/random.h +++ b/cpp/src/arrow/testing/random.h @@ -230,6 +230,11 @@ class ARROW_EXPORT RandomArrayGenerator { int32_t min_length, int32_t max_length, double null_probability); + /// \brief Like StringWithRepeats but return BinaryArray + std::shared_ptr BinaryWithRepeats(int64_t size, int64_t unique, + int32_t min_length, int32_t max_length, + double null_probability); + private: SeedType seed() { return seed_distribution_(seed_rng_); } diff --git a/cpp/src/arrow/type_traits.h b/cpp/src/arrow/type_traits.h index 4902f5c6334..c4c549f371b 100644 --- a/cpp/src/arrow/type_traits.h +++ b/cpp/src/arrow/type_traits.h @@ -270,6 +270,12 @@ struct CTypeTraits : public TypeTraits { using ArrowType = StringType; }; +template <> +struct CTypeTraits + : public TypeTraits { + using ArrowType = DayTimeIntervalType; +}; + template <> struct TypeTraits { using ArrayType = ListArray; diff --git a/cpp/src/arrow/util/hashing.h b/cpp/src/arrow/util/hashing.h index bad2b49905e..6a00c1edad7 100644 --- a/cpp/src/arrow/util/hashing.h +++ b/cpp/src/arrow/util/hashing.h @@ -833,7 +833,9 @@ static inline Status ComputeNullBitmap(MemoryPool* pool, const MemoTableType& me } template -struct DictionaryTraits {}; +struct DictionaryTraits { + using MemoTableType = void; +}; template <> struct DictionaryTraits { diff --git a/cpp/src/parquet/arrow/reader.cc b/cpp/src/parquet/arrow/reader.cc index 45071b5003f..7b6ecb9b789 100644 --- a/cpp/src/parquet/arrow/reader.cc +++ b/cpp/src/parquet/arrow/reader.cc @@ -38,6 +38,7 @@ #include "arrow/util/int-util.h" #include "arrow/util/logging.h" #include "arrow/util/thread-pool.h" +#include "arrow/util/ubsan.h" // For arrow::compute::Datum. This should perhaps be promoted. See ARROW-4022 #include "arrow/compute/kernel.h" @@ -65,6 +66,10 @@ using arrow::StructArray; using arrow::Table; using arrow::TimestampArray; +using ::arrow::BitUtil::FromBigEndian; +using ::arrow::internal::SafeLeftShift; +using ::arrow::util::SafeLoadAs; + // For Array/ChunkedArray variant using arrow::compute::Datum; @@ -79,10 +84,6 @@ using parquet::internal::RecordReader; namespace parquet { namespace arrow { -using ::arrow::BitUtil::FromBigEndian; -using ::arrow::internal::SafeLeftShift; -using ::arrow::util::SafeLoadAs; - template using ArrayType = typename ::arrow::TypeTraits::ArrayType; @@ -308,7 +309,6 @@ class PARQUET_NO_EXPORT PrimitiveImpl : public ColumnReader::ColumnReaderImpl { Status NextBatch(int64_t records_to_read, std::shared_ptr* out) override; - template Status WrapIntoListArray(Datum* inout_array); Status GetDefLevels(const int16_t** data, size_t* length) override; @@ -880,7 +880,6 @@ const ParquetFileReader* FileReader::parquet_reader() const { return impl_->parquet_reader(); } -template Status PrimitiveImpl::WrapIntoListArray(Datum* inout_array) { if (descr_->max_repetition_level() == 0) { // Flat, no action @@ -1016,108 +1015,60 @@ Status PrimitiveImpl::WrapIntoListArray(Datum* inout_array) { return Status::OK(); } -template -struct supports_fast_path_impl { - using ArrowCType = typename ArrowType::c_type; - using ParquetCType = typename ParquetType::c_type; - static constexpr bool value = std::is_same::value; -}; - -template -struct supports_fast_path_impl { - static constexpr bool value = false; -}; +// ---------------------------------------------------------------------- +// Primitive types -template -struct supports_fast_path_impl { - static constexpr bool value = false; -}; +template +struct TransferFunctor {}; template -using supports_fast_path = - typename std::enable_if::value>::type; - -template -struct TransferFunctor { +Status TransferInt(RecordReader* reader, MemoryPool* pool, + const std::shared_ptr<::arrow::DataType>& type, Datum* out) { using ArrowCType = typename ArrowType::c_type; using ParquetCType = typename ParquetType::c_type; + int64_t length = reader->values_written(); + std::shared_ptr data; + RETURN_NOT_OK(::arrow::AllocateBuffer(pool, length * sizeof(ArrowCType), &data)); - Status operator()(RecordReader* reader, MemoryPool* pool, - const std::shared_ptr<::arrow::DataType>& type, Datum* out) { - static_assert(!std::is_same::value, - "The fast path transfer functor should be used " - "for primitive values"); + auto values = reinterpret_cast(reader->values()); + auto out_ptr = reinterpret_cast(data->mutable_data()); + std::copy(values, values + length, out_ptr); + *out = std::make_shared>( + type, length, data, reader->ReleaseIsValid(), reader->null_count()); + return Status::OK(); +} - int64_t length = reader->values_written(); - std::shared_ptr data; - RETURN_NOT_OK(::arrow::AllocateBuffer(pool, length * sizeof(ArrowCType), &data)); +std::shared_ptr TransferZeroCopy(RecordReader* reader, + const std::shared_ptr<::arrow::DataType>& type) { + std::vector> buffers = {reader->ReleaseIsValid(), + reader->ReleaseValues()}; + auto data = std::make_shared<::arrow::ArrayData>(type, reader->values_written(), + buffers, reader->null_count()); + return MakeArray(data); +} - auto values = reinterpret_cast(reader->values()); - auto out_ptr = reinterpret_cast(data->mutable_data()); - std::copy(values, values + length, out_ptr); +Status TransferBool(RecordReader* reader, MemoryPool* pool, Datum* out) { + int64_t length = reader->values_written(); + std::shared_ptr data; - if (reader->nullable_values()) { - std::shared_ptr is_valid = reader->ReleaseIsValid(); - *out = std::make_shared>(type, length, data, is_valid, - reader->null_count()); - } else { - *out = std::make_shared>(type, length, data); - } - return Status::OK(); - } -}; + const int64_t buffer_size = BitUtil::BytesForBits(length); + RETURN_NOT_OK(::arrow::AllocateBuffer(pool, buffer_size, &data)); -template -struct TransferFunctor> { - Status operator()(RecordReader* reader, MemoryPool* pool, - const std::shared_ptr<::arrow::DataType>& type, Datum* out) { - int64_t length = reader->values_written(); - std::shared_ptr values = reader->ReleaseValues(); + // Transfer boolean values to packed bitmap + auto values = reinterpret_cast(reader->values()); + uint8_t* data_ptr = data->mutable_data(); + memset(data_ptr, 0, buffer_size); - if (reader->nullable_values()) { - std::shared_ptr is_valid = reader->ReleaseIsValid(); - *out = std::make_shared>(type, length, values, is_valid, - reader->null_count()); - } else { - *out = std::make_shared>(type, length, values); + for (int64_t i = 0; i < length; i++) { + if (values[i]) { + ::arrow::BitUtil::SetBit(data_ptr, i); } - return Status::OK(); } -}; -template <> -struct TransferFunctor<::arrow::BooleanType, BooleanType> { - Status operator()(RecordReader* reader, MemoryPool* pool, - const std::shared_ptr<::arrow::DataType>& type, Datum* out) { - int64_t length = reader->values_written(); - std::shared_ptr data; - - const int64_t buffer_size = BitUtil::BytesForBits(length); - RETURN_NOT_OK(::arrow::AllocateBuffer(pool, buffer_size, &data)); - - // Transfer boolean values to packed bitmap - auto values = reinterpret_cast(reader->values()); - uint8_t* data_ptr = data->mutable_data(); - memset(data_ptr, 0, buffer_size); - - for (int64_t i = 0; i < length; i++) { - if (values[i]) { - ::arrow::BitUtil::SetBit(data_ptr, i); - } - } - - if (reader->nullable_values()) { - std::shared_ptr is_valid = reader->ReleaseIsValid(); - RETURN_NOT_OK(is_valid->Resize(BitUtil::BytesForBits(length), false)); - *out = std::make_shared(type, length, data, is_valid, - reader->null_count()); - } else { - *out = std::make_shared(type, length, data); - } - return Status::OK(); - } -}; + *out = std::make_shared(length, data, reader->ReleaseIsValid(), + reader->null_count()); + return Status::OK(); +} template <> struct TransferFunctor<::arrow::TimestampType, Int96Type> { @@ -1134,59 +1085,49 @@ struct TransferFunctor<::arrow::TimestampType, Int96Type> { *data_ptr++ = Int96GetNanoSeconds(values[i]); } - if (reader->nullable_values()) { - std::shared_ptr is_valid = reader->ReleaseIsValid(); - *out = std::make_shared(type, length, data, is_valid, - reader->null_count()); - } else { - *out = std::make_shared(type, length, data); - } - + *out = std::make_shared(type, length, data, reader->ReleaseIsValid(), + reader->null_count()); return Status::OK(); } }; -template <> -struct TransferFunctor<::arrow::Date64Type, Int32Type> { - Status operator()(RecordReader* reader, MemoryPool* pool, - const std::shared_ptr<::arrow::DataType>& type, Datum* out) { - int64_t length = reader->values_written(); - auto values = reinterpret_cast(reader->values()); +Status TransferDate64(RecordReader* reader, MemoryPool* pool, + const std::shared_ptr<::arrow::DataType>& type, Datum* out) { + int64_t length = reader->values_written(); + auto values = reinterpret_cast(reader->values()); - std::shared_ptr data; - RETURN_NOT_OK(::arrow::AllocateBuffer(pool, length * sizeof(int64_t), &data)); - auto out_ptr = reinterpret_cast(data->mutable_data()); - - for (int64_t i = 0; i < length; i++) { - *out_ptr++ = static_cast(values[i]) * kMillisecondsPerDay; - } + std::shared_ptr data; + RETURN_NOT_OK(::arrow::AllocateBuffer(pool, length * sizeof(int64_t), &data)); + auto out_ptr = reinterpret_cast(data->mutable_data()); - if (reader->nullable_values()) { - std::shared_ptr is_valid = reader->ReleaseIsValid(); - *out = std::make_shared<::arrow::Date64Array>(type, length, data, is_valid, - reader->null_count()); - } else { - *out = std::make_shared<::arrow::Date64Array>(type, length, data); - } - return Status::OK(); + for (int64_t i = 0; i < length; i++) { + *out_ptr++ = static_cast(values[i]) * kMillisecondsPerDay; } -}; -template -struct TransferFunctor< - ArrowType, ParquetType, - typename std::enable_if< - (std::is_base_of<::arrow::BinaryType, ArrowType>::value || - std::is_same<::arrow::FixedSizeBinaryType, ArrowType>::value) && - (std::is_same::value || - std::is_same::value)>::type> { - Status operator()(RecordReader* reader, MemoryPool* pool, - const std::shared_ptr<::arrow::DataType>& type, Datum* out) { - std::vector> chunks = reader->GetBuilderChunks(); - *out = std::make_shared(chunks); - return Status::OK(); - } -}; + *out = std::make_shared<::arrow::Date64Array>( + type, length, data, reader->ReleaseIsValid(), reader->null_count()); + return Status::OK(); +} + +Status TransferBinary(RecordReader* reader, Datum* out) { + auto binary_reader = dynamic_cast(reader); + DCHECK(binary_reader); + *out = std::make_shared(binary_reader->GetBuilderChunks()); + return Status::OK(); +} + +// ---------------------------------------------------------------------- +// Direct to dictionary-encoded + +Status TransferDictionary(RecordReader* reader, Datum* out) { + auto dict_reader = dynamic_cast(reader); + DCHECK(dict_reader); + *out = dict_reader->GetResult(); + return Status::OK(); +} + +// ---------------------------------------------------------------------- +// INT32 / INT64 / BYTE_ARRAY / FIXED_LEN_BYTE_ARRAY -> Decimal128 static uint64_t BytesToInteger(const uint8_t* bytes, int32_t start, int32_t stop) { const int32_t length = stop - start; @@ -1305,9 +1246,6 @@ static inline void RawBytesToDecimalBytes(const uint8_t* value, int32_t byte_wid BytesToIntegerPair(value, byte_width, high, low); } -// ---------------------------------------------------------------------- -// BYTE_ARRAY / FIXED_LEN_BYTE_ARRAY -> Decimal128 - template Status ConvertToDecimal128(const Array& array, const std::shared_ptr<::arrow::DataType>&, MemoryPool* pool, std::shared_ptr*) { @@ -1405,37 +1343,6 @@ Status ConvertToDecimal128(const Array& array, return Status::OK(); } -/// \brief Convert an arrow::BinaryArray to an arrow::Decimal128Array -/// We do this by: -/// 1. Creating an arrow::BinaryArray from the RecordReader's builder -/// 2. Allocating a buffer for the arrow::Decimal128Array -/// 3. Converting the big-endian bytes in each BinaryArray entry to two integers -/// representing the high and low bits of each decimal value. -template -struct TransferFunctor< - ArrowType, ParquetType, - typename std::enable_if::value && - (std::is_same::value || - std::is_same::value)>::type> { - Status operator()(RecordReader* reader, MemoryPool* pool, - const std::shared_ptr<::arrow::DataType>& type, Datum* out) { - DCHECK_EQ(type->id(), ::arrow::Type::DECIMAL); - - ::arrow::ArrayVector chunks = reader->GetBuilderChunks(); - - for (size_t i = 0; i < chunks.size(); ++i) { - std::shared_ptr chunk_as_decimal; - RETURN_NOT_OK( - ConvertToDecimal128(*chunks[i], type, pool, &chunk_as_decimal)); - - // Replace the chunk, which will hopefully also free memory as we go - chunks[i] = chunk_as_decimal; - } - *out = std::make_shared(chunks); - return Status::OK(); - } -}; - /// \brief Convert an Int32 or Int64 array into a Decimal128Array /// The parquet spec allows systems to write decimals in int32, int64 if the values are /// small enough to fit in less 4 bytes or less than 8 bytes, respectively. @@ -1490,32 +1397,62 @@ static Status DecimalIntegerTransfer(RecordReader* reader, MemoryPool* pool, return Status::OK(); } -template <> -struct TransferFunctor<::arrow::Decimal128Type, Int32Type> { +/// \brief Convert an arrow::BinaryArray to an arrow::Decimal128Array +/// We do this by: +/// 1. Creating an arrow::BinaryArray from the RecordReader's builder +/// 2. Allocating a buffer for the arrow::Decimal128Array +/// 3. Converting the big-endian bytes in each BinaryArray entry to two integers +/// representing the high and low bits of each decimal value. +template +struct TransferFunctor< + ArrowType, ParquetType, + typename std::enable_if::value && + (std::is_same::value || + std::is_same::value)>::type> { Status operator()(RecordReader* reader, MemoryPool* pool, const std::shared_ptr<::arrow::DataType>& type, Datum* out) { - return DecimalIntegerTransfer(reader, pool, type, out); - } -}; + DCHECK_EQ(type->id(), ::arrow::Type::DECIMAL); -template <> -struct TransferFunctor<::arrow::Decimal128Type, Int64Type> { - Status operator()(RecordReader* reader, MemoryPool* pool, - const std::shared_ptr<::arrow::DataType>& type, Datum* out) { - return DecimalIntegerTransfer(reader, pool, type, out); + auto binary_reader = dynamic_cast(reader); + DCHECK(binary_reader); + ::arrow::ArrayVector chunks = binary_reader->GetBuilderChunks(); + + for (size_t i = 0; i < chunks.size(); ++i) { + std::shared_ptr chunk_as_decimal; + RETURN_NOT_OK( + ConvertToDecimal128(*chunks[i], type, pool, &chunk_as_decimal)); + + // Replace the chunk, which will hopefully also free memory as we go + chunks[i] = chunk_as_decimal; + } + *out = std::make_shared(chunks); + return Status::OK(); } }; -#define TRANSFER_DATA(ArrowType, ParquetType) \ - TransferFunctor func; \ - RETURN_NOT_OK(func(record_reader_.get(), pool_, field_->type(), &result)); \ - RETURN_NOT_OK(WrapIntoListArray(&result)) +#define TRANSFER_DATA(ArrowType, ParquetType) \ + TransferFunctor func; \ + RETURN_NOT_OK(func(record_reader_.get(), pool_, field_->type(), &result)); #define TRANSFER_CASE(ENUM, ArrowType, ParquetType) \ case ::arrow::Type::ENUM: { \ TRANSFER_DATA(ArrowType, ParquetType); \ } break; +#define TRANSFER_INT32(ENUM, ArrowType) \ + case ::arrow::Type::ENUM: { \ + Status s = TransferInt(record_reader_.get(), pool_, \ + field_->type(), &result); \ + RETURN_NOT_OK(s); \ + } break; + +#define TRANSFER_INT64(ENUM, ArrowType) \ + case ::arrow::Type::ENUM: { \ + Status s = TransferInt(record_reader_.get(), pool_, \ + field_->type(), &result); \ + RETURN_NOT_OK(s); \ + } break; + Status PrimitiveImpl::NextBatch(int64_t records_to_read, std::shared_ptr* out) { try { @@ -1539,34 +1476,48 @@ Status PrimitiveImpl::NextBatch(int64_t records_to_read, Datum result; switch (field_->type()->id()) { - TRANSFER_CASE(BOOL, ::arrow::BooleanType, BooleanType) - TRANSFER_CASE(UINT8, ::arrow::UInt8Type, Int32Type) - TRANSFER_CASE(INT8, ::arrow::Int8Type, Int32Type) - TRANSFER_CASE(UINT16, ::arrow::UInt16Type, Int32Type) - TRANSFER_CASE(INT16, ::arrow::Int16Type, Int32Type) - TRANSFER_CASE(UINT32, ::arrow::UInt32Type, Int32Type) - TRANSFER_CASE(INT32, ::arrow::Int32Type, Int32Type) - TRANSFER_CASE(UINT64, ::arrow::UInt64Type, Int64Type) - TRANSFER_CASE(INT64, ::arrow::Int64Type, Int64Type) - TRANSFER_CASE(FLOAT, ::arrow::FloatType, FloatType) - TRANSFER_CASE(DOUBLE, ::arrow::DoubleType, DoubleType) - TRANSFER_CASE(STRING, ::arrow::StringType, ByteArrayType) - TRANSFER_CASE(BINARY, ::arrow::BinaryType, ByteArrayType) - TRANSFER_CASE(DATE32, ::arrow::Date32Type, Int32Type) - TRANSFER_CASE(DATE64, ::arrow::Date64Type, Int32Type) - TRANSFER_CASE(FIXED_SIZE_BINARY, ::arrow::FixedSizeBinaryType, FLBAType) + case ::arrow::Type::DICTIONARY: { + RETURN_NOT_OK(TransferDictionary(record_reader_.get(), &result)); + break; + } case ::arrow::Type::NA: { result = std::make_shared<::arrow::NullArray>(record_reader_->values_written()); - RETURN_NOT_OK(WrapIntoListArray(&result)); break; } + case ::arrow::Type::INT32: + case ::arrow::Type::INT64: + case ::arrow::Type::FLOAT: + case ::arrow::Type::DOUBLE: + result = TransferZeroCopy(record_reader_.get(), field_->type()); + break; + case ::arrow::Type::BOOL: + RETURN_NOT_OK(TransferBool(record_reader_.get(), pool_, &result)); + break; + TRANSFER_INT32(UINT8, ::arrow::UInt8Type); + TRANSFER_INT32(INT8, ::arrow::Int8Type); + TRANSFER_INT32(UINT16, ::arrow::UInt16Type); + TRANSFER_INT32(INT16, ::arrow::Int16Type); + TRANSFER_INT32(UINT32, ::arrow::UInt32Type); + TRANSFER_INT32(DATE32, ::arrow::Date32Type); + TRANSFER_INT32(TIME32, ::arrow::Time32Type); + TRANSFER_INT64(TIME64, ::arrow::Time64Type); + case ::arrow::Type::DATE64: + RETURN_NOT_OK(TransferDate64(record_reader_.get(), pool_, field_->type(), &result)); + break; + case ::arrow::Type::FIXED_SIZE_BINARY: + case ::arrow::Type::BINARY: + case ::arrow::Type::STRING: + RETURN_NOT_OK(TransferBinary(record_reader_.get(), &result)); + break; case ::arrow::Type::DECIMAL: { switch (descr_->physical_type()) { case ::parquet::Type::INT32: { - TRANSFER_DATA(::arrow::Decimal128Type, Int32Type); + RETURN_NOT_OK(DecimalIntegerTransfer(record_reader_.get(), pool_, + field_->type(), &result)); } break; case ::parquet::Type::INT64: { - TRANSFER_DATA(::arrow::Decimal128Type, Int64Type); + RETURN_NOT_OK(DecimalIntegerTransfer(record_reader_.get(), pool_, + field_->type(), &result)); } break; case ::parquet::Type::BYTE_ARRAY: { TRANSFER_DATA(::arrow::Decimal128Type, ByteArrayType); @@ -1581,31 +1532,32 @@ Status PrimitiveImpl::NextBatch(int64_t records_to_read, } } break; case ::arrow::Type::TIMESTAMP: { - ::arrow::TimestampType* timestamp_type = - static_cast<::arrow::TimestampType*>(field_->type().get()); - switch (timestamp_type->unit()) { + const ::arrow::TimestampType& timestamp_type = + static_cast<::arrow::TimestampType&>(*field_->type()); + switch (timestamp_type.unit()) { case ::arrow::TimeUnit::MILLI: case ::arrow::TimeUnit::MICRO: { - TRANSFER_DATA(::arrow::TimestampType, Int64Type); + result = TransferZeroCopy(record_reader_.get(), field_->type()); } break; case ::arrow::TimeUnit::NANO: { if (descr_->physical_type() == ::parquet::Type::INT96) { TRANSFER_DATA(::arrow::TimestampType, Int96Type); } else { - TRANSFER_DATA(::arrow::TimestampType, Int64Type); + result = TransferZeroCopy(record_reader_.get(), field_->type()); } } break; default: return Status::NotImplemented("TimeUnit not supported"); } } break; - TRANSFER_CASE(TIME32, ::arrow::Time32Type, Int32Type) - TRANSFER_CASE(TIME64, ::arrow::Time64Type, Int64Type) default: return Status::NotImplemented("No support for reading columns of type ", field_->type()->ToString()); } + // Nest nested types + RETURN_NOT_OK(WrapIntoListArray(&result)); + DCHECK_NE(result.kind(), Datum::NONE); if (result.kind() == Datum::ARRAY) { diff --git a/cpp/src/parquet/column_reader.cc b/cpp/src/parquet/column_reader.cc index 6727fe6fcd0..c44c6e68c55 100644 --- a/cpp/src/parquet/column_reader.cc +++ b/cpp/src/parquet/column_reader.cc @@ -25,6 +25,7 @@ #include "arrow/buffer.h" #include "arrow/builder.h" +#include "arrow/table.h" #include "arrow/util/bit-stream-utils.h" #include "arrow/util/compression.h" #include "arrow/util/logging.h" @@ -286,7 +287,8 @@ class ColumnReaderImplBase { num_buffered_values_(0), num_decoded_values_(0), pool_(pool), - current_decoder_(nullptr) {} + current_decoder_(nullptr), + current_encoding_(Encoding::UNKNOWN) {} virtual ~ColumnReaderImplBase() = default; @@ -409,6 +411,7 @@ class ColumnReaderImplBase { ParquetException::NYI("only plain dictionary encoding has been implemented"); } + new_dictionary_ = true; current_decoder_ = decoders_[encoding].get(); DCHECK(current_decoder_); } @@ -493,6 +496,7 @@ class ColumnReaderImplBase { throw ParquetException("Unknown encoding type."); } } + current_encoding_ = encoding; current_decoder_->SetData(static_cast(num_buffered_values_), buffer, static_cast(data_size)); } @@ -526,6 +530,11 @@ class ColumnReaderImplBase { using DecoderType = typename EncodingTraits::Decoder; DecoderType* current_decoder_; + Encoding::type current_encoding_; + + /// Flag to signal when a new dictionary has been set, for the benefit of + /// DictionaryRecordReader + bool new_dictionary_; // Map of encoding type to the respective decoder object. For example, a // column chunk's data pages may include both dictionary-encoded and @@ -770,7 +779,8 @@ namespace internal { constexpr int64_t kMinLevelBatchSize = 1024; template -class TypedRecordReader : public ColumnReaderImplBase, public RecordReader { +class TypedRecordReader : public ColumnReaderImplBase, + virtual public RecordReader { public: using T = typename DType::c_type; using BASE = ColumnReaderImplBase; @@ -883,9 +893,13 @@ class TypedRecordReader : public ColumnReaderImplBase, public RecordReade } std::shared_ptr ReleaseIsValid() override { - auto result = valid_bits_; - valid_bits_ = AllocateBuffer(this->pool_); - return result; + if (nullable_values_) { + auto result = valid_bits_; + valid_bits_ = AllocateBuffer(this->pool_); + return result; + } else { + return nullptr; + } } // Process written repetition/definition levels to reach the end of @@ -1113,10 +1127,6 @@ class TypedRecordReader : public ColumnReaderImplBase, public RecordReade std::cout << std::endl; } - std::vector> GetBuilderChunks() override { - throw ParquetException("GetChunks only implemented for binary types"); - } - void ResetValues() { if (values_written_ > 0) { // Resize to 0, but do not shrink to fit @@ -1137,7 +1147,8 @@ class TypedRecordReader : public ColumnReaderImplBase, public RecordReade } }; -class FLBARecordReader : public TypedRecordReader { +class FLBARecordReader : public TypedRecordReader, + virtual public BinaryRecordReader { public: FLBARecordReader(const ColumnDescriptor* descr, ::arrow::MemoryPool* pool) : TypedRecordReader(descr, pool), builder_(nullptr) { @@ -1189,22 +1200,16 @@ class FLBARecordReader : public TypedRecordReader { std::unique_ptr<::arrow::FixedSizeBinaryBuilder> builder_; }; -class ByteArrayChunkedRecordReader : public TypedRecordReader { +class ByteArrayChunkedRecordReader : public TypedRecordReader, + virtual public BinaryRecordReader { public: - using BuilderType = ::arrow::internal::ChunkedBinaryBuilder; - ByteArrayChunkedRecordReader(const ColumnDescriptor* descr, ::arrow::MemoryPool* pool) : TypedRecordReader(descr, pool), builder_(nullptr) { // ARROW-4688(wesm): Using 2^31 - 1 chunks for now constexpr int32_t kBinaryChunksize = 2147483647; DCHECK_EQ(descr_->physical_type(), Type::BYTE_ARRAY); - if (descr_->converted_type() == ConvertedType::UTF8) { - builder_.reset( - new ::arrow::internal::ChunkedStringBuilder(kBinaryChunksize, this->pool_)); - } else { - builder_.reset( - new ::arrow::internal::ChunkedBinaryBuilder(kBinaryChunksize, this->pool_)); - } + builder_.reset( + new ::arrow::internal::ChunkedBinaryBuilder(kBinaryChunksize, this->pool_)); } ::arrow::ArrayVector GetBuilderChunks() override { @@ -1229,39 +1234,84 @@ class ByteArrayChunkedRecordReader : public TypedRecordReader { } private: - std::unique_ptr builder_; + std::unique_ptr<::arrow::internal::ChunkedBinaryBuilder> builder_; }; -template -class ByteArrayDictionaryRecordReader : public TypedRecordReader { +class ByteArrayDictionaryRecordReader : public TypedRecordReader, + virtual public DictionaryRecordReader { public: ByteArrayDictionaryRecordReader(const ColumnDescriptor* descr, ::arrow::MemoryPool* pool) - : TypedRecordReader(descr, pool), builder_(new BuilderType(pool)) {} + : TypedRecordReader(descr, pool), builder_(pool) {} - ::arrow::ArrayVector GetBuilderChunks() override { - std::shared_ptr<::arrow::Array> chunk; - PARQUET_THROW_NOT_OK(builder_->Finish(&chunk)); - return ::arrow::ArrayVector({chunk}); + std::shared_ptr<::arrow::ChunkedArray> GetResult() override { + FlushBuilder(); + return std::make_shared<::arrow::ChunkedArray>(result_chunks_); + } + + void FlushBuilder() { + if (builder_.length() > 0) { + std::shared_ptr<::arrow::Array> chunk; + PARQUET_THROW_NOT_OK(builder_.Finish(&chunk)); + result_chunks_.emplace_back(std::move(chunk)); + + // Reset clears the dictionary memo table + builder_.Reset(); + } + } + + void WriteNewDictionary() { + if (this->new_dictionary_) { + auto decoder = dynamic_cast(this->current_decoder_); + decoder->InsertDictionary(&builder_); + this->new_dictionary_ = false; + } } void ReadValuesDense(int64_t values_to_read) override { - int64_t num_decoded = this->current_decoder_->DecodeArrowNonNull( - static_cast(values_to_read), builder_.get()); + int64_t num_decoded = 0; + if (current_encoding_ == Encoding::RLE_DICTIONARY) { + /// If there is a new dictionary, we may need to flush the builder, then + /// insert the new dictionary values + FlushBuilder(); + WriteNewDictionary(); + auto decoder = dynamic_cast(this->current_decoder_); + num_decoded = decoder->DecodeIndices(static_cast(values_to_read), &builder_); + } else { + num_decoded = this->current_decoder_->DecodeArrowNonNull( + static_cast(values_to_read), &builder_); + + /// Flush values since they have been copied into the builder + ResetValues(); + } DCHECK_EQ(num_decoded, values_to_read); - ResetValues(); } void ReadValuesSpaced(int64_t values_to_read, int64_t null_count) override { - int64_t num_decoded = this->current_decoder_->DecodeArrow( - static_cast(values_to_read), static_cast(null_count), - valid_bits_->mutable_data(), values_written_, builder_.get()); + int64_t num_decoded = 0; + if (current_encoding_ == Encoding::RLE_DICTIONARY) { + /// If there is a new dictionary, we may need to flush the builder, then + /// insert the new dictionary values + FlushBuilder(); + WriteNewDictionary(); + auto decoder = dynamic_cast(this->current_decoder_); + num_decoded = decoder->DecodeIndicesSpaced( + static_cast(values_to_read), static_cast(null_count), + valid_bits_->mutable_data(), values_written_, &builder_); + } else { + num_decoded = this->current_decoder_->DecodeArrow( + static_cast(values_to_read), static_cast(null_count), + valid_bits_->mutable_data(), values_written_, &builder_); + + /// Flush values since they have been copied into the builder + ResetValues(); + } DCHECK_EQ(num_decoded, values_to_read); - ResetValues(); } private: - std::unique_ptr builder_; + ::arrow::BinaryDictionaryBuilder builder_; + std::vector> result_chunks_; }; // TODO(wesm): Implement these to some satisfaction @@ -1278,13 +1328,7 @@ std::shared_ptr MakeByteArrayRecordReader(const ColumnDescriptor* arrow::MemoryPool* pool, bool read_dictionary) { if (read_dictionary) { - if (descr->converted_type() == ConvertedType::UTF8) { - using Builder = ::arrow::StringDictionaryBuilder; - return std::make_shared>(descr, pool); - } else { - using Builder = ::arrow::BinaryDictionaryBuilder; - return std::make_shared>(descr, pool); - } + return std::make_shared(descr, pool); } else { return std::make_shared(descr, pool); } diff --git a/cpp/src/parquet/column_reader.h b/cpp/src/parquet/column_reader.h index 461cf726733..8022be980e3 100644 --- a/cpp/src/parquet/column_reader.h +++ b/cpp/src/parquet/column_reader.h @@ -33,6 +33,7 @@ namespace arrow { class Array; +class ChunkedArray; namespace BitUtil { class BitReader; @@ -225,9 +226,6 @@ class RecordReader { virtual void DebugPrintState() = 0; - // For BYTE_ARRAY, FIXED_LEN_BYTE_ARRAY types that may have chunked output - virtual std::vector> GetBuilderChunks() = 0; - /// \brief Decoded definition levels int16_t* def_levels() const { return reinterpret_cast(def_levels_->mutable_data()); @@ -282,6 +280,18 @@ class RecordReader { std::shared_ptr<::arrow::ResizableBuffer> rep_levels_; }; +class BinaryRecordReader : virtual public RecordReader { + public: + virtual std::vector> GetBuilderChunks() = 0; +}; + +/// \brief Read records directly to dictionary-encoded Arrow form (int32 +/// indices). Only valid for BYTE_ARRAY columns +class DictionaryRecordReader : virtual public RecordReader { + public: + virtual std::shared_ptr<::arrow::ChunkedArray> GetResult() = 0; +}; + static inline void DefinitionLevelsToBitmap( const int16_t* def_levels, int64_t num_def_levels, const int16_t max_definition_level, const int16_t max_repetition_level, int64_t* values_read, int64_t* null_count, diff --git a/cpp/src/parquet/encoding-test.cc b/cpp/src/parquet/encoding-test.cc index cca9edd1310..4eaf232a16b 100644 --- a/cpp/src/parquet/encoding-test.cc +++ b/cpp/src/parquet/encoding-test.cc @@ -324,26 +324,11 @@ TEST(TestDictionaryEncoding, CannotDictDecodeBoolean) { // ---------------------------------------------------------------------- // Shared arrow builder decode tests -template -struct BuilderTraits {}; -template <> -struct BuilderTraits<::arrow::BinaryType> { - using DenseArrayBuilder = ::arrow::internal::ChunkedBinaryBuilder; - using DictArrayBuilder = ::arrow::BinaryDictionaryBuilder; -}; - -template <> -struct BuilderTraits<::arrow::StringType> { - using DenseArrayBuilder = ::arrow::internal::ChunkedStringBuilder; - using DictArrayBuilder = ::arrow::StringDictionaryBuilder; -}; - -template class TestArrowBuilderDecoding : public ::testing::Test { public: - using DenseBuilder = typename BuilderTraits::DenseArrayBuilder; - using DictBuilder = typename BuilderTraits::DictArrayBuilder; + using DenseBuilder = ::arrow::internal::ChunkedBinaryBuilder; + using DictBuilder = ::arrow::BinaryDictionaryBuilder; void SetUp() override { null_probabilities_ = {0.0, 0.5, 1.0}; } void TearDown() override {} @@ -359,19 +344,9 @@ class TestArrowBuilderDecoding : public ::testing::Test { constexpr int64_t min_length = 2; constexpr int64_t max_length = 10; ::arrow::random::RandomArrayGenerator rag(0); - expected_dense_ = rag.StringWithRepeats(repeat * num_unique, num_unique, min_length, + expected_dense_ = rag.BinaryWithRepeats(repeat * num_unique, num_unique, min_length, max_length, null_probability); - std::shared_ptr<::arrow::DataType> data_type = std::make_shared(); - - if (data_type->id() == ::arrow::BinaryType::type_id) { - // TODO(hatemhelal): this is a kludge. Probably best to extend the - // RandomArrayGenerator to also generate BinaryType arrays. - auto data = expected_dense_->data()->Copy(); - data->type = data_type; - expected_dense_ = std::make_shared<::arrow::BinaryArray>(data); - } - num_values_ = static_cast(expected_dense_->length()); null_count_ = static_cast(expected_dense_->null_count()); valid_bits_ = expected_dense_->null_bitmap()->data(); @@ -475,16 +450,7 @@ class TestArrowBuilderDecoding : public ::testing::Test { std::shared_ptr buffer_; }; -#define TEST_ARROW_BUILDER_BASE_MEMBERS() \ - using TestArrowBuilderDecoding::encoder_; \ - using TestArrowBuilderDecoding::decoder_; \ - using TestArrowBuilderDecoding::input_data_; \ - using TestArrowBuilderDecoding::valid_bits_; \ - using TestArrowBuilderDecoding::num_values_; \ - using TestArrowBuilderDecoding::buffer_ - -template -class PlainEncoding : public TestArrowBuilderDecoding { +class PlainEncoding : public TestArrowBuilderDecoding { public: void SetupEncoderDecoder() override { encoder_ = MakeTypedEncoder(Encoding::PLAIN); @@ -493,33 +459,25 @@ class PlainEncoding : public TestArrowBuilderDecoding { buffer_ = encoder_->FlushValues(); decoder_->SetData(num_values_, buffer_->data(), static_cast(buffer_->size())); } - - protected: - TEST_ARROW_BUILDER_BASE_MEMBERS(); }; -using BuilderArrayTypes = ::testing::Types<::arrow::BinaryType, ::arrow::StringType>; -// using BuilderArrayTypes = ::testing::Types<::arrow::StringType>; -TYPED_TEST_CASE(PlainEncoding, BuilderArrayTypes); - -TYPED_TEST(PlainEncoding, CheckDecodeArrowUsingDenseBuilder) { +TEST_F(PlainEncoding, CheckDecodeArrowUsingDenseBuilder) { this->CheckDecodeArrowUsingDenseBuilder(); } -TYPED_TEST(PlainEncoding, CheckDecodeArrowUsingDictBuilder) { +TEST_F(PlainEncoding, CheckDecodeArrowUsingDictBuilder) { this->CheckDecodeArrowUsingDictBuilder(); } -TYPED_TEST(PlainEncoding, CheckDecodeArrowNonNullDenseBuilder) { +TEST_F(PlainEncoding, CheckDecodeArrowNonNullDenseBuilder) { this->CheckDecodeArrowNonNullUsingDenseBuilder(); } -TYPED_TEST(PlainEncoding, CheckDecodeArrowNonNullDictBuilder) { +TEST_F(PlainEncoding, CheckDecodeArrowNonNullDictBuilder) { this->CheckDecodeArrowNonNullUsingDictBuilder(); } -template -class DictEncoding : public TestArrowBuilderDecoding { +class DictEncoding : public TestArrowBuilderDecoding { public: void SetupEncoderDecoder() override { auto node = schema::ByteArray("name"); @@ -549,28 +507,25 @@ class DictEncoding : public TestArrowBuilderDecoding { } protected: - TEST_ARROW_BUILDER_BASE_MEMBERS(); std::unique_ptr descr_; std::unique_ptr plain_decoder_; std::unique_ptr> dict_decoder_; std::shared_ptr dict_buffer_; }; -TYPED_TEST_CASE(DictEncoding, BuilderArrayTypes); - -TYPED_TEST(DictEncoding, CheckDecodeArrowUsingDenseBuilder) { +TEST_F(DictEncoding, CheckDecodeArrowUsingDenseBuilder) { this->CheckDecodeArrowUsingDenseBuilder(); } -TYPED_TEST(DictEncoding, CheckDecodeArrowUsingDictBuilder) { +TEST_F(DictEncoding, CheckDecodeArrowUsingDictBuilder) { this->CheckDecodeArrowUsingDictBuilder(); } -TYPED_TEST(DictEncoding, CheckDecodeArrowNonNullDenseBuilder) { +TEST_F(DictEncoding, CheckDecodeArrowNonNullDenseBuilder) { this->CheckDecodeArrowNonNullUsingDenseBuilder(); } -TYPED_TEST(DictEncoding, CheckDecodeArrowNonNullDictBuilder) { +TEST_F(DictEncoding, CheckDecodeArrowNonNullDictBuilder) { this->CheckDecodeArrowNonNullUsingDictBuilder(); } diff --git a/cpp/src/parquet/encoding.cc b/cpp/src/parquet/encoding.cc index 304724b6b52..67c313b4e3c 100644 --- a/cpp/src/parquet/encoding.cc +++ b/cpp/src/parquet/encoding.cc @@ -706,10 +706,43 @@ class PlainByteArrayDecoder : public PlainDecoder, using Base::DecodeSpaced; using Base::PlainDecoder; + int DecodeArrow(int num_values, int null_count, const uint8_t* valid_bits, + int64_t valid_bits_offset, + ::arrow::BinaryDictionaryBuilder* builder) override { + int result = 0; + PARQUET_THROW_NOT_OK(DecodeArrow(num_values, null_count, valid_bits, + valid_bits_offset, builder, &result)); + return result; + } + + int DecodeArrow(int num_values, int null_count, const uint8_t* valid_bits, + int64_t valid_bits_offset, + ::arrow::internal::ChunkedBinaryBuilder* builder) override { + int result = 0; + PARQUET_THROW_NOT_OK(DecodeArrow(num_values, null_count, valid_bits, + valid_bits_offset, builder, &result)); + return result; + } + + int DecodeArrowNonNull(int num_values, + ::arrow::BinaryDictionaryBuilder* builder) override { + int result = 0; + PARQUET_THROW_NOT_OK(DecodeArrowNonNull(num_values, builder, &result)); + return result; + } + + int DecodeArrowNonNull(int num_values, + ::arrow::internal::ChunkedBinaryBuilder* builder) override { + int result = 0; + PARQUET_THROW_NOT_OK(DecodeArrowNonNull(num_values, builder, &result)); + return result; + } + private: + template ::arrow::Status DecodeArrow(int num_values, int null_count, const uint8_t* valid_bits, - int64_t valid_bits_offset, WrappedBuilderInterface* builder, - int* values_decoded) override { + int64_t valid_bits_offset, BuilderType* builder, + int* values_decoded) { num_values = std::min(num_values, num_values_); builder->Reserve(num_values); ::arrow::internal::BitmapReader bit_reader(valid_bits, valid_bits_offset, num_values); @@ -743,8 +776,9 @@ class PlainByteArrayDecoder : public PlainDecoder, return ::arrow::Status::OK(); } - ::arrow::Status DecodeArrowNonNull(int num_values, WrappedBuilderInterface* builder, - int* values_decoded) override { + template + ::arrow::Status DecodeArrowNonNull(int num_values, BuilderType* builder, + int* values_decoded) { num_values = std::min(num_values, num_values_); builder->Reserve(num_values); int i = 0; @@ -905,10 +939,43 @@ class DictByteArrayDecoder : public DictDecoderImpl, using BASE = DictDecoderImpl; using BASE::DictDecoderImpl; + int DecodeArrow(int num_values, int null_count, const uint8_t* valid_bits, + int64_t valid_bits_offset, + ::arrow::BinaryDictionaryBuilder* builder) override { + int result = 0; + PARQUET_THROW_NOT_OK(DecodeArrow(num_values, null_count, valid_bits, + valid_bits_offset, builder, &result)); + return result; + } + + int DecodeArrow(int num_values, int null_count, const uint8_t* valid_bits, + int64_t valid_bits_offset, + ::arrow::internal::ChunkedBinaryBuilder* builder) override { + int result = 0; + PARQUET_THROW_NOT_OK(DecodeArrow(num_values, null_count, valid_bits, + valid_bits_offset, builder, &result)); + return result; + } + + int DecodeArrowNonNull(int num_values, + ::arrow::BinaryDictionaryBuilder* builder) override { + int result = 0; + PARQUET_THROW_NOT_OK(DecodeArrowNonNull(num_values, builder, &result)); + return result; + } + + int DecodeArrowNonNull(int num_values, + ::arrow::internal::ChunkedBinaryBuilder* builder) override { + int result = 0; + PARQUET_THROW_NOT_OK(DecodeArrowNonNull(num_values, builder, &result)); + return result; + } + private: + template ::arrow::Status DecodeArrow(int num_values, int null_count, const uint8_t* valid_bits, - int64_t valid_bits_offset, WrappedBuilderInterface* builder, - int* out_num_values) override { + int64_t valid_bits_offset, BuilderType* builder, + int* out_num_values) { constexpr int32_t buffer_size = 1024; int32_t indices_buffer[buffer_size]; builder->Reserve(num_values); @@ -960,8 +1027,9 @@ class DictByteArrayDecoder : public DictDecoderImpl, return ::arrow::Status::OK(); } - ::arrow::Status DecodeArrowNonNull(int num_values, WrappedBuilderInterface* builder, - int* out_num_values) override { + template + ::arrow::Status DecodeArrowNonNull(int num_values, BuilderType* builder, + int* out_num_values) { constexpr int32_t buffer_size = 2048; int32_t indices_buffer[buffer_size]; int values_decoded = 0; @@ -1245,5 +1313,4 @@ std::unique_ptr MakeDictDecoder(Type::type type_num, } } // namespace detail - } // namespace parquet diff --git a/cpp/src/parquet/encoding.h b/cpp/src/parquet/encoding.h index 28a9b98716f..479c60e6c1d 100644 --- a/cpp/src/parquet/encoding.h +++ b/cpp/src/parquet/encoding.h @@ -26,6 +26,17 @@ #include "parquet/platform.h" #include "parquet/types.h" +namespace arrow { + +class BinaryDictionaryBuilder; + +namespace internal { + +class ChunkedBinaryBuilder; + +} // namespace internal +} // namespace arrow + namespace parquet { class ColumnDescriptor; @@ -158,6 +169,30 @@ class DictDecoder : virtual public TypedDecoder { virtual void SetDict(TypedDecoder* dictionary) = 0; }; +class BinaryDictDecoder : virtual public DictDecoder { + public: + /// \brief Insert dictionary values into the Arrow dictionary builder's memo, + /// but do not append any indices + virtual void InsertDictionary(::arrow::BinaryDictionaryBuilder* builder) = 0; + + /// \brief Decode only dictionary indices and append to dictionary + /// builder. The builder must have had the dictionary from this decoder + /// inserted already. + /// + /// Remember to reset the builder each time the dict decoder is initialized + /// with a new dictionary page + virtual int DecodeIndicesSpaced(int num_values, int null_count, + const uint8_t* valid_bits, int64_t valid_bits_offset, + ::arrow::BinaryDictionaryBuilder* builder) = 0; + + /// \brief Decode only dictionary indices (no nulls) + /// + /// Remember to reset the builder each time the dict decoder is initialized + /// with a new dictionary page + virtual int DecodeIndices(int num_values, + ::arrow::BinaryDictionaryBuilder* builder) = 0; +}; + // ---------------------------------------------------------------------- // TypedEncoder specializations, traits, and factory functions @@ -191,60 +226,19 @@ class ByteArrayDecoder : virtual public TypedDecoder { public: using TypedDecoder::DecodeSpaced; - class WrappedBuilderInterface { - public: - virtual void Reserve(int64_t values) = 0; - virtual void Append(const uint8_t* value, uint32_t length) = 0; - virtual void AppendNull() = 0; - virtual ~WrappedBuilderInterface() = default; - }; - - template - class WrappedBuilder : public WrappedBuilderInterface { - public: - explicit WrappedBuilder(Builder* builder) : builder_(builder) {} - - void Reserve(int64_t values) override { - PARQUET_THROW_NOT_OK(builder_->Reserve(values)); - } - void Append(const uint8_t* value, uint32_t length) override { - PARQUET_THROW_NOT_OK(builder_->Append(value, length)); - } - - void AppendNull() override { PARQUET_THROW_NOT_OK(builder_->AppendNull()); } - - private: - Builder* builder_; - }; + virtual int DecodeArrow(int num_values, int null_count, const uint8_t* valid_bits, + int64_t valid_bits_offset, + ::arrow::BinaryDictionaryBuilder* builder) = 0; - template - int DecodeArrow(int num_values, int null_count, const uint8_t* valid_bits, - int64_t valid_bits_offset, Builder* builder) { - int result = 0; - WrappedBuilder wrapped_builder(builder); - PARQUET_THROW_NOT_OK(DecodeArrow(num_values, null_count, valid_bits, - valid_bits_offset, &wrapped_builder, &result)); - return result; - } - - template - int DecodeArrowNonNull(int num_values, Builder* builder) { - int result = 0; - WrappedBuilder wrapped_builder(builder); - PARQUET_THROW_NOT_OK(DecodeArrowNonNull(num_values, &wrapped_builder, &result)); - return result; - } + virtual int DecodeArrowNonNull(int num_values, + ::arrow::BinaryDictionaryBuilder* builder) = 0; - private: - virtual ::arrow::Status DecodeArrow(int num_values, int null_count, - const uint8_t* valid_bits, - int64_t valid_bits_offset, - WrappedBuilderInterface* builder, - int* values_decoded) = 0; + virtual int DecodeArrow(int num_values, int null_count, const uint8_t* valid_bits, + int64_t valid_bits_offset, + ::arrow::internal::ChunkedBinaryBuilder* builder) = 0; - virtual ::arrow::Status DecodeArrowNonNull(int num_values, - WrappedBuilderInterface* builder, - int* values_decoded) = 0; + virtual int DecodeArrowNonNull(int num_values, + ::arrow::internal::ChunkedBinaryBuilder* builder) = 0; }; class FLBADecoder : virtual public TypedDecoder { diff --git a/cpp/src/parquet/types.h b/cpp/src/parquet/types.h index dfa056ebe56..0faec10cf67 100644 --- a/cpp/src/parquet/types.h +++ b/cpp/src/parquet/types.h @@ -452,7 +452,8 @@ struct Encoding { DELTA_BINARY_PACKED = 5, DELTA_LENGTH_BYTE_ARRAY = 6, DELTA_BYTE_ARRAY = 7, - RLE_DICTIONARY = 8 + RLE_DICTIONARY = 8, + UNKNOWN = 999 }; }; From 2d6558b418996bb3d7cd723d225fb546f6c08fde Mon Sep 17 00:00:00 2001 From: Wes McKinney Date: Sat, 20 Jul 2019 18:30:05 -0500 Subject: [PATCH 02/11] Add RleDecoder::GetBatchSpaced --- cpp/src/arrow/util/rle-encoding-test.cc | 71 +++++++++++++- cpp/src/arrow/util/rle-encoding.h | 97 +++++++++++++++++-- cpp/src/parquet/encoding.cc | 118 +++++++++++++++++++----- cpp/src/parquet/encoding.h | 10 +- 4 files changed, 255 insertions(+), 41 deletions(-) diff --git a/cpp/src/arrow/util/rle-encoding-test.cc b/cpp/src/arrow/util/rle-encoding-test.cc index 3d6da692de5..f049828359b 100644 --- a/cpp/src/arrow/util/rle-encoding-test.cc +++ b/cpp/src/arrow/util/rle-encoding-test.cc @@ -26,6 +26,10 @@ #include // IWYU pragma: export +#include "arrow/array.h" +#include "arrow/buffer.h" +#include "arrow/testing/random.h" +#include "arrow/type.h" #include "arrow/util/bit-stream-utils.h" #include "arrow/util/bit-util.h" #include "arrow/util/rle-encoding.h" @@ -242,12 +246,13 @@ bool CheckRoundTrip(const std::vector& values, int bit_width) { // Verify batch read { - RleDecoder decoder(buffer, len, bit_width); + RleDecoder decoder(buffer, encoded_len, bit_width); std::vector values_read(values.size()); if (static_cast(values.size()) != decoder.GetBatch(values_read.data(), static_cast(values.size()))) { return false; } + if (values != values_read) { return false; } @@ -464,5 +469,69 @@ TEST(BitRle, Overflow) { } } +template +void CheckRoundTripSpaced(const Array& data, int bit_width) { + using ArrayType = typename TypeTraits::ArrayType; + using T = typename Type::c_type; + + int num_values = static_cast(data.length()); + int buffer_size = RleEncoder::MaxBufferSize(bit_width, num_values); + + const T* values = static_cast(data).raw_values(); + + std::vector buffer(buffer_size); + RleEncoder encoder(buffer.data(), buffer_size, bit_width); + for (int i = 0; i < num_values; ++i) { + if (data.IsValid(i)) { + if (!encoder.Put(static_cast(values[i]))) { + FAIL() << "Encoding failed"; + } + } + } + int encoded_size = encoder.Flush(); + + // Verify batch read + RleDecoder decoder(buffer.data(), encoded_size, bit_width); + std::vector values_read(num_values); + + if (num_values != decoder.GetBatchSpaced(num_values, data.null_count(), + data.null_bitmap_data(), data.offset(), + values_read.data())) { + FAIL(); + } + + for (int64_t i = 0; i < num_values; ++i) { + if (data.IsValid(i)) { + if (values_read[i] != values[i]) { + FAIL() << "Index " << i << " read " << values_read[i] << " but should be " + << values[i]; + } + } + } +} + +template +struct GetBatchSpacedTestCase { + T max_value; + int64_t size; + double null_probability; + int bit_width; +}; + +TEST(RleDecoder, GetBatchSpaced) { + uint32_t kSeed = 1337; + ::arrow::random::RandomArrayGenerator rand(kSeed); + + std::vector> int32_cases{ + {1, 100000, 0.01, 1}, {1, 100000, 0.1, 1}, {1, 100000, 0.5, 1}, + {4, 100000, 0.05, 3}, {100, 100000, 0.05, 7}, + }; + for (auto case_ : int32_cases) { + auto arr = rand.Int32(case_.size, /*min=*/0, case_.max_value, case_.null_probability); + CheckRoundTripSpaced(*arr, case_.bit_width); + CheckRoundTripSpaced(*arr->Slice(1), case_.bit_width); + } +} + } // namespace util } // namespace arrow diff --git a/cpp/src/arrow/util/rle-encoding.h b/cpp/src/arrow/util/rle-encoding.h index 739158a59a1..4b5da6fae4d 100644 --- a/cpp/src/arrow/util/rle-encoding.h +++ b/cpp/src/arrow/util/rle-encoding.h @@ -116,6 +116,11 @@ class RleDecoder { template int GetBatch(T* values, int batch_size); + /// Like GetBatch but add spacing for null entries + template + int GetBatchSpaced(int batch_size, int null_count, const uint8_t* valid_bits, + int64_t valid_bits_offset, T* out); + /// Like GetBatch but the values are then decoded using the provided dictionary template int GetBatchWithDict(const T* dictionary, T* values, int batch_size); @@ -310,6 +315,82 @@ inline int RleDecoder::GetBatch(T* values, int batch_size) { return values_read; } +template +inline int RleDecoder::GetBatchSpaced(int batch_size, int null_count, + const uint8_t* valid_bits, + int64_t valid_bits_offset, T* out) { + DCHECK_GE(bit_width_, 0); + int values_read = 0; + int remaining_nulls = null_count; + + arrow::internal::BitmapReader bit_reader(valid_bits, valid_bits_offset, batch_size); + + while (values_read < batch_size) { + bool is_valid = bit_reader.IsSet(); + bit_reader.Next(); + + if (is_valid) { + if ((repeat_count_ == 0) && (literal_count_ == 0)) { + if (!NextCounts()) return values_read; + } + if (repeat_count_ > 0) { + // The current index is already valid, we don't need to check that again + int repeat_batch = 1; + repeat_count_--; + + while (repeat_count_ > 0 && (values_read + repeat_batch) < batch_size) { + if (bit_reader.IsSet()) { + repeat_count_--; + } else { + remaining_nulls--; + } + repeat_batch++; + + bit_reader.Next(); + } + std::fill(out, out + repeat_batch, current_value_); + out += repeat_batch; + values_read += repeat_batch; + } else if (literal_count_ > 0) { + int literal_batch = std::min(batch_size - values_read - remaining_nulls, + static_cast(literal_count_)); + + // Decode the literals + constexpr int kBufferSize = 1024; + T indices[kBufferSize]; + literal_batch = std::min(literal_batch, kBufferSize); + int actual_read = bit_reader_.GetBatch(bit_width_, &indices[0], literal_batch); + DCHECK_EQ(actual_read, literal_batch); + + int skipped = 0; + int literals_read = 1; + *out++ = indices[0]; + + // Read the first bitset to the end + while (literals_read < literal_batch) { + if (bit_reader.IsSet()) { + *out = indices[literals_read]; + literals_read++; + } else { + skipped++; + } + ++out; + bit_reader.Next(); + } + literal_count_ -= literal_batch; + values_read += literal_batch + skipped; + remaining_nulls -= skipped; + } + } else { + ++out; + values_read++; + remaining_nulls--; + } + } + + return values_read; +} + template inline int RleDecoder::GetBatchWithDict(const T* dictionary, T* values, int batch_size) { DCHECK_GE(bit_width_, 0); @@ -346,9 +427,8 @@ inline int RleDecoder::GetBatchWithDict(const T* dictionary, T* values, int batc } template -inline int RleDecoder::GetBatchWithDictSpaced(const T* dictionary, T* values, - int batch_size, int null_count, - const uint8_t* valid_bits, +inline int RleDecoder::GetBatchWithDictSpaced(const T* dictionary, T* out, int batch_size, + int null_count, const uint8_t* valid_bits, int64_t valid_bits_offset) { DCHECK_GE(bit_width_, 0); int values_read = 0; @@ -380,7 +460,8 @@ inline int RleDecoder::GetBatchWithDictSpaced(const T* dictionary, T* values, bit_reader.Next(); } - std::fill(values + values_read, values + values_read + repeat_batch, value); + std::fill(out, out + repeat_batch, value); + out += repeat_batch; values_read += repeat_batch; } else if (literal_count_ > 0) { int literal_batch = std::min(batch_size - values_read - remaining_nulls, @@ -395,18 +476,17 @@ inline int RleDecoder::GetBatchWithDictSpaced(const T* dictionary, T* values, int skipped = 0; int literals_read = 1; - values[values_read] = dictionary[indices[0]]; + *out++ = dictionary[indices[0]]; // Read the first bitset to the end while (literals_read < literal_batch) { if (bit_reader.IsSet()) { - values[values_read + literals_read + skipped] = - dictionary[indices[literals_read]]; + *out = dictionary[indices[literals_read]]; literals_read++; } else { skipped++; } - + ++out; bit_reader.Next(); } literal_count_ -= literal_batch; @@ -414,6 +494,7 @@ inline int RleDecoder::GetBatchWithDictSpaced(const T* dictionary, T* values, remaining_nulls -= skipped; } } else { + ++out; values_read++; remaining_nulls--; } diff --git a/cpp/src/parquet/encoding.cc b/cpp/src/parquet/encoding.cc index 67c313b4e3c..6e27b36d095 100644 --- a/cpp/src/parquet/encoding.cc +++ b/cpp/src/parquet/encoding.cc @@ -25,6 +25,7 @@ #include #include "arrow/util/bit-stream-utils.h" +#include "arrow/util/checked_cast.h" #include "arrow/util/hashing.h" #include "arrow/util/logging.h" #include "arrow/util/rle-encoding.h" @@ -826,7 +827,10 @@ class DictDecoderImpl : public DecoderImpl, virtual public DictDecoder { ::arrow::MemoryPool* pool = ::arrow::default_memory_pool()) : DecoderImpl(descr, Encoding::RLE_DICTIONARY), dictionary_(AllocateBuffer(pool, 0)), - byte_array_data_(AllocateBuffer(pool, 0)) {} + dictionary_length_(0), + byte_array_data_(AllocateBuffer(pool, 0)), + byte_array_offsets_(AllocateBuffer(pool, 0)), + indices_scratch_space_(AllocateBuffer(pool, 0)) {} // Perform type-specific initiatialization void SetDict(TypedDecoder* dictionary) override; @@ -840,92 +844,141 @@ class DictDecoderImpl : public DecoderImpl, virtual public DictDecoder { idx_decoder_ = ::arrow::util::RleDecoder(data, len, bit_width); } - int Decode(T* buffer, int max_values) override { - max_values = std::min(max_values, num_values_); + int Decode(T* buffer, int num_values) override { + num_values = std::min(num_values, num_values_); int decoded_values = idx_decoder_.GetBatchWithDict( - reinterpret_cast(dictionary_->data()), buffer, max_values); - if (decoded_values != max_values) { + reinterpret_cast(dictionary_->data()), buffer, num_values); + if (decoded_values != num_values) { ParquetException::EofException(); } - num_values_ -= max_values; - return max_values; + num_values_ -= num_values; + return num_values; } int DecodeSpaced(T* buffer, int num_values, int null_count, const uint8_t* valid_bits, int64_t valid_bits_offset) override { + num_values = std::min(num_values, num_values_); int decoded_values = idx_decoder_.GetBatchWithDictSpaced( reinterpret_cast(dictionary_->data()), buffer, num_values, null_count, valid_bits, valid_bits_offset); if (decoded_values != num_values) { ParquetException::EofException(); } + num_values_ -= num_values return decoded_values; + } + + void InsertDictionary(::arrow::ArrayBuilder* builder) override; + + int DecodeIndicesSpaced(int num_values, int null_count, const uint8_t* valid_bits, + int64_t valid_bits_offset, + ::arrow::DictionaryBuilder* builder) override { + num_values = std::min(num_values, num_values_); + int decoded_values = idx_decoder_.GetIndicesSpaced(buffer, num_values, null_count, + valid_bits, valid_bits_offset); + if (decoded_values != num_values) { + ParquetException::EofException(); + } + num_values_ -= num_values; return decoded_values; } + int DecodeIndices(int num_values, ::arrow::DictionaryBuilder* builder) override { + num_values = std::min(num_values, num_values_); + int decoded_values = idx_decoder_.GetIndices(buffer, num_values); + if (decoded_values != num_values) { + ParquetException::EofException(); + } + num_values_ -= num_values; + return num_values; + } + protected: inline void DecodeDict(TypedDecoder* dictionary) { - int num_dictionary_values = dictionary->values_left(); - PARQUET_THROW_NOT_OK(dictionary_->Resize(num_dictionary_values * sizeof(T))); + dictionary_length_ = static_cast(dictionary->values_left()); + PARQUET_THROW_NOT_OK(dictionary_->Resize(dictionary_length_ * sizeof(T), + /*shrink_to_fit=*/false)); dictionary->Decode(reinterpret_cast(dictionary_->mutable_data()), - num_dictionary_values); + dictionary_length_); } // Only one is set. std::shared_ptr dictionary_; + int32_t dictionary_length_; + // Data that contains the byte array data (byte_array_dictionary_ just has the // pointers). std::shared_ptr byte_array_data_; + // Arrow-style byte offsets for each dictionary value. We maintain two + // representations of the dictionary, one as ByteArray* for non-Arrow + // consumers and this one for Arrow conumers. Since dictionaries are + // generally pretty small to begin with this doesn't mean too much extra + // memory use in most cases + std::shared_ptr byte_array_offsets_; + + // Reusable buffer for decoding dictionary indices to be appended to a + // BinaryDictionaryBuilder + std::shared_ptr indices_scratch_space_; + ::arrow::util::RleDecoder idx_decoder_; }; template -inline void DictDecoderImpl::SetDict(TypedDecoder* dictionary) { +void DictDecoderImpl::SetDict(TypedDecoder* dictionary) { + DecodeDict(dictionary); +} + +void DictDecoderImpl::SetDict(TypedDecoder* dictionary) { DecodeDict(dictionary); } template <> -inline void DictDecoderImpl::SetDict(TypedDecoder* dictionary) { +void DictDecoderImpl::SetDict(TypedDecoder* dictionary) { ParquetException::NYI("Dictionary encoding is not implemented for boolean values"); } template <> -inline void DictDecoderImpl::SetDict( - TypedDecoder* dictionary) { - int num_dictionary_values = dictionary->values_left(); +void DictDecoderImpl::SetDict(TypedDecoder* dictionary) { DecodeDict(dictionary); auto dict_values = reinterpret_cast(dictionary_->mutable_data()); int total_size = 0; - for (int i = 0; i < num_dictionary_values; ++i) { + for (int i = 0; i < dictionary_length_; ++i) { total_size += dict_values[i].len; } if (total_size > 0) { - PARQUET_THROW_NOT_OK(byte_array_data_->Resize(total_size, false)); + PARQUET_THROW_NOT_OK(byte_array_data_->Resize(total_size, + /*shrink_to_fit=*/false)); + PARQUET_THROW_NOT_OK( + byte_array_offsets_->Resize((num_dictionary_values + 1) * sizeof(int32_t), + /*shrink_to_fit=*/false)); } - int offset = 0; + int32_t offset = 0; uint8_t* bytes_data = byte_array_data_->mutable_data(); - for (int i = 0; i < num_dictionary_values; ++i) { + int32_t* bytes_offsets = byte_array_offsets_->mutable_data(); + for (int i = 0; i < dictionary_length_; ++i) { memcpy(bytes_data + offset, dict_values[i].ptr, dict_values[i].len); + bytes_offsets[i] = offset; dict_values[i].ptr = bytes_data + offset; offset += dict_values[i].len; } + bytes_offsets[num_dictionary_values] = offset; } template <> inline void DictDecoderImpl::SetDict(TypedDecoder* dictionary) { - int num_dictionary_values = dictionary->values_left(); DecodeDict(dictionary); auto dict_values = reinterpret_cast(dictionary_->mutable_data()); int fixed_len = descr_->type_length(); - int total_size = num_dictionary_values * fixed_len; + int total_size = dictionary_length_ * fixed_len; - PARQUET_THROW_NOT_OK(byte_array_data_->Resize(total_size, false)); + PARQUET_THROW_NOT_OK(byte_array_data_->Resize(total_size, + /*shrink_to_fit=*/false)); uint8_t* bytes_data = byte_array_data_->mutable_data(); for (int32_t i = 0, offset = 0; i < num_dictionary_values; ++i, offset += fixed_len) { memcpy(bytes_data + offset, dict_values[i].ptr, fixed_len); @@ -933,11 +986,26 @@ inline void DictDecoderImpl::SetDict(TypedDecoder* dictionar } } -class DictByteArrayDecoder : public DictDecoderImpl, - virtual public ByteArrayDecoder { +template +void DictDecoderImpl::InsertDictionary(::arrow::ArrayBuilder* builder) { + ParquetException::NYI("InsertDictionary only implemented for BYTE_ARRAY types"); +} + +template <> +void DictDecoderImpl::InsertDictionary(::arrow::ArrayBuilder* builder) { + auto binary_builder = + ::arrow::util::checked_cast<::arrow::BinaryDictionaryBuilder*>(builder); + + // Make an BinaryArray referencing the internal dictionary data + auto arr = std::make_shared<::arrow::BinaryArray>( + dictionary_length_, byte_array_offsets_, byte_array_data_); + PARQUET_THROW_NOT_OK(binary_builder->InsertMemoValues(*arr)); +} + +class DictByteArrayDecoderImpl : public DictDecoderImpl, + virtual public DictByteArrayDecoder { public: using BASE = DictDecoderImpl; - using BASE::DictDecoderImpl; int DecodeArrow(int num_values, int null_count, const uint8_t* valid_bits, int64_t valid_bits_offset, diff --git a/cpp/src/parquet/encoding.h b/cpp/src/parquet/encoding.h index 479c60e6c1d..ecc648aa08c 100644 --- a/cpp/src/parquet/encoding.h +++ b/cpp/src/parquet/encoding.h @@ -167,13 +167,10 @@ template class DictDecoder : virtual public TypedDecoder { public: virtual void SetDict(TypedDecoder* dictionary) = 0; -}; -class BinaryDictDecoder : virtual public DictDecoder { - public: /// \brief Insert dictionary values into the Arrow dictionary builder's memo, /// but do not append any indices - virtual void InsertDictionary(::arrow::BinaryDictionaryBuilder* builder) = 0; + virtual void InsertDictionary(::arrow::ArrayBuilder* builder) = 0; /// \brief Decode only dictionary indices and append to dictionary /// builder. The builder must have had the dictionary from this decoder @@ -183,14 +180,13 @@ class BinaryDictDecoder : virtual public DictDecoder { /// with a new dictionary page virtual int DecodeIndicesSpaced(int num_values, int null_count, const uint8_t* valid_bits, int64_t valid_bits_offset, - ::arrow::BinaryDictionaryBuilder* builder) = 0; + ::arrow::DictionaryBuilder* builder) = 0; /// \brief Decode only dictionary indices (no nulls) /// /// Remember to reset the builder each time the dict decoder is initialized /// with a new dictionary page - virtual int DecodeIndices(int num_values, - ::arrow::BinaryDictionaryBuilder* builder) = 0; + virtual int DecodeIndices(int num_values, ::arrow::DictionaryBuilder* builder) = 0; }; // ---------------------------------------------------------------------- From 418ffe1952186f05bd092a2c2383b1442f0e9980 Mon Sep 17 00:00:00 2001 From: Wes McKinney Date: Sun, 21 Jul 2019 09:11:38 -0500 Subject: [PATCH 03/11] Closer to working decoder implementation [skip ci] --- cpp/src/parquet/encoding.cc | 29 ++++++++++++++++++----------- cpp/src/parquet/encoding.h | 5 +++-- 2 files changed, 21 insertions(+), 13 deletions(-) diff --git a/cpp/src/parquet/encoding.cc b/cpp/src/parquet/encoding.cc index 6e27b36d095..ecc00b6a16e 100644 --- a/cpp/src/parquet/encoding.cc +++ b/cpp/src/parquet/encoding.cc @@ -858,13 +858,12 @@ class DictDecoderImpl : public DecoderImpl, virtual public DictDecoder { int DecodeSpaced(T* buffer, int num_values, int null_count, const uint8_t* valid_bits, int64_t valid_bits_offset) override { num_values = std::min(num_values, num_values_); - int decoded_values = idx_decoder_.GetBatchWithDictSpaced( - reinterpret_cast(dictionary_->data()), buffer, num_values, null_count, - valid_bits, valid_bits_offset); - if (decoded_values != num_values) { + if (num_values != idx_decoder_.GetBatchWithDictSpaced( + reinterpret_cast(dictionary_->data()), buffer, + num_values, null_count, valid_bits, valid_bits_offset)) { ParquetException::EofException(); } - num_values_ -= num_values return decoded_values; + num_values_ -= num_values return num_values; } void InsertDictionary(::arrow::ArrayBuilder* builder) override; @@ -873,19 +872,27 @@ class DictDecoderImpl : public DecoderImpl, virtual public DictDecoder { int64_t valid_bits_offset, ::arrow::DictionaryBuilder* builder) override { num_values = std::min(num_values, num_values_); - int decoded_values = idx_decoder_.GetIndicesSpaced(buffer, num_values, null_count, - valid_bits, valid_bits_offset); - if (decoded_values != num_values) { + if (num_values > 0) { + PARQUET_THROW_NOT_OK(indices_scratch_space_->Resize(num_values * sizeof(int16_t), + /*shrink_to_fit=*/false)) + } + if (num_values != + idx_decoder_.GetIndicesSpaced(indices_scratch_space_->mutable_data(), num_values, + null_count, valid_bits, valid_bits_offset)) { ParquetException::EofException(); } num_values_ -= num_values; - return decoded_values; + return num_values; } int DecodeIndices(int num_values, ::arrow::DictionaryBuilder* builder) override { num_values = std::min(num_values, num_values_); - int decoded_values = idx_decoder_.GetIndices(buffer, num_values); - if (decoded_values != num_values) { + if (num_values > 0) { + PARQUET_THROW_NOT_OK(indices_scratch_space_->Resize(num_values * sizeof(int16_t), + /*shrink_to_fit=*/false)) + } + if (num_values != + idx_decoder_.GetBatch(indices_scratch_space_->mutable_data(), num_values)) { ParquetException::EofException(); } num_values_ -= num_values; diff --git a/cpp/src/parquet/encoding.h b/cpp/src/parquet/encoding.h index ecc648aa08c..4918a1362dd 100644 --- a/cpp/src/parquet/encoding.h +++ b/cpp/src/parquet/encoding.h @@ -28,6 +28,7 @@ namespace arrow { +class ArrayBuilder; class BinaryDictionaryBuilder; namespace internal { @@ -180,13 +181,13 @@ class DictDecoder : virtual public TypedDecoder { /// with a new dictionary page virtual int DecodeIndicesSpaced(int num_values, int null_count, const uint8_t* valid_bits, int64_t valid_bits_offset, - ::arrow::DictionaryBuilder* builder) = 0; + ::arrow::ArrayBuilder* builder) = 0; /// \brief Decode only dictionary indices (no nulls) /// /// Remember to reset the builder each time the dict decoder is initialized /// with a new dictionary page - virtual int DecodeIndices(int num_values, ::arrow::DictionaryBuilder* builder) = 0; + virtual int DecodeIndices(int num_values, ::arrow::ArrayBuilder* builder) = 0; }; // ---------------------------------------------------------------------- From ad253ed1accd1c218f9dbf8cb6d0371e2b1f336f Mon Sep 17 00:00:00 2001 From: Wes McKinney Date: Thu, 25 Jul 2019 10:11:15 -0500 Subject: [PATCH 04/11] More refactoring [skip ci] --- cpp/src/parquet/column_reader.cc | 10 ++++-- cpp/src/parquet/encoding-test.cc | 26 ++++++++++----- cpp/src/parquet/encoding.cc | 55 +++++++++++++++++++------------- 3 files changed, 58 insertions(+), 33 deletions(-) diff --git a/cpp/src/parquet/column_reader.cc b/cpp/src/parquet/column_reader.cc index c44c6e68c55..b50251c873e 100644 --- a/cpp/src/parquet/column_reader.cc +++ b/cpp/src/parquet/column_reader.cc @@ -27,6 +27,7 @@ #include "arrow/builder.h" #include "arrow/table.h" #include "arrow/util/bit-stream-utils.h" +#include "arrow/util/checked_cast.h" #include "arrow/util/compression.h" #include "arrow/util/logging.h" #include "arrow/util/rle-encoding.h" @@ -39,6 +40,7 @@ #include "parquet/thrift.h" using arrow::MemoryPool; +using arrow::internal::checked_cast; namespace parquet { @@ -1262,7 +1264,7 @@ class ByteArrayDictionaryRecordReader : public TypedRecordReader, void WriteNewDictionary() { if (this->new_dictionary_) { - auto decoder = dynamic_cast(this->current_decoder_); + auto decoder = checked_cast(this->current_decoder_); decoder->InsertDictionary(&builder_); this->new_dictionary_ = false; } @@ -1275,7 +1277,7 @@ class ByteArrayDictionaryRecordReader : public TypedRecordReader, /// insert the new dictionary values FlushBuilder(); WriteNewDictionary(); - auto decoder = dynamic_cast(this->current_decoder_); + auto decoder = checked_cast(this->current_decoder_); num_decoded = decoder->DecodeIndices(static_cast(values_to_read), &builder_); } else { num_decoded = this->current_decoder_->DecodeArrowNonNull( @@ -1294,7 +1296,7 @@ class ByteArrayDictionaryRecordReader : public TypedRecordReader, /// insert the new dictionary values FlushBuilder(); WriteNewDictionary(); - auto decoder = dynamic_cast(this->current_decoder_); + auto decoder = checked_cast(this->current_decoder_); num_decoded = decoder->DecodeIndicesSpaced( static_cast(values_to_read), static_cast(null_count), valid_bits_->mutable_data(), values_written_, &builder_); @@ -1310,6 +1312,8 @@ class ByteArrayDictionaryRecordReader : public TypedRecordReader, } private: + using BinaryDictDecoder = DictDecoder; + ::arrow::BinaryDictionaryBuilder builder_; std::vector> result_chunks_; }; diff --git a/cpp/src/parquet/encoding-test.cc b/cpp/src/parquet/encoding-test.cc index 4eaf232a16b..6340510d2b7 100644 --- a/cpp/src/parquet/encoding-test.cc +++ b/cpp/src/parquet/encoding-test.cc @@ -340,7 +340,7 @@ class TestArrowBuilderDecoding : public ::testing::Test { void GenerateInputData(double null_probability) { constexpr int num_unique = 100; - constexpr int repeat = 10; + constexpr int repeat = 100; constexpr int64_t min_length = 2; constexpr int64_t max_length = 10; ::arrow::random::RandomArrayGenerator rag(0); @@ -446,7 +446,9 @@ class TestArrowBuilderDecoding : public ::testing::Test { std::vector input_data_; const uint8_t* valid_bits_; std::unique_ptr encoder_; - std::unique_ptr decoder_; + ByteArrayDecoder* decoder_; + std::unique_ptr plain_decoder_; + std::unique_ptr> dict_decoder_; std::shared_ptr buffer_; }; @@ -454,7 +456,8 @@ class PlainEncoding : public TestArrowBuilderDecoding { public: void SetupEncoderDecoder() override { encoder_ = MakeTypedEncoder(Encoding::PLAIN); - decoder_ = MakeTypedDecoder(Encoding::PLAIN); + plain_decoder_ = MakeTypedDecoder(Encoding::PLAIN); + decoder_ = plain_decoder_.get(); ASSERT_NO_THROW(encoder_->PutSpaced(input_data_.data(), num_values_, valid_bits_, 0)); buffer_ = encoder_->FlushValues(); decoder_->SetData(num_values_, buffer_->data(), static_cast(buffer_->size())); @@ -502,14 +505,11 @@ class DictEncoding : public TestArrowBuilderDecoding { dict_decoder_->SetDict(plain_decoder_.get()); dict_decoder_->SetData(num_values_, buffer_->data(), static_cast(buffer_->size())); - decoder_ = std::unique_ptr( - dynamic_cast(dict_decoder_.release())); + decoder_ = dynamic_cast(dict_decoder_.get()); } protected: std::unique_ptr descr_; - std::unique_ptr plain_decoder_; - std::unique_ptr> dict_decoder_; std::shared_ptr dict_buffer_; }; @@ -529,6 +529,16 @@ TEST_F(DictEncoding, CheckDecodeArrowNonNullDictBuilder) { this->CheckDecodeArrowNonNullUsingDictBuilder(); } -} // namespace test +TEST_F(DictEncoding, CheckDecodeIndicesSpaced) { + for (auto np : null_probabilities_) { + InitTestCase(np); + auto builder = CreateDictBuilder(); + dict_decoder_->InsertDictionary(builder.get()); + auto actual_num_values = dict_decoder_->DecodeIndicesSpaced( + num_values_, null_count_, valid_bits_, 0, builder.get()); + CheckDict(actual_num_values, *builder); + } +} +} // namespace test } // namespace parquet diff --git a/cpp/src/parquet/encoding.cc b/cpp/src/parquet/encoding.cc index ecc00b6a16e..98435755b11 100644 --- a/cpp/src/parquet/encoding.cc +++ b/cpp/src/parquet/encoding.cc @@ -37,6 +37,8 @@ #include "parquet/schema.h" #include "parquet/types.h" +using arrow::internal::checked_cast; + namespace parquet { constexpr int64_t kInMemoryDefaultCapacity = 1024; @@ -829,8 +831,7 @@ class DictDecoderImpl : public DecoderImpl, virtual public DictDecoder { dictionary_(AllocateBuffer(pool, 0)), dictionary_length_(0), byte_array_data_(AllocateBuffer(pool, 0)), - byte_array_offsets_(AllocateBuffer(pool, 0)), - indices_scratch_space_(AllocateBuffer(pool, 0)) {} + byte_array_offsets_(AllocateBuffer(pool, 0)) {} // Perform type-specific initiatialization void SetDict(TypedDecoder* dictionary) override; @@ -863,38 +864,53 @@ class DictDecoderImpl : public DecoderImpl, virtual public DictDecoder { num_values, null_count, valid_bits, valid_bits_offset)) { ParquetException::EofException(); } - num_values_ -= num_values return num_values; + num_values_ -= num_values; + return num_values; } void InsertDictionary(::arrow::ArrayBuilder* builder) override; int DecodeIndicesSpaced(int num_values, int null_count, const uint8_t* valid_bits, int64_t valid_bits_offset, - ::arrow::DictionaryBuilder* builder) override { + ::arrow::ArrayBuilder* builder) override { + auto binary_builder = checked_cast<::arrow::BinaryDictionaryBuilder*>(builder); + num_values = std::min(num_values, num_values_); - if (num_values > 0) { - PARQUET_THROW_NOT_OK(indices_scratch_space_->Resize(num_values * sizeof(int16_t), - /*shrink_to_fit=*/false)) - } - if (num_values != - idx_decoder_.GetIndicesSpaced(indices_scratch_space_->mutable_data(), num_values, - null_count, valid_bits, valid_bits_offset)) { + constexpr int64_t kBufferSize = 2048; + int64_t indices_buffer[kBufferSize]; + + if (num_values != idx_decoder_.GetBatchSpaced(indices_scratch_space_->mutable_data(), + num_values, null_count, valid_bits, + valid_bits_offset)) { ParquetException::EofException(); } + + /// XXX(wesm): Cannot append "valid bits" directly to the builder + std::vector valid_bytes(num_values); + ::arrow::internal::BitmapReader bit_reader(valid_bits, valid_bits_offset, num_values); + for (int64_t i = 0; i < num_values; ++i) { + valid_bytes[i] = static_cast(bit_reader.IsSet()); + bit_reader.Next(); + } + + PARQUET_THROW_NOT_OK(binary_builder->AppendIndices(indices_scratch_space_.data(), + num_values, valid_bytes.data())); num_values_ -= num_values; return num_values; } - int DecodeIndices(int num_values, ::arrow::DictionaryBuilder* builder) override { + int DecodeIndices(int num_values, ::arrow::ArrayBuilder* builder) override { num_values = std::min(num_values, num_values_); - if (num_values > 0) { - PARQUET_THROW_NOT_OK(indices_scratch_space_->Resize(num_values * sizeof(int16_t), - /*shrink_to_fit=*/false)) - } + constexpr int64_t kBufferSize = 2048; + int64_t indices_buffer[kBufferSize]; + if (num_values != idx_decoder_.GetBatch(indices_scratch_space_->mutable_data(), num_values)) { ParquetException::EofException(); } + auto binary_builder = checked_cast<::arrow::BinaryDictionaryBuilder*>(builder); + PARQUET_THROW_NOT_OK( + binary_builder->AppendIndices(indices_scratch_space_.data(), num_values)); num_values_ -= num_values; return num_values; } @@ -924,10 +940,6 @@ class DictDecoderImpl : public DecoderImpl, virtual public DictDecoder { // memory use in most cases std::shared_ptr byte_array_offsets_; - // Reusable buffer for decoding dictionary indices to be appended to a - // BinaryDictionaryBuilder - std::shared_ptr indices_scratch_space_; - ::arrow::util::RleDecoder idx_decoder_; }; @@ -1000,8 +1012,7 @@ void DictDecoderImpl::InsertDictionary(::arrow::ArrayBuilder* builder) { template <> void DictDecoderImpl::InsertDictionary(::arrow::ArrayBuilder* builder) { - auto binary_builder = - ::arrow::util::checked_cast<::arrow::BinaryDictionaryBuilder*>(builder); + auto binary_builder = checked_cast<::arrow::BinaryDictionaryBuilder*>(builder); // Make an BinaryArray referencing the internal dictionary data auto arr = std::make_shared<::arrow::BinaryArray>( From 246e11cfd6424db6dd5260b56380e3dfd8ebd570 Mon Sep 17 00:00:00 2001 From: Wes McKinney Date: Thu, 25 Jul 2019 14:57:29 -0500 Subject: [PATCH 05/11] Compiling again, but segfaulting [skip ci] --- cpp/src/arrow/array/builder_dict.h | 2 + cpp/src/arrow/util/rle-encoding.h | 4 +- cpp/src/parquet/column_reader.cc | 2 +- cpp/src/parquet/encoding.cc | 68 ++++++++++++++++++------------ 4 files changed, 45 insertions(+), 31 deletions(-) diff --git a/cpp/src/arrow/array/builder_dict.h b/cpp/src/arrow/array/builder_dict.h index a932fa3f1be..8b70e60af7d 100644 --- a/cpp/src/arrow/array/builder_dict.h +++ b/cpp/src/arrow/array/builder_dict.h @@ -367,6 +367,7 @@ class DictionaryBuilder : public ArrayBuilder { class ARROW_EXPORT BinaryDictionaryBuilder : public DictionaryBuilder { public: using DictionaryBuilder::Append; + using DictionaryBuilder::AppendIndices; using DictionaryBuilder::DictionaryBuilder; BinaryDictionaryBuilder() : BinaryDictionaryBuilder(default_memory_pool()) {} @@ -384,6 +385,7 @@ class ARROW_EXPORT BinaryDictionaryBuilder : public DictionaryBuilder { public: using DictionaryBuilder::Append; + using DictionaryBuilder::AppendIndices; using DictionaryBuilder::DictionaryBuilder; StringDictionaryBuilder() : StringDictionaryBuilder(default_memory_pool()) {} diff --git a/cpp/src/arrow/util/rle-encoding.h b/cpp/src/arrow/util/rle-encoding.h index 4b5da6fae4d..e2210686b55 100644 --- a/cpp/src/arrow/util/rle-encoding.h +++ b/cpp/src/arrow/util/rle-encoding.h @@ -114,7 +114,7 @@ class RleDecoder { /// Gets a batch of values. Returns the number of decoded elements. template - int GetBatch(T* values, int batch_size); + int GetBatch(int batch_size, T* values); /// Like GetBatch but add spacing for null entries template @@ -287,7 +287,7 @@ inline bool RleDecoder::Get(T* val) { } template -inline int RleDecoder::GetBatch(T* values, int batch_size) { +inline int RleDecoder::GetBatch(int batch_size, T* values) { DCHECK_GE(bit_width_, 0); int values_read = 0; diff --git a/cpp/src/parquet/column_reader.cc b/cpp/src/parquet/column_reader.cc index b50251c873e..d919c4fdd0c 100644 --- a/cpp/src/parquet/column_reader.cc +++ b/cpp/src/parquet/column_reader.cc @@ -87,7 +87,7 @@ int LevelDecoder::Decode(int batch_size, int16_t* levels) { int num_values = std::min(num_values_remaining_, batch_size); if (encoding_ == Encoding::RLE) { - num_decoded = rle_decoder_->GetBatch(levels, num_values); + num_decoded = rle_decoder_->GetBatch(num_values, levels); } else { num_decoded = bit_packed_decoder_->GetBatch(bit_width_, levels, num_values); } diff --git a/cpp/src/parquet/encoding.cc b/cpp/src/parquet/encoding.cc index 98435755b11..1f865e24cf6 100644 --- a/cpp/src/parquet/encoding.cc +++ b/cpp/src/parquet/encoding.cc @@ -873,15 +873,19 @@ class DictDecoderImpl : public DecoderImpl, virtual public DictDecoder { int DecodeIndicesSpaced(int num_values, int null_count, const uint8_t* valid_bits, int64_t valid_bits_offset, ::arrow::ArrayBuilder* builder) override { - auto binary_builder = checked_cast<::arrow::BinaryDictionaryBuilder*>(builder); - num_values = std::min(num_values, num_values_); - constexpr int64_t kBufferSize = 2048; - int64_t indices_buffer[kBufferSize]; + if (num_values > 0) { + // TODO(wesm): Refactor to batch reads for improved memory use. It is not + // trivial because the null_count is relative to the entire bitmap + PARQUET_THROW_NOT_OK(indices_scratch_space_->Resize(num_values * sizeof(int16_t), + /*shrink_to_fit=*/false)); + } + + auto indices_buffer = + reinterpret_cast(indices_scratch_space_->mutable_data()); - if (num_values != idx_decoder_.GetBatchSpaced(indices_scratch_space_->mutable_data(), - num_values, null_count, valid_bits, - valid_bits_offset)) { + if (num_values != idx_decoder_.GetBatchSpaced(num_values, null_count, valid_bits, + valid_bits_offset, indices_buffer)) { ParquetException::EofException(); } @@ -893,24 +897,30 @@ class DictDecoderImpl : public DecoderImpl, virtual public DictDecoder { bit_reader.Next(); } - PARQUET_THROW_NOT_OK(binary_builder->AppendIndices(indices_scratch_space_.data(), - num_values, valid_bytes.data())); + auto binary_builder = checked_cast<::arrow::BinaryDictionaryBuilder*>(builder); + PARQUET_THROW_NOT_OK( + binary_builder->AppendIndices(indices_buffer, num_values, valid_bytes.data())); num_values_ -= num_values; return num_values; } int DecodeIndices(int num_values, ::arrow::ArrayBuilder* builder) override { num_values = std::min(num_values, num_values_); - constexpr int64_t kBufferSize = 2048; - int64_t indices_buffer[kBufferSize]; - - if (num_values != - idx_decoder_.GetBatch(indices_scratch_space_->mutable_data(), num_values)) { + num_values = std::min(num_values, num_values_); + if (num_values > 0) { + // TODO(wesm): Refactor to batch reads for improved memory use. This is + // relatively simple here because we don't have to do any bookkeeping of + // nulls + PARQUET_THROW_NOT_OK(indices_scratch_space_->Resize(num_values * sizeof(int16_t), + /*shrink_to_fit=*/false)); + } + auto indices_buffer = + reinterpret_cast(indices_scratch_space_->mutable_data()); + if (num_values != idx_decoder_.GetBatch(num_values, indices_buffer)) { ParquetException::EofException(); } auto binary_builder = checked_cast<::arrow::BinaryDictionaryBuilder*>(builder); - PARQUET_THROW_NOT_OK( - binary_builder->AppendIndices(indices_scratch_space_.data(), num_values)); + PARQUET_THROW_NOT_OK(binary_builder->AppendIndices(indices_buffer, num_values)); num_values_ -= num_values; return num_values; } @@ -940,6 +950,10 @@ class DictDecoderImpl : public DecoderImpl, virtual public DictDecoder { // memory use in most cases std::shared_ptr byte_array_offsets_; + // Reusable buffer for decoding dictionary indices to be appended to a + // BinaryDictionaryBuilder + std::shared_ptr indices_scratch_space_; + ::arrow::util::RleDecoder idx_decoder_; }; @@ -948,10 +962,6 @@ void DictDecoderImpl::SetDict(TypedDecoder* dictionary) { DecodeDict(dictionary); } -void DictDecoderImpl::SetDict(TypedDecoder* dictionary) { - DecodeDict(dictionary); -} - template <> void DictDecoderImpl::SetDict(TypedDecoder* dictionary) { ParquetException::NYI("Dictionary encoding is not implemented for boolean values"); @@ -971,20 +981,21 @@ void DictDecoderImpl::SetDict(TypedDecoder* dictio PARQUET_THROW_NOT_OK(byte_array_data_->Resize(total_size, /*shrink_to_fit=*/false)); PARQUET_THROW_NOT_OK( - byte_array_offsets_->Resize((num_dictionary_values + 1) * sizeof(int32_t), + byte_array_offsets_->Resize((dictionary_length_ + 1) * sizeof(int32_t), /*shrink_to_fit=*/false)); } int32_t offset = 0; uint8_t* bytes_data = byte_array_data_->mutable_data(); - int32_t* bytes_offsets = byte_array_offsets_->mutable_data(); + int32_t* bytes_offsets = + reinterpret_cast(byte_array_offsets_->mutable_data()); for (int i = 0; i < dictionary_length_; ++i) { memcpy(bytes_data + offset, dict_values[i].ptr, dict_values[i].len); bytes_offsets[i] = offset; dict_values[i].ptr = bytes_data + offset; offset += dict_values[i].len; } - bytes_offsets[num_dictionary_values] = offset; + bytes_offsets[dictionary_length_] = offset; } template <> @@ -999,7 +1010,7 @@ inline void DictDecoderImpl::SetDict(TypedDecoder* dictionar PARQUET_THROW_NOT_OK(byte_array_data_->Resize(total_size, /*shrink_to_fit=*/false)); uint8_t* bytes_data = byte_array_data_->mutable_data(); - for (int32_t i = 0, offset = 0; i < num_dictionary_values; ++i, offset += fixed_len) { + for (int32_t i = 0, offset = 0; i < dictionary_length_; ++i, offset += fixed_len) { memcpy(bytes_data + offset, dict_values[i].ptr, fixed_len); dict_values[i].ptr = bytes_data + offset; } @@ -1021,9 +1032,10 @@ void DictDecoderImpl::InsertDictionary(::arrow::ArrayBuilder* bui } class DictByteArrayDecoderImpl : public DictDecoderImpl, - virtual public DictByteArrayDecoder { + virtual public ByteArrayDecoder { public: using BASE = DictDecoderImpl; + using BASE::DictDecoderImpl; int DecodeArrow(int num_values, int null_count, const uint8_t* valid_bits, int64_t valid_bits_offset, @@ -1077,7 +1089,7 @@ class DictByteArrayDecoderImpl : public DictDecoderImpl, if (is_valid) { int32_t batch_size = std::min(buffer_size, num_values - values_decoded - null_count); - int num_indices = idx_decoder_.GetBatch(indices_buffer, batch_size); + int num_indices = idx_decoder_.GetBatch(batch_size, indices_buffer); int i = 0; while (true) { @@ -1125,7 +1137,7 @@ class DictByteArrayDecoderImpl : public DictDecoderImpl, while (values_decoded < num_values) { int32_t batch_size = std::min(buffer_size, num_values - values_decoded); - int num_indices = idx_decoder_.GetBatch(indices_buffer, batch_size); + int num_indices = idx_decoder_.GetBatch(batch_size, indices_buffer); if (num_indices == 0) break; for (int i = 0; i < num_indices; ++i) { const auto& val = dict_values[indices_buffer[i]]; @@ -1388,7 +1400,7 @@ std::unique_ptr MakeDictDecoder(Type::type type_num, case Type::DOUBLE: return std::unique_ptr(new DictDecoderImpl(descr, pool)); case Type::BYTE_ARRAY: - return std::unique_ptr(new DictByteArrayDecoder(descr, pool)); + return std::unique_ptr(new DictByteArrayDecoderImpl(descr, pool)); case Type::FIXED_LEN_BYTE_ARRAY: return std::unique_ptr(new DictFLBADecoder(descr, pool)); default: From 21e0cc714e29c1d6979b38417a146a5805ccb0e0 Mon Sep 17 00:00:00 2001 From: Wes McKinney Date: Thu, 25 Jul 2019 20:50:33 -0500 Subject: [PATCH 06/11] Fix up unit test now that each row group has a different dictionary --- cpp/src/arrow/array-dict-test.cc | 3 + cpp/src/arrow/array/builder_dict.h | 1 + cpp/src/arrow/table.cc | 1 + cpp/src/arrow/type.cc | 6 +- .../parquet/arrow/arrow-reader-writer-test.cc | 118 ++++++----- cpp/src/parquet/arrow/arrow-schema-test.cc | 13 +- cpp/src/parquet/arrow/reader.cc | 194 ++++++++++++------ cpp/src/parquet/arrow/reader.h | 13 +- cpp/src/parquet/arrow/schema.cc | 24 ++- cpp/src/parquet/arrow/schema.h | 49 +++-- cpp/src/parquet/arrow/test-util.h | 10 +- cpp/src/parquet/arrow/writer.cc | 2 + cpp/src/parquet/column_reader.cc | 15 +- cpp/src/parquet/encoding-test.cc | 8 + cpp/src/parquet/encoding.cc | 11 +- cpp/src/parquet/types.h | 18 +- 16 files changed, 310 insertions(+), 176 deletions(-) diff --git a/cpp/src/arrow/array-dict-test.cc b/cpp/src/arrow/array-dict-test.cc index eafdcfc3470..ff006205833 100644 --- a/cpp/src/arrow/array-dict-test.cc +++ b/cpp/src/arrow/array-dict-test.cc @@ -320,6 +320,9 @@ TEST(TestStringDictionaryBuilder, AppendIndices) { ASSERT_OK(builder.AppendIndices( raw_indices.data(), static_cast(raw_indices.size()), is_valid.data())); } + + ASSERT_EQ(10, builder.length()); + std::shared_ptr result; ASSERT_OK(builder.Finish(&result)); diff --git a/cpp/src/arrow/array/builder_dict.h b/cpp/src/arrow/array/builder_dict.h index 8b70e60af7d..e1cfe8a2873 100644 --- a/cpp/src/arrow/array/builder_dict.h +++ b/cpp/src/arrow/array/builder_dict.h @@ -196,6 +196,7 @@ class DictionaryBuilder : public ArrayBuilder { const uint8_t* valid_bytes = NULLPTR) { int64_t null_count_before = values_builder_.null_count(); ARROW_RETURN_NOT_OK(values_builder_.AppendValues(values, length, valid_bytes)); + length_ += length; null_count_ += values_builder_.null_count() - null_count_before; return Status::OK(); } diff --git a/cpp/src/arrow/table.cc b/cpp/src/arrow/table.cc index 907cc8c2241..446010f93b1 100644 --- a/cpp/src/arrow/table.cc +++ b/cpp/src/arrow/table.cc @@ -42,6 +42,7 @@ using internal::checked_cast; ChunkedArray::ChunkedArray(const ArrayVector& chunks) : chunks_(chunks) { length_ = 0; null_count_ = 0; + ARROW_CHECK_GT(chunks.size(), 0) << "cannot construct ChunkedArray from empty vector and omitted type"; type_ = chunks[0]->type(); diff --git a/cpp/src/arrow/type.cc b/cpp/src/arrow/type.cc index 54e0103fbc2..76be841a662 100644 --- a/cpp/src/arrow/type.cc +++ b/cpp/src/arrow/type.cc @@ -493,7 +493,11 @@ Schema::~Schema() {} int Schema::num_fields() const { return static_cast(impl_->fields_.size()); } -std::shared_ptr Schema::field(int i) const { return impl_->fields_[i]; } +std::shared_ptr Schema::field(int i) const { + DCHECK_GE(i, 0); + DCHECK_LT(i, num_fields()); + return impl_->fields_[i]; +} const std::vector>& Schema::fields() const { return impl_->fields_; diff --git a/cpp/src/parquet/arrow/arrow-reader-writer-test.cc b/cpp/src/parquet/arrow/arrow-reader-writer-test.cc index c5d638d9788..cfc1c1fcda5 100644 --- a/cpp/src/parquet/arrow/arrow-reader-writer-test.cc +++ b/cpp/src/parquet/arrow/arrow-reader-writer-test.cc @@ -85,7 +85,7 @@ static constexpr int LARGE_SIZE = 10000; static constexpr uint32_t kDefaultSeed = 0; -std::shared_ptr get_logical_type(const ::DataType& type) { +std::shared_ptr get_logical_type(const DataType& type) { switch (type.id()) { case ArrowId::UINT8: return LogicalType::Int(8, false); @@ -154,7 +154,7 @@ std::shared_ptr get_logical_type(const ::DataType& type) { return LogicalType::None(); } -ParquetType::type get_physical_type(const ::DataType& type) { +ParquetType::type get_physical_type(const DataType& type) { switch (type.id()) { case ArrowId::BOOL: return ParquetType::BOOLEAN; @@ -332,7 +332,7 @@ const std::string test_traits<::arrow::BinaryType>::value("\x00\x01\x02\x03"); const std::string test_traits<::arrow::FixedSizeBinaryType>::value("Fixed"); // NOLINT template -using ParquetDataType = DataType::parquet_enum>; +using ParquetDataType = PhysicalType::parquet_enum>; template using ParquetWriter = TypedColumnWriter>; @@ -401,11 +401,11 @@ void CheckConfiguredRoundtrip( if (expected_table) { ASSERT_NO_FATAL_FAILURE( ::arrow::AssertSchemaEqual(*actual_table->schema(), *expected_table->schema())); - ASSERT_NO_FATAL_FAILURE(::arrow::AssertTablesEqual(*actual_table, *expected_table)); + ASSERT_NO_FATAL_FAILURE(::arrow::AssertTablesEqual(*expected_table, *actual_table)); } else { ASSERT_NO_FATAL_FAILURE( ::arrow::AssertSchemaEqual(*actual_table->schema(), *input_table->schema())); - ASSERT_NO_FATAL_FAILURE(::arrow::AssertTablesEqual(*actual_table, *input_table)); + ASSERT_NO_FATAL_FAILURE(::arrow::AssertTablesEqual(*input_table, *actual_table)); } } @@ -444,14 +444,14 @@ void CheckSimpleRoundtrip(const std::shared_ptr& table, int64_t row_group ASSERT_NO_FATAL_FAILURE(::arrow::AssertTablesEqual(*table, *result, false)); } -static std::shared_ptr MakeSimpleSchema(const ::DataType& type, +static std::shared_ptr MakeSimpleSchema(const DataType& type, Repetition::type repetition) { int32_t byte_width = -1; switch (type.id()) { case ::arrow::Type::DICTIONARY: { const auto& dict_type = static_cast(type); - const ::DataType& values_type = *dict_type.value_type(); + const DataType& values_type = *dict_type.value_type(); switch (values_type.id()) { case ::arrow::Type::FIXED_SIZE_BINARY: byte_width = @@ -595,7 +595,8 @@ class TestParquetIO : public ::testing::Test { SchemaDescriptor descriptor; ASSERT_NO_THROW(descriptor.Init(schema)); std::shared_ptr<::arrow::Schema> arrow_schema; - ASSERT_OK_NO_THROW(FromParquetSchema(&descriptor, &arrow_schema)); + ArrowReaderProperties props; + ASSERT_OK_NO_THROW(FromParquetSchema(&descriptor, props, &arrow_schema)); FileWriter writer(::arrow::default_memory_pool(), MakeWriter(schema), arrow_schema); ASSERT_OK_NO_THROW(writer.NewRowGroup(values->length())); ASSERT_OK_NO_THROW(writer.WriteColumnChunk(*values)); @@ -786,7 +787,8 @@ TYPED_TEST(TestParquetIO, SingleColumnRequiredChunkedWrite) { SchemaDescriptor descriptor; ASSERT_NO_THROW(descriptor.Init(schema)); std::shared_ptr<::arrow::Schema> arrow_schema; - ASSERT_OK_NO_THROW(FromParquetSchema(&descriptor, &arrow_schema)); + ArrowReaderProperties props; + ASSERT_OK_NO_THROW(FromParquetSchema(&descriptor, props, &arrow_schema)); FileWriter writer(default_memory_pool(), this->MakeWriter(schema), arrow_schema); for (int i = 0; i < 4; i++) { ASSERT_OK_NO_THROW(writer.NewRowGroup(chunk_size)); @@ -855,7 +857,8 @@ TYPED_TEST(TestParquetIO, SingleColumnOptionalChunkedWrite) { SchemaDescriptor descriptor; ASSERT_NO_THROW(descriptor.Init(schema)); std::shared_ptr<::arrow::Schema> arrow_schema; - ASSERT_OK_NO_THROW(FromParquetSchema(&descriptor, &arrow_schema)); + ArrowReaderProperties props; + ASSERT_OK_NO_THROW(FromParquetSchema(&descriptor, props, &arrow_schema)); FileWriter writer(::arrow::default_memory_pool(), this->MakeWriter(schema), arrow_schema); for (int i = 0; i < 4; i++) { @@ -1791,7 +1794,7 @@ void MakeDoubleTable(int num_columns, int num_rows, int nchunks, } void MakeListArray(int num_rows, int max_value_length, - std::shared_ptr<::DataType>* out_type, + std::shared_ptr* out_type, std::shared_ptr* out_array) { std::vector length_draws; randint(num_rows, 0, max_value_length, &length_draws); @@ -1965,7 +1968,7 @@ TEST(TestArrowReadWrite, ListLargeRecords) { const int row_group_size = 100; std::shared_ptr list_array; - std::shared_ptr<::DataType> list_type; + std::shared_ptr list_type; MakeListArray(num_rows, 20, &list_type, &list_array); @@ -2009,14 +2012,14 @@ TEST(TestArrowReadWrite, ListLargeRecords) { ASSERT_TRUE(table->Equals(*chunked_table)); } -typedef std::function*, std::shared_ptr*)> +typedef std::function*, std::shared_ptr*)> ArrayFactory; template struct GenerateArrayFunctor { explicit GenerateArrayFunctor(double pct_null = 0.1) : pct_null(pct_null) {} - void operator()(int length, std::shared_ptr<::DataType>* type, + void operator()(int length, std::shared_ptr* type, std::shared_ptr* array) { using T = typename ArrowType::c_type; @@ -2034,16 +2037,16 @@ struct GenerateArrayFunctor { double pct_null; }; -typedef std::function*, std::shared_ptr*)> +typedef std::function*, std::shared_ptr*)> ArrayFactory; -auto GenerateInt32 = [](int length, std::shared_ptr<::DataType>* type, +auto GenerateInt32 = [](int length, std::shared_ptr* type, std::shared_ptr* array) { GenerateArrayFunctor<::arrow::Int32Type> func; func(length, type, array); }; -auto GenerateList = [](int length, std::shared_ptr<::DataType>* type, +auto GenerateList = [](int length, std::shared_ptr* type, std::shared_ptr* array) { MakeListArray(length, 100, type, array); }; @@ -2078,7 +2081,7 @@ TEST(TestArrowReadWrite, TableWithChunkedColumns) { for (const auto& datagen_func : functions) { ::arrow::ArrayVector arrays; std::shared_ptr arr; - std::shared_ptr<::DataType> type; + std::shared_ptr type; datagen_func(total_length, &type, &arr); int64_t offset = 0; @@ -2575,15 +2578,16 @@ void TryReadDataFile(const std::string& path, auto pool = ::arrow::default_memory_pool(); std::unique_ptr arrow_reader; + Status s = + FileReader::Make(pool, ParquetFileReader::OpenFile(path, false), &arrow_reader); + if (s.ok()) { + std::shared_ptr<::arrow::Table> table; + s = arrow_reader->ReadTable(&table); + } - arrow_reader.reset(new FileReader(pool, ParquetFileReader::OpenFile(path, false))); - std::shared_ptr<::arrow::Table> table; - auto status = arrow_reader->ReadTable(&table); - - ASSERT_TRUE(status.code() == expected_code) + ASSERT_TRUE(s.code() == expected_code) << "Expected reading file to return " - << arrow::Status(expected_code, "").CodeAsString() << ", but got " - << status.ToString(); + << arrow::Status(expected_code, "").CodeAsString() << ", but got " << s.ToString(); } TEST(TestArrowReaderAdHoc, Int96BadMemoryAccess) { @@ -2634,8 +2638,10 @@ TEST(TestArrowReaderAdHoc, DISABLED_LargeStringColumn) { array.reset(); auto reader = ParquetFileReader::Open(std::make_shared(tables_buffer)); - FileReader arrow_reader(default_memory_pool(), std::move(reader)); - ASSERT_OK_NO_THROW(arrow_reader.ReadTable(&table)); + + std::unique_ptr arrow_reader; + ASSERT_OK(FileReader::Make(default_memory_pool(), std::move(reader), &arrow_reader)); + ASSERT_OK_NO_THROW(arrow_reader->ReadTable(&table)); ASSERT_OK(table->Validate()); } @@ -2647,13 +2653,13 @@ TEST(TestArrowReaderAdHoc, HandleDictPageOffsetZero) { class TestArrowReaderAdHocSparkAndHvr : public ::testing::TestWithParam< - std::tuple>> {}; + std::tuple>> {}; TEST_P(TestArrowReaderAdHocSparkAndHvr, ReadDecimals) { std::string path(test::get_data_dir()); std::string filename; - std::shared_ptr<::DataType> decimal_type; + std::shared_ptr decimal_type; std::tie(filename, decimal_type) = GetParam(); path += "/" + filename; @@ -2662,8 +2668,8 @@ TEST_P(TestArrowReaderAdHocSparkAndHvr, ReadDecimals) { auto pool = ::arrow::default_memory_pool(); std::unique_ptr arrow_reader; - ASSERT_NO_THROW( - arrow_reader.reset(new FileReader(pool, ParquetFileReader::OpenFile(path, false)))); + ASSERT_OK_NO_THROW( + FileReader::Make(pool, ParquetFileReader::OpenFile(path, false), &arrow_reader)); std::shared_ptr<::arrow::Table> table; ASSERT_OK_NO_THROW(arrow_reader->ReadTable(&table)); @@ -2727,12 +2733,15 @@ TEST(TestArrowWriterAdHoc, SchemaMismatch) { // ---------------------------------------------------------------------- // Tests for directly reading DictionaryArray + class TestArrowReadDictionary : public ::testing::TestWithParam { public: void SetUp() override { GenerateData(GetParam()); + + // Write 4 row groups; each row group will have a different dictionary ASSERT_NO_FATAL_FAILURE( - WriteTableToBuffer(expected_dense_, expected_dense_->num_rows() / 2, + WriteTableToBuffer(expected_dense_, expected_dense_->num_rows() / 4, default_arrow_writer_properties(), &buffer_)); properties_ = default_arrow_reader_properties(); @@ -2740,28 +2749,13 @@ class TestArrowReadDictionary : public ::testing::TestWithParam { void GenerateData(double null_probability) { constexpr int num_unique = 100; - constexpr int repeat = 10; + constexpr int repeat = 100; constexpr int64_t min_length = 2; - constexpr int64_t max_length = 10; + constexpr int64_t max_length = 100; ::arrow::random::RandomArrayGenerator rag(0); - auto dense_array = rag.StringWithRepeats(repeat * num_unique, num_unique, min_length, - max_length, null_probability); - expected_dense_ = MakeSimpleTable(dense_array, /*nullable=*/true); - - ::arrow::StringDictionaryBuilder builder(default_memory_pool()); - const auto& string_array = static_cast(*dense_array); - ASSERT_OK(builder.AppendArray(string_array)); - - std::shared_ptr<::arrow::Array> dict_array; - ASSERT_OK(builder.Finish(&dict_array)); - expected_dict_ = MakeSimpleTable(dict_array, /*nullable=*/true); - - // TODO(hatemhelal): Figure out if we can use the following to init the expected_dict_ - // Currently fails due to DataType mismatch for indices array. - // Datum out; - // FunctionContext ctx(default_memory_pool()); - // ASSERT_OK(DictionaryEncode(&ctx, Datum(dense_array), &out)); - // expected_dict_ = MakeSimpleTable(out.make_array(), /*nullable=*/true); + dense_values_ = rag.StringWithRepeats(repeat * num_unique, num_unique, min_length, + max_length, null_probability); + expected_dense_ = MakeSimpleTable(dense_values_, /*nullable=*/true); } void TearDown() override {} @@ -2773,21 +2767,37 @@ class TestArrowReadDictionary : public ::testing::TestWithParam { std::shared_ptr
actual; ASSERT_OK_NO_THROW(reader->ReadTable(&actual)); - ::arrow::AssertTablesEqual(*actual, expected, /*same_chunk_layout=*/false); + ::arrow::AssertTablesEqual(expected, *actual, /*same_chunk_layout=*/false); } static std::vector null_probabilites() { return {0.0, 0.5, 1}; } protected: + std::shared_ptr dense_values_; std::shared_ptr
expected_dense_; std::shared_ptr
expected_dict_; std::shared_ptr buffer_; ArrowReaderProperties properties_; }; +void AsDictionaryEncoded(const Array& arr, std::shared_ptr* out) { + ::arrow::StringDictionaryBuilder builder(default_memory_pool()); + const auto& string_array = static_cast(arr); + ASSERT_OK(builder.AppendArray(string_array)); + ASSERT_OK(builder.Finish(out)); +} + TEST_P(TestArrowReadDictionary, ReadWholeFileDict) { properties_.set_read_dictionary(0, true); - CheckReadWholeFile(*expected_dict_); + + std::vector> chunks(4); + const int64_t chunk_size = expected_dense_->num_rows() / 4; + for (int i = 0; i < 4; ++i) { + AsDictionaryEncoded(*dense_values_->Slice(chunk_size * i, chunk_size), &chunks[i]); + } + auto ex_table = MakeSimpleTable(std::make_shared(chunks), + /*nullable=*/true); + CheckReadWholeFile(*ex_table); } TEST_P(TestArrowReadDictionary, ReadWholeFileDense) { diff --git a/cpp/src/parquet/arrow/arrow-schema-test.cc b/cpp/src/parquet/arrow/arrow-schema-test.cc index dc0a02a7b87..06646b4cb81 100644 --- a/cpp/src/parquet/arrow/arrow-schema-test.cc +++ b/cpp/src/parquet/arrow/arrow-schema-test.cc @@ -20,6 +20,7 @@ #include "gtest/gtest.h" +#include "parquet/arrow/reader.h" #include "parquet/arrow/schema.h" #include "parquet/file_reader.h" #include "parquet/schema.h" @@ -74,14 +75,16 @@ class TestConvertParquetSchema : public ::testing::Test { ::arrow::Status ConvertSchema(const std::vector& nodes) { NodePtr schema = GroupNode::Make("schema", Repetition::REPEATED, nodes); descr_.Init(schema); - return FromParquetSchema(&descr_, &result_schema_); + ArrowReaderProperties props; + return FromParquetSchema(&descr_, props, &result_schema_); } ::arrow::Status ConvertSchema(const std::vector& nodes, const std::vector& column_indices) { NodePtr schema = GroupNode::Make("schema", Repetition::REPEATED, nodes); descr_.Init(schema); - return FromParquetSchema(&descr_, column_indices, &result_schema_); + ArrowReaderProperties props; + return FromParquetSchema(&descr_, column_indices, props, &result_schema_); } ::arrow::Status ConvertSchema( @@ -89,7 +92,8 @@ class TestConvertParquetSchema : public ::testing::Test { const std::shared_ptr& key_value_metadata) { NodePtr schema = GroupNode::Make("schema", Repetition::REPEATED, nodes); descr_.Init(schema); - return FromParquetSchema(&descr_, {}, key_value_metadata, &result_schema_); + ArrowReaderProperties props; + return FromParquetSchema(&descr_, {}, props, key_value_metadata, &result_schema_); } protected: @@ -1048,7 +1052,8 @@ TEST(TestFromParquetSchema, CorruptMetadata) { parquet::ParquetFileReader::OpenFile(path); const auto parquet_schema = reader->metadata()->schema(); std::shared_ptr<::arrow::Schema> arrow_schema; - ASSERT_RAISES(IOError, FromParquetSchema(parquet_schema, &arrow_schema)); + ArrowReaderProperties props; + ASSERT_RAISES(IOError, FromParquetSchema(parquet_schema, props, &arrow_schema)); } } // namespace arrow diff --git a/cpp/src/parquet/arrow/reader.cc b/cpp/src/parquet/arrow/reader.cc index 7b6ecb9b789..fc953da2300 100644 --- a/cpp/src/parquet/arrow/reader.cc +++ b/cpp/src/parquet/arrow/reader.cc @@ -56,6 +56,7 @@ using arrow::Array; using arrow::BooleanArray; using arrow::ChunkedArray; +using arrow::DataType; using arrow::Field; using arrow::Int32Array; using arrow::ListArray; @@ -228,6 +229,11 @@ class FileReader::Impl { virtual ~Impl() {} + Status Init() { + // TODO(wesm): Smarter schema/column-reader initialization for nested data + return Status::OK(); + } + Status GetColumn(int i, FileColumnIteratorFactory iterator_factory, std::unique_ptr* out); @@ -247,6 +253,7 @@ class FileReader::Impl { Status GetSchema(std::shared_ptr<::arrow::Schema>* out); Status GetSchema(const std::vector& indices, std::shared_ptr<::arrow::Schema>* out); + Status ReadRowGroup(int row_group_index, std::shared_ptr
* table); Status ReadRowGroup(int row_group_index, const std::vector& indices, std::shared_ptr<::arrow::Table>* out); @@ -299,8 +306,11 @@ class ColumnReader::ColumnReaderImpl { class PARQUET_NO_EXPORT PrimitiveImpl : public ColumnReader::ColumnReaderImpl { public: PrimitiveImpl(MemoryPool* pool, std::unique_ptr input, - const bool read_dictionary) - : pool_(pool), input_(std::move(input)), descr_(input_->descr()) { + bool read_dictionary) + : pool_(pool), + input_(std::move(input)), + descr_(input_->descr()), + read_dictionary_(read_dictionary) { record_reader_ = RecordReader::Make(descr_, pool_, read_dictionary); Status s = NodeToField(*input_->descr()->schema_node(), &field_); DCHECK_OK(s); @@ -324,8 +334,10 @@ class PARQUET_NO_EXPORT PrimitiveImpl : public ColumnReader::ColumnReaderImpl { const ColumnDescriptor* descr_; std::shared_ptr record_reader_; - std::shared_ptr field_; + + // Are we reading directly to dictionary-encoded? + bool read_dictionary_; }; // Reader implementation for struct array @@ -358,6 +370,20 @@ FileReader::FileReader(MemoryPool* pool, std::unique_ptr read const ArrowReaderProperties& properties) : impl_(new FileReader::Impl(pool, std::move(reader), properties)) {} +Status FileReader::Make(::arrow::MemoryPool* pool, + std::unique_ptr reader, + const ArrowReaderProperties& properties, + std::unique_ptr* out) { + out->reset(new FileReader(pool, std::move(reader), properties)); + return (*out)->impl_->Init(); +} + +Status FileReader::Make(::arrow::MemoryPool* pool, + std::unique_ptr reader, + std::unique_ptr* out) { + return Make(pool, std::move(reader), default_arrow_reader_properties(), out); +} + FileReader::~FileReader() {} Status FileReader::Impl::GetColumn(int i, FileColumnIteratorFactory iterator_factory, @@ -370,10 +396,9 @@ Status FileReader::Impl::GetColumn(int i, FileColumnIteratorFactory iterator_fac } std::unique_ptr input(iterator_factory(i, reader_.get())); - bool read_dict = reader_properties_.read_dictionary(i); std::unique_ptr impl( - new PrimitiveImpl(pool_, std::move(input), read_dict)); + new PrimitiveImpl(pool_, std::move(input), reader_properties_.read_dictionary(i))); *out = std::unique_ptr(new ColumnReader(std::move(impl))); return Status::OK(); } @@ -480,14 +505,14 @@ Status FileReader::Impl::ReadColumn(int i, std::shared_ptr* out) { return flat_column_reader->NextBatch(records_to_read, out); } -Status FileReader::Impl::GetSchema(const std::vector& indices, - std::shared_ptr<::arrow::Schema>* out) { - return FromParquetSchema(reader_->metadata()->schema(), indices, +Status FileReader::Impl::GetSchema(std::shared_ptr<::arrow::Schema>* out) { + return FromParquetSchema(reader_->metadata()->schema(), reader_properties_, reader_->metadata()->key_value_metadata(), out); } -Status FileReader::Impl::GetSchema(std::shared_ptr<::arrow::Schema>* out) { - return FromParquetSchema(reader_->metadata()->schema(), +Status FileReader::Impl::GetSchema(const std::vector& indices, + std::shared_ptr<::arrow::Schema>* out) { + return FromParquetSchema(reader_->metadata()->schema(), indices, reader_properties_, reader_->metadata()->key_value_metadata(), out); } @@ -576,7 +601,6 @@ Status FileReader::Impl::ReadRowGroup(int row_group_index, } auto dict_indices = GetDictionaryIndices(indices); - if (!dict_indices.empty()) { schema = FixSchema(*schema, dict_indices, columns); } @@ -628,7 +652,6 @@ Status FileReader::Impl::ReadTable(const std::vector& indices, } auto dict_indices = GetDictionaryIndices(indices); - if (!dict_indices.empty()) { schema = FixSchema(*schema, dict_indices, columns); } @@ -701,29 +724,27 @@ std::shared_ptr<::arrow::Schema> FileReader::Impl::FixSchema( // Static ctor Status OpenFile(const std::shared_ptr<::arrow::io::RandomAccessFile>& file, - MemoryPool* allocator, const ReaderProperties& props, + MemoryPool* pool, const ReaderProperties& props, const std::shared_ptr& metadata, std::unique_ptr* reader) { std::unique_ptr pq_reader; PARQUET_CATCH_NOT_OK(pq_reader = ParquetReader::Open(file, props, metadata)); - reader->reset(new FileReader(allocator, std::move(pq_reader))); - return Status::OK(); + return FileReader::Make(pool, std::move(pq_reader), default_arrow_reader_properties(), + reader); } Status OpenFile(const std::shared_ptr<::arrow::io::RandomAccessFile>& file, - MemoryPool* allocator, std::unique_ptr* reader) { - return OpenFile(file, allocator, ::parquet::default_reader_properties(), nullptr, - reader); + MemoryPool* pool, std::unique_ptr* reader) { + return OpenFile(file, pool, ::parquet::default_reader_properties(), nullptr, reader); } Status OpenFile(const std::shared_ptr<::arrow::io::RandomAccessFile>& file, - ::arrow::MemoryPool* allocator, const ArrowReaderProperties& properties, + ::arrow::MemoryPool* pool, const ArrowReaderProperties& properties, std::unique_ptr* reader) { std::unique_ptr pq_reader; PARQUET_CATCH_NOT_OK(pq_reader = ParquetReader::Open( file, ::parquet::default_reader_properties(), nullptr)); - reader->reset(new FileReader(allocator, std::move(pq_reader), properties)); - return Status::OK(); + return FileReader::Make(pool, std::move(pq_reader), properties, reader); } Status FileReader::GetColumn(int i, std::unique_ptr* out) { @@ -907,11 +928,18 @@ Status PrimitiveImpl::WrapIntoListArray(Datum* inout_array) { const int64_t total_levels_read = record_reader_->levels_position(); std::shared_ptr<::arrow::Schema> arrow_schema; - RETURN_NOT_OK(FromParquetSchema(input_->schema(), {input_->column_index()}, - input_->metadata()->key_value_metadata(), - &arrow_schema)); + RETURN_NOT_OK(FromParquetSchema( + input_->schema(), {input_->column_index()}, default_arrow_reader_properties(), + input_->metadata()->key_value_metadata(), &arrow_schema)); std::shared_ptr current_field = arrow_schema->field(0); + if (current_field->type()->num_children() > 0 && + flat_array->type_id() == ::arrow::Type::DICTIONARY) { + // XXX(wesm): Handling of nested types and dictionary encoding needs to be + // significantly refactored + return Status::Invalid("Cannot have nested types containing dictionary arrays yet"); + } + // Walk downwards to extract nullability std::vector nullable; std::vector> offset_builders; @@ -1023,15 +1051,15 @@ struct TransferFunctor {}; template Status TransferInt(RecordReader* reader, MemoryPool* pool, - const std::shared_ptr<::arrow::DataType>& type, Datum* out) { + const std::shared_ptr& type, Datum* out) { using ArrowCType = typename ArrowType::c_type; using ParquetCType = typename ParquetType::c_type; int64_t length = reader->values_written(); std::shared_ptr data; RETURN_NOT_OK(::arrow::AllocateBuffer(pool, length * sizeof(ArrowCType), &data)); - auto values = reinterpret_cast(reader->values()); - auto out_ptr = reinterpret_cast(data->mutable_data()); + auto values = reinterpret_cast(reader->values()); + auto out_ptr = reinterpret_cast(data->mutable_data()); std::copy(values, values + length, out_ptr); *out = std::make_shared>( type, length, data, reader->ReleaseIsValid(), reader->null_count()); @@ -1039,7 +1067,7 @@ Status TransferInt(RecordReader* reader, MemoryPool* pool, } std::shared_ptr TransferZeroCopy(RecordReader* reader, - const std::shared_ptr<::arrow::DataType>& type) { + const std::shared_ptr& type) { std::vector> buffers = {reader->ReleaseIsValid(), reader->ReleaseValues()}; auto data = std::make_shared<::arrow::ArrayData>(type, reader->values_written(), @@ -1073,7 +1101,7 @@ Status TransferBool(RecordReader* reader, MemoryPool* pool, Datum* out) { template <> struct TransferFunctor<::arrow::TimestampType, Int96Type> { Status operator()(RecordReader* reader, MemoryPool* pool, - const std::shared_ptr<::arrow::DataType>& type, Datum* out) { + const std::shared_ptr& type, Datum* out) { int64_t length = reader->values_written(); auto values = reinterpret_cast(reader->values()); @@ -1092,7 +1120,7 @@ struct TransferFunctor<::arrow::TimestampType, Int96Type> { }; Status TransferDate64(RecordReader* reader, MemoryPool* pool, - const std::shared_ptr<::arrow::DataType>& type, Datum* out) { + const std::shared_ptr& type, Datum* out) { int64_t length = reader->values_written(); auto values = reinterpret_cast(reader->values()); @@ -1109,20 +1137,59 @@ Status TransferDate64(RecordReader* reader, MemoryPool* pool, return Status::OK(); } -Status TransferBinary(RecordReader* reader, Datum* out) { - auto binary_reader = dynamic_cast(reader); - DCHECK(binary_reader); - *out = std::make_shared(binary_reader->GetBuilderChunks()); - return Status::OK(); +// ---------------------------------------------------------------------- +// Binary, direct to dictionary-encoded + +// Some ugly hacks here for now to handle binary dictionaries casted to their +// logical type +std::shared_ptr ShallowCast(const Array& arr, + const std::shared_ptr& new_type) { + std::shared_ptr<::arrow::ArrayData> new_data = arr.data()->Copy(); + new_data->type = new_type; + if (new_type->id() == ::arrow::Type::DICTIONARY) { + // Cast dictionary, too + const auto& dict_type = static_cast(*new_type); + new_data->dictionary = ShallowCast(*new_data->dictionary, dict_type.value_type()); + } + return MakeArray(new_data); } -// ---------------------------------------------------------------------- -// Direct to dictionary-encoded +std::shared_ptr CastChunksTo( + const ChunkedArray& data, const std::shared_ptr& logical_value_type) { + std::vector> string_chunks; + for (int i = 0; i < data.num_chunks(); ++i) { + string_chunks.push_back(ShallowCast(*data.chunk(i), logical_value_type)); + } + return std::make_shared(string_chunks); +} -Status TransferDictionary(RecordReader* reader, Datum* out) { +Status TransferDictionary(RecordReader* reader, + const std::shared_ptr& logical_value_type, + std::shared_ptr* out) { auto dict_reader = dynamic_cast(reader); DCHECK(dict_reader); *out = dict_reader->GetResult(); + + const auto& dict_type = static_cast(*(*out)->type()); + if (!logical_value_type->Equals(*dict_type.value_type())) { + *out = CastChunksTo(**out, + ::arrow::dictionary(dict_type.index_type(), logical_value_type)); + } + return Status::OK(); +} + +Status TransferBinary(RecordReader* reader, + const std::shared_ptr& logical_value_type, + bool read_dictionary, std::shared_ptr* out) { + if (read_dictionary) { + return TransferDictionary(reader, logical_value_type, out); + } + auto binary_reader = dynamic_cast(reader); + DCHECK(binary_reader); + *out = std::make_shared(binary_reader->GetBuilderChunks()); + if (!logical_value_type->Equals(*(*out)->type())) { + *out = CastChunksTo(**out, logical_value_type); + } return Status::OK(); } @@ -1247,14 +1314,14 @@ static inline void RawBytesToDecimalBytes(const uint8_t* value, int32_t byte_wid } template -Status ConvertToDecimal128(const Array& array, const std::shared_ptr<::arrow::DataType>&, +Status ConvertToDecimal128(const Array& array, const std::shared_ptr&, MemoryPool* pool, std::shared_ptr*) { return Status::NotImplemented("not implemented"); } template <> Status ConvertToDecimal128(const Array& array, - const std::shared_ptr<::arrow::DataType>& type, + const std::shared_ptr& type, MemoryPool* pool, std::shared_ptr* out) { const auto& fixed_size_binary_array = static_cast(array); @@ -1302,7 +1369,7 @@ Status ConvertToDecimal128(const Array& array, template <> Status ConvertToDecimal128(const Array& array, - const std::shared_ptr<::arrow::DataType>& type, + const std::shared_ptr& type, MemoryPool* pool, std::shared_ptr* out) { const auto& binary_array = static_cast(array); const int64_t length = binary_array.length(); @@ -1352,8 +1419,7 @@ template ::value || std::is_same::value>::type> static Status DecimalIntegerTransfer(RecordReader* reader, MemoryPool* pool, - const std::shared_ptr<::arrow::DataType>& type, - Datum* out) { + const std::shared_ptr& type, Datum* out) { DCHECK_EQ(type->id(), ::arrow::Type::DECIMAL); const int64_t length = reader->values_written(); @@ -1410,7 +1476,7 @@ struct TransferFunctor< (std::is_same::value || std::is_same::value)>::type> { Status operator()(RecordReader* reader, MemoryPool* pool, - const std::shared_ptr<::arrow::DataType>& type, Datum* out) { + const std::shared_ptr& type, Datum* out) { DCHECK_EQ(type->id(), ::arrow::Type::DECIMAL); auto binary_reader = dynamic_cast(reader); @@ -1432,7 +1498,7 @@ struct TransferFunctor< #define TRANSFER_DATA(ArrowType, ParquetType) \ TransferFunctor func; \ - RETURN_NOT_OK(func(record_reader_.get(), pool_, field_->type(), &result)); + RETURN_NOT_OK(func(record_reader_.get(), pool_, value_type, &result)); #define TRANSFER_CASE(ENUM, ArrowType, ParquetType) \ case ::arrow::Type::ENUM: { \ @@ -1442,14 +1508,14 @@ struct TransferFunctor< #define TRANSFER_INT32(ENUM, ArrowType) \ case ::arrow::Type::ENUM: { \ Status s = TransferInt(record_reader_.get(), pool_, \ - field_->type(), &result); \ + value_type, &result); \ RETURN_NOT_OK(s); \ } break; #define TRANSFER_INT64(ENUM, ArrowType) \ case ::arrow::Type::ENUM: { \ Status s = TransferInt(record_reader_.get(), pool_, \ - field_->type(), &result); \ + value_type, &result); \ RETURN_NOT_OK(s); \ } break; @@ -1474,12 +1540,10 @@ Status PrimitiveImpl::NextBatch(int64_t records_to_read, return ::arrow::Status::IOError(e.what()); } + std::shared_ptr value_type = field_->type(); + Datum result; - switch (field_->type()->id()) { - case ::arrow::Type::DICTIONARY: { - RETURN_NOT_OK(TransferDictionary(record_reader_.get(), &result)); - break; - } + switch (value_type->id()) { case ::arrow::Type::NA: { result = std::make_shared<::arrow::NullArray>(record_reader_->values_written()); break; @@ -1488,7 +1552,7 @@ Status PrimitiveImpl::NextBatch(int64_t records_to_read, case ::arrow::Type::INT64: case ::arrow::Type::FLOAT: case ::arrow::Type::DOUBLE: - result = TransferZeroCopy(record_reader_.get(), field_->type()); + result = TransferZeroCopy(record_reader_.get(), value_type); break; case ::arrow::Type::BOOL: RETURN_NOT_OK(TransferBool(record_reader_.get(), pool_, &result)); @@ -1498,26 +1562,30 @@ Status PrimitiveImpl::NextBatch(int64_t records_to_read, TRANSFER_INT32(UINT16, ::arrow::UInt16Type); TRANSFER_INT32(INT16, ::arrow::Int16Type); TRANSFER_INT32(UINT32, ::arrow::UInt32Type); + TRANSFER_INT64(UINT64, ::arrow::UInt64Type); TRANSFER_INT32(DATE32, ::arrow::Date32Type); TRANSFER_INT32(TIME32, ::arrow::Time32Type); TRANSFER_INT64(TIME64, ::arrow::Time64Type); case ::arrow::Type::DATE64: - RETURN_NOT_OK(TransferDate64(record_reader_.get(), pool_, field_->type(), &result)); + RETURN_NOT_OK(TransferDate64(record_reader_.get(), pool_, value_type, &result)); break; case ::arrow::Type::FIXED_SIZE_BINARY: case ::arrow::Type::BINARY: - case ::arrow::Type::STRING: - RETURN_NOT_OK(TransferBinary(record_reader_.get(), &result)); - break; + case ::arrow::Type::STRING: { + std::shared_ptr out; + RETURN_NOT_OK( + TransferBinary(record_reader_.get(), value_type, read_dictionary_, &out)); + result = out; + } break; case ::arrow::Type::DECIMAL: { switch (descr_->physical_type()) { case ::parquet::Type::INT32: { RETURN_NOT_OK(DecimalIntegerTransfer(record_reader_.get(), pool_, - field_->type(), &result)); + value_type, &result)); } break; case ::parquet::Type::INT64: { RETURN_NOT_OK(DecimalIntegerTransfer(record_reader_.get(), pool_, - field_->type(), &result)); + value_type, &result)); } break; case ::parquet::Type::BYTE_ARRAY: { TRANSFER_DATA(::arrow::Decimal128Type, ByteArrayType); @@ -1533,17 +1601,17 @@ Status PrimitiveImpl::NextBatch(int64_t records_to_read, } break; case ::arrow::Type::TIMESTAMP: { const ::arrow::TimestampType& timestamp_type = - static_cast<::arrow::TimestampType&>(*field_->type()); + static_cast<::arrow::TimestampType&>(*value_type); switch (timestamp_type.unit()) { case ::arrow::TimeUnit::MILLI: case ::arrow::TimeUnit::MICRO: { - result = TransferZeroCopy(record_reader_.get(), field_->type()); + result = TransferZeroCopy(record_reader_.get(), value_type); } break; case ::arrow::TimeUnit::NANO: { if (descr_->physical_type() == ::parquet::Type::INT96) { TRANSFER_DATA(::arrow::TimestampType, Int96Type); } else { - result = TransferZeroCopy(record_reader_.get(), field_->type()); + result = TransferZeroCopy(record_reader_.get(), value_type); } } break; default: @@ -1552,7 +1620,7 @@ Status PrimitiveImpl::NextBatch(int64_t records_to_read, } break; default: return Status::NotImplemented("No support for reading columns of type ", - field_->type()->ToString()); + value_type->ToString()); } // Nest nested types diff --git a/cpp/src/parquet/arrow/reader.h b/cpp/src/parquet/arrow/reader.h index 97e93b9c6e6..349131482d6 100644 --- a/cpp/src/parquet/arrow/reader.h +++ b/cpp/src/parquet/arrow/reader.h @@ -156,8 +156,14 @@ ArrowReaderProperties default_arrow_reader_properties(); // arrays class PARQUET_EXPORT FileReader { public: - FileReader(::arrow::MemoryPool* pool, std::unique_ptr reader, - const ArrowReaderProperties& properties = default_arrow_reader_properties()); + static ::arrow::Status Make(::arrow::MemoryPool* pool, + std::unique_ptr reader, + const ArrowReaderProperties& properties, + std::unique_ptr* out); + + static ::arrow::Status Make(::arrow::MemoryPool* pool, + std::unique_ptr reader, + std::unique_ptr* out); // Since the distribution of columns amongst a Parquet file's row groups may // be uneven (the number of values in each column chunk can be different), we @@ -261,6 +267,9 @@ class PARQUET_EXPORT FileReader { virtual ~FileReader(); private: + FileReader(::arrow::MemoryPool* pool, std::unique_ptr reader, + const ArrowReaderProperties& properties); + friend ColumnChunkReader; friend RowGroupReader; diff --git a/cpp/src/parquet/arrow/schema.cc b/cpp/src/parquet/arrow/schema.cc index f77bf38f9e2..1648f0e4a17 100644 --- a/cpp/src/parquet/arrow/schema.cc +++ b/cpp/src/parquet/arrow/schema.cc @@ -29,6 +29,7 @@ #include "arrow/util/checked_cast.h" #include "arrow/util/logging.h" +#include "parquet/arrow/reader.h" #include "parquet/arrow/writer.h" #include "parquet/exception.h" #include "parquet/properties.h" @@ -424,8 +425,13 @@ Status NodeToFieldInternal(const Node& node, return Status::OK(); } +std::shared_ptr ToDictionary32(const Field& field) { + auto new_ty = ::arrow::dictionary(::arrow::int32(), field.type()); + return field.WithType(new_ty); +} + Status FromParquetSchema( - const SchemaDescriptor* parquet_schema, + const SchemaDescriptor* parquet_schema, const ArrowReaderProperties& properties, const std::shared_ptr& key_value_metadata, std::shared_ptr<::arrow::Schema>* out) { const GroupNode& schema_node = *parquet_schema->group_node(); @@ -435,13 +441,13 @@ Status FromParquetSchema( for (int i = 0; i < num_fields; i++) { RETURN_NOT_OK(NodeToField(*schema_node.field(i), &fields[i])); } - *out = std::make_shared<::arrow::Schema>(fields, key_value_metadata); return Status::OK(); } Status FromParquetSchema( const SchemaDescriptor* parquet_schema, const std::vector& column_indices, + const ArrowReaderProperties& properties, const std::shared_ptr& key_value_metadata, std::shared_ptr<::arrow::Schema>* out) { // TODO(wesm): Consider adding an arrow::Schema name attribute, which comes @@ -456,10 +462,12 @@ Status FromParquetSchema( std::unordered_set included_leaf_nodes(num_columns); for (int i = 0; i < num_columns; i++) { const ColumnDescriptor* column_desc = parquet_schema->Column(column_indices[i]); - included_leaf_nodes.insert(column_desc->schema_node().get()); + const Node* node = column_desc->schema_node().get(); + + included_leaf_nodes.insert(node); const Node* column_root = parquet_schema->GetColumnRoot(column_indices[i]); - auto insertion = top_nodes.insert(column_root); - if (insertion.second) { + auto it = top_nodes.insert(column_root); + if (it.second) { base_nodes.push_back(column_root); } } @@ -479,13 +487,15 @@ Status FromParquetSchema( Status FromParquetSchema(const SchemaDescriptor* parquet_schema, const std::vector& column_indices, + const ArrowReaderProperties& properties, std::shared_ptr<::arrow::Schema>* out) { - return FromParquetSchema(parquet_schema, column_indices, nullptr, out); + return FromParquetSchema(parquet_schema, column_indices, properties, nullptr, out); } Status FromParquetSchema(const SchemaDescriptor* parquet_schema, + const ArrowReaderProperties& properties, std::shared_ptr<::arrow::Schema>* out) { - return FromParquetSchema(parquet_schema, nullptr, out); + return FromParquetSchema(parquet_schema, properties, nullptr, out); } Status ListToNode(const std::shared_ptr<::arrow::ListType>& type, const std::string& name, diff --git a/cpp/src/parquet/arrow/schema.h b/cpp/src/parquet/arrow/schema.h index 52fb843e6c6..0ad86af1347 100644 --- a/cpp/src/parquet/arrow/schema.h +++ b/cpp/src/parquet/arrow/schema.h @@ -40,6 +40,7 @@ class WriterProperties; namespace arrow { +class ArrowReaderProperties; class ArrowWriterProperties; PARQUET_EXPORT @@ -57,38 +58,46 @@ ::arrow::Status NodeToField(const schema::Node& node, PARQUET_EXPORT ::arrow::Status FromParquetSchema( const SchemaDescriptor* parquet_schema, const std::vector& column_indices, + const ArrowReaderProperties& properties, const std::shared_ptr& key_value_metadata, std::shared_ptr<::arrow::Schema>* out); // Without indices PARQUET_EXPORT ::arrow::Status FromParquetSchema( - const SchemaDescriptor* parquet_schema, + const SchemaDescriptor* parquet_schema, const ArrowReaderProperties& properties, const std::shared_ptr& key_value_metadata, std::shared_ptr<::arrow::Schema>* out); // Without metadata -::arrow::Status PARQUET_EXPORT FromParquetSchema(const SchemaDescriptor* parquet_schema, - const std::vector& column_indices, - std::shared_ptr<::arrow::Schema>* out); +PARQUET_EXPORT +::arrow::Status FromParquetSchema(const SchemaDescriptor* parquet_schema, + const std::vector& column_indices, + const ArrowReaderProperties& properties, + std::shared_ptr<::arrow::Schema>* out); // Without metadata or indices -::arrow::Status PARQUET_EXPORT FromParquetSchema(const SchemaDescriptor* parquet_schema, - std::shared_ptr<::arrow::Schema>* out); - -::arrow::Status PARQUET_EXPORT FieldToNode(const std::shared_ptr<::arrow::Field>& field, - const WriterProperties& properties, - const ArrowWriterProperties& arrow_properties, - schema::NodePtr* out); - -::arrow::Status PARQUET_EXPORT -ToParquetSchema(const ::arrow::Schema* arrow_schema, const WriterProperties& properties, - const ArrowWriterProperties& arrow_properties, - std::shared_ptr* out); - -::arrow::Status PARQUET_EXPORT ToParquetSchema(const ::arrow::Schema* arrow_schema, - const WriterProperties& properties, - std::shared_ptr* out); +PARQUET_EXPORT +::arrow::Status FromParquetSchema(const SchemaDescriptor* parquet_schema, + const ArrowReaderProperties& properties, + std::shared_ptr<::arrow::Schema>* out); + +PARQUET_EXPORT +::arrow::Status FieldToNode(const std::shared_ptr<::arrow::Field>& field, + const WriterProperties& properties, + const ArrowWriterProperties& arrow_properties, + schema::NodePtr* out); + +PARQUET_EXPORT +::arrow::Status ToParquetSchema(const ::arrow::Schema* arrow_schema, + const WriterProperties& properties, + const ArrowWriterProperties& arrow_properties, + std::shared_ptr* out); + +PARQUET_EXPORT +::arrow::Status ToParquetSchema(const ::arrow::Schema* arrow_schema, + const WriterProperties& properties, + std::shared_ptr* out); PARQUET_EXPORT int32_t DecimalSize(int32_t precision); diff --git a/cpp/src/parquet/arrow/test-util.h b/cpp/src/parquet/arrow/test-util.h index 8760d91f2a9..cc4774fa03e 100644 --- a/cpp/src/parquet/arrow/test-util.h +++ b/cpp/src/parquet/arrow/test-util.h @@ -39,6 +39,7 @@ using internal::RecordReader; namespace arrow { using ::arrow::Array; +using ::arrow::ChunkedArray; using ::arrow::Status; template @@ -429,11 +430,16 @@ Status MakeEmptyListsArray(int64_t size, std::shared_ptr* out_array) { return Status::OK(); } +std::shared_ptr<::arrow::Table> MakeSimpleTable( + const std::shared_ptr& values, bool nullable) { + auto schema = ::arrow::schema({::arrow::field("col", values->type(), nullable)}); + return ::arrow::Table::Make(schema, {values}); +} + std::shared_ptr<::arrow::Table> MakeSimpleTable(const std::shared_ptr& values, bool nullable) { auto carr = std::make_shared<::arrow::ChunkedArray>(values); - auto schema = ::arrow::schema({::arrow::field("col", values->type(), nullable)}); - return ::arrow::Table::Make(schema, {carr}); + return MakeSimpleTable(carr, nullable); } template diff --git a/cpp/src/parquet/arrow/writer.cc b/cpp/src/parquet/arrow/writer.cc index fd40319fde8..f2096d706e9 100644 --- a/cpp/src/parquet/arrow/writer.cc +++ b/cpp/src/parquet/arrow/writer.cc @@ -34,6 +34,7 @@ #include "arrow/util/logging.h" +#include "parquet/arrow/reader.h" #include "parquet/arrow/schema.h" #include "parquet/column_writer.h" #include "parquet/deprecated_io.h" @@ -1079,6 +1080,7 @@ class FileWriter::Impl { int current_column_idx = row_group_writer_->current_column(); std::shared_ptr<::arrow::Schema> arrow_schema; RETURN_NOT_OK(FromParquetSchema(writer_->schema(), {current_column_idx - 1}, + default_arrow_reader_properties(), writer_->key_value_metadata(), &arrow_schema)); ArrowColumnWriter arrow_writer(&column_write_context_, column_writer, diff --git a/cpp/src/parquet/column_reader.cc b/cpp/src/parquet/column_reader.cc index d919c4fdd0c..ea47176d093 100644 --- a/cpp/src/parquet/column_reader.cc +++ b/cpp/src/parquet/column_reader.cc @@ -1262,8 +1262,11 @@ class ByteArrayDictionaryRecordReader : public TypedRecordReader, } } - void WriteNewDictionary() { + void MaybeWriteNewDictionary() { if (this->new_dictionary_) { + /// If there is a new dictionary, we may need to flush the builder, then + /// insert the new dictionary values + FlushBuilder(); auto decoder = checked_cast(this->current_decoder_); decoder->InsertDictionary(&builder_); this->new_dictionary_ = false; @@ -1273,10 +1276,7 @@ class ByteArrayDictionaryRecordReader : public TypedRecordReader, void ReadValuesDense(int64_t values_to_read) override { int64_t num_decoded = 0; if (current_encoding_ == Encoding::RLE_DICTIONARY) { - /// If there is a new dictionary, we may need to flush the builder, then - /// insert the new dictionary values - FlushBuilder(); - WriteNewDictionary(); + MaybeWriteNewDictionary(); auto decoder = checked_cast(this->current_decoder_); num_decoded = decoder->DecodeIndices(static_cast(values_to_read), &builder_); } else { @@ -1292,10 +1292,7 @@ class ByteArrayDictionaryRecordReader : public TypedRecordReader, void ReadValuesSpaced(int64_t values_to_read, int64_t null_count) override { int64_t num_decoded = 0; if (current_encoding_ == Encoding::RLE_DICTIONARY) { - /// If there is a new dictionary, we may need to flush the builder, then - /// insert the new dictionary values - FlushBuilder(); - WriteNewDictionary(); + MaybeWriteNewDictionary(); auto decoder = checked_cast(this->current_decoder_); num_decoded = decoder->DecodeIndicesSpaced( static_cast(values_to_read), static_cast(null_count), diff --git a/cpp/src/parquet/encoding-test.cc b/cpp/src/parquet/encoding-test.cc index 6340510d2b7..799b81dc2d2 100644 --- a/cpp/src/parquet/encoding-test.cc +++ b/cpp/src/parquet/encoding-test.cc @@ -540,5 +540,13 @@ TEST_F(DictEncoding, CheckDecodeIndicesSpaced) { } } +TEST_F(DictEncoding, CheckDecodeIndicesNoNulls) { + InitTestCase(/*null_probability=*/0.0); + auto builder = CreateDictBuilder(); + dict_decoder_->InsertDictionary(builder.get()); + auto actual_num_values = dict_decoder_->DecodeIndices(num_values_, builder.get()); + CheckDict(actual_num_values, *builder); +} + } // namespace test } // namespace parquet diff --git a/cpp/src/parquet/encoding.cc b/cpp/src/parquet/encoding.cc index 1f865e24cf6..54908228211 100644 --- a/cpp/src/parquet/encoding.cc +++ b/cpp/src/parquet/encoding.cc @@ -831,7 +831,8 @@ class DictDecoderImpl : public DecoderImpl, virtual public DictDecoder { dictionary_(AllocateBuffer(pool, 0)), dictionary_length_(0), byte_array_data_(AllocateBuffer(pool, 0)), - byte_array_offsets_(AllocateBuffer(pool, 0)) {} + byte_array_offsets_(AllocateBuffer(pool, 0)), + indices_scratch_space_(AllocateBuffer(pool, 0)) {} // Perform type-specific initiatialization void SetDict(TypedDecoder* dictionary) override; @@ -877,8 +878,8 @@ class DictDecoderImpl : public DecoderImpl, virtual public DictDecoder { if (num_values > 0) { // TODO(wesm): Refactor to batch reads for improved memory use. It is not // trivial because the null_count is relative to the entire bitmap - PARQUET_THROW_NOT_OK(indices_scratch_space_->Resize(num_values * sizeof(int16_t), - /*shrink_to_fit=*/false)); + PARQUET_THROW_NOT_OK(indices_scratch_space_->TypedResize( + num_values, /*shrink_to_fit=*/false)); } auto indices_buffer = @@ -911,8 +912,8 @@ class DictDecoderImpl : public DecoderImpl, virtual public DictDecoder { // TODO(wesm): Refactor to batch reads for improved memory use. This is // relatively simple here because we don't have to do any bookkeeping of // nulls - PARQUET_THROW_NOT_OK(indices_scratch_space_->Resize(num_values * sizeof(int16_t), - /*shrink_to_fit=*/false)); + PARQUET_THROW_NOT_OK(indices_scratch_space_->TypedResize( + num_values, /*shrink_to_fit=*/false)); } auto indices_buffer = reinterpret_cast(indices_scratch_space_->mutable_data()); diff --git a/cpp/src/parquet/types.h b/cpp/src/parquet/types.h index 0faec10cf67..f24ff85cc14 100644 --- a/cpp/src/parquet/types.h +++ b/cpp/src/parquet/types.h @@ -633,19 +633,19 @@ struct type_traits { }; template -struct DataType { +struct PhysicalType { using c_type = typename type_traits::value_type; static constexpr Type::type type_num = TYPE; }; -using BooleanType = DataType; -using Int32Type = DataType; -using Int64Type = DataType; -using Int96Type = DataType; -using FloatType = DataType; -using DoubleType = DataType; -using ByteArrayType = DataType; -using FLBAType = DataType; +using BooleanType = PhysicalType; +using Int32Type = PhysicalType; +using Int64Type = PhysicalType; +using Int96Type = PhysicalType; +using FloatType = PhysicalType; +using DoubleType = PhysicalType; +using ByteArrayType = PhysicalType; +using FLBAType = PhysicalType; template inline std::string format_fwf(int width) { From 2b424409b5f02a4af4ed937ec9e96c980fed3ee9 Mon Sep 17 00:00:00 2001 From: Wes McKinney Date: Thu, 25 Jul 2019 21:08:11 -0500 Subject: [PATCH 07/11] Fix release builds and Python --- cpp/src/parquet/column_reader.cc | 6 +++--- python/pyarrow/_parquet.pxd | 7 +++++++ python/pyarrow/_parquet.pyx | 3 ++- 3 files changed, 12 insertions(+), 4 deletions(-) diff --git a/cpp/src/parquet/column_reader.cc b/cpp/src/parquet/column_reader.cc index ea47176d093..28dd5aca6d7 100644 --- a/cpp/src/parquet/column_reader.cc +++ b/cpp/src/parquet/column_reader.cc @@ -1267,7 +1267,7 @@ class ByteArrayDictionaryRecordReader : public TypedRecordReader, /// If there is a new dictionary, we may need to flush the builder, then /// insert the new dictionary values FlushBuilder(); - auto decoder = checked_cast(this->current_decoder_); + auto decoder = dynamic_cast(this->current_decoder_); decoder->InsertDictionary(&builder_); this->new_dictionary_ = false; } @@ -1277,7 +1277,7 @@ class ByteArrayDictionaryRecordReader : public TypedRecordReader, int64_t num_decoded = 0; if (current_encoding_ == Encoding::RLE_DICTIONARY) { MaybeWriteNewDictionary(); - auto decoder = checked_cast(this->current_decoder_); + auto decoder = dynamic_cast(this->current_decoder_); num_decoded = decoder->DecodeIndices(static_cast(values_to_read), &builder_); } else { num_decoded = this->current_decoder_->DecodeArrowNonNull( @@ -1293,7 +1293,7 @@ class ByteArrayDictionaryRecordReader : public TypedRecordReader, int64_t num_decoded = 0; if (current_encoding_ == Encoding::RLE_DICTIONARY) { MaybeWriteNewDictionary(); - auto decoder = checked_cast(this->current_decoder_); + auto decoder = dynamic_cast(this->current_decoder_); num_decoded = decoder->DecodeIndicesSpaced( static_cast(values_to_read), static_cast(null_count), valid_bits_->mutable_data(), values_written_, &builder_); diff --git a/python/pyarrow/_parquet.pxd b/python/pyarrow/_parquet.pxd index 6a11fc89cec..270bb619f75 100644 --- a/python/pyarrow/_parquet.pxd +++ b/python/pyarrow/_parquet.pxd @@ -358,6 +358,11 @@ cdef extern from "parquet/api/writer.h" namespace "parquet" nogil: cdef extern from "parquet/arrow/reader.h" namespace "parquet::arrow" nogil: + cdef cppclass ArrowReaderProperties: + pass + + ArrowReaderProperties default_arrow_reader_properties() + CStatus OpenFile(const shared_ptr[RandomAccessFile]& file, CMemoryPool* allocator, const ReaderProperties& properties, @@ -389,11 +394,13 @@ cdef extern from "parquet/arrow/reader.h" namespace "parquet::arrow" nogil: cdef extern from "parquet/arrow/schema.h" namespace "parquet::arrow" nogil: CStatus FromParquetSchema( const SchemaDescriptor* parquet_schema, + const ArrowReaderProperties& properties, const shared_ptr[const CKeyValueMetadata]& key_value_metadata, shared_ptr[CSchema]* out) CStatus ToParquetSchema( const CSchema* arrow_schema, + const ArrowReaderProperties& properties, const shared_ptr[const CKeyValueMetadata]& key_value_metadata, shared_ptr[SchemaDescriptor]* out) diff --git a/python/pyarrow/_parquet.pyx b/python/pyarrow/_parquet.pyx index eb74dea852b..1944ab6653b 100644 --- a/python/pyarrow/_parquet.pyx +++ b/python/pyarrow/_parquet.pyx @@ -743,7 +743,8 @@ cdef class ParquetSchema: with nogil: check_status(FromParquetSchema( - self.schema, self.parent._metadata.key_value_metadata(), + self.schema, default_arrow_reader_properties(), + self.parent._metadata.key_value_metadata(), &sp_arrow_schema)) return pyarrow_wrap_schema(sp_arrow_schema) From 04c950078d58236fe909126f0f2588847b6ba865 Mon Sep 17 00:00:00 2001 From: Wes McKinney Date: Thu, 25 Jul 2019 21:44:21 -0500 Subject: [PATCH 08/11] Fix API usages --- cpp/src/arrow/util/rle-encoding.h | 4 ++-- .../parquet/arrow/reader-writer-benchmark.cc | 22 ++++++++++++------- cpp/src/parquet/column_reader.cc | 2 +- cpp/src/parquet/encoding.cc | 6 ++--- 4 files changed, 20 insertions(+), 14 deletions(-) diff --git a/cpp/src/arrow/util/rle-encoding.h b/cpp/src/arrow/util/rle-encoding.h index e2210686b55..4b5da6fae4d 100644 --- a/cpp/src/arrow/util/rle-encoding.h +++ b/cpp/src/arrow/util/rle-encoding.h @@ -114,7 +114,7 @@ class RleDecoder { /// Gets a batch of values. Returns the number of decoded elements. template - int GetBatch(int batch_size, T* values); + int GetBatch(T* values, int batch_size); /// Like GetBatch but add spacing for null entries template @@ -287,7 +287,7 @@ inline bool RleDecoder::Get(T* val) { } template -inline int RleDecoder::GetBatch(int batch_size, T* values) { +inline int RleDecoder::GetBatch(T* values, int batch_size) { DCHECK_GE(bit_width_, 0); int values_read = 0; diff --git a/cpp/src/parquet/arrow/reader-writer-benchmark.cc b/cpp/src/parquet/arrow/reader-writer-benchmark.cc index 239d707e231..59c238b1855 100644 --- a/cpp/src/parquet/arrow/reader-writer-benchmark.cc +++ b/cpp/src/parquet/arrow/reader-writer-benchmark.cc @@ -179,9 +179,11 @@ static void BM_ReadColumn(::benchmark::State& state) { while (state.KeepRunning()) { auto reader = ParquetFileReader::Open(std::make_shared<::arrow::io::BufferReader>(buffer)); - FileReader filereader(::arrow::default_memory_pool(), std::move(reader)); + std::unique_ptr arrow_reader; + EXIT_NOT_OK(FileReader::Make(::arrow::default_memory_pool(), std::move(reader), + &arrow_reader)); std::shared_ptr<::arrow::Table> table; - EXIT_NOT_OK(filereader.ReadTable(&table)); + EXIT_NOT_OK(arrow_reader->ReadTable(&table)); } SetBytesProcessed(state); } @@ -212,14 +214,16 @@ static void BM_ReadIndividualRowGroups(::benchmark::State& state) { while (state.KeepRunning()) { auto reader = ParquetFileReader::Open(std::make_shared<::arrow::io::BufferReader>(buffer)); - FileReader filereader(::arrow::default_memory_pool(), std::move(reader)); + std::unique_ptr arrow_reader; + EXIT_NOT_OK(FileReader::Make(::arrow::default_memory_pool(), std::move(reader), + &arrow_reader)); std::vector> tables; - for (int i = 0; i < filereader.num_row_groups(); i++) { + for (int i = 0; i < arrow_reader->num_row_groups(); i++) { // Only read the even numbered RowGroups if ((i % 2) == 0) { std::shared_ptr<::arrow::Table> table; - EXIT_NOT_OK(filereader.RowGroup(i)->ReadTable(&table)); + EXIT_NOT_OK(arrow_reader->RowGroup(i)->ReadTable(&table)); tables.push_back(table); } } @@ -245,11 +249,13 @@ static void BM_ReadMultipleRowGroups(::benchmark::State& state) { while (state.KeepRunning()) { auto reader = ParquetFileReader::Open(std::make_shared<::arrow::io::BufferReader>(buffer)); - FileReader filereader(::arrow::default_memory_pool(), std::move(reader)); + std::unique_ptr arrow_reader; + EXIT_NOT_OK(FileReader::Make(::arrow::default_memory_pool(), std::move(reader), + &arrow_reader)); std::vector> tables; std::vector rgs; - for (int i = 0; i < filereader.num_row_groups(); i++) { + for (int i = 0; i < arrow_reader->num_row_groups(); i++) { // Only read the even numbered RowGroups if ((i % 2) == 0) { rgs.push_back(i); @@ -257,7 +263,7 @@ static void BM_ReadMultipleRowGroups(::benchmark::State& state) { } std::shared_ptr<::arrow::Table> table; - EXIT_NOT_OK(filereader.ReadRowGroups(rgs, &table)); + EXIT_NOT_OK(arrow_reader->ReadRowGroups(rgs, &table)); } SetBytesProcessed(state); } diff --git a/cpp/src/parquet/column_reader.cc b/cpp/src/parquet/column_reader.cc index 28dd5aca6d7..4562d7b8845 100644 --- a/cpp/src/parquet/column_reader.cc +++ b/cpp/src/parquet/column_reader.cc @@ -87,7 +87,7 @@ int LevelDecoder::Decode(int batch_size, int16_t* levels) { int num_values = std::min(num_values_remaining_, batch_size); if (encoding_ == Encoding::RLE) { - num_decoded = rle_decoder_->GetBatch(num_values, levels); + num_decoded = rle_decoder_->GetBatch(levels, num_values); } else { num_decoded = bit_packed_decoder_->GetBatch(bit_width_, levels, num_values); } diff --git a/cpp/src/parquet/encoding.cc b/cpp/src/parquet/encoding.cc index 54908228211..875bcd95289 100644 --- a/cpp/src/parquet/encoding.cc +++ b/cpp/src/parquet/encoding.cc @@ -917,7 +917,7 @@ class DictDecoderImpl : public DecoderImpl, virtual public DictDecoder { } auto indices_buffer = reinterpret_cast(indices_scratch_space_->mutable_data()); - if (num_values != idx_decoder_.GetBatch(num_values, indices_buffer)) { + if (num_values != idx_decoder_.GetBatch(indices_buffer, num_values)) { ParquetException::EofException(); } auto binary_builder = checked_cast<::arrow::BinaryDictionaryBuilder*>(builder); @@ -1090,7 +1090,7 @@ class DictByteArrayDecoderImpl : public DictDecoderImpl, if (is_valid) { int32_t batch_size = std::min(buffer_size, num_values - values_decoded - null_count); - int num_indices = idx_decoder_.GetBatch(batch_size, indices_buffer); + int num_indices = idx_decoder_.GetBatch(indices_buffer, batch_size); int i = 0; while (true) { @@ -1138,7 +1138,7 @@ class DictByteArrayDecoderImpl : public DictDecoderImpl, while (values_decoded < num_values) { int32_t batch_size = std::min(buffer_size, num_values - values_decoded); - int num_indices = idx_decoder_.GetBatch(batch_size, indices_buffer); + int num_indices = idx_decoder_.GetBatch(indices_buffer, batch_size); if (num_indices == 0) break; for (int i = 0; i < num_indices; ++i) { const auto& val = dict_values[indices_buffer[i]]; From e78f32b287073c76637b66cea2364bd872a3396e Mon Sep 17 00:00:00 2001 From: Wes McKinney Date: Fri, 26 Jul 2019 16:07:23 -0500 Subject: [PATCH 09/11] Fix MSVC issues [skip ci] --- cpp/src/arrow/util/rle-encoding-test.cc | 3 ++- cpp/src/arrow/util/rle-encoding.h | 2 +- 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/cpp/src/arrow/util/rle-encoding-test.cc b/cpp/src/arrow/util/rle-encoding-test.cc index f049828359b..e5a755aebcc 100644 --- a/cpp/src/arrow/util/rle-encoding-test.cc +++ b/cpp/src/arrow/util/rle-encoding-test.cc @@ -494,7 +494,8 @@ void CheckRoundTripSpaced(const Array& data, int bit_width) { RleDecoder decoder(buffer.data(), encoded_size, bit_width); std::vector values_read(num_values); - if (num_values != decoder.GetBatchSpaced(num_values, data.null_count(), + if (num_values != decoder.GetBatchSpaced(num_values, + static_cast(data.null_count()), data.null_bitmap_data(), data.offset(), values_read.data())) { FAIL(); diff --git a/cpp/src/arrow/util/rle-encoding.h b/cpp/src/arrow/util/rle-encoding.h index 4b5da6fae4d..99616021d6c 100644 --- a/cpp/src/arrow/util/rle-encoding.h +++ b/cpp/src/arrow/util/rle-encoding.h @@ -348,7 +348,7 @@ inline int RleDecoder::GetBatchSpaced(int batch_size, int null_count, bit_reader.Next(); } - std::fill(out, out + repeat_batch, current_value_); + std::fill(out, out + repeat_batch, static_cast(current_value_)); out += repeat_batch; values_read += repeat_batch; } else if (literal_count_ > 0) { From d551b3de68b2ccb2f49bc3620a6172b5f31d6f68 Mon Sep 17 00:00:00 2001 From: Wes McKinney Date: Fri, 26 Jul 2019 16:36:52 -0500 Subject: [PATCH 10/11] Add missing Status checks now that wrapper gone --- cpp/src/arrow/util/rle-encoding-test.cc | 7 +++---- cpp/src/parquet/encoding.cc | 22 +++++++++++----------- 2 files changed, 14 insertions(+), 15 deletions(-) diff --git a/cpp/src/arrow/util/rle-encoding-test.cc b/cpp/src/arrow/util/rle-encoding-test.cc index e5a755aebcc..1ea6394ddc7 100644 --- a/cpp/src/arrow/util/rle-encoding-test.cc +++ b/cpp/src/arrow/util/rle-encoding-test.cc @@ -494,10 +494,9 @@ void CheckRoundTripSpaced(const Array& data, int bit_width) { RleDecoder decoder(buffer.data(), encoded_size, bit_width); std::vector values_read(num_values); - if (num_values != decoder.GetBatchSpaced(num_values, - static_cast(data.null_count()), - data.null_bitmap_data(), data.offset(), - values_read.data())) { + if (num_values != decoder.GetBatchSpaced( + num_values, static_cast(data.null_count()), + data.null_bitmap_data(), data.offset(), values_read.data())) { FAIL(); } diff --git a/cpp/src/parquet/encoding.cc b/cpp/src/parquet/encoding.cc index 875bcd95289..ef42e88ccb2 100644 --- a/cpp/src/parquet/encoding.cc +++ b/cpp/src/parquet/encoding.cc @@ -747,7 +747,7 @@ class PlainByteArrayDecoder : public PlainDecoder, int64_t valid_bits_offset, BuilderType* builder, int* values_decoded) { num_values = std::min(num_values, num_values_); - builder->Reserve(num_values); + RETURN_NOT_OK(builder->Reserve(num_values)); ::arrow::internal::BitmapReader bit_reader(valid_bits, valid_bits_offset, num_values); int increment; int i = 0; @@ -761,12 +761,12 @@ class PlainByteArrayDecoder : public PlainDecoder, if (data_size < increment) { ParquetException::EofException(); } - builder->Append(data + sizeof(uint32_t), len); + RETURN_NOT_OK(builder->Append(data + sizeof(uint32_t), len)); data += increment; data_size -= increment; bytes_decoded += increment; } else { - builder->AppendNull(); + RETURN_NOT_OK(builder->AppendNull()); } bit_reader.Next(); ++i; @@ -783,7 +783,7 @@ class PlainByteArrayDecoder : public PlainDecoder, ::arrow::Status DecodeArrowNonNull(int num_values, BuilderType* builder, int* values_decoded) { num_values = std::min(num_values, num_values_); - builder->Reserve(num_values); + RETURN_NOT_OK(builder->Reserve(num_values)); int i = 0; const uint8_t* data = data_; int64_t data_size = len_; @@ -793,7 +793,7 @@ class PlainByteArrayDecoder : public PlainDecoder, uint32_t len = arrow::util::SafeLoadAs(data); int increment = static_cast(sizeof(uint32_t) + len); if (data_size < increment) ParquetException::EofException(); - builder->Append(data + sizeof(uint32_t), len); + RETURN_NOT_OK(builder->Append(data + sizeof(uint32_t), len)); data += increment; data_size -= increment; bytes_decoded += increment; @@ -1077,7 +1077,7 @@ class DictByteArrayDecoderImpl : public DictDecoderImpl, int* out_num_values) { constexpr int32_t buffer_size = 1024; int32_t indices_buffer[buffer_size]; - builder->Reserve(num_values); + RETURN_NOT_OK(builder->Reserve(num_values)); ::arrow::internal::BitmapReader bit_reader(valid_bits, valid_bits_offset, num_values); auto dict_values = reinterpret_cast(dictionary_->data()); @@ -1097,10 +1097,10 @@ class DictByteArrayDecoderImpl : public DictDecoderImpl, // Consume all indices if (is_valid) { const auto& val = dict_values[indices_buffer[i]]; - builder->Append(val.ptr, val.len); + RETURN_NOT_OK(builder->Append(val.ptr, val.len)); ++i; } else { - builder->AppendNull(); + RETURN_NOT_OK(builder->AppendNull()); --null_count; } ++values_decoded; @@ -1113,7 +1113,7 @@ class DictByteArrayDecoderImpl : public DictDecoderImpl, bit_reader.Next(); } } else { - builder->AppendNull(); + RETURN_NOT_OK(builder->AppendNull()); --null_count; ++values_decoded; } @@ -1132,7 +1132,7 @@ class DictByteArrayDecoderImpl : public DictDecoderImpl, constexpr int32_t buffer_size = 2048; int32_t indices_buffer[buffer_size]; int values_decoded = 0; - builder->Reserve(num_values); + RETURN_NOT_OK(builder->Reserve(num_values)); auto dict_values = reinterpret_cast(dictionary_->data()); @@ -1142,7 +1142,7 @@ class DictByteArrayDecoderImpl : public DictDecoderImpl, if (num_indices == 0) break; for (int i = 0; i < num_indices; ++i) { const auto& val = dict_values[indices_buffer[i]]; - builder->Append(val.ptr, val.len); + RETURN_NOT_OK(builder->Append(val.ptr, val.len)); } values_decoded += num_indices; } From 0269629713f457c2bbcf8df662db0f04e571cac1 Mon Sep 17 00:00:00 2001 From: Wes McKinney Date: Fri, 26 Jul 2019 17:58:43 -0500 Subject: [PATCH 11/11] Add missing documentation --- cpp/src/parquet/arrow/schema.h | 1 + 1 file changed, 1 insertion(+) diff --git a/cpp/src/parquet/arrow/schema.h b/cpp/src/parquet/arrow/schema.h index 0ad86af1347..477597e5d50 100644 --- a/cpp/src/parquet/arrow/schema.h +++ b/cpp/src/parquet/arrow/schema.h @@ -52,6 +52,7 @@ ::arrow::Status NodeToField(const schema::Node& node, /// \param column_indices indices of leaf nodes in parquet schema tree. Appearing ordering /// matters for the converted schema. Repeated indices are ignored /// except for the first one +/// \param properties reader options for FileReader /// \param key_value_metadata optional metadata, can be nullptr /// \param out the corresponding arrow schema /// \return Status::OK() on a successful conversion.