From c871ea971deade55abd8b1b77bbaad4b18b36054 Mon Sep 17 00:00:00 2001 From: Wes McKinney Date: Thu, 8 Aug 2019 18:02:13 -0500 Subject: [PATCH 01/28] Refactor WriteBatch/WriteBatchSpaced to utilize helper functions [skip ci] --- cpp/src/parquet/column_writer.cc | 374 ++++++++++++++----------------- 1 file changed, 164 insertions(+), 210 deletions(-) diff --git a/cpp/src/parquet/column_writer.cc b/cpp/src/parquet/column_writer.cc index fa16234e6ec..9c25604a91e 100644 --- a/cpp/src/parquet/column_writer.cc +++ b/cpp/src/parquet/column_writer.cc @@ -630,6 +630,18 @@ void ColumnWriterImpl::FlushBufferedDataPages() { // ---------------------------------------------------------------------- // TypedColumnWriter +template +inline void DoInBatches(int64_t total, int64_t batch_size, Action&& action) { + int64_t num_batches = static_cast(total / batch_size); + for (int round = 0; round < num_batches; round++) { + action(round * batch_size, batch_size); + } + // Write the remaining values + if (total % batch_size > 0) { + action(num_batches * batch_size, total % batch_size); + } +} + template class TypedColumnWriterImpl : public ColumnWriterImpl, public TypedColumnWriter { public: @@ -653,11 +665,44 @@ class TypedColumnWriterImpl : public ColumnWriterImpl, public TypedColumnWriter< int64_t Close() override { return ColumnWriterImpl::Close(); } void WriteBatch(int64_t num_values, const int16_t* def_levels, - const int16_t* rep_levels, const T* values) override; + const int16_t* rep_levels, const T* values) override { + // We check for DataPage limits only after we have inserted the values. If a user + // writes a large number of values, the DataPage size can be much above the limit. + // The purpose of this chunking is to bound this. Even if a user writes large number + // of values, the chunking will ensure the AddDataPage() is called at a reasonable + // pagesize limit + int64_t value_offset = 0; + auto WriteBatch = [&](int64_t offset, int64_t batch_size) { + int64_t values_to_write = + WriteLevels(batch_size, def_levels + offset, rep_levels + offset); + // PARQUET-780 + if (values_to_write > 0) { + DCHECK_NE(nullptr, values); + } + WriteValues(values + value_offset, values_to_write, batch_size - values_to_write); + CommitWriteAndCheckLimits(batch_size, values_to_write); + value_offset += values_to_write; + }; + DoInBatches(num_values, properties_->write_batch_size(), WriteBatch); + } void WriteBatchSpaced(int64_t num_values, const int16_t* def_levels, const int16_t* rep_levels, const uint8_t* valid_bits, - int64_t valid_bits_offset, const T* values) override; + int64_t valid_bits_offset, const T* values) override { + // Like WriteBatch, but for spaced values + int64_t value_offset = 0; + auto WriteBatch = [&](int64_t offset, int64_t batch_size) { + int64_t batch_num_values = 0; + int64_t batch_num_spaced_values = 0; + WriteLevelsSpaced(batch_size, def_levels + offset, rep_levels + offset, + &batch_num_values, &batch_num_spaced_values); + WriteValuesSpaced(values + value_offset, batch_num_values, batch_num_spaced_values, + valid_bits, valid_bits_offset + value_offset); + CommitWriteAndCheckLimits(batch_size, batch_num_spaced_values); + value_offset += batch_num_spaced_values; + }; + DoInBatches(num_values, properties_->write_batch_size(), WriteBatch); + } Status WriteArrow(const int16_t* def_levels, const int16_t* rep_levels, int64_t num_levels, const ::arrow::Array& array, @@ -689,7 +734,24 @@ class TypedColumnWriterImpl : public ColumnWriterImpl, public TypedColumnWriter< // Checks if the Dictionary Page size limit is reached // If the limit is reached, the Dictionary and Data Pages are serialized // The encoding is switched to PLAIN - void CheckDictionarySizeLimit(); + // + // Only one Dictionary Page is written. + // Fallback to PLAIN if dictionary page limit is reached. + void CheckDictionarySizeLimit() { + // We have to dynamic cast here because TypedEncoder as some compilers + // don't want to cast through virtual inheritance + auto dict_encoder = dynamic_cast*>(current_encoder_.get()); + if (dict_encoder->dict_encoded_size() >= properties_->dictionary_pagesize_limit()) { + WriteDictionaryPage(); + // Serialize the buffered Dictionary Indicies + FlushBufferedDataPages(); + fallback_ = true; + // Only PLAIN encoding is supported for fallback in V1 + current_encoder_ = MakeEncoder(DType::type_num, Encoding::PLAIN, false, descr_, + properties_->memory_pool()); + encoding_ = Encoding::PLAIN; + } + } EncodedStatistics GetPageStatistics() override { EncodedStatistics result; @@ -729,234 +791,126 @@ class TypedColumnWriterImpl : public ColumnWriterImpl, public TypedColumnWriter< std::shared_ptr page_statistics_; std::shared_ptr chunk_statistics_; - inline int64_t WriteMiniBatch(int64_t num_values, const int16_t* def_levels, - const int16_t* rep_levels, const T* values); - - inline int64_t WriteMiniBatchSpaced(int64_t num_values, const int16_t* def_levels, - const int16_t* rep_levels, - const uint8_t* valid_bits, - int64_t valid_bits_offset, const T* values, - int64_t* num_spaced_written); - - // Write values to a temporary buffer before they are encoded into pages - void WriteValues(int64_t num_values, const T* values) { - dynamic_cast(current_encoder_.get()) - ->Put(values, static_cast(num_values)); - } - - void WriteValuesSpaced(int64_t num_values, const uint8_t* valid_bits, - int64_t valid_bits_offset, const T* values) { - dynamic_cast(current_encoder_.get()) - ->PutSpaced(values, static_cast(num_values), valid_bits, valid_bits_offset); - } -}; - -// Only one Dictionary Page is written. -// Fallback to PLAIN if dictionary page limit is reached. -template -void TypedColumnWriterImpl::CheckDictionarySizeLimit() { - // We have to dynamic cast here because TypedEncoder as some compilers - // don't want to cast through virtual inheritance - auto dict_encoder = dynamic_cast*>(current_encoder_.get()); - if (dict_encoder->dict_encoded_size() >= properties_->dictionary_pagesize_limit()) { - WriteDictionaryPage(); - // Serialize the buffered Dictionary Indicies - FlushBufferedDataPages(); - fallback_ = true; - // Only PLAIN encoding is supported for fallback in V1 - current_encoder_ = MakeEncoder(DType::type_num, Encoding::PLAIN, false, descr_, - properties_->memory_pool()); - encoding_ = Encoding::PLAIN; - } -} - -// ---------------------------------------------------------------------- -// Instantiate templated classes - -template -int64_t TypedColumnWriterImpl::WriteMiniBatch(int64_t num_values, - const int16_t* def_levels, - const int16_t* rep_levels, - const T* values) { - int64_t values_to_write = 0; - // If the field is required and non-repeated, there are no definition levels - if (descr_->max_definition_level() > 0) { - for (int64_t i = 0; i < num_values; ++i) { - if (def_levels[i] == descr_->max_definition_level()) { - ++values_to_write; + int64_t WriteLevels(int64_t num_values, const int16_t* def_levels, + const int16_t* rep_levels) { + int64_t values_to_write = 0; + // If the field is required and non-repeated, there are no definition levels + if (descr_->max_definition_level() > 0) { + for (int64_t i = 0; i < num_values; ++i) { + if (def_levels[i] == descr_->max_definition_level()) { + ++values_to_write; + } } - } - - WriteDefinitionLevels(num_values, def_levels); - } else { - // Required field, write all values - values_to_write = num_values; - } - // Not present for non-repeated fields - if (descr_->max_repetition_level() > 0) { - // A row could include more than one value - // Count the occasions where we start a new row - for (int64_t i = 0; i < num_values; ++i) { - if (rep_levels[i] == 0) { - rows_written_++; - } + WriteDefinitionLevels(num_values, def_levels); + } else { + // Required field, write all values + values_to_write = num_values; } - WriteRepetitionLevels(num_values, rep_levels); - } else { - // Each value is exactly one row - rows_written_ += static_cast(num_values); - } - - // PARQUET-780 - if (values_to_write > 0) { - DCHECK(nullptr != values) << "Values ptr cannot be NULL"; - } - - WriteValues(values_to_write, values); - - if (page_statistics_ != nullptr) { - page_statistics_->Update(values, values_to_write, num_values - values_to_write); - } - - num_buffered_values_ += num_values; - num_buffered_encoded_values_ += values_to_write; - - if (current_encoder_->EstimatedDataEncodedSize() >= properties_->data_pagesize()) { - AddDataPage(); - } - if (has_dictionary_ && !fallback_) { - CheckDictionarySizeLimit(); - } - - return values_to_write; -} + // Not present for non-repeated fields + if (descr_->max_repetition_level() > 0) { + // A row could include more than one value + // Count the occasions where we start a new row + for (int64_t i = 0; i < num_values; ++i) { + if (rep_levels[i] == 0) { + rows_written_++; + } + } -template -int64_t TypedColumnWriterImpl::WriteMiniBatchSpaced( - int64_t num_levels, const int16_t* def_levels, const int16_t* rep_levels, - const uint8_t* valid_bits, int64_t valid_bits_offset, const T* values, - int64_t* num_spaced_written) { - int64_t values_to_write = 0; - int64_t spaced_values_to_write = 0; - // If the field is required and non-repeated, there are no definition levels - if (descr_->max_definition_level() > 0) { - // Minimal definition level for which spaced values are written - int16_t min_spaced_def_level = descr_->max_definition_level(); - if (descr_->schema_node()->is_optional()) { - min_spaced_def_level--; + WriteRepetitionLevels(num_values, rep_levels); + } else { + // Each value is exactly one row + rows_written_ += static_cast(num_values); } - for (int64_t i = 0; i < num_levels; ++i) { - if (def_levels[i] == descr_->max_definition_level()) { - ++values_to_write; + return values_to_write; + } + + void WriteLevelsSpaced(int64_t num_levels, const int16_t* def_levels, + const int16_t* rep_levels, int64_t* out_values_to_write, + int64_t* out_spaced_values_to_write) { + int64_t values_to_write = 0; + int64_t spaced_values_to_write = 0; + // If the field is required and non-repeated, there are no definition levels + if (descr_->max_definition_level() > 0) { + // Minimal definition level for which spaced values are written + int16_t min_spaced_def_level = descr_->max_definition_level(); + if (descr_->schema_node()->is_optional()) { + min_spaced_def_level--; } - if (def_levels[i] >= min_spaced_def_level) { - ++spaced_values_to_write; + for (int64_t i = 0; i < num_levels; ++i) { + if (def_levels[i] == descr_->max_definition_level()) { + ++values_to_write; + } + if (def_levels[i] >= min_spaced_def_level) { + ++spaced_values_to_write; + } } - } - - WriteDefinitionLevels(num_levels, def_levels); - } else { - // Required field, write all values - values_to_write = num_levels; - spaced_values_to_write = num_levels; - } - // Not present for non-repeated fields - if (descr_->max_repetition_level() > 0) { - // A row could include more than one value - // Count the occasions where we start a new row - for (int64_t i = 0; i < num_levels; ++i) { - if (rep_levels[i] == 0) { - rows_written_++; - } + WriteDefinitionLevels(num_levels, def_levels); + } else { + // Required field, write all values + values_to_write = num_levels; + spaced_values_to_write = num_levels; } - WriteRepetitionLevels(num_levels, rep_levels); - } else { - // Each value is exactly one row - rows_written_ += static_cast(num_levels); - } + // Not present for non-repeated fields + if (descr_->max_repetition_level() > 0) { + // A row could include more than one value + // Count the occasions where we start a new row + for (int64_t i = 0; i < num_levels; ++i) { + if (rep_levels[i] == 0) { + rows_written_++; + } + } - if (descr_->schema_node()->is_optional()) { - WriteValuesSpaced(spaced_values_to_write, valid_bits, valid_bits_offset, values); - } else { - WriteValues(values_to_write, values); - } - *num_spaced_written = spaced_values_to_write; + WriteRepetitionLevels(num_levels, rep_levels); + } else { + // Each value is exactly one row + rows_written_ += static_cast(num_levels); + } - if (page_statistics_ != nullptr) { - page_statistics_->UpdateSpaced(values, valid_bits, valid_bits_offset, values_to_write, - spaced_values_to_write - values_to_write); + *out_values_to_write = values_to_write; + *out_spaced_values_to_write = spaced_values_to_write; } - num_buffered_values_ += num_levels; - num_buffered_encoded_values_ += values_to_write; + void CommitWriteAndCheckLimits(int64_t num_levels, int64_t num_values) { + num_buffered_values_ += num_levels; + num_buffered_encoded_values_ += num_values; - if (current_encoder_->EstimatedDataEncodedSize() >= properties_->data_pagesize()) { - AddDataPage(); - } - if (has_dictionary_ && !fallback_) { - CheckDictionarySizeLimit(); + if (current_encoder_->EstimatedDataEncodedSize() >= properties_->data_pagesize()) { + AddDataPage(); + } + if (has_dictionary_ && !fallback_) { + CheckDictionarySizeLimit(); + } } - return values_to_write; -} - -template -void TypedColumnWriterImpl::WriteBatch(int64_t num_values, - const int16_t* def_levels, - const int16_t* rep_levels, - const T* values) { - // We check for DataPage limits only after we have inserted the values. If a user - // writes a large number of values, the DataPage size can be much above the limit. - // The purpose of this chunking is to bound this. Even if a user writes large number - // of values, the chunking will ensure the AddDataPage() is called at a reasonable - // pagesize limit - int64_t write_batch_size = properties_->write_batch_size(); - int num_batches = static_cast(num_values / write_batch_size); - int64_t num_remaining = num_values % write_batch_size; - int64_t value_offset = 0; - for (int round = 0; round < num_batches; round++) { - int64_t offset = round * write_batch_size; - int64_t num_values = WriteMiniBatch(write_batch_size, &def_levels[offset], - &rep_levels[offset], &values[value_offset]); - value_offset += num_values; + void WriteValues(const T* values, int64_t num_values, int64_t num_nulls) { + dynamic_cast(current_encoder_.get()) + ->Put(values, static_cast(num_values)); + if (page_statistics_ != nullptr) { + page_statistics_->Update(values, num_values, num_nulls); + } } - // Write the remaining values - int64_t offset = num_batches * write_batch_size; - WriteMiniBatch(num_remaining, &def_levels[offset], &rep_levels[offset], - &values[value_offset]); -} -template -void TypedColumnWriterImpl::WriteBatchSpaced( - int64_t num_values, const int16_t* def_levels, const int16_t* rep_levels, - const uint8_t* valid_bits, int64_t valid_bits_offset, const T* values) { - // We check for DataPage limits only after we have inserted the values. If a user - // writes a large number of values, the DataPage size can be much above the limit. - // The purpose of this chunking is to bound this. Even if a user writes large number - // of values, the chunking will ensure the AddDataPage() is called at a reasonable - // pagesize limit - int64_t write_batch_size = properties_->write_batch_size(); - int num_batches = static_cast(num_values / write_batch_size); - int64_t num_remaining = num_values % write_batch_size; - int64_t num_spaced_written = 0; - int64_t values_offset = 0; - for (int round = 0; round < num_batches; round++) { - int64_t offset = round * write_batch_size; - WriteMiniBatchSpaced(write_batch_size, &def_levels[offset], &rep_levels[offset], - valid_bits, valid_bits_offset + values_offset, - values + values_offset, &num_spaced_written); - values_offset += num_spaced_written; + void WriteValuesSpaced(const T* values, int64_t num_values, int64_t num_spaced_values, + const uint8_t* valid_bits, int64_t valid_bits_offset) { + if (descr_->schema_node()->is_optional()) { + dynamic_cast(current_encoder_.get()) + ->PutSpaced(values, static_cast(num_spaced_values), valid_bits, + valid_bits_offset); + } else { + dynamic_cast(current_encoder_.get()) + ->Put(values, static_cast(num_values)); + } + if (page_statistics_ != nullptr) { + const int64_t num_nulls = num_spaced_values - num_values; + page_statistics_->UpdateSpaced(values, valid_bits, valid_bits_offset, num_values, + num_nulls); + } } - // Write the remaining values - int64_t offset = num_batches * write_batch_size; - WriteMiniBatchSpaced(num_remaining, &def_levels[offset], &rep_levels[offset], - valid_bits, valid_bits_offset + values_offset, - values + values_offset, &num_spaced_written); -} +}; // ---------------------------------------------------------------------- // Direct Arrow write path From c4d7dc27927ab7c3c3f3981112ce56cae034e0c3 Mon Sep 17 00:00:00 2001 From: Wes McKinney Date: Fri, 9 Aug 2019 11:24:48 -0500 Subject: [PATCH 02/28] Refactor and add Arrow encoder stubs --- cpp/src/parquet/column_writer.cc | 29 +++++- cpp/src/parquet/encoding.cc | 171 ++++++++++++++----------------- cpp/src/parquet/encoding.h | 24 ++++- 3 files changed, 122 insertions(+), 102 deletions(-) diff --git a/cpp/src/parquet/column_writer.cc b/cpp/src/parquet/column_writer.cc index 9c25604a91e..fa0ea233ded 100644 --- a/cpp/src/parquet/column_writer.cc +++ b/cpp/src/parquet/column_writer.cc @@ -1344,12 +1344,31 @@ Status TypedColumnWriterImpl::WriteArrow(const int16_t* def_level int64_t num_levels, const ::arrow::Array& array, ArrowWriteContext* ctx) { - switch (array.type()->id()) { - WRITE_SERIALIZE_CASE(BINARY, BinaryType, ByteArrayType) - WRITE_SERIALIZE_CASE(STRING, BinaryType, ByteArrayType) - default: - ARROW_UNSUPPORTED(); + if (array.type()->id() != ::arrow::Type::BINARY && + array.type()->id() != ::arrow::Type::STRING) { + ARROW_UNSUPPORTED(); } + const auto& data = checked_cast<::arrow::BinaryArray&>(array); + + // Like WriteBatch, but for spaced values + int64_t value_offset = 0; + auto WriteBatch = [&](int64_t offset, int64_t batch_size) { + int64_t batch_num_values = 0; + int64_t batch_num_spaced_values = 0; + WriteLevelsSpaced(batch_size, def_levels + offset, rep_levels + offset, + &batch_num_values, &batch_num_spaced_values); + + dynamic_cast(current_encoder_.get()) + ->Put(*data.Slice(offset, batch_size)); + + // TODO(wesm): Update page statistics + + CommitWriteAndCheckLimits(batch_size, batch_num_spaced_values); + value_offset += batch_num_spaced_values; + }; + PARQUET_CATCH_NOT_OK( + DoInBatches(num_values, properties_->write_batch_size(), WriteBatch)); + return Status::OK(); } // ---------------------------------------------------------------------- diff --git a/cpp/src/parquet/encoding.cc b/cpp/src/parquet/encoding.cc index 57d3dd73869..5e3a38ddbb5 100644 --- a/cpp/src/parquet/encoding.cc +++ b/cpp/src/parquet/encoding.cc @@ -75,11 +75,23 @@ class PlainEncoder : public EncoderImpl, virtual public TypedEncoder { public: using T = typename DType::c_type; - explicit PlainEncoder(const ColumnDescriptor* descr, - ::arrow::MemoryPool* pool = ::arrow::default_memory_pool()); + explicit PlainEncoder(const ColumnDescriptor* descr, ::arrow::MemoryPool* pool) + : EncoderImpl(descr, Encoding::PLAIN, pool) { + values_sink_ = CreateOutputStream(pool); + } - int64_t EstimatedDataEncodedSize() override; - std::shared_ptr FlushValues() override; + int64_t EstimatedDataEncodedSize() override { + int64_t position = -1; + PARQUET_THROW_NOT_OK(values_sink_->Tell(&position)); + return position; + } + + std::shared_ptr FlushValues() override { + std::shared_ptr buffer; + PARQUET_THROW_NOT_OK(values_sink_->Finish(&buffer)); + values_sink_ = CreateOutputStream(this->pool_); + return buffer; + } void Put(const T* buffer, int num_values) override; @@ -87,27 +99,6 @@ class PlainEncoder : public EncoderImpl, virtual public TypedEncoder { std::shared_ptr<::arrow::io::BufferOutputStream> values_sink_; }; -template -PlainEncoder::PlainEncoder(const ColumnDescriptor* descr, - ::arrow::MemoryPool* pool) - : EncoderImpl(descr, Encoding::PLAIN, pool) { - values_sink_ = CreateOutputStream(pool); -} -template -int64_t PlainEncoder::EstimatedDataEncodedSize() { - int64_t position = -1; - PARQUET_THROW_NOT_OK(values_sink_->Tell(&position)); - return position; -} - -template -std::shared_ptr PlainEncoder::FlushValues() { - std::shared_ptr buffer; - PARQUET_THROW_NOT_OK(values_sink_->Finish(&buffer)); - values_sink_ = CreateOutputStream(this->pool_); - return buffer; -} - template void PlainEncoder::Put(const T* buffer, int num_values) { PARQUET_THROW_NOT_OK(values_sink_->Write(reinterpret_cast(buffer), @@ -145,6 +136,9 @@ class PlainByteArrayEncoder : public PlainEncoder, public: using BASE = PlainEncoder; using BASE::PlainEncoder; + using BASE::Put; + + void Put(const ::arrow::BinaryArray& values) override { return; } }; class PlainFLBAEncoder : public PlainEncoder, virtual public FLBAEncoder { @@ -275,6 +269,9 @@ struct DictEncoderTraits { using MemoTableType = ::arrow::internal::BinaryMemoTable; }; +// Initially 1024 elements +static constexpr int32_t kInitialHashTableSize = 1 << 10; + /// See the dictionary encoding section of https://github.com/Parquet/parquet-format. /// The encoding supports streaming encoding. Values are encoded as they are added while /// the dictionary is being constructed. At any time, the buffered values can be @@ -287,9 +284,10 @@ class DictEncoderImpl : public EncoderImpl, virtual public DictEncoder { public: typedef typename DType::c_type T; - explicit DictEncoderImpl( - const ColumnDescriptor* desc, - ::arrow::MemoryPool* allocator = ::arrow::default_memory_pool()); + explicit DictEncoderImpl(const ColumnDescriptor* desc, ::arrow::MemoryPool* pool) + : EncoderImpl(desc, Encoding::PLAIN_DICTIONARY, pool), + dict_encoded_size_(0), + memo_table_(pool, kInitialHashTableSize) {} ~DictEncoderImpl() override { DCHECK(buffered_indices_.empty()); } @@ -315,20 +313,56 @@ class DictEncoderImpl : public EncoderImpl, virtual public DictEncoder { /// Returns a conservative estimate of the number of bytes needed to encode the buffered /// indices. Used to size the buffer passed to WriteIndices(). - int64_t EstimatedDataEncodedSize() override; + int64_t EstimatedDataEncodedSize() override { + // Note: because of the way RleEncoder::CheckBufferFull() is called, we have to + // reserve + // an extra "RleEncoder::MinBufferSize" bytes. These extra bytes won't be used + // but not reserving them would cause the encoder to fail. + return 1 + + ::arrow::util::RleEncoder::MaxBufferSize( + bit_width(), static_cast(buffered_indices_.size())) + + ::arrow::util::RleEncoder::MinBufferSize(bit_width()); + } /// The minimum bit width required to encode the currently buffered indices. - int bit_width() const override; + int bit_width() const override { + if (ARROW_PREDICT_FALSE(num_entries() == 0)) return 0; + if (ARROW_PREDICT_FALSE(num_entries() == 1)) return 1; + return BitUtil::Log2(num_entries()); + } /// Encode value. Note that this does not actually write any data, just /// buffers the value's index to be written later. inline void Put(const T& value); - void Put(const T* values, int num_values) override; - std::shared_ptr FlushValues() override; + void Put(const T* src, int num_values) override { + for (int32_t i = 0; i < num_values; i++) { + Put(src[i]); + } + } void PutSpaced(const T* src, int num_values, const uint8_t* valid_bits, - int64_t valid_bits_offset) override; + int64_t valid_bits_offset) override { + ::arrow::internal::BitmapReader valid_bits_reader(valid_bits, valid_bits_offset, + num_values); + for (int32_t i = 0; i < num_values; i++) { + if (valid_bits_reader.IsSet()) { + Put(src[i]); + } + valid_bits_reader.Next(); + } + } + + void PutIndices(const int32_t* indices, int length) override { return; } + + std::shared_ptr FlushValues() override { + std::shared_ptr buffer = + AllocateBuffer(this->pool_, EstimatedDataEncodedSize()); + int result_size = WriteIndices(buffer->mutable_data(), + static_cast(EstimatedDataEncodedSize())); + PARQUET_THROW_NOT_OK(buffer->Resize(result_size, false)); + return std::move(buffer); + } /// Writes out the encoded dictionary to buffer. buffer must be preallocated to /// dict_encoded_size() bytes. @@ -350,66 +384,6 @@ class DictEncoderImpl : public EncoderImpl, virtual public DictEncoder { MemoTableType memo_table_; }; -// Initially 1024 elements -static constexpr int32_t INITIAL_HASH_TABLE_SIZE = 1 << 10; - -template -DictEncoderImpl::DictEncoderImpl(const ColumnDescriptor* desc, - ::arrow::MemoryPool* pool) - : EncoderImpl(desc, Encoding::PLAIN_DICTIONARY, pool), - dict_encoded_size_(0), - memo_table_(pool, INITIAL_HASH_TABLE_SIZE) {} - -template -int64_t DictEncoderImpl::EstimatedDataEncodedSize() { - // Note: because of the way RleEncoder::CheckBufferFull() is called, we have to - // reserve - // an extra "RleEncoder::MinBufferSize" bytes. These extra bytes won't be used - // but not reserving them would cause the encoder to fail. - return 1 + - ::arrow::util::RleEncoder::MaxBufferSize( - bit_width(), static_cast(buffered_indices_.size())) + - ::arrow::util::RleEncoder::MinBufferSize(bit_width()); -} - -template -int DictEncoderImpl::bit_width() const { - if (ARROW_PREDICT_FALSE(num_entries() == 0)) return 0; - if (ARROW_PREDICT_FALSE(num_entries() == 1)) return 1; - return BitUtil::Log2(num_entries()); -} - -template -std::shared_ptr DictEncoderImpl::FlushValues() { - std::shared_ptr buffer = - AllocateBuffer(this->pool_, EstimatedDataEncodedSize()); - int result_size = - WriteIndices(buffer->mutable_data(), static_cast(EstimatedDataEncodedSize())); - PARQUET_THROW_NOT_OK(buffer->Resize(result_size, false)); - return std::move(buffer); -} - -template -void DictEncoderImpl::Put(const T* src, int num_values) { - for (int32_t i = 0; i < num_values; i++) { - Put(src[i]); - } -} - -template -void DictEncoderImpl::PutSpaced(const T* src, int num_values, - const uint8_t* valid_bits, - int64_t valid_bits_offset) { - ::arrow::internal::BitmapReader valid_bits_reader(valid_bits, valid_bits_offset, - num_values); - for (int32_t i = 0; i < num_values; i++) { - if (valid_bits_reader.IsSet()) { - Put(src[i]); - } - valid_bits_reader.Next(); - } -} - template void DictEncoderImpl::WriteDict(uint8_t* buffer) { // For primitive types, only a memcpy @@ -479,11 +453,16 @@ inline void DictEncoderImpl::Put(const FixedLenByteArray& v) { buffered_indices_.push_back(memo_index); } -class DictByteArrayEncoder : public DictEncoderImpl, - virtual public ByteArrayEncoder { +class DictByteArrayEncoderImpl : public DictEncoderImpl, + virtual public DictByteArrayEncoder { public: using BASE = DictEncoderImpl; using BASE::DictEncoderImpl; + using BASE::Put; + + void Put(const ::arrow::BinaryArray& values) override { return; } + + void PutDictionary(const ::arrow::BinaryArray& values) override { return; } }; class DictFLBAEncoder : public DictEncoderImpl, virtual public FLBAEncoder { @@ -511,7 +490,7 @@ std::unique_ptr MakeEncoder(Type::type type_num, Encoding::type encodin case Type::DOUBLE: return std::unique_ptr(new DictEncoderImpl(descr, pool)); case Type::BYTE_ARRAY: - return std::unique_ptr(new DictByteArrayEncoder(descr, pool)); + return std::unique_ptr(new DictByteArrayEncoderImpl(descr, pool)); case Type::FIXED_LEN_BYTE_ARRAY: return std::unique_ptr(new DictFLBAEncoder(descr, pool)); default: diff --git a/cpp/src/parquet/encoding.h b/cpp/src/parquet/encoding.h index 5aa1fed74b6..395d6b77546 100644 --- a/cpp/src/parquet/encoding.h +++ b/cpp/src/parquet/encoding.h @@ -29,6 +29,7 @@ namespace arrow { class ArrayBuilder; +class BinaryArray; class BinaryDictionary32Builder; namespace internal { @@ -105,6 +106,13 @@ class DictEncoder : virtual public TypedEncoder { virtual void WriteDict(uint8_t* buffer) = 0; virtual int num_entries() const = 0; + + /// \brief EXPERIMENTAL: Append dictionary indices into the + /// encoder. It is assumed that the indices reference pre-existing + /// dictionary values + /// \param[in] indices the dictionary index values + /// \param[in] length the number of values being inserted + virtual void PutIndices(const int32_t* indices, int length) = 0; }; // ---------------------------------------------------------------------- @@ -204,9 +212,23 @@ using Int64Encoder = TypedEncoder; using Int96Encoder = TypedEncoder; using FloatEncoder = TypedEncoder; using DoubleEncoder = TypedEncoder; -class ByteArrayEncoder : virtual public TypedEncoder {}; +class ByteArrayEncoder : virtual public TypedEncoder { + public: + using TypedEncoder::Put; + virtual void Put(const ::arrow::BinaryArray& values) = 0; +}; + class FLBAEncoder : virtual public TypedEncoder {}; +class DictByteArrayEncoder : virtual public DictEncoder, + virtual public ByteArrayEncoder { + public: + /// \brief EXPERIMENTAL: Append dictionary into encoder, inserting + /// indices separately + /// \param[in] values the dictionary values + virtual void PutDictionary(const ::arrow::BinaryArray& values) = 0; +}; + class BooleanDecoder : virtual public TypedDecoder { public: using TypedDecoder::Decode; From 245f445793bcc26dd5d8bca31cf0730b4005377c Mon Sep 17 00:00:00 2001 From: Wes McKinney Date: Fri, 9 Aug 2019 14:03:15 -0500 Subject: [PATCH 03/28] ByteArray statistics specializations [skip ci] --- cpp/src/parquet/encoding.h | 22 +++++++++++----------- cpp/src/parquet/statistics.cc | 33 +++++++++++++++++++++++++++++++-- cpp/src/parquet/statistics.h | 30 +++++++++++++++++++++++------- 3 files changed, 65 insertions(+), 20 deletions(-) diff --git a/cpp/src/parquet/encoding.h b/cpp/src/parquet/encoding.h index 395d6b77546..7074637c92e 100644 --- a/cpp/src/parquet/encoding.h +++ b/cpp/src/parquet/encoding.h @@ -60,7 +60,7 @@ class Encoder { // // TODO(wesm): Encode interface API is temporary template -class TypedEncoder : virtual public Encoder { +class TypedEncoder : public Encoder { public: typedef typename DType::c_type T; @@ -87,7 +87,7 @@ class TypedEncoder : virtual public Encoder { // Base class for dictionary encoders template -class DictEncoder : virtual public TypedEncoder { +class DictEncoder : public TypedEncoder { public: /// Writes out any buffered indices to buffer preceded by the bit width of this data. /// Returns the number of bytes written. @@ -133,7 +133,7 @@ class Decoder { }; template -class TypedDecoder : virtual public Decoder { +class TypedDecoder : public Decoder { public: using T = typename DType::c_type; @@ -173,7 +173,7 @@ class TypedDecoder : virtual public Decoder { }; template -class DictDecoder : virtual public TypedDecoder { +class DictDecoder : public TypedDecoder { public: virtual void SetDict(TypedDecoder* dictionary) = 0; @@ -201,7 +201,7 @@ class DictDecoder : virtual public TypedDecoder { // ---------------------------------------------------------------------- // TypedEncoder specializations, traits, and factory functions -class BooleanEncoder : virtual public TypedEncoder { +class BooleanEncoder : public TypedEncoder { public: using TypedEncoder::Put; virtual void Put(const std::vector& src, int num_values) = 0; @@ -212,15 +212,15 @@ using Int64Encoder = TypedEncoder; using Int96Encoder = TypedEncoder; using FloatEncoder = TypedEncoder; using DoubleEncoder = TypedEncoder; -class ByteArrayEncoder : virtual public TypedEncoder { +class ByteArrayEncoder : public TypedEncoder { public: using TypedEncoder::Put; virtual void Put(const ::arrow::BinaryArray& values) = 0; }; -class FLBAEncoder : virtual public TypedEncoder {}; +class FLBAEncoder : public TypedEncoder {}; -class DictByteArrayEncoder : virtual public DictEncoder, +class DictByteArrayEncoder : public DictEncoder, virtual public ByteArrayEncoder { public: /// \brief EXPERIMENTAL: Append dictionary into encoder, inserting @@ -229,7 +229,7 @@ class DictByteArrayEncoder : virtual public DictEncoder, virtual void PutDictionary(const ::arrow::BinaryArray& values) = 0; }; -class BooleanDecoder : virtual public TypedDecoder { +class BooleanDecoder : public TypedDecoder { public: using TypedDecoder::Decode; virtual int Decode(uint8_t* buffer, int max_values) = 0; @@ -241,7 +241,7 @@ using Int96Decoder = TypedDecoder; using FloatDecoder = TypedDecoder; using DoubleDecoder = TypedDecoder; -class ByteArrayDecoder : virtual public TypedDecoder { +class ByteArrayDecoder : public TypedDecoder { public: using TypedDecoder::DecodeSpaced; @@ -260,7 +260,7 @@ class ByteArrayDecoder : virtual public TypedDecoder { ::arrow::internal::ChunkedBinaryBuilder* builder) = 0; }; -class FLBADecoder : virtual public TypedDecoder { +class FLBADecoder : public TypedDecoder { public: using TypedDecoder::DecodeSpaced; diff --git a/cpp/src/parquet/statistics.cc b/cpp/src/parquet/statistics.cc index 16abc152cf4..56d403dff1a 100644 --- a/cpp/src/parquet/statistics.cc +++ b/cpp/src/parquet/statistics.cc @@ -177,6 +177,35 @@ class TypedComparatorImpl : public TypedComparator { int type_length_; }; +template +class ByteArrayComparatorImpl : public TypedComparatorImpl, + virtual public ByteArrayComparator { + public: + using BASE = TypedComparatorImpl; + using BASE::BASE; + using BASE::GetMinMax; + + void GetMinMax(const ::arrow::BinaryArray& values, ByteArray* out_min, + ByteArray* out_max) override { + ::arrow::internal::BitmapReader valid_bits_reader(valid_bits, valid_bits_offset, + length); + T min = values[0]; + T max = values[0]; + for (int64_t i = 0; i < length; i++) { + if (valid_bits_reader.IsSet()) { + if (CompareInline(values[i], min)) { + min = values[i]; + } else if (CompareInline(max, values[i])) { + max = values[i]; + } + } + valid_bits_reader.Next(); + } + *out_min = min; + *out_max = max; + } +}; + std::shared_ptr Comparator::Make(Type::type physical_type, SortOrder::type sort_order, int type_length) { @@ -195,7 +224,7 @@ std::shared_ptr Comparator::Make(Type::type physical_type, case Type::DOUBLE: return std::make_shared>(); case Type::BYTE_ARRAY: - return std::make_shared>(); + return std::make_shared(); case Type::FIXED_LEN_BYTE_ARRAY: return std::make_shared>(type_length); default: @@ -210,7 +239,7 @@ std::shared_ptr Comparator::Make(Type::type physical_type, case Type::INT96: return std::make_shared>(); case Type::BYTE_ARRAY: - return std::make_shared>(); + return std::make_shared>(); case Type::FIXED_LEN_BYTE_ARRAY: return std::make_shared>(type_length); default: diff --git a/cpp/src/parquet/statistics.h b/cpp/src/parquet/statistics.h index 402b3c38923..948bb27a69f 100644 --- a/cpp/src/parquet/statistics.h +++ b/cpp/src/parquet/statistics.h @@ -26,6 +26,12 @@ #include "parquet/platform.h" #include "parquet/types.h" +namespace arrow { + +class BinaryArray; + +} // namespace arrow + namespace parquet { class ColumnDescriptor; @@ -100,6 +106,14 @@ class TypedComparator : public Comparator { int64_t valid_bits_offset, T* out_min, T* out_max) = 0; }; +class ByteArrayComparator : public TypedComparator { + public: + using TypedComparator::GetMinMax; + + virtual void GetMinMax(const ::arrow::BinaryArray& values, + ByteArray* out_min, ByteArray* out_max) = 0; +}; + // ---------------------------------------------------------------------- /// \brief Structure represented encoded statistics to be written to @@ -289,24 +303,26 @@ class TypedStatistics : public Statistics { /// \brief Batch statistics update with supplied validity bitmap virtual void UpdateSpaced(const T* values, const uint8_t* valid_bits, - int64_t valid_bits_spaced, int64_t num_not_null, + int64_t valid_bits_offset, int64_t num_not_null, int64_t num_null) = 0; /// \brief Set min and max values to particular values virtual void SetMinMax(const T& min, const T& max) = 0; }; -#ifndef ARROW_NO_DEPRECATED_API -// TODO(wesm): Remove after Arrow 0.14.0 -using RowGroupStatistics = Statistics; -#endif - using BoolStatistics = TypedStatistics; using Int32Statistics = TypedStatistics; using Int64Statistics = TypedStatistics; using FloatStatistics = TypedStatistics; using DoubleStatistics = TypedStatistics; -using ByteArrayStatistics = TypedStatistics; + +class ByteArrayStatistics : public TypedStatistics { + public: + using TypedStatistics::Update; + + virtual void Update(const ::arrow::BinaryArray& values) = 0; +}; + using FLBAStatistics = TypedStatistics; } // namespace parquet From 882e4341fbfccddedb57c0f874cd247dea2c54c6 Mon Sep 17 00:00:00 2001 From: Wes McKinney Date: Sun, 11 Aug 2019 11:35:01 -0500 Subject: [PATCH 04/28] TypedComparator/TypedStatistics augmentations for arrow::BinaryArray --- cpp/src/parquet/column_writer-test.cc | 4 +- cpp/src/parquet/column_writer.cc | 14 +- cpp/src/parquet/encoding.h | 22 +-- cpp/src/parquet/metadata.cc | 4 +- cpp/src/parquet/statistics-test.cc | 60 +++--- cpp/src/parquet/statistics.cc | 255 +++++++++++++++----------- cpp/src/parquet/statistics.h | 127 ++++++------- cpp/src/parquet/types.h | 84 +++++++++ 8 files changed, 347 insertions(+), 223 deletions(-) diff --git a/cpp/src/parquet/column_writer-test.cc b/cpp/src/parquet/column_writer-test.cc index dd0d65aa5cd..fcc8344ac06 100644 --- a/cpp/src/parquet/column_writer-test.cc +++ b/cpp/src/parquet/column_writer-test.cc @@ -218,7 +218,7 @@ class TestPrimitiveWriter : public PrimitiveTypedTest { void ReadAndCompare(Compression::type compression, int64_t num_rows) { this->SetupValuesOut(num_rows); this->ReadColumnFully(compression); - auto comparator = TypedComparator::Make(this->descr_); + auto comparator = MakeComparator(this->descr_); for (size_t i = 0; i < this->values_.size(); i++) { if (comparator->Compare(this->values_[i], this->values_out_[i]) || comparator->Compare(this->values_out_[i], this->values_[i])) { @@ -310,7 +310,7 @@ void TestPrimitiveWriter::ReadAndCompare(Compression::type compressio this->SetupValuesOut(num_rows); this->ReadColumnFully(compression); - auto comparator = TypedComparator::Make(Type::INT96, SortOrder::SIGNED); + auto comparator = MakeComparator(Type::INT96, SortOrder::SIGNED); for (size_t i = 0; i < this->values_.size(); i++) { if (comparator->Compare(this->values_[i], this->values_out_[i]) || comparator->Compare(this->values_out_[i], this->values_[i])) { diff --git a/cpp/src/parquet/column_writer.cc b/cpp/src/parquet/column_writer.cc index fa0ea233ded..5aefd2b8068 100644 --- a/cpp/src/parquet/column_writer.cc +++ b/cpp/src/parquet/column_writer.cc @@ -657,8 +657,8 @@ class TypedColumnWriterImpl : public ColumnWriterImpl, public TypedColumnWriter< if (properties->statistics_enabled(descr_->path()) && (SortOrder::UNKNOWN != descr_->sort_order())) { - page_statistics_ = TypedStats::Make(descr_, allocator_); - chunk_statistics_ = TypedStats::Make(descr_, allocator_); + page_statistics_ = MakeStatistics(descr_, allocator_); + chunk_statistics_ = MakeStatistics(descr_, allocator_); } } @@ -786,7 +786,7 @@ class TypedColumnWriterImpl : public ColumnWriterImpl, public TypedColumnWriter< private: using ValueEncoderType = typename EncodingTraits::Encoder; - using TypedStats = TypedStatistics; + using TypedStats = typename TypeClasses::Statistics; std::unique_ptr current_encoder_; std::shared_ptr page_statistics_; std::shared_ptr chunk_statistics_; @@ -1348,7 +1348,7 @@ Status TypedColumnWriterImpl::WriteArrow(const int16_t* def_level array.type()->id() != ::arrow::Type::STRING) { ARROW_UNSUPPORTED(); } - const auto& data = checked_cast<::arrow::BinaryArray&>(array); + // const auto& data = checked_cast(array); // Like WriteBatch, but for spaced values int64_t value_offset = 0; @@ -1358,8 +1358,8 @@ Status TypedColumnWriterImpl::WriteArrow(const int16_t* def_level WriteLevelsSpaced(batch_size, def_levels + offset, rep_levels + offset, &batch_num_values, &batch_num_spaced_values); - dynamic_cast(current_encoder_.get()) - ->Put(*data.Slice(offset, batch_size)); + // dynamic_cast(current_encoder_.get()) + // ->Put(*data.Slice(offset, batch_size)); // TODO(wesm): Update page statistics @@ -1367,7 +1367,7 @@ Status TypedColumnWriterImpl::WriteArrow(const int16_t* def_level value_offset += batch_num_spaced_values; }; PARQUET_CATCH_NOT_OK( - DoInBatches(num_values, properties_->write_batch_size(), WriteBatch)); + DoInBatches(array.length(), properties_->write_batch_size(), WriteBatch)); return Status::OK(); } diff --git a/cpp/src/parquet/encoding.h b/cpp/src/parquet/encoding.h index 7074637c92e..395d6b77546 100644 --- a/cpp/src/parquet/encoding.h +++ b/cpp/src/parquet/encoding.h @@ -60,7 +60,7 @@ class Encoder { // // TODO(wesm): Encode interface API is temporary template -class TypedEncoder : public Encoder { +class TypedEncoder : virtual public Encoder { public: typedef typename DType::c_type T; @@ -87,7 +87,7 @@ class TypedEncoder : public Encoder { // Base class for dictionary encoders template -class DictEncoder : public TypedEncoder { +class DictEncoder : virtual public TypedEncoder { public: /// Writes out any buffered indices to buffer preceded by the bit width of this data. /// Returns the number of bytes written. @@ -133,7 +133,7 @@ class Decoder { }; template -class TypedDecoder : public Decoder { +class TypedDecoder : virtual public Decoder { public: using T = typename DType::c_type; @@ -173,7 +173,7 @@ class TypedDecoder : public Decoder { }; template -class DictDecoder : public TypedDecoder { +class DictDecoder : virtual public TypedDecoder { public: virtual void SetDict(TypedDecoder* dictionary) = 0; @@ -201,7 +201,7 @@ class DictDecoder : public TypedDecoder { // ---------------------------------------------------------------------- // TypedEncoder specializations, traits, and factory functions -class BooleanEncoder : public TypedEncoder { +class BooleanEncoder : virtual public TypedEncoder { public: using TypedEncoder::Put; virtual void Put(const std::vector& src, int num_values) = 0; @@ -212,15 +212,15 @@ using Int64Encoder = TypedEncoder; using Int96Encoder = TypedEncoder; using FloatEncoder = TypedEncoder; using DoubleEncoder = TypedEncoder; -class ByteArrayEncoder : public TypedEncoder { +class ByteArrayEncoder : virtual public TypedEncoder { public: using TypedEncoder::Put; virtual void Put(const ::arrow::BinaryArray& values) = 0; }; -class FLBAEncoder : public TypedEncoder {}; +class FLBAEncoder : virtual public TypedEncoder {}; -class DictByteArrayEncoder : public DictEncoder, +class DictByteArrayEncoder : virtual public DictEncoder, virtual public ByteArrayEncoder { public: /// \brief EXPERIMENTAL: Append dictionary into encoder, inserting @@ -229,7 +229,7 @@ class DictByteArrayEncoder : public DictEncoder, virtual void PutDictionary(const ::arrow::BinaryArray& values) = 0; }; -class BooleanDecoder : public TypedDecoder { +class BooleanDecoder : virtual public TypedDecoder { public: using TypedDecoder::Decode; virtual int Decode(uint8_t* buffer, int max_values) = 0; @@ -241,7 +241,7 @@ using Int96Decoder = TypedDecoder; using FloatDecoder = TypedDecoder; using DoubleDecoder = TypedDecoder; -class ByteArrayDecoder : public TypedDecoder { +class ByteArrayDecoder : virtual public TypedDecoder { public: using TypedDecoder::DecodeSpaced; @@ -260,7 +260,7 @@ class ByteArrayDecoder : public TypedDecoder { ::arrow::internal::ChunkedBinaryBuilder* builder) = 0; }; -class FLBADecoder : public TypedDecoder { +class FLBADecoder : virtual public TypedDecoder { public: using TypedDecoder::DecodeSpaced; diff --git a/cpp/src/parquet/metadata.cc b/cpp/src/parquet/metadata.cc index 5410dc8367c..c8718f07d62 100644 --- a/cpp/src/parquet/metadata.cc +++ b/cpp/src/parquet/metadata.cc @@ -81,14 +81,14 @@ static std::shared_ptr MakeTypedColumnStats( const format::ColumnMetaData& metadata, const ColumnDescriptor* descr) { // If ColumnOrder is defined, return max_value and min_value if (descr->column_order().get_order() == ColumnOrder::TYPE_DEFINED_ORDER) { - return TypedStatistics::Make( + return MakeStatistics( descr, metadata.statistics.min_value, metadata.statistics.max_value, metadata.num_values - metadata.statistics.null_count, metadata.statistics.null_count, metadata.statistics.distinct_count, metadata.statistics.__isset.max_value || metadata.statistics.__isset.min_value); } // Default behavior - return TypedStatistics::Make( + return MakeStatistics( descr, metadata.statistics.min, metadata.statistics.max, metadata.num_values - metadata.statistics.null_count, metadata.statistics.null_count, metadata.statistics.distinct_count, diff --git a/cpp/src/parquet/statistics-test.cc b/cpp/src/parquet/statistics-test.cc index fa1caa96d31..ccc2d248f2d 100644 --- a/cpp/src/parquet/statistics-test.cc +++ b/cpp/src/parquet/statistics-test.cc @@ -62,8 +62,7 @@ static FLBA FLBAFromString(const std::string& s) { } TEST(Comparison, SignedByteArray) { - auto comparator = - TypedComparator::Make(Type::BYTE_ARRAY, SortOrder::SIGNED); + auto comparator = MakeComparator(Type::BYTE_ARRAY, SortOrder::SIGNED); std::string s1 = "12345"; std::string s2 = "12345678"; @@ -82,8 +81,7 @@ TEST(Comparison, SignedByteArray) { TEST(Comparison, UnsignedByteArray) { // Check if UTF-8 is compared using unsigned correctly - auto comparator = - TypedComparator::Make(Type::BYTE_ARRAY, SortOrder::UNSIGNED); + auto comparator = MakeComparator(Type::BYTE_ARRAY, SortOrder::UNSIGNED); std::string s1 = "arrange"; std::string s2 = "arrangement"; @@ -107,8 +105,8 @@ TEST(Comparison, UnsignedByteArray) { TEST(Comparison, SignedFLBA) { int size = 10; - auto comparator = TypedComparator::Make(Type::FIXED_LEN_BYTE_ARRAY, - SortOrder::SIGNED, size); + auto comparator = + MakeComparator(Type::FIXED_LEN_BYTE_ARRAY, SortOrder::SIGNED, size); std::string s1 = "Anti123456"; std::string s2 = "Bunkd123456"; @@ -125,8 +123,8 @@ TEST(Comparison, SignedFLBA) { TEST(Comparison, UnsignedFLBA) { int size = 10; - auto comparator = TypedComparator::Make(Type::FIXED_LEN_BYTE_ARRAY, - SortOrder::UNSIGNED, size); + auto comparator = + MakeComparator(Type::FIXED_LEN_BYTE_ARRAY, SortOrder::UNSIGNED, size); std::string s1 = "Anti123456"; std::string s2 = "Bunkd123456"; @@ -146,7 +144,7 @@ TEST(Comparison, SignedInt96) { parquet::Int96 aa{{1, 41, 14}}, bb{{1, 41, 14}}; parquet::Int96 aaa{{1, 41, static_cast(-14)}}, bbb{{1, 41, 42}}; - auto comparator = TypedComparator::Make(Type::INT96, SortOrder::SIGNED); + auto comparator = MakeComparator(Type::INT96, SortOrder::SIGNED); ASSERT_TRUE(comparator->Compare(a, b)); ASSERT_TRUE(!comparator->Compare(aa, bb) && !comparator->Compare(bb, aa)); @@ -158,7 +156,7 @@ TEST(Comparison, UnsignedInt96) { parquet::Int96 aa{{1, 41, 14}}, bb{{1, 41, static_cast(-14)}}; parquet::Int96 aaa, bbb; - auto comparator = TypedComparator::Make(Type::INT96, SortOrder::UNSIGNED); + auto comparator = MakeComparator(Type::INT96, SortOrder::UNSIGNED); ASSERT_TRUE(comparator->Compare(a, b)); ASSERT_TRUE(comparator->Compare(aa, bb)); @@ -197,7 +195,7 @@ TEST(Comparison, SignedInt64) { NodePtr node = PrimitiveNode::Make("SignedInt64", Repetition::REQUIRED, Type::INT64); ColumnDescriptor descr(node, 0, 0); - auto comparator = TypedComparator::Make(&descr); + auto comparator = MakeComparator(&descr); ASSERT_TRUE(comparator->Compare(a, b)); ASSERT_TRUE(!comparator->Compare(aa, bb) && !comparator->Compare(bb, aa)); @@ -214,7 +212,7 @@ TEST(Comparison, UnsignedInt64) { ColumnDescriptor descr(node, 0, 0); ASSERT_EQ(SortOrder::UNSIGNED, descr.sort_order()); - auto comparator = TypedComparator::Make(&descr); + auto comparator = MakeComparator(&descr); ASSERT_TRUE(comparator->Compare(a, b)); ASSERT_TRUE(!comparator->Compare(aa, bb) && !comparator->Compare(bb, aa)); @@ -231,7 +229,7 @@ TEST(Comparison, UnsignedInt32) { ColumnDescriptor descr(node, 0, 0); ASSERT_EQ(SortOrder::UNSIGNED, descr.sort_order()); - auto comparator = TypedComparator::Make(&descr); + auto comparator = MakeComparator(&descr); ASSERT_TRUE(comparator->Compare(a, b)); ASSERT_TRUE(!comparator->Compare(aa, bb) && !comparator->Compare(bb, aa)); @@ -253,7 +251,6 @@ template class TestStatistics : public PrimitiveTypedTest { public: using T = typename TestType::c_type; - using TypedStats = TypedStatistics; std::vector GetDeepCopy( const std::vector&); // allocates new memory for FLBA/ByteArray @@ -264,15 +261,16 @@ class TestStatistics : public PrimitiveTypedTest { void TestMinMaxEncode() { this->GenerateData(1000); - auto statistics1 = TypedStats::Make(this->schema_.Column(0)); + auto statistics1 = MakeStatistics(this->schema_.Column(0)); statistics1->Update(this->values_ptr_, this->values_.size(), 0); std::string encoded_min = statistics1->EncodeMin(); std::string encoded_max = statistics1->EncodeMax(); - auto statistics2 = TypedStats::Make(this->schema_.Column(0), encoded_min, encoded_max, - this->values_.size(), 0, 0, true); + auto statistics2 = + MakeStatistics(this->schema_.Column(0), encoded_min, encoded_max, + this->values_.size(), 0, 0, true); - auto statistics3 = TypedStats::Make(this->schema_.Column(0)); + auto statistics3 = MakeStatistics(this->schema_.Column(0)); std::vector valid_bits( BitUtil::BytesForBits(static_cast(this->values_.size())) + 1, 255); statistics3->UpdateSpaced(this->values_ptr_, valid_bits.data(), 0, @@ -293,7 +291,7 @@ class TestStatistics : public PrimitiveTypedTest { void TestReset() { this->GenerateData(1000); - auto statistics = TypedStats::Make(this->schema_.Column(0)); + auto statistics = MakeStatistics(this->schema_.Column(0)); statistics->Update(this->values_ptr_, this->values_.size(), 0); ASSERT_EQ(this->values_.size(), statistics->num_values()); @@ -308,17 +306,17 @@ class TestStatistics : public PrimitiveTypedTest { int num_null[2]; random_numbers(2, 42, 0, 100, num_null); - auto statistics1 = TypedStats::Make(this->schema_.Column(0)); + auto statistics1 = MakeStatistics(this->schema_.Column(0)); this->GenerateData(1000); statistics1->Update(this->values_ptr_, this->values_.size() - num_null[0], num_null[0]); - auto statistics2 = TypedStats::Make(this->schema_.Column(0)); + auto statistics2 = MakeStatistics(this->schema_.Column(0)); this->GenerateData(1000); statistics2->Update(this->values_ptr_, this->values_.size() - num_null[1], num_null[1]); - auto total = TypedStats::Make(this->schema_.Column(0)); + auto total = MakeStatistics(this->schema_.Column(0)); total->Merge(*statistics1); total->Merge(*statistics2); @@ -332,7 +330,7 @@ class TestStatistics : public PrimitiveTypedTest { this->GenerateData(num_values); // compute statistics for the whole batch - auto expected_stats = TypedStats::Make(this->schema_.Column(0)); + auto expected_stats = MakeStatistics(this->schema_.Column(0)); expected_stats->Update(this->values_ptr_, num_values - null_count, null_count); auto sink = CreateOutputStream(); @@ -456,7 +454,7 @@ template <> void TestStatistics::TestMinMaxEncode() { this->GenerateData(1000); // Test that we encode min max strings correctly - auto statistics1 = TypedStatistics::Make(this->schema_.Column(0)); + auto statistics1 = MakeStatistics(this->schema_.Column(0)); statistics1->Update(this->values_ptr_, this->values_.size(), 0); std::string encoded_min = statistics1->EncodeMin(); std::string encoded_max = statistics1->EncodeMax(); @@ -470,8 +468,8 @@ void TestStatistics::TestMinMaxEncode() { statistics1->max().len)); auto statistics2 = - TypedStatistics::Make(this->schema_.Column(0), encoded_min, - encoded_max, this->values_.size(), 0, 0, true); + MakeStatistics(this->schema_.Column(0), encoded_min, encoded_max, + this->values_.size(), 0, 0, true); ASSERT_EQ(encoded_min, statistics2->EncodeMin()); ASSERT_EQ(encoded_max, statistics2->EncodeMax()); @@ -873,7 +871,7 @@ TEST(TestStatisticsSortOrderFloatNaN, NaNValues) { } // Test values - auto nan_stats = TypedStatistics::Make(&descr); + auto nan_stats = MakeStatistics(&descr); nan_stats->Update(&values[0], NUM_VALUES, 0); float min = nan_stats->min(); float max = nan_stats->max(); @@ -881,7 +879,7 @@ TEST(TestStatisticsSortOrderFloatNaN, NaNValues) { ASSERT_EQ(max, 3.0f); // Test all NaNs - auto all_nan_stats = TypedStatistics::Make(&descr); + auto all_nan_stats = MakeStatistics(&descr); all_nan_stats->Update(&nan_values[0], NUM_VALUES, 0); min = all_nan_stats->min(); max = all_nan_stats->max(); @@ -925,7 +923,7 @@ TEST(TestStatisticsSortOrderFloatNaN, NaNValuesSpaced) { std::vector valid_bits(BitUtil::BytesForBits(NUM_VALUES) + 1, 255); // Test values - auto nan_stats = TypedStatistics::Make(&descr); + auto nan_stats = MakeStatistics(&descr); nan_stats->UpdateSpaced(&values[0], valid_bits.data(), 0, NUM_VALUES, 0); float min = nan_stats->min(); float max = nan_stats->max(); @@ -933,7 +931,7 @@ TEST(TestStatisticsSortOrderFloatNaN, NaNValuesSpaced) { ASSERT_EQ(max, 3.0f); // Test all NaNs - auto all_nan_stats = TypedStatistics::Make(&descr); + auto all_nan_stats = MakeStatistics(&descr); all_nan_stats->UpdateSpaced(&nan_values[0], valid_bits.data(), 0, NUM_VALUES, 0); min = all_nan_stats->min(); max = all_nan_stats->max(); @@ -968,7 +966,7 @@ TEST(TestStatisticsSortOrderDoubleNaN, NaNValues) { NodePtr node = PrimitiveNode::Make("nan_double", Repetition::OPTIONAL, Type::DOUBLE); ColumnDescriptor descr(node, 1, 1); - auto nan_stats = TypedStatistics::Make(&descr); + auto nan_stats = MakeStatistics(&descr); double values[NUM_VALUES] = {std::nan(""), std::nan(""), -3.0, -2.0, -1.0, 0.0, 1.0, 2.0, 3.0, 4.0}; double* values_ptr = &values[0]; diff --git a/cpp/src/parquet/statistics.cc b/cpp/src/parquet/statistics.cc index 56d403dff1a..d1ef804f2a8 100644 --- a/cpp/src/parquet/statistics.cc +++ b/cpp/src/parquet/statistics.cc @@ -20,6 +20,9 @@ #include #include +#include "arrow/array.h" +#include "arrow/type.h" +#include "arrow/util/checked_cast.h" #include "arrow/util/logging.h" #include "parquet/encoding.h" @@ -30,6 +33,7 @@ using arrow::default_memory_pool; using arrow::MemoryPool; +using arrow::internal::checked_cast; namespace parquet { @@ -126,14 +130,14 @@ struct CompareHelper { } }; -template -class TypedComparatorImpl : public TypedComparator { +template +class TypedComparatorImpl : virtual public TypedComparator { public: typedef typename DType::c_type T; explicit TypedComparatorImpl(int type_length = -1) : type_length_(type_length) {} - bool CompareInline(const T& a, const T& b) { + bool CompareInline(const T& a, const T& b) const { return CompareHelper::Compare(type_length_, a, b); } @@ -173,38 +177,56 @@ class TypedComparatorImpl : public TypedComparator { *out_max = max; } + void GetMinMax(const ::arrow::Array& values, T* out_min, T* out_max) override; + private: int type_length_; }; -template -class ByteArrayComparatorImpl : public TypedComparatorImpl, - virtual public ByteArrayComparator { - public: - using BASE = TypedComparatorImpl; - using BASE::BASE; - using BASE::GetMinMax; +template +void TypedComparatorImpl::GetMinMax(const ::arrow::Array& values, + typename DType::c_type* out_min, + typename DType::c_type* out_max) { + ParquetException::NYI(values.type()->ToString()); +} - void GetMinMax(const ::arrow::BinaryArray& values, ByteArray* out_min, - ByteArray* out_max) override { - ::arrow::internal::BitmapReader valid_bits_reader(valid_bits, valid_bits_offset, - length); - T min = values[0]; - T max = values[0]; - for (int64_t i = 0; i < length; i++) { - if (valid_bits_reader.IsSet()) { - if (CompareInline(values[i], min)) { - min = values[i]; - } else if (CompareInline(max, values[i])) { - max = values[i]; - } +template +void GetMinMaxBinaryHelper( + const TypedComparatorImpl& comparator, + const ::arrow::Array& values, ByteArray* out_min, ByteArray* out_max) { + const auto& data = checked_cast(values); + ::arrow::internal::BitmapReader valid_bits_reader(data.null_bitmap_data(), + data.offset(), data.length()); + ByteArray min = data.GetView(0); + ByteArray max = data.GetView(0); + for (int64_t i = 0; i < data.length(); i++) { + ByteArray val = data.GetView(i); + if (valid_bits_reader.IsSet()) { + if (comparator.CompareInline(val, min)) { + min = val; + } else if (comparator.CompareInline(max, val)) { + max = val; } - valid_bits_reader.Next(); } - *out_min = min; - *out_max = max; + valid_bits_reader.Next(); } -}; + *out_min = min; + *out_max = max; +} + +template <> +void TypedComparatorImpl::GetMinMax(const ::arrow::Array& values, + ByteArray* out_min, + ByteArray* out_max) { + GetMinMaxBinaryHelper(*this, values, out_min, out_max); +} + +template <> +void TypedComparatorImpl::GetMinMax(const ::arrow::Array& values, + ByteArray* out_min, + ByteArray* out_max) { + GetMinMaxBinaryHelper(*this, values, out_min, out_max); +} std::shared_ptr Comparator::Make(Type::type physical_type, SortOrder::type sort_order, @@ -212,36 +234,36 @@ std::shared_ptr Comparator::Make(Type::type physical_type, if (SortOrder::SIGNED == sort_order) { switch (physical_type) { case Type::BOOLEAN: - return std::make_shared>(); + return std::make_shared>(); case Type::INT32: - return std::make_shared>(); + return std::make_shared>(); case Type::INT64: - return std::make_shared>(); + return std::make_shared>(); case Type::INT96: - return std::make_shared>(); + return std::make_shared>(); case Type::FLOAT: - return std::make_shared>(); + return std::make_shared>(); case Type::DOUBLE: - return std::make_shared>(); + return std::make_shared>(); case Type::BYTE_ARRAY: - return std::make_shared(); + return std::make_shared>(); case Type::FIXED_LEN_BYTE_ARRAY: - return std::make_shared>(type_length); + return std::make_shared>(type_length); default: ParquetException::NYI("Signed Compare not implemented"); } } else if (SortOrder::UNSIGNED == sort_order) { switch (physical_type) { case Type::INT32: - return std::make_shared>(); + return std::make_shared>(); case Type::INT64: - return std::make_shared>(); + return std::make_shared>(); case Type::INT96: - return std::make_shared>(); + return std::make_shared>(); case Type::BYTE_ARRAY: - return std::make_shared>(); + return std::make_shared>(); case Type::FIXED_LEN_BYTE_ARRAY: - return std::make_shared>(type_length); + return std::make_shared>(type_length); default: ParquetException::NYI("Unsigned Compare not implemented"); } @@ -257,6 +279,59 @@ std::shared_ptr Comparator::Make(const ColumnDescriptor* descr) { // ---------------------------------------------------------------------- +template +struct StatsHelper { + bool CanHaveNaN() { return false; } + + inline int64_t GetValueBeginOffset(const T* values, int64_t count) { return 0; } + + inline int64_t GetValueEndOffset(const T* values, int64_t count) { return count; } + + inline bool IsNaN(const T value) { return false; } +}; + +template +struct StatsHelper::value>::type> { + bool CanHaveNaN() { return true; } + + inline int64_t GetValueBeginOffset(const T* values, int64_t count) { + // Skip NaNs + for (int64_t i = 0; i < count; i++) { + if (!std::isnan(values[i])) { + return i; + } + } + return count; + } + + inline int64_t GetValueEndOffset(const T* values, int64_t count) { + // Skip NaNs + for (int64_t i = (count - 1); i >= 0; i--) { + if (!std::isnan(values[i])) { + return (i + 1); + } + } + return 0; + } + + inline bool IsNaN(const T value) { return std::isnan(value); } +}; + +template +void SetNaN(T* value) { + // no-op +} + +template <> +void SetNaN(float* value) { + *value = std::nanf(""); +} + +template <> +void SetNaN(double* value) { + *value = std::nan(""); +} + template class TypedStatisticsImpl : public TypedStatistics { public: @@ -334,6 +409,25 @@ class TypedStatisticsImpl : public TypedStatistics { void UpdateSpaced(const T* values, const uint8_t* valid_bits, int64_t valid_bits_spaced, int64_t num_not_null, int64_t num_null) override; + void Update(const ::arrow::Array& values) override { + IncrementNullCount(values.null_count()); + IncrementNumValues(values.length() - values.null_count()); + + // TODO: support distinct count? + if (values.null_count() == values.length()) { + return; + } + + StatsHelper helper; + if (helper.CanHaveNaN()) { + ParquetException::NYI("No NaN handling for Arrow arrays yet"); + } + + T batch_min, batch_max; + comparator_->GetMinMax(values, &batch_min, &batch_max); + SetMinMax(batch_min, batch_max); + } + const T& min() const override { return min_; } const T& max() const override { return max_; } @@ -422,55 +516,6 @@ inline void TypedStatisticsImpl::Copy(const ByteArray& src, ByteA *dst = ByteArray(src.len, buffer->data()); } -template -struct StatsHelper { - inline int64_t GetValueBeginOffset(const T* values, int64_t count) { return 0; } - - inline int64_t GetValueEndOffset(const T* values, int64_t count) { return count; } - - inline bool IsNaN(const T value) { return false; } -}; - -template -struct StatsHelper::value>::type> { - inline int64_t GetValueBeginOffset(const T* values, int64_t count) { - // Skip NaNs - for (int64_t i = 0; i < count; i++) { - if (!std::isnan(values[i])) { - return i; - } - } - return count; - } - - inline int64_t GetValueEndOffset(const T* values, int64_t count) { - // Skip NaNs - for (int64_t i = (count - 1); i >= 0; i--) { - if (!std::isnan(values[i])) { - return (i + 1); - } - } - return 0; - } - - inline bool IsNaN(const T value) { return std::isnan(value); } -}; - -template -void SetNaN(T* value) { - // no-op -} - -template <> -void SetNaN(float* value) { - *value = std::nanf(""); -} - -template <> -void SetNaN(double* value) { - *value = std::nan(""); -} - template void TypedStatisticsImpl::Update(const T* values, int64_t num_not_null, int64_t num_null) { @@ -490,7 +535,7 @@ void TypedStatisticsImpl::Update(const T* values, int64_t num_not_null, int64_t end_offset = helper.GetValueEndOffset(values, num_not_null); // All values are NaN - if (end_offset < begin_offset) { + if (helper.CanHaveNaN() && end_offset < begin_offset) { // Set min/max to NaNs in this case. // Don't set has_min_max flag since // these values must be over-written by valid stats later @@ -526,23 +571,25 @@ void TypedStatisticsImpl::UpdateSpaced(const T* values, const uint8_t* va ::arrow::internal::BitmapReader valid_bits_reader(valid_bits, valid_bits_offset, length); StatsHelper helper; - for (; i < length; i++) { - // PARQUET-1225: Handle NaNs - if (valid_bits_reader.IsSet() && !helper.IsNaN(values[i])) { - break; + if (helper.CanHaveNaN()) { + for (; i < length; i++) { + // PARQUET-1225: Handle NaNs + if (valid_bits_reader.IsSet() && !helper.IsNaN(values[i])) { + break; + } + valid_bits_reader.Next(); } - valid_bits_reader.Next(); - } - // All are NaNs and stats are not set yet - if ((i == length) && helper.IsNaN(values[i - 1])) { - // Don't set has_min_max flag since - // these values must be over-written by valid stats later - if (!has_min_max_) { - SetNaN(&min_); - SetNaN(&max_); + // All are NaNs and stats are not set yet + if ((i == length) && helper.IsNaN(values[i - 1])) { + // Don't set has_min_max flag since + // these values must be over-written by valid stats later + if (!has_min_max_) { + SetNaN(&min_); + SetNaN(&max_); + } + return; } - return; } // Find min and max values from remaining non-NaN values diff --git a/cpp/src/parquet/statistics.h b/cpp/src/parquet/statistics.h index 948bb27a69f..bb91f31d225 100644 --- a/cpp/src/parquet/statistics.h +++ b/cpp/src/parquet/statistics.h @@ -28,6 +28,7 @@ namespace arrow { +class Array; class BinaryArray; } // namespace arrow @@ -69,19 +70,6 @@ class TypedComparator : public Comparator { public: using T = typename DType::c_type; - /// \brief Typed version of Comparator::Make - static std::shared_ptr> Make(Type::type physical_type, - SortOrder::type sort_order, - int type_length = -1) { - return std::static_pointer_cast>( - Comparator::Make(physical_type, sort_order, type_length)); - } - - /// \brief Typed version of Comparator::Make - static std::shared_ptr> Make(const ColumnDescriptor* descr) { - return std::static_pointer_cast>(Comparator::Make(descr)); - } - /// \brief Scalar comparison of two elements, return true if first /// is strictly less than the second virtual bool Compare(const T& a, const T& b) = 0; @@ -90,6 +78,11 @@ class TypedComparator : public Comparator { /// elements without any nulls virtual void GetMinMax(const T* values, int64_t length, T* out_min, T* out_max) = 0; + /// \brief Compute minimum and maximum elements from an Arrow array. Only + /// valid for certain Parquet Type / Arrow Type combinations, like BYTE_ARRAY + /// / arrow::BinaryArray + virtual void GetMinMax(const ::arrow::Array& values, T* out_min, T* out_max) = 0; + /// \brief Compute maximum and minimum elements in a batch of /// elements with accompanying bitmap indicating which elements are /// included (bit set) and excluded (bit not set) @@ -106,13 +99,21 @@ class TypedComparator : public Comparator { int64_t valid_bits_offset, T* out_min, T* out_max) = 0; }; -class ByteArrayComparator : public TypedComparator { - public: - using TypedComparator::GetMinMax; +/// \brief Typed version of Comparator::Make +template +inline std::shared_ptr::Comparator> MakeComparator( + Type::type physical_type, SortOrder::type sort_order, int type_length = -1) { + return std::dynamic_pointer_cast::Comparator>( + Comparator::Make(physical_type, sort_order, type_length)); +} - virtual void GetMinMax(const ::arrow::BinaryArray& values, - ByteArray* out_min, ByteArray* out_max) = 0; -}; +/// \brief Typed version of Comparator::Make +template +inline std::shared_ptr::Comparator> MakeComparator( + const ColumnDescriptor* descr) { + return std::dynamic_pointer_cast::Comparator>( + Comparator::Make(descr)); +} // ---------------------------------------------------------------------- @@ -256,39 +257,6 @@ class TypedStatistics : public Statistics { public: using T = typename DType::c_type; - /// \brief Typed version of Statistics::Make - static std::shared_ptr> Make( - const ColumnDescriptor* descr, - ::arrow::MemoryPool* pool = ::arrow::default_memory_pool()) { - return std::static_pointer_cast>( - Statistics::Make(descr, pool)); - } - - /// \brief Create Statistics initialized to a particular state - /// \param[in] min the minimum value - /// \param[in] max the minimum value - /// \param[in] num_values number of values - /// \param[in] null_count number of null values - /// \param[in] distinct_count number of distinct values - static std::shared_ptr> Make(const T& min, const T& max, - int64_t num_values, - int64_t null_count, - int64_t distinct_count) { - return std::static_pointer_cast>(Statistics::Make( - DType::type_num, &min, &max, num_values, null_count, distinct_count)); - } - - /// \brief Typed version of Statistics::Make - static std::shared_ptr> Make( - const ColumnDescriptor* descr, const std::string& encoded_min, - const std::string& encoded_max, int64_t num_values, int64_t null_count, - int64_t distinct_count, bool has_min_max, - ::arrow::MemoryPool* pool = ::arrow::default_memory_pool()) { - return std::static_pointer_cast>( - Statistics::Make(descr, encoded_min, encoded_max, num_values, null_count, - distinct_count, has_min_max, pool)); - } - /// \brief The current minimum value virtual const T& min() const = 0; @@ -306,23 +274,50 @@ class TypedStatistics : public Statistics { int64_t valid_bits_offset, int64_t num_not_null, int64_t num_null) = 0; + /// \brief EXPERIMENTAL: Update statistics with an Arrow array without + /// conversion to a primitive Parquet C type. Only implemented for certain + /// Parquet type / Arrow type combinations like BYTE_ARRAY / + /// arrow::BinaryArray + virtual void Update(const ::arrow::Array& values) = 0; + /// \brief Set min and max values to particular values virtual void SetMinMax(const T& min, const T& max) = 0; }; -using BoolStatistics = TypedStatistics; -using Int32Statistics = TypedStatistics; -using Int64Statistics = TypedStatistics; -using FloatStatistics = TypedStatistics; -using DoubleStatistics = TypedStatistics; - -class ByteArrayStatistics : public TypedStatistics { - public: - using TypedStatistics::Update; - - virtual void Update(const ::arrow::BinaryArray& values) = 0; -}; - -using FLBAStatistics = TypedStatistics; +/// \brief Typed version of Statistics::Make +template +inline std::shared_ptr::Statistics> MakeStatistics( + const ColumnDescriptor* descr, + ::arrow::MemoryPool* pool = ::arrow::default_memory_pool()) { + return std::dynamic_pointer_cast::Statistics>( + Statistics::Make(descr, pool)); +} + +/// \brief Create Statistics initialized to a particular state +/// \param[in] min the minimum value +/// \param[in] max the minimum value +/// \param[in] num_values number of values +/// \param[in] null_count number of null values +/// \param[in] distinct_count number of distinct values +template +inline std::shared_ptr::Statistics> MakeStatistics( + const typename DType::c_type& min, const typename DType::c_type& max, + int64_t num_values, int64_t null_count, int64_t distinct_count) { + return std::dynamic_pointer_cast::Statistics>( + Statistics::Make(DType::type_num, &min, &max, num_values, null_count, + distinct_count)); +} + +/// \brief Typed version of Statistics::Make +template +inline std::shared_ptr::Statistics> MakeStatistics( + const ColumnDescriptor* descr, const std::string& encoded_min, + const std::string& encoded_max, int64_t num_values, int64_t null_count, + int64_t distinct_count, bool has_min_max, + ::arrow::MemoryPool* pool = ::arrow::default_memory_pool()) { + return std::dynamic_pointer_cast::Statistics>( + Statistics::Make(descr, encoded_min, encoded_max, num_values, null_count, + distinct_count, has_min_max, pool)); +} } // namespace parquet diff --git a/cpp/src/parquet/types.h b/cpp/src/parquet/types.h index bc456ea24a8..04ae8afa30a 100644 --- a/cpp/src/parquet/types.h +++ b/cpp/src/parquet/types.h @@ -493,6 +493,10 @@ class ColumnOrder { struct ByteArray { ByteArray() : len(0), ptr(NULLPTR) {} ByteArray(uint32_t len, const uint8_t* ptr) : len(len), ptr(ptr) {} + + ByteArray(::arrow::util::string_view view) // NOLINT implicit conversion + : ByteArray(static_cast(view.size()), + reinterpret_cast(view.data())) {} uint32_t len; const uint8_t* ptr; }; @@ -646,6 +650,86 @@ using DoubleType = PhysicalType; using ByteArrayType = PhysicalType; using FLBAType = PhysicalType; +// Type forward declarations for TypeClasses +class Comparator; +class Statistics; + +template +class TypedComparator; + +template +class TypedStatistics; + +using BoolComparator = TypedComparator; +using Int32Comparator = TypedComparator; +using Int64Comparator = TypedComparator; +using Int96Comparator = TypedComparator; +using FloatComparator = TypedComparator; +using DoubleComparator = TypedComparator; +using FLBAComparator = TypedComparator; +using ByteArrayComparator = TypedComparator; + +using BoolStatistics = TypedStatistics; +using Int32Statistics = TypedStatistics; +using Int64Statistics = TypedStatistics; +using FloatStatistics = TypedStatistics; +using DoubleStatistics = TypedStatistics; +using FLBAStatistics = TypedStatistics; +using ByteArrayStatistics = TypedStatistics; + +template +struct TypeClasses {}; + +template <> +struct TypeClasses { + using Comparator = BoolComparator; + using Statistics = BoolStatistics; +}; + +template <> +struct TypeClasses { + using Comparator = Int32Comparator; + using Statistics = Int32Statistics; +}; + +template <> +struct TypeClasses { + using Comparator = Int64Comparator; + using Statistics = Int64Statistics; +}; + +template <> +struct TypeClasses { + using Comparator = Int96Comparator; + + // unused + using Statistics = TypedStatistics; +}; + +template <> +struct TypeClasses { + using Comparator = FloatComparator; + using Statistics = FloatStatistics; +}; + +template <> +struct TypeClasses { + using Comparator = DoubleComparator; + using Statistics = DoubleStatistics; +}; + +template <> +struct TypeClasses { + using Comparator = ByteArrayComparator; + using Statistics = ByteArrayStatistics; +}; + +template <> +struct TypeClasses { + using Comparator = FLBAComparator; + using Statistics = FLBAStatistics; +}; + template inline std::string format_fwf(int width) { std::stringstream ss; From 1264e10ccd23799a5874a3fd84b99ab22f29f2d0 Mon Sep 17 00:00:00 2001 From: Wes McKinney Date: Sun, 11 Aug 2019 16:28:47 -0500 Subject: [PATCH 05/28] More direct encoding implementation stubs --- cpp/src/arrow/array/builder_dict.cc | 3 + cpp/src/parquet/column_writer.cc | 93 ++------ cpp/src/parquet/encoding-test.cc | 54 ++++- cpp/src/parquet/encoding.cc | 327 ++++++++++++++++++---------- cpp/src/parquet/encoding.h | 57 ++--- cpp/src/parquet/statistics-test.cc | 19 ++ 6 files changed, 323 insertions(+), 230 deletions(-) diff --git a/cpp/src/arrow/array/builder_dict.cc b/cpp/src/arrow/array/builder_dict.cc index 6c0e651efcb..bf5cbdd7051 100644 --- a/cpp/src/arrow/array/builder_dict.cc +++ b/cpp/src/arrow/array/builder_dict.cc @@ -202,6 +202,9 @@ class internal::DictionaryMemoTable::DictionaryMemoTableImpl { template enable_if_memoize InsertValues(const DType&, const ArrayType& array) { + if (array.null_count() > 0) { + return Status::Invalid("Cannot yet insert dictionary values containing nulls"); + } for (int64_t i = 0; i < array.length(); ++i) { ARROW_IGNORE_EXPR(impl_->GetOrInsert(array.GetView(i))); } diff --git a/cpp/src/parquet/column_writer.cc b/cpp/src/parquet/column_writer.cc index 5aefd2b8068..799abb42a5f 100644 --- a/cpp/src/parquet/column_writer.cc +++ b/cpp/src/parquet/column_writer.cc @@ -672,7 +672,7 @@ class TypedColumnWriterImpl : public ColumnWriterImpl, public TypedColumnWriter< // of values, the chunking will ensure the AddDataPage() is called at a reasonable // pagesize limit int64_t value_offset = 0; - auto WriteBatch = [&](int64_t offset, int64_t batch_size) { + auto WriteChunk = [&](int64_t offset, int64_t batch_size) { int64_t values_to_write = WriteLevels(batch_size, def_levels + offset, rep_levels + offset); // PARQUET-780 @@ -683,7 +683,7 @@ class TypedColumnWriterImpl : public ColumnWriterImpl, public TypedColumnWriter< CommitWriteAndCheckLimits(batch_size, values_to_write); value_offset += values_to_write; }; - DoInBatches(num_values, properties_->write_batch_size(), WriteBatch); + DoInBatches(num_values, properties_->write_batch_size(), WriteChunk); } void WriteBatchSpaced(int64_t num_values, const int16_t* def_levels, @@ -691,7 +691,7 @@ class TypedColumnWriterImpl : public ColumnWriterImpl, public TypedColumnWriter< int64_t valid_bits_offset, const T* values) override { // Like WriteBatch, but for spaced values int64_t value_offset = 0; - auto WriteBatch = [&](int64_t offset, int64_t batch_size) { + auto WriteChunk = [&](int64_t offset, int64_t batch_size) { int64_t batch_num_values = 0; int64_t batch_num_spaced_values = 0; WriteLevelsSpaced(batch_size, def_levels + offset, rep_levels + offset, @@ -701,7 +701,7 @@ class TypedColumnWriterImpl : public ColumnWriterImpl, public TypedColumnWriter< CommitWriteAndCheckLimits(batch_size, batch_num_spaced_values); value_offset += batch_num_spaced_values; }; - DoInBatches(num_values, properties_->write_batch_size(), WriteBatch); + DoInBatches(num_values, properties_->write_batch_size(), WriteChunk); } Status WriteArrow(const int16_t* def_levels, const int16_t* rep_levels, @@ -1009,6 +1009,16 @@ Status WriteArrowZeroCopy(const ::arrow::Array& array, int64_t num_levels, // ---------------------------------------------------------------------- // Write Arrow to BooleanType +template <> +struct SerializeFunctor { + Status Serialize(const ::arrow::BooleanArray& data, ArrowWriteContext*, bool* out) { + for (int i = 0; i < data.length(); i++) { + *out++ = data.Value(i); + } + return Status::OK(); + } +}; + template <> Status TypedColumnWriterImpl::WriteArrow(const int16_t* def_levels, const int16_t* rep_levels, @@ -1018,27 +1028,8 @@ Status TypedColumnWriterImpl::WriteArrow(const int16_t* def_levels, if (array.type_id() != ::arrow::Type::BOOL) { ARROW_UNSUPPORTED(); } - bool* buffer = nullptr; - RETURN_NOT_OK(ctx->GetScratchData(array.length(), &buffer)); - - const auto& data = static_cast(array); - const uint8_t* values = nullptr; - // The values buffer may be null if the array is empty (ARROW-2744) - if (data.values() != nullptr) { - values = reinterpret_cast(data.values()->data()); - } else { - DCHECK_EQ(data.length(), 0); - } - - int buffer_idx = 0; - int64_t offset = array.offset(); - for (int i = 0; i < data.length(); i++) { - if (data.IsValid(i)) { - buffer[buffer_idx++] = BitUtil::GetBit(values, offset + i); - } - } - PARQUET_CATCH_NOT_OK(WriteBatch(num_levels, def_levels, rep_levels, buffer)); - return Status::OK(); + return WriteArrowSerialize( + array, num_levels, def_levels, rep_levels, ctx, this); } // ---------------------------------------------------------------------- @@ -1305,39 +1296,6 @@ Status TypedColumnWriterImpl::WriteArrow(const int16_t* def_levels, // ---------------------------------------------------------------------- // Write Arrow to BYTE_ARRAY -template -struct SerializeFunctor> { - Status Serialize(const ::arrow::BinaryArray& array, ArrowWriteContext*, - ByteArray* out) { - // In the case of an array consisting of only empty strings or all null, - // array.data() points already to a nullptr, thus array.data()->data() will - // segfault. - const uint8_t* values = nullptr; - if (array.value_data()) { - values = reinterpret_cast(array.value_data()->data()); - DCHECK(values != nullptr); - } - - // Slice offset is accounted for in raw_value_offsets - const int32_t* value_offset = array.raw_value_offsets(); - if (array.null_count() == 0) { - // no nulls, just dump the data - for (int64_t i = 0; i < array.length(); i++) { - out[i] = - ByteArray(value_offset[i + 1] - value_offset[i], values + value_offset[i]); - } - } else { - for (int64_t i = 0; i < array.length(); i++) { - if (array.IsValid(i)) { - out[i] = - ByteArray(value_offset[i + 1] - value_offset[i], values + value_offset[i]); - } - } - } - return Status::OK(); - } -}; - template <> Status TypedColumnWriterImpl::WriteArrow(const int16_t* def_levels, const int16_t* rep_levels, @@ -1348,26 +1306,21 @@ Status TypedColumnWriterImpl::WriteArrow(const int16_t* def_level array.type()->id() != ::arrow::Type::STRING) { ARROW_UNSUPPORTED(); } - // const auto& data = checked_cast(array); - - // Like WriteBatch, but for spaced values - int64_t value_offset = 0; - auto WriteBatch = [&](int64_t offset, int64_t batch_size) { + auto WriteChunk = [&](int64_t offset, int64_t batch_size) { int64_t batch_num_values = 0; int64_t batch_num_spaced_values = 0; WriteLevelsSpaced(batch_size, def_levels + offset, rep_levels + offset, &batch_num_values, &batch_num_spaced_values); - // dynamic_cast(current_encoder_.get()) - // ->Put(*data.Slice(offset, batch_size)); - - // TODO(wesm): Update page statistics - + std::shared_ptr<::arrow::Array> data_slice = array.Slice(offset, batch_size); + current_encoder_->Put(*data_slice); + if (page_statistics_ != nullptr) { + page_statistics_->Update(*data_slice); + } CommitWriteAndCheckLimits(batch_size, batch_num_spaced_values); - value_offset += batch_num_spaced_values; }; PARQUET_CATCH_NOT_OK( - DoInBatches(array.length(), properties_->write_batch_size(), WriteBatch)); + DoInBatches(array.length(), properties_->write_batch_size(), WriteChunk)); return Status::OK(); } diff --git a/cpp/src/parquet/encoding-test.cc b/cpp/src/parquet/encoding-test.cc index 9497534c823..3cb43b6bd2d 100644 --- a/cpp/src/parquet/encoding-test.cc +++ b/cpp/src/parquet/encoding-test.cc @@ -54,7 +54,7 @@ TEST(VectorBooleanTest, TestEncodeDecode) { int nbytes = static_cast(BitUtil::BytesForBits(nvalues)); std::vector draws; - ::arrow::random_is_valid(nvalues, 0.5 /* null prob */, &draws, 0 /* seed */); + arrow::random_is_valid(nvalues, 0.5 /* null prob */, &draws, 0 /* seed */); std::unique_ptr encoder = MakeTypedEncoder(Encoding::PLAIN); @@ -75,7 +75,7 @@ TEST(VectorBooleanTest, TestEncodeDecode) { ASSERT_EQ(nvalues, values_decoded); for (int i = 0; i < nvalues; ++i) { - ASSERT_EQ(draws[i], ::arrow::BitUtil::GetBit(decode_data, i)) << i; + ASSERT_EQ(draws[i], arrow::BitUtil::GetBit(decode_data, i)) << i; } } @@ -260,7 +260,7 @@ class TestDictionaryEncoding : public TestEncodingBase { static constexpr int TYPE = Type::type_num; void CheckRoundtrip() { - std::vector valid_bits(::arrow::BitUtil::BytesForBits(num_values_) + 1, 255); + std::vector valid_bits(arrow::BitUtil::BytesForBits(num_values_) + 1, 255); auto base_encoder = MakeEncoder(Type::type_num, Encoding::PLAIN, true, descr_.get()); auto encoder = @@ -327,8 +327,8 @@ TEST(TestDictionaryEncoding, CannotDictDecodeBoolean) { class TestArrowBuilderDecoding : public ::testing::Test { public: - using DenseBuilder = ::arrow::internal::ChunkedBinaryBuilder; - using DictBuilder = ::arrow::BinaryDictionary32Builder; + using DenseBuilder = arrow::internal::ChunkedBinaryBuilder; + using DictBuilder = arrow::BinaryDictionary32Builder; void SetUp() override { null_probabilities_ = {0.0, 0.5, 1.0}; } void TearDown() override {} @@ -343,7 +343,7 @@ class TestArrowBuilderDecoding : public ::testing::Test { constexpr int repeat = 100; constexpr int64_t min_length = 2; constexpr int64_t max_length = 10; - ::arrow::random::RandomArrayGenerator rag(0); + arrow::random::RandomArrayGenerator rag(0); expected_dense_ = rag.BinaryWithRepeats(repeat * num_unique, num_unique, min_length, max_length, null_probability); @@ -356,7 +356,7 @@ class TestArrowBuilderDecoding : public ::testing::Test { ASSERT_OK(builder->Finish(&expected_dict_)); // Initialize input_data_ for the encoder from the expected_array_ values - const auto& binary_array = static_cast(*expected_dense_); + const auto& binary_array = static_cast(*expected_dense_); input_data_.resize(binary_array.length()); for (int64_t i = 0; i < binary_array.length(); ++i) { @@ -383,7 +383,7 @@ class TestArrowBuilderDecoding : public ::testing::Test { template void CheckDense(int actual_num_values, Builder& builder) { ASSERT_EQ(actual_num_values, num_values_); - ::arrow::ArrayVector actual_vec; + arrow::ArrayVector actual_vec; ASSERT_OK(builder.Finish(&actual_vec)); ASSERT_EQ(actual_vec.size(), 1); ASSERT_ARRAYS_EQUAL(*actual_vec[0], *expected_dense_); @@ -392,7 +392,7 @@ class TestArrowBuilderDecoding : public ::testing::Test { template void CheckDict(int actual_num_values, Builder& builder) { ASSERT_EQ(actual_num_values, num_values_); - std::shared_ptr<::arrow::Array> actual; + std::shared_ptr actual; ASSERT_OK(builder.Finish(&actual)); ASSERT_ARRAYS_EQUAL(*actual, *expected_dict_); } @@ -439,8 +439,8 @@ class TestArrowBuilderDecoding : public ::testing::Test { protected: std::vector null_probabilities_; - std::shared_ptr<::arrow::Array> expected_dict_; - std::shared_ptr<::arrow::Array> expected_dense_; + std::shared_ptr expected_dict_; + std::shared_ptr expected_dense_; int num_values_; int null_count_; std::vector input_data_; @@ -480,6 +480,38 @@ TEST_F(PlainEncoding, CheckDecodeArrowNonNullDictBuilder) { this->CheckDecodeArrowNonNullUsingDictBuilder(); } +TEST(PlainEncoding, ArrowBinaryDirectPut) { + // Implemented as part of ARROW-3246 + auto encoder = MakeTypedEncoder(Encoding::PLAIN); + auto decoder = MakeTypedDecoder(Encoding::PLAIN); + + const int64_t size = 1000; + const int64_t min_length = 0; + const int64_t max_length = 10; + const double null_probability = 0.1; + + arrow::random::RandomArrayGenerator rag(0); + auto values = rag.String(size, min_length, max_length, null_probability); + auto i32_values = rag.Int32(size, 0, 10, null_probability); + ASSERT_NO_THROW(encoder->Put(*values)); + + // Type checked + ASSERT_THROW(encoder->Put(*i32_values), ParquetException); + + auto buf = encoder->FlushValues(); + decoder->SetData(size, buf->data(), static_cast(buf->size())); + + arrow::StringBuilder builder; + ASSERT_EQ(size, + decoder->DecodeArrow(size, values->null_count(), values->null_bitmap_data(), + values->offset(), &builder)); + + std::shared_ptr result; + ASSERT_OK(builder.Finish(&result)); + + arrow::AssertArraysEqual(*values, *result); +} + class DictEncoding : public TestArrowBuilderDecoding { public: void SetupEncoderDecoder() override { diff --git a/cpp/src/parquet/encoding.cc b/cpp/src/parquet/encoding.cc index 5e3a38ddbb5..41448976822 100644 --- a/cpp/src/parquet/encoding.cc +++ b/cpp/src/parquet/encoding.cc @@ -46,8 +46,7 @@ constexpr int64_t kInMemoryDefaultCapacity = 1024; class EncoderImpl : virtual public Encoder { public: - EncoderImpl(const ColumnDescriptor* descr, Encoding::type encoding, - ::arrow::MemoryPool* pool) + EncoderImpl(const ColumnDescriptor* descr, Encoding::type encoding, MemoryPool* pool) : descr_(descr), encoding_(encoding), pool_(pool), @@ -55,13 +54,13 @@ class EncoderImpl : virtual public Encoder { Encoding::type encoding() const override { return encoding_; } - ::arrow::MemoryPool* memory_pool() const override { return pool_; } + MemoryPool* memory_pool() const override { return pool_; } protected: // For accessing type-specific metadata, like FIXED_LEN_BYTE_ARRAY const ColumnDescriptor* descr_; const Encoding::type encoding_; - ::arrow::MemoryPool* pool_; + MemoryPool* pool_; /// Type length from descr int type_length_; @@ -75,7 +74,7 @@ class PlainEncoder : public EncoderImpl, virtual public TypedEncoder { public: using T = typename DType::c_type; - explicit PlainEncoder(const ColumnDescriptor* descr, ::arrow::MemoryPool* pool) + explicit PlainEncoder(const ColumnDescriptor* descr, MemoryPool* pool) : EncoderImpl(descr, Encoding::PLAIN, pool) { values_sink_ = CreateOutputStream(pool); } @@ -95,8 +94,39 @@ class PlainEncoder : public EncoderImpl, virtual public TypedEncoder { void Put(const T* buffer, int num_values) override; + void Put(const arrow::Array& values) override; + + void PutSpaced(const T* src, int num_values, const uint8_t* valid_bits, + int64_t valid_bits_offset) override { + std::shared_ptr buffer; + PARQUET_THROW_NOT_OK(arrow::AllocateResizableBuffer(this->memory_pool(), + num_values * sizeof(T), &buffer)); + int32_t num_valid_values = 0; + arrow::internal::BitmapReader valid_bits_reader(valid_bits, valid_bits_offset, + num_values); + T* data = reinterpret_cast(buffer->mutable_data()); + for (int32_t i = 0; i < num_values; i++) { + if (valid_bits_reader.IsSet()) { + data[num_valid_values++] = src[i]; + } + valid_bits_reader.Next(); + } + Put(data, num_valid_values); + } + protected: - std::shared_ptr<::arrow::io::BufferOutputStream> values_sink_; + std::shared_ptr values_sink_; + + void WriteByteArray(const ByteArray& val) { + // Write the result to the output stream + PARQUET_THROW_NOT_OK(values_sink_->Write(reinterpret_cast(&val.len), + sizeof(uint32_t))); + if (val.len > 0) { + DCHECK(nullptr != val.ptr) << "Value ptr cannot be NULL"; + } + PARQUET_THROW_NOT_OK( + values_sink_->Write(reinterpret_cast(val.ptr), val.len)); + } }; template @@ -108,14 +138,29 @@ void PlainEncoder::Put(const T* buffer, int num_values) { template <> inline void PlainEncoder::Put(const ByteArray* src, int num_values) { for (int i = 0; i < num_values; ++i) { - // Write the result to the output stream - PARQUET_THROW_NOT_OK(values_sink_->Write( - reinterpret_cast(&src[i].len), sizeof(uint32_t))); - if (src[i].len > 0) { - DCHECK(nullptr != src[i].ptr) << "Value ptr cannot be NULL"; + WriteByteArray(src[i]); + } +} + +template +void PlainEncoder::Put(const arrow::Array& values) { + ParquetException::NYI(values.type()->ToString()); +} + +template <> +void PlainEncoder::Put(const arrow::Array& values) { + const auto& data = checked_cast(values); + if (data.null_count() == 0) { + // no nulls, just dump the data + for (int64_t i = 0; i < data.length(); i++) { + WriteByteArray(ByteArray(data.GetView(i))); + } + } else { + for (int64_t i = 0; i < data.length(); i++) { + if (data.IsValid(i)) { + WriteByteArray(ByteArray(data.GetView(i))); + } } - PARQUET_THROW_NOT_OK( - values_sink_->Write(reinterpret_cast(src[i].ptr), src[i].len)); } } @@ -131,16 +176,6 @@ inline void PlainEncoder::Put(const FixedLenByteArray* src, int num_va } } -class PlainByteArrayEncoder : public PlainEncoder, - virtual public ByteArrayEncoder { - public: - using BASE = PlainEncoder; - using BASE::PlainEncoder; - using BASE::Put; - - void Put(const ::arrow::BinaryArray& values) override { return; } -}; - class PlainFLBAEncoder : public PlainEncoder, virtual public FLBAEncoder { public: using BASE = PlainEncoder; @@ -151,9 +186,8 @@ class PlainBooleanEncoder : public EncoderImpl, virtual public TypedEncoder, virtual public BooleanEncoder { public: - explicit PlainBooleanEncoder( - const ColumnDescriptor* descr, - ::arrow::MemoryPool* pool = ::arrow::default_memory_pool()); + explicit PlainBooleanEncoder(const ColumnDescriptor* descr, + MemoryPool* pool = arrow::default_memory_pool()); int64_t EstimatedDataEncodedSize() override; std::shared_ptr FlushValues() override; @@ -161,11 +195,33 @@ class PlainBooleanEncoder : public EncoderImpl, void Put(const bool* src, int num_values) override; void Put(const std::vector& src, int num_values) override; + void PutSpaced(const bool* src, int num_values, const uint8_t* valid_bits, + int64_t valid_bits_offset) override { + std::shared_ptr buffer; + PARQUET_THROW_NOT_OK(arrow::AllocateResizableBuffer(this->memory_pool(), + num_values * sizeof(T), &buffer)); + int32_t num_valid_values = 0; + arrow::internal::BitmapReader valid_bits_reader(valid_bits, valid_bits_offset, + num_values); + T* data = reinterpret_cast(buffer->mutable_data()); + for (int32_t i = 0; i < num_values; i++) { + if (valid_bits_reader.IsSet()) { + data[num_valid_values++] = src[i]; + } + valid_bits_reader.Next(); + } + Put(data, num_valid_values); + } + + void Put(const arrow::Array& values) override { + ParquetException::NYI("Direct Arrow to Boolean writes not implemented"); + } + private: int bits_available_; - std::unique_ptr<::arrow::BitUtil::BitWriter> bit_writer_; + std::unique_ptr bit_writer_; std::shared_ptr bits_buffer_; - std::shared_ptr<::arrow::io::BufferOutputStream> values_sink_; + std::shared_ptr values_sink_; template void PutImpl(const SequenceType& src, int num_values); @@ -211,8 +267,7 @@ void PlainBooleanEncoder::PutImpl(const SequenceType& src, int num_values) { } } -PlainBooleanEncoder::PlainBooleanEncoder(const ColumnDescriptor* descr, - ::arrow::MemoryPool* pool) +PlainBooleanEncoder::PlainBooleanEncoder(const ColumnDescriptor* descr, MemoryPool* pool) : EncoderImpl(descr, Encoding::PLAIN, pool), bits_available_(kInMemoryDefaultCapacity * 8), bits_buffer_(AllocateBuffer(pool, kInMemoryDefaultCapacity)) { @@ -256,17 +311,17 @@ void PlainBooleanEncoder::Put(const std::vector& src, int num_values) { template struct DictEncoderTraits { using c_type = typename DType::c_type; - using MemoTableType = ::arrow::internal::ScalarMemoTable; + using MemoTableType = arrow::internal::ScalarMemoTable; }; template <> struct DictEncoderTraits { - using MemoTableType = ::arrow::internal::BinaryMemoTable; + using MemoTableType = arrow::internal::BinaryMemoTable; }; template <> struct DictEncoderTraits { - using MemoTableType = ::arrow::internal::BinaryMemoTable; + using MemoTableType = arrow::internal::BinaryMemoTable; }; // Initially 1024 elements @@ -284,7 +339,7 @@ class DictEncoderImpl : public EncoderImpl, virtual public DictEncoder { public: typedef typename DType::c_type T; - explicit DictEncoderImpl(const ColumnDescriptor* desc, ::arrow::MemoryPool* pool) + explicit DictEncoderImpl(const ColumnDescriptor* desc, MemoryPool* pool) : EncoderImpl(desc, Encoding::PLAIN_DICTIONARY, pool), dict_encoded_size_(0), memo_table_(pool, kInitialHashTableSize) {} @@ -299,7 +354,7 @@ class DictEncoderImpl : public EncoderImpl, virtual public DictEncoder { ++buffer; --buffer_len; - ::arrow::util::RleEncoder encoder(buffer, buffer_len, bit_width()); + arrow::util::RleEncoder encoder(buffer, buffer_len, bit_width()); for (int index : buffered_indices_) { if (!encoder.Put(index)) return -1; } @@ -319,9 +374,9 @@ class DictEncoderImpl : public EncoderImpl, virtual public DictEncoder { // an extra "RleEncoder::MinBufferSize" bytes. These extra bytes won't be used // but not reserving them would cause the encoder to fail. return 1 + - ::arrow::util::RleEncoder::MaxBufferSize( + arrow::util::RleEncoder::MaxBufferSize( bit_width(), static_cast(buffered_indices_.size())) + - ::arrow::util::RleEncoder::MinBufferSize(bit_width()); + arrow::util::RleEncoder::MinBufferSize(bit_width()); } /// The minimum bit width required to encode the currently buffered indices. @@ -343,8 +398,8 @@ class DictEncoderImpl : public EncoderImpl, virtual public DictEncoder { void PutSpaced(const T* src, int num_values, const uint8_t* valid_bits, int64_t valid_bits_offset) override { - ::arrow::internal::BitmapReader valid_bits_reader(valid_bits, valid_bits_offset, - num_values); + arrow::internal::BitmapReader valid_bits_reader(valid_bits, valid_bits_offset, + num_values); for (int32_t i = 0; i < num_values; i++) { if (valid_bits_reader.IsSet()) { Put(src[i]); @@ -353,7 +408,12 @@ class DictEncoderImpl : public EncoderImpl, virtual public DictEncoder { } } - void PutIndices(const int32_t* indices, int length) override { return; } + void Put(const arrow::Array& values) override; + void PutDictionary(const arrow::Array& values) override; + + void PutIndices(const int32_t* indices, int length) override { + buffered_indices_.insert(std::end(buffered_indices_), indices, indices + length); + } std::shared_ptr FlushValues() override { std::shared_ptr buffer = @@ -394,7 +454,7 @@ void DictEncoderImpl::WriteDict(uint8_t* buffer) { // ByteArray and FLBA already have the dictionary encoded in their data heaps template <> void DictEncoderImpl::WriteDict(uint8_t* buffer) { - memo_table_.VisitValues(0, [&buffer](const ::arrow::util::string_view& v) { + memo_table_.VisitValues(0, [&buffer](const arrow::util::string_view& v) { uint32_t len = static_cast(v.length()); memcpy(buffer, &len, sizeof(len)); buffer += sizeof(len); @@ -405,7 +465,7 @@ void DictEncoderImpl::WriteDict(uint8_t* buffer) { template <> void DictEncoderImpl::WriteDict(uint8_t* buffer) { - memo_table_.VisitValues(0, [&](const ::arrow::util::string_view& v) { + memo_table_.VisitValues(0, [&](const arrow::util::string_view& v) { DCHECK_EQ(v.length(), static_cast(type_length_)); memcpy(buffer, v.data(), type_length_); buffer += type_length_; @@ -453,30 +513,55 @@ inline void DictEncoderImpl::Put(const FixedLenByteArray& v) { buffered_indices_.push_back(memo_index); } -class DictByteArrayEncoderImpl : public DictEncoderImpl, - virtual public DictByteArrayEncoder { - public: - using BASE = DictEncoderImpl; - using BASE::DictEncoderImpl; - using BASE::Put; +template +void DictEncoderImpl::Put(const arrow::Array& values) { + ParquetException::NYI(values.type()->ToString()); +} - void Put(const ::arrow::BinaryArray& values) override { return; } +template <> +void DictEncoderImpl::Put(const arrow::Array& values) { + const auto& data = checked_cast(values); + if (data.null_count() == 0) { + // no nulls, just dump the data + for (int64_t i = 0; i < data.length(); i++) { + Put(ByteArray(data.GetView(i))); + } + } else { + for (int64_t i = 0; i < data.length(); i++) { + if (data.IsValid(i)) { + Put(ByteArray(data.GetView(i))); + } + } + } +} - void PutDictionary(const ::arrow::BinaryArray& values) override { return; } -}; +template +void DictEncoderImpl::PutDictionary(const arrow::Array& values) { + ParquetException::NYI(values.type()->ToString()); +} -class DictFLBAEncoder : public DictEncoderImpl, virtual public FLBAEncoder { - public: - using BASE = DictEncoderImpl; - using BASE::DictEncoderImpl; -}; +template <> +void DictEncoderImpl::PutDictionary(const arrow::Array& values) { + const auto& data = checked_cast(values); + if (data.null_count() > 0) { + throw ParquetException("Inserted binary dictionary cannot cannot contain nulls"); + } + for (int64_t i = 0; i < data.length(); i++) { + auto v = data.GetView(i); + dict_encoded_size_ += static_cast(v.size() + sizeof(uint32_t)); + ARROW_IGNORE_EXPR( + memo_table_.GetOrInsert(v.data(), static_cast(v.size()), + /*on_found=*/[](int32_t memo_index) {}, + /*on_not_found=*/[](int32_t memo_index) {})); + } +} // ---------------------------------------------------------------------- // Encoder and decoder factory functions std::unique_ptr MakeEncoder(Type::type type_num, Encoding::type encoding, bool use_dictionary, const ColumnDescriptor* descr, - ::arrow::MemoryPool* pool) { + MemoryPool* pool) { if (use_dictionary) { switch (type_num) { case Type::INT32: @@ -490,9 +575,9 @@ std::unique_ptr MakeEncoder(Type::type type_num, Encoding::type encodin case Type::DOUBLE: return std::unique_ptr(new DictEncoderImpl(descr, pool)); case Type::BYTE_ARRAY: - return std::unique_ptr(new DictByteArrayEncoderImpl(descr, pool)); + return std::unique_ptr(new DictEncoderImpl(descr, pool)); case Type::FIXED_LEN_BYTE_ARRAY: - return std::unique_ptr(new DictFLBAEncoder(descr, pool)); + return std::unique_ptr(new DictEncoderImpl(descr, pool)); default: DCHECK(false) << "Encoder not implemented"; break; @@ -512,9 +597,9 @@ std::unique_ptr MakeEncoder(Type::type type_num, Encoding::type encodin case Type::DOUBLE: return std::unique_ptr(new PlainEncoder(descr, pool)); case Type::BYTE_ARRAY: - return std::unique_ptr(new PlainByteArrayEncoder(descr, pool)); + return std::unique_ptr(new PlainEncoder(descr, pool)); case Type::FIXED_LEN_BYTE_ARRAY: - return std::unique_ptr(new PlainFLBAEncoder(descr, pool)); + return std::unique_ptr(new PlainEncoder(descr, pool)); default: DCHECK(false) << "Encoder not implemented"; break; @@ -644,7 +729,7 @@ class PlainBooleanDecoder : public DecoderImpl, int Decode(bool* buffer, int max_values) override; private: - std::unique_ptr<::arrow::BitUtil::BitReader> bit_reader_; + std::unique_ptr bit_reader_; }; PlainBooleanDecoder::PlainBooleanDecoder(const ColumnDescriptor* descr) @@ -658,7 +743,7 @@ void PlainBooleanDecoder::SetData(int num_values, const uint8_t* data, int len) int PlainBooleanDecoder::Decode(uint8_t* buffer, int max_values) { max_values = std::min(max_values, num_values_); bool val; - ::arrow::internal::BitmapWriter bit_writer(buffer, 0, max_values); + arrow::internal::BitmapWriter bit_writer(buffer, 0, max_values); for (int i = 0; i < max_values; ++i) { if (!bit_reader_->GetValue(1, &val)) { ParquetException::EofException(); @@ -691,7 +776,7 @@ class PlainByteArrayDecoder : public PlainDecoder, int DecodeArrow(int num_values, int null_count, const uint8_t* valid_bits, int64_t valid_bits_offset, - ::arrow::BinaryDictionary32Builder* builder) override { + arrow::BinaryDictionary32Builder* builder) override { int result = 0; PARQUET_THROW_NOT_OK(DecodeArrow(num_values, null_count, valid_bits, valid_bits_offset, builder, &result)); @@ -700,7 +785,15 @@ class PlainByteArrayDecoder : public PlainDecoder, int DecodeArrow(int num_values, int null_count, const uint8_t* valid_bits, int64_t valid_bits_offset, - ::arrow::internal::ChunkedBinaryBuilder* builder) override { + arrow::internal::ChunkedBinaryBuilder* builder) override { + int result = 0; + PARQUET_THROW_NOT_OK(DecodeArrow(num_values, null_count, valid_bits, + valid_bits_offset, builder, &result)); + return result; + } + + int DecodeArrow(int num_values, int null_count, const uint8_t* valid_bits, + int64_t valid_bits_offset, arrow::BinaryBuilder* builder) override { int result = 0; PARQUET_THROW_NOT_OK(DecodeArrow(num_values, null_count, valid_bits, valid_bits_offset, builder, &result)); @@ -708,14 +801,14 @@ class PlainByteArrayDecoder : public PlainDecoder, } int DecodeArrowNonNull(int num_values, - ::arrow::BinaryDictionary32Builder* builder) override { + arrow::BinaryDictionary32Builder* builder) override { int result = 0; PARQUET_THROW_NOT_OK(DecodeArrowNonNull(num_values, builder, &result)); return result; } int DecodeArrowNonNull(int num_values, - ::arrow::internal::ChunkedBinaryBuilder* builder) override { + arrow::internal::ChunkedBinaryBuilder* builder) override { int result = 0; PARQUET_THROW_NOT_OK(DecodeArrowNonNull(num_values, builder, &result)); return result; @@ -723,12 +816,12 @@ class PlainByteArrayDecoder : public PlainDecoder, private: template - ::arrow::Status DecodeArrow(int num_values, int null_count, const uint8_t* valid_bits, - int64_t valid_bits_offset, BuilderType* builder, - int* values_decoded) { + arrow::Status DecodeArrow(int num_values, int null_count, const uint8_t* valid_bits, + int64_t valid_bits_offset, BuilderType* builder, + int* values_decoded) { num_values = std::min(num_values, num_values_); RETURN_NOT_OK(builder->Reserve(num_values)); - ::arrow::internal::BitmapReader bit_reader(valid_bits, valid_bits_offset, num_values); + arrow::internal::BitmapReader bit_reader(valid_bits, valid_bits_offset, num_values); int increment; int i = 0; const uint8_t* data = data_; @@ -756,12 +849,12 @@ class PlainByteArrayDecoder : public PlainDecoder, len_ -= bytes_decoded; num_values_ -= num_values; *values_decoded = num_values; - return ::arrow::Status::OK(); + return arrow::Status::OK(); } template - ::arrow::Status DecodeArrowNonNull(int num_values, BuilderType* builder, - int* values_decoded) { + arrow::Status DecodeArrowNonNull(int num_values, BuilderType* builder, + int* values_decoded) { num_values = std::min(num_values, num_values_); RETURN_NOT_OK(builder->Reserve(num_values)); int i = 0; @@ -784,7 +877,7 @@ class PlainByteArrayDecoder : public PlainDecoder, len_ -= bytes_decoded; num_values_ -= num_values; *values_decoded = num_values; - return ::arrow::Status::OK(); + return arrow::Status::OK(); } }; @@ -806,7 +899,7 @@ class DictDecoderImpl : public DecoderImpl, virtual public DictDecoder { // dictionary is not guaranteed to persist in memory after this call so the // dictionary decoder needs to copy the data out if necessary. explicit DictDecoderImpl(const ColumnDescriptor* descr, - ::arrow::MemoryPool* pool = ::arrow::default_memory_pool()) + MemoryPool* pool = arrow::default_memory_pool()) : DecoderImpl(descr, Encoding::RLE_DICTIONARY), dictionary_(AllocateBuffer(pool, 0)), dictionary_length_(0), @@ -823,7 +916,7 @@ class DictDecoderImpl : public DecoderImpl, virtual public DictDecoder { uint8_t bit_width = *data; ++data; --len; - idx_decoder_ = ::arrow::util::RleDecoder(data, len, bit_width); + idx_decoder_ = arrow::util::RleDecoder(data, len, bit_width); } int Decode(T* buffer, int num_values) override { @@ -849,11 +942,11 @@ class DictDecoderImpl : public DecoderImpl, virtual public DictDecoder { return num_values; } - void InsertDictionary(::arrow::ArrayBuilder* builder) override; + void InsertDictionary(arrow::ArrayBuilder* builder) override; int DecodeIndicesSpaced(int num_values, int null_count, const uint8_t* valid_bits, int64_t valid_bits_offset, - ::arrow::ArrayBuilder* builder) override { + arrow::ArrayBuilder* builder) override { num_values = std::min(num_values, num_values_); if (num_values > 0) { // TODO(wesm): Refactor to batch reads for improved memory use. It is not @@ -872,20 +965,20 @@ class DictDecoderImpl : public DecoderImpl, virtual public DictDecoder { /// XXX(wesm): Cannot append "valid bits" directly to the builder std::vector valid_bytes(num_values); - ::arrow::internal::BitmapReader bit_reader(valid_bits, valid_bits_offset, num_values); + arrow::internal::BitmapReader bit_reader(valid_bits, valid_bits_offset, num_values); for (int64_t i = 0; i < num_values; ++i) { valid_bytes[i] = static_cast(bit_reader.IsSet()); bit_reader.Next(); } - auto binary_builder = checked_cast<::arrow::BinaryDictionary32Builder*>(builder); + auto binary_builder = checked_cast(builder); PARQUET_THROW_NOT_OK( binary_builder->AppendIndices(indices_buffer, num_values, valid_bytes.data())); num_values_ -= num_values; return num_values; } - int DecodeIndices(int num_values, ::arrow::ArrayBuilder* builder) override { + int DecodeIndices(int num_values, arrow::ArrayBuilder* builder) override { num_values = std::min(num_values, num_values_); num_values = std::min(num_values, num_values_); if (num_values > 0) { @@ -900,7 +993,7 @@ class DictDecoderImpl : public DecoderImpl, virtual public DictDecoder { if (num_values != idx_decoder_.GetBatch(indices_buffer, num_values)) { ParquetException::EofException(); } - auto binary_builder = checked_cast<::arrow::BinaryDictionary32Builder*>(builder); + auto binary_builder = checked_cast(builder); PARQUET_THROW_NOT_OK(binary_builder->AppendIndices(indices_buffer, num_values)); num_values_ -= num_values; return num_values; @@ -935,7 +1028,7 @@ class DictDecoderImpl : public DecoderImpl, virtual public DictDecoder { // BinaryDictionary32Builder std::shared_ptr indices_scratch_space_; - ::arrow::util::RleDecoder idx_decoder_; + arrow::util::RleDecoder idx_decoder_; }; template @@ -998,17 +1091,17 @@ inline void DictDecoderImpl::SetDict(TypedDecoder* dictionar } template -void DictDecoderImpl::InsertDictionary(::arrow::ArrayBuilder* builder) { +void DictDecoderImpl::InsertDictionary(arrow::ArrayBuilder* builder) { ParquetException::NYI("InsertDictionary only implemented for BYTE_ARRAY types"); } template <> -void DictDecoderImpl::InsertDictionary(::arrow::ArrayBuilder* builder) { - auto binary_builder = checked_cast<::arrow::BinaryDictionary32Builder*>(builder); +void DictDecoderImpl::InsertDictionary(arrow::ArrayBuilder* builder) { + auto binary_builder = checked_cast(builder); // Make an BinaryArray referencing the internal dictionary data - auto arr = std::make_shared<::arrow::BinaryArray>( - dictionary_length_, byte_array_offsets_, byte_array_data_); + auto arr = std::make_shared(dictionary_length_, byte_array_offsets_, + byte_array_data_); PARQUET_THROW_NOT_OK(binary_builder->InsertMemoValues(*arr)); } @@ -1020,7 +1113,7 @@ class DictByteArrayDecoderImpl : public DictDecoderImpl, int DecodeArrow(int num_values, int null_count, const uint8_t* valid_bits, int64_t valid_bits_offset, - ::arrow::BinaryDictionary32Builder* builder) override { + arrow::BinaryDictionary32Builder* builder) override { int result = 0; PARQUET_THROW_NOT_OK(DecodeArrow(num_values, null_count, valid_bits, valid_bits_offset, builder, &result)); @@ -1029,7 +1122,15 @@ class DictByteArrayDecoderImpl : public DictDecoderImpl, int DecodeArrow(int num_values, int null_count, const uint8_t* valid_bits, int64_t valid_bits_offset, - ::arrow::internal::ChunkedBinaryBuilder* builder) override { + arrow::internal::ChunkedBinaryBuilder* builder) override { + int result = 0; + PARQUET_THROW_NOT_OK(DecodeArrow(num_values, null_count, valid_bits, + valid_bits_offset, builder, &result)); + return result; + } + + int DecodeArrow(int num_values, int null_count, const uint8_t* valid_bits, + int64_t valid_bits_offset, arrow::BinaryBuilder* builder) override { int result = 0; PARQUET_THROW_NOT_OK(DecodeArrow(num_values, null_count, valid_bits, valid_bits_offset, builder, &result)); @@ -1037,14 +1138,14 @@ class DictByteArrayDecoderImpl : public DictDecoderImpl, } int DecodeArrowNonNull(int num_values, - ::arrow::BinaryDictionary32Builder* builder) override { + arrow::BinaryDictionary32Builder* builder) override { int result = 0; PARQUET_THROW_NOT_OK(DecodeArrowNonNull(num_values, builder, &result)); return result; } int DecodeArrowNonNull(int num_values, - ::arrow::internal::ChunkedBinaryBuilder* builder) override { + arrow::internal::ChunkedBinaryBuilder* builder) override { int result = 0; PARQUET_THROW_NOT_OK(DecodeArrowNonNull(num_values, builder, &result)); return result; @@ -1052,13 +1153,13 @@ class DictByteArrayDecoderImpl : public DictDecoderImpl, private: template - ::arrow::Status DecodeArrow(int num_values, int null_count, const uint8_t* valid_bits, - int64_t valid_bits_offset, BuilderType* builder, - int* out_num_values) { + arrow::Status DecodeArrow(int num_values, int null_count, const uint8_t* valid_bits, + int64_t valid_bits_offset, BuilderType* builder, + int* out_num_values) { constexpr int32_t buffer_size = 1024; int32_t indices_buffer[buffer_size]; RETURN_NOT_OK(builder->Reserve(num_values)); - ::arrow::internal::BitmapReader bit_reader(valid_bits, valid_bits_offset, num_values); + arrow::internal::BitmapReader bit_reader(valid_bits, valid_bits_offset, num_values); auto dict_values = reinterpret_cast(dictionary_->data()); @@ -1099,16 +1200,16 @@ class DictByteArrayDecoderImpl : public DictDecoderImpl, } } if (values_decoded != num_values) { - return ::arrow::Status::IOError("Expected to dictionary-decode ", num_values, - " but only able to decode ", values_decoded); + return arrow::Status::IOError("Expected to dictionary-decode ", num_values, + " but only able to decode ", values_decoded); } *out_num_values = values_decoded; - return ::arrow::Status::OK(); + return arrow::Status::OK(); } template - ::arrow::Status DecodeArrowNonNull(int num_values, BuilderType* builder, - int* out_num_values) { + arrow::Status DecodeArrowNonNull(int num_values, BuilderType* builder, + int* out_num_values) { constexpr int32_t buffer_size = 2048; int32_t indices_buffer[buffer_size]; int values_decoded = 0; @@ -1130,7 +1231,7 @@ class DictByteArrayDecoderImpl : public DictDecoderImpl, ParquetException::EofException(); } *out_num_values = values_decoded; - return ::arrow::Status::OK(); + return arrow::Status::OK(); } }; @@ -1149,7 +1250,7 @@ class DeltaBitPackDecoder : public DecoderImpl, virtual public TypedDecodernum_values_ = num_values; - decoder_ = ::arrow::BitUtil::BitReader(data, len); + decoder_ = arrow::BitUtil::BitReader(data, len); values_current_block_ = 0; values_current_mini_block_ = 0; } @@ -1221,8 +1322,8 @@ class DeltaBitPackDecoder : public DecoderImpl, virtual public TypedDecoder { public: - explicit DeltaLengthByteArrayDecoder( - const ColumnDescriptor* descr, - ::arrow::MemoryPool* pool = ::arrow::default_memory_pool()) + explicit DeltaLengthByteArrayDecoder(const ColumnDescriptor* descr, + MemoryPool* pool = arrow::default_memory_pool()) : DecoderImpl(descr, Encoding::DELTA_LENGTH_BYTE_ARRAY), len_decoder_(nullptr, pool) {} @@ -1282,9 +1382,8 @@ class DeltaLengthByteArrayDecoder : public DecoderImpl, class DeltaByteArrayDecoder : public DecoderImpl, virtual public TypedDecoder { public: - explicit DeltaByteArrayDecoder( - const ColumnDescriptor* descr, - ::arrow::MemoryPool* pool = ::arrow::default_memory_pool()) + explicit DeltaByteArrayDecoder(const ColumnDescriptor* descr, + MemoryPool* pool = arrow::default_memory_pool()) : DecoderImpl(descr, Encoding::DELTA_BYTE_ARRAY), prefix_len_decoder_(nullptr, pool), suffix_decoder_(nullptr, pool), @@ -1366,7 +1465,7 @@ namespace detail { std::unique_ptr MakeDictDecoder(Type::type type_num, const ColumnDescriptor* descr, - ::arrow::MemoryPool* pool) { + MemoryPool* pool) { switch (type_num) { case Type::BOOLEAN: ParquetException::NYI("Dictionary encoding not implemented for boolean type"); diff --git a/cpp/src/parquet/encoding.h b/cpp/src/parquet/encoding.h index 395d6b77546..84be806fc7e 100644 --- a/cpp/src/parquet/encoding.h +++ b/cpp/src/parquet/encoding.h @@ -28,8 +28,10 @@ namespace arrow { +class Array; class ArrayBuilder; class BinaryArray; +class BinaryBuilder; class BinaryDictionary32Builder; namespace internal { @@ -52,7 +54,9 @@ class Encoder { virtual std::shared_ptr FlushValues() = 0; virtual Encoding::type encoding() const = 0; - virtual ::arrow::MemoryPool* memory_pool() const = 0; + virtual void Put(const ::arrow::Array& values) = 0; + + virtual MemoryPool* memory_pool() const = 0; }; // Base class for value encoders. Since encoders may or not have state (e.g., @@ -64,25 +68,12 @@ class TypedEncoder : virtual public Encoder { public: typedef typename DType::c_type T; + using Encoder::Put; + virtual void Put(const T* src, int num_values) = 0; virtual void PutSpaced(const T* src, int num_values, const uint8_t* valid_bits, - int64_t valid_bits_offset) { - std::shared_ptr buffer; - PARQUET_THROW_NOT_OK(::arrow::AllocateResizableBuffer( - this->memory_pool(), num_values * sizeof(T), &buffer)); - int32_t num_valid_values = 0; - ::arrow::internal::BitmapReader valid_bits_reader(valid_bits, valid_bits_offset, - num_values); - T* data = reinterpret_cast(buffer->mutable_data()); - for (int32_t i = 0; i < num_values; i++) { - if (valid_bits_reader.IsSet()) { - data[num_valid_values++] = src[i]; - } - valid_bits_reader.Next(); - } - Put(data, num_valid_values); - } + int64_t valid_bits_offset) = 0; }; // Base class for dictionary encoders @@ -107,12 +98,19 @@ class DictEncoder : virtual public TypedEncoder { virtual int num_entries() const = 0; - /// \brief EXPERIMENTAL: Append dictionary indices into the - /// encoder. It is assumed that the indices reference pre-existing - /// dictionary values + /// \brief EXPERIMENTAL: Append dictionary indices into the encoder. It is + /// assumed, without any boundschecking that the indices reference + /// pre-existing dictionary values /// \param[in] indices the dictionary index values /// \param[in] length the number of values being inserted virtual void PutIndices(const int32_t* indices, int length) = 0; + + /// \brief EXPERIMENTAL: Append dictionary into encoder, inserting indices + /// separately. Currently throws exception if the current dictionary memo is + /// non-empty + /// \param[in] values the dictionary values. Only valid for certain + /// Parquet/Arrow type combinations, like BYTE_ARRAY/BinaryArray + virtual void PutDictionary(const ::arrow::Array& values) = 0; }; // ---------------------------------------------------------------------- @@ -212,23 +210,9 @@ using Int64Encoder = TypedEncoder; using Int96Encoder = TypedEncoder; using FloatEncoder = TypedEncoder; using DoubleEncoder = TypedEncoder; -class ByteArrayEncoder : virtual public TypedEncoder { - public: - using TypedEncoder::Put; - virtual void Put(const ::arrow::BinaryArray& values) = 0; -}; - +using ByteArrayEncoder = TypedEncoder; class FLBAEncoder : virtual public TypedEncoder {}; -class DictByteArrayEncoder : virtual public DictEncoder, - virtual public ByteArrayEncoder { - public: - /// \brief EXPERIMENTAL: Append dictionary into encoder, inserting - /// indices separately - /// \param[in] values the dictionary values - virtual void PutDictionary(const ::arrow::BinaryArray& values) = 0; -}; - class BooleanDecoder : virtual public TypedDecoder { public: using TypedDecoder::Decode; @@ -252,6 +236,9 @@ class ByteArrayDecoder : virtual public TypedDecoder { virtual int DecodeArrowNonNull(int num_values, ::arrow::BinaryDictionary32Builder* builder) = 0; + virtual int DecodeArrow(int num_values, int null_count, const uint8_t* valid_bits, + int64_t valid_bits_offset, ::arrow::BinaryBuilder* builder) = 0; + virtual int DecodeArrow(int num_values, int null_count, const uint8_t* valid_bits, int64_t valid_bits_offset, ::arrow::internal::ChunkedBinaryBuilder* builder) = 0; diff --git a/cpp/src/parquet/statistics-test.cc b/cpp/src/parquet/statistics-test.cc index ccc2d248f2d..84150d1a8bf 100644 --- a/cpp/src/parquet/statistics-test.cc +++ b/cpp/src/parquet/statistics-test.cc @@ -831,6 +831,25 @@ TYPED_TEST(TestStatisticsSortOrder, MinMax) { ASSERT_NO_FATAL_FAILURE(this->VerifyParquetStats()); } +TEST(TestByteArrayStatisticsFromArrow, Basics) { + // Part of ARROW-3246. Replicating TestStatisticsSortOrder test but via Arrow + + auto values = ArrayFromJSON(::arrow::utf8(), + u8"[\"c123\", \"b123\", \"a123\", null, " + "null, \"f123\", \"g123\", \"h123\", \"i123\", \"ü123\"]"); + + const auto& typed_values = static_cast(*values); + + NodePtr node = PrimitiveNode::Make("field", Repetition::REQUIRED, Type::BYTE_ARRAY, + ConvertedType::UTF8); + ColumnDescriptor descr(node, 0, 0); + auto stats = MakeStatistics(&descr); + ASSERT_NO_FATAL_FAILURE(stats->Update(*values)); + + ASSERT_EQ(ByteArray(typed_values.GetView(2)), stats->min()); + ASSERT_EQ(ByteArray(typed_values.GetView(9)), stats->max()); +} + // Ensure UNKNOWN sort order is handled properly using TestStatisticsSortOrderFLBA = TestStatisticsSortOrder; From 57a45e0ce09c3c5130246d2671f2b649b0ef6657 Mon Sep 17 00:00:00 2001 From: Wes McKinney Date: Sun, 11 Aug 2019 16:44:42 -0500 Subject: [PATCH 06/28] Direct binary put works --- cpp/src/parquet/encoding-test.cc | 2 +- cpp/src/parquet/encoding.cc | 9 +++++++++ cpp/src/parquet/encoding.h | 2 +- 3 files changed, 11 insertions(+), 2 deletions(-) diff --git a/cpp/src/parquet/encoding-test.cc b/cpp/src/parquet/encoding-test.cc index 3cb43b6bd2d..3ad118097c3 100644 --- a/cpp/src/parquet/encoding-test.cc +++ b/cpp/src/parquet/encoding-test.cc @@ -480,7 +480,7 @@ TEST_F(PlainEncoding, CheckDecodeArrowNonNullDictBuilder) { this->CheckDecodeArrowNonNullUsingDictBuilder(); } -TEST(PlainEncoding, ArrowBinaryDirectPut) { +TEST(PlainEncodingAdHoc, ArrowBinaryDirectPut) { // Implemented as part of ARROW-3246 auto encoder = MakeTypedEncoder(Encoding::PLAIN); auto decoder = MakeTypedDecoder(Encoding::PLAIN); diff --git a/cpp/src/parquet/encoding.cc b/cpp/src/parquet/encoding.cc index 41448976822..c7467c5625f 100644 --- a/cpp/src/parquet/encoding.cc +++ b/cpp/src/parquet/encoding.cc @@ -147,8 +147,15 @@ void PlainEncoder::Put(const arrow::Array& values) { ParquetException::NYI(values.type()->ToString()); } +void AssertBinary(const arrow::Array& values) { + if (dynamic_cast(&values) == nullptr) { + throw ParquetException("Only BinaryArray and subclasses supported"); + } +} + template <> void PlainEncoder::Put(const arrow::Array& values) { + AssertBinary(values); const auto& data = checked_cast(values); if (data.null_count() == 0) { // no nulls, just dump the data @@ -520,6 +527,7 @@ void DictEncoderImpl::Put(const arrow::Array& values) { template <> void DictEncoderImpl::Put(const arrow::Array& values) { + AssertBinary(values); const auto& data = checked_cast(values); if (data.null_count() == 0) { // no nulls, just dump the data @@ -542,6 +550,7 @@ void DictEncoderImpl::PutDictionary(const arrow::Array& values) { template <> void DictEncoderImpl::PutDictionary(const arrow::Array& values) { + AssertBinary(values); const auto& data = checked_cast(values); if (data.null_count() > 0) { throw ParquetException("Inserted binary dictionary cannot cannot contain nulls"); diff --git a/cpp/src/parquet/encoding.h b/cpp/src/parquet/encoding.h index 84be806fc7e..8c9aa9cffd9 100644 --- a/cpp/src/parquet/encoding.h +++ b/cpp/src/parquet/encoding.h @@ -211,7 +211,7 @@ using Int96Encoder = TypedEncoder; using FloatEncoder = TypedEncoder; using DoubleEncoder = TypedEncoder; using ByteArrayEncoder = TypedEncoder; -class FLBAEncoder : virtual public TypedEncoder {}; +using FLBAEncoder = TypedEncoder; class BooleanDecoder : virtual public TypedDecoder { public: From edc9f8473aa2bbc6f66318a794bf85c49f51cb7e Mon Sep 17 00:00:00 2001 From: Wes McKinney Date: Sun, 11 Aug 2019 21:52:42 -0500 Subject: [PATCH 07/28] Fix unit tests [skip ci] --- cpp/src/arrow/testing/gtest_util.cc | 4 +- .../parquet/arrow/arrow-reader-writer-test.cc | 9 ++- cpp/src/parquet/column_writer.cc | 9 ++- cpp/src/parquet/statistics.cc | 56 ++++++++++++++----- 4 files changed, 56 insertions(+), 22 deletions(-) diff --git a/cpp/src/arrow/testing/gtest_util.cc b/cpp/src/arrow/testing/gtest_util.cc index 49447642bec..687e0873d6a 100644 --- a/cpp/src/arrow/testing/gtest_util.cc +++ b/cpp/src/arrow/testing/gtest_util.cc @@ -49,7 +49,9 @@ static void PrintChunkedArray(const ChunkedArray& carr, std::stringstream* ss) { for (int i = 0; i < carr.num_chunks(); ++i) { auto c1 = carr.chunk(i); *ss << "Chunk " << i << std::endl; - ARROW_EXPECT_OK(::arrow::PrettyPrint(*c1, 0, ss)); + ::arrow::PrettyPrintOptions options(2); + options.window = 100; + ARROW_EXPECT_OK(::arrow::PrettyPrint(*c1, options, ss)); *ss << std::endl; } } diff --git a/cpp/src/parquet/arrow/arrow-reader-writer-test.cc b/cpp/src/parquet/arrow/arrow-reader-writer-test.cc index 1d50bcb9220..fd11ed087b4 100644 --- a/cpp/src/parquet/arrow/arrow-reader-writer-test.cc +++ b/cpp/src/parquet/arrow/arrow-reader-writer-test.cc @@ -439,9 +439,8 @@ void CheckSimpleRoundtrip(const std::shared_ptr& table, int64_t row_group std::shared_ptr
result; DoSimpleRoundtrip(table, false /* use_threads */, row_group_size, {}, &result, arrow_properties); - ASSERT_NO_FATAL_FAILURE( - ::arrow::AssertSchemaEqual(*table->schema(), *result->schema())); - ASSERT_NO_FATAL_FAILURE(::arrow::AssertTablesEqual(*table, *result, false)); + ::arrow::AssertSchemaEqual(*table->schema(), *result->schema()); + ::arrow::AssertTablesEqual(*table, *result, false); } static std::shared_ptr MakeSimpleSchema(const DataType& type, @@ -751,8 +750,8 @@ TYPED_TEST(TestParquetIO, SingleEmptyListsColumnReadWrite) { TYPED_TEST(TestParquetIO, SingleNullableListNullableColumnReadWrite) { std::shared_ptr
table; - ASSERT_NO_FATAL_FAILURE(this->PrepareListTable(SMALL_SIZE, true, true, 10, &table)); - ASSERT_NO_FATAL_FAILURE(this->CheckRoundTrip(table)); + this->PrepareListTable(SMALL_SIZE, true, true, 10, &table); + this->CheckRoundTrip(table); } TYPED_TEST(TestParquetIO, SingleRequiredListNullableColumnReadWrite) { diff --git a/cpp/src/parquet/column_writer.cc b/cpp/src/parquet/column_writer.cc index 799abb42a5f..e19b64db1d6 100644 --- a/cpp/src/parquet/column_writer.cc +++ b/cpp/src/parquet/column_writer.cc @@ -1306,21 +1306,24 @@ Status TypedColumnWriterImpl::WriteArrow(const int16_t* def_level array.type()->id() != ::arrow::Type::STRING) { ARROW_UNSUPPORTED(); } + int64_t value_offset = 0; auto WriteChunk = [&](int64_t offset, int64_t batch_size) { int64_t batch_num_values = 0; int64_t batch_num_spaced_values = 0; WriteLevelsSpaced(batch_size, def_levels + offset, rep_levels + offset, &batch_num_values, &batch_num_spaced_values); - std::shared_ptr<::arrow::Array> data_slice = array.Slice(offset, batch_size); + std::shared_ptr<::arrow::Array> data_slice = + array.Slice(value_offset, batch_num_spaced_values); current_encoder_->Put(*data_slice); if (page_statistics_ != nullptr) { page_statistics_->Update(*data_slice); } - CommitWriteAndCheckLimits(batch_size, batch_num_spaced_values); + CommitWriteAndCheckLimits(batch_size, batch_num_values); + value_offset += batch_num_spaced_values; }; PARQUET_CATCH_NOT_OK( - DoInBatches(array.length(), properties_->write_batch_size(), WriteChunk)); + DoInBatches(num_levels, properties_->write_batch_size(), WriteChunk)); return Status::OK(); } diff --git a/cpp/src/parquet/statistics.cc b/cpp/src/parquet/statistics.cc index d1ef804f2a8..4bada835741 100644 --- a/cpp/src/parquet/statistics.cc +++ b/cpp/src/parquet/statistics.cc @@ -161,9 +161,18 @@ class TypedComparatorImpl : virtual public TypedComparator { int64_t valid_bits_offset, T* out_min, T* out_max) override { ::arrow::internal::BitmapReader valid_bits_reader(valid_bits, valid_bits_offset, length); - T min = values[0]; - T max = values[0]; - for (int64_t i = 0; i < length; i++) { + + // Find the first non-null value + int64_t first_non_null = 0; + while (!valid_bits_reader.IsSet()) { + ++first_non_null; + valid_bits_reader.Next(); + } + + T min = values[first_non_null]; + T max = values[first_non_null]; + valid_bits_reader.Next(); + for (int64_t i = first_non_null + 1; i < length; i++) { if (valid_bits_reader.IsSet()) { if (CompareInline(values[i], min)) { min = values[i]; @@ -195,20 +204,41 @@ void GetMinMaxBinaryHelper( const TypedComparatorImpl& comparator, const ::arrow::Array& values, ByteArray* out_min, ByteArray* out_max) { const auto& data = checked_cast(values); - ::arrow::internal::BitmapReader valid_bits_reader(data.null_bitmap_data(), - data.offset(), data.length()); - ByteArray min = data.GetView(0); - ByteArray max = data.GetView(0); - for (int64_t i = 0; i < data.length(); i++) { - ByteArray val = data.GetView(i); - if (valid_bits_reader.IsSet()) { + + ByteArray min, max; + if (data.null_count() > 0) { + ::arrow::internal::BitmapReader valid_bits_reader(data.null_bitmap_data(), + data.offset(), data.length()); + + int64_t first_non_null = 0; + while (!valid_bits_reader.IsSet()) { + ++first_non_null; + valid_bits_reader.Next(); + } + min = data.GetView(first_non_null); + max = data.GetView(first_non_null); + for (int64_t i = first_non_null; i < data.length(); i++) { + ByteArray val = data.GetView(i); + if (valid_bits_reader.IsSet()) { + if (comparator.CompareInline(val, min)) { + min = val; + } else if (comparator.CompareInline(max, val)) { + max = val; + } + } + valid_bits_reader.Next(); + } + } else { + min = data.GetView(0); + max = data.GetView(0); + for (int64_t i = 0; i < data.length(); i++) { + ByteArray val = data.GetView(i); if (comparator.CompareInline(val, min)) { min = val; } else if (comparator.CompareInline(max, val)) { max = val; } } - valid_bits_reader.Next(); } *out_min = min; *out_max = max; @@ -568,10 +598,10 @@ void TypedStatisticsImpl::UpdateSpaced(const T* values, const uint8_t* va // As (num_not_null != 0) there must be one int64_t length = num_null + num_not_null; int64_t i = 0; - ::arrow::internal::BitmapReader valid_bits_reader(valid_bits, valid_bits_offset, - length); StatsHelper helper; if (helper.CanHaveNaN()) { + ::arrow::internal::BitmapReader valid_bits_reader(valid_bits, valid_bits_offset, + length); for (; i < length; i++) { // PARQUET-1225: Handle NaNs if (valid_bits_reader.IsSet() && !helper.IsNaN(values[i])) { From d21ebd852b97ac7679936bdc6da90e1f29eb9780 Mon Sep 17 00:00:00 2001 From: Wes McKinney Date: Mon, 12 Aug 2019 12:09:15 -0500 Subject: [PATCH 08/28] Get all direct put unit tests passing [skip ci] --- cpp/src/arrow/pretty_print.cc | 15 +++++++ cpp/src/arrow/pretty_print.h | 4 ++ cpp/src/arrow/testing/gtest_util.cc | 8 ++-- cpp/src/parquet/column_reader.cc | 4 +- cpp/src/parquet/encoding-test.cc | 68 +++++++++++++++++++++++------ cpp/src/parquet/encoding.cc | 31 +++++++------ cpp/src/parquet/encoding.h | 5 ++- 7 files changed, 101 insertions(+), 34 deletions(-) diff --git a/cpp/src/arrow/pretty_print.cc b/cpp/src/arrow/pretty_print.cc index 88bd5470a80..1daaa91de38 100644 --- a/cpp/src/arrow/pretty_print.cc +++ b/cpp/src/arrow/pretty_print.cc @@ -536,6 +536,21 @@ Status PrettyPrint(const RecordBatch& batch, int indent, std::ostream* sink) { return Status::OK(); } +Status PrettyPrint(const RecordBatch& batch, const PrettyPrintOptions& options, + std::ostream* sink) { + for (int i = 0; i < batch.num_columns(); ++i) { + const std::string& name = batch.column_name(i); + PrettyPrintOptions column_options = options; + column_options.indent += 2; + + (*sink) << name << ": "; + RETURN_NOT_OK(PrettyPrint(*batch.column(i), column_options, sink)); + (*sink) << "\n"; + } + (*sink) << std::flush; + return Status::OK(); +} + Status PrettyPrint(const Table& table, const PrettyPrintOptions& options, std::ostream* sink) { RETURN_NOT_OK(PrettyPrint(*table.schema(), options, sink)); diff --git a/cpp/src/arrow/pretty_print.h b/cpp/src/arrow/pretty_print.h index 5740341a67d..70caa8cfa87 100644 --- a/cpp/src/arrow/pretty_print.h +++ b/cpp/src/arrow/pretty_print.h @@ -61,6 +61,10 @@ struct PrettyPrintOptions { ARROW_EXPORT Status PrettyPrint(const RecordBatch& batch, int indent, std::ostream* sink); +ARROW_EXPORT +Status PrettyPrint(const RecordBatch& batch, const PrettyPrintOptions& options, + std::ostream* sink); + /// \brief Print human-readable representation of Table ARROW_EXPORT Status PrettyPrint(const Table& table, const PrettyPrintOptions& options, diff --git a/cpp/src/arrow/testing/gtest_util.cc b/cpp/src/arrow/testing/gtest_util.cc index 687e0873d6a..2c54aa0aef7 100644 --- a/cpp/src/arrow/testing/gtest_util.cc +++ b/cpp/src/arrow/testing/gtest_util.cc @@ -49,8 +49,7 @@ static void PrintChunkedArray(const ChunkedArray& carr, std::stringstream* ss) { for (int i = 0; i < carr.num_chunks(); ++i) { auto c1 = carr.chunk(i); *ss << "Chunk " << i << std::endl; - ::arrow::PrettyPrintOptions options(2); - options.window = 100; + ::arrow::PrettyPrintOptions options(/*indent=*/2); ARROW_EXPECT_OK(::arrow::PrettyPrint(*c1, options, ss)); *ss << std::endl; } @@ -61,8 +60,9 @@ void AssertTsEqual(const T& expected, const T& actual) { if (!expected.Equals(actual)) { std::stringstream pp_expected; std::stringstream pp_actual; - ARROW_EXPECT_OK(PrettyPrint(expected, 0, &pp_expected)); - ARROW_EXPECT_OK(PrettyPrint(actual, 0, &pp_actual)); + ::arrow::PrettyPrintOptions options(/*indent=*/2); + ARROW_EXPECT_OK(PrettyPrint(expected, options, &pp_expected)); + ARROW_EXPECT_OK(PrettyPrint(actual, options, &pp_actual)); FAIL() << "Got: \n" << pp_actual.str() << "\nExpected: \n" << pp_expected.str(); } } diff --git a/cpp/src/parquet/column_reader.cc b/cpp/src/parquet/column_reader.cc index 925a4ff80ab..0fd3a4c28dd 100644 --- a/cpp/src/parquet/column_reader.cc +++ b/cpp/src/parquet/column_reader.cc @@ -1234,7 +1234,7 @@ class ByteArrayChunkedRecordReader : public TypedRecordReader, int64_t num_decoded = this->current_decoder_->DecodeArrow( static_cast(values_to_read), static_cast(null_count), valid_bits_->mutable_data(), values_written_, builder_.get()); - DCHECK_EQ(num_decoded, values_to_read); + DCHECK_EQ(num_decoded, values_to_read - null_count); ResetValues(); } @@ -1310,7 +1310,7 @@ class ByteArrayDictionaryRecordReader : public TypedRecordReader, /// Flush values since they have been copied into the builder ResetValues(); } - DCHECK_EQ(num_decoded, values_to_read); + DCHECK_EQ(num_decoded, values_to_read - null_count); } private: diff --git a/cpp/src/parquet/encoding-test.cc b/cpp/src/parquet/encoding-test.cc index 3ad118097c3..c67dc5dd19c 100644 --- a/cpp/src/parquet/encoding-test.cc +++ b/cpp/src/parquet/encoding-test.cc @@ -382,7 +382,7 @@ class TestArrowBuilderDecoding : public ::testing::Test { template void CheckDense(int actual_num_values, Builder& builder) { - ASSERT_EQ(actual_num_values, num_values_); + ASSERT_EQ(actual_num_values, num_values_ - null_count_); arrow::ArrayVector actual_vec; ASSERT_OK(builder.Finish(&actual_vec)); ASSERT_EQ(actual_vec.size(), 1); @@ -391,7 +391,7 @@ class TestArrowBuilderDecoding : public ::testing::Test { template void CheckDict(int actual_num_values, Builder& builder) { - ASSERT_EQ(actual_num_values, num_values_); + ASSERT_EQ(actual_num_values, num_values_ - null_count_); std::shared_ptr actual; ASSERT_OK(builder.Finish(&actual)); ASSERT_ARRAYS_EQUAL(*actual, *expected_dict_); @@ -480,35 +480,77 @@ TEST_F(PlainEncoding, CheckDecodeArrowNonNullDictBuilder) { this->CheckDecodeArrowNonNullUsingDictBuilder(); } -TEST(PlainEncodingAdHoc, ArrowBinaryDirectPut) { +TEST(EncodingArrow, PlainBinaryDirectPut) { // Implemented as part of ARROW-3246 - auto encoder = MakeTypedEncoder(Encoding::PLAIN); - auto decoder = MakeTypedDecoder(Encoding::PLAIN); - const int64_t size = 1000; + const int64_t size = 50; const int64_t min_length = 0; const int64_t max_length = 10; const double null_probability = 0.1; - arrow::random::RandomArrayGenerator rag(0); auto values = rag.String(size, min_length, max_length, null_probability); - auto i32_values = rag.Int32(size, 0, 10, null_probability); + + auto encoder = MakeTypedEncoder(Encoding::PLAIN); + auto decoder = MakeTypedDecoder(Encoding::PLAIN); + ASSERT_NO_THROW(encoder->Put(*values)); + auto buf = encoder->FlushValues(); + + int64_t num_values = values->length() - values->null_count(); + decoder->SetData(num_values, buf->data(), static_cast(buf->size())); + + arrow::StringBuilder builder; + ASSERT_EQ(num_values, + decoder->DecodeArrow(values->length(), values->null_count(), + values->null_bitmap_data(), values->offset(), &builder)); + + std::shared_ptr result; + ASSERT_OK(builder.Finish(&result)); + arrow::AssertArraysEqual(*values, *result); // Type checked + auto i32_values = rag.Int32(size, 0, 10, null_probability); ASSERT_THROW(encoder->Put(*i32_values), ParquetException); +} + +TEST(EncodingArrow, DictBinaryDirectPut) { + // Implemented as part of ARROW-3246 + + const int64_t size = 50; + const int64_t min_length = 0; + const int64_t max_length = 10; + const double null_probability = 0.1; + arrow::random::RandomArrayGenerator rag(0); + auto values = rag.String(size, min_length, max_length, null_probability); + + auto owned_encoder = MakeTypedEncoder(Encoding::PLAIN, + /*use_dictionary=*/true); + auto owned_decoder = MakeDictDecoder(); + auto encoder = dynamic_cast*>(owned_encoder.get()); + auto decoder = dynamic_cast(owned_decoder.get()); + + ASSERT_NO_THROW(encoder->Put(*values)); auto buf = encoder->FlushValues(); - decoder->SetData(size, buf->data(), static_cast(buf->size())); + + auto dict_buf = AllocateBuffer(default_memory_pool(), encoder->dict_encoded_size()); + encoder->WriteDict(dict_buf->mutable_data()); + + auto values_decoder = MakeTypedDecoder(Encoding::PLAIN); + values_decoder->SetData(encoder->num_entries(), dict_buf->data(), + static_cast(dict_buf->size())); + + int64_t num_values = values->length() - values->null_count(); + decoder->SetData(num_values, buf->data(), static_cast(buf->size())); + owned_decoder->SetDict(values_decoder.get()); arrow::StringBuilder builder; - ASSERT_EQ(size, - decoder->DecodeArrow(size, values->null_count(), values->null_bitmap_data(), - values->offset(), &builder)); + ASSERT_EQ(num_values, + decoder->DecodeArrow(values->length(), values->null_count(), + values->null_bitmap_data(), values->offset(), &builder)); std::shared_ptr result; ASSERT_OK(builder.Finish(&result)); - arrow::AssertArraysEqual(*values, *result); } diff --git a/cpp/src/parquet/encoding.cc b/cpp/src/parquet/encoding.cc index c7467c5625f..252586a0b90 100644 --- a/cpp/src/parquet/encoding.cc +++ b/cpp/src/parquet/encoding.cc @@ -827,8 +827,7 @@ class PlainByteArrayDecoder : public PlainDecoder, template arrow::Status DecodeArrow(int num_values, int null_count, const uint8_t* valid_bits, int64_t valid_bits_offset, BuilderType* builder, - int* values_decoded) { - num_values = std::min(num_values, num_values_); + int* out_values_decoded) { RETURN_NOT_OK(builder->Reserve(num_values)); arrow::internal::BitmapReader bit_reader(valid_bits, valid_bits_offset, num_values); int increment; @@ -836,7 +835,8 @@ class PlainByteArrayDecoder : public PlainDecoder, const uint8_t* data = data_; int64_t data_size = len_; int bytes_decoded = 0; - while (i < num_values) { + int64_t values_decoded = 0; + while (i < num_values && values_decoded < num_values_) { if (bit_reader.IsSet()) { uint32_t len = arrow::util::SafeLoadAs(data); increment = static_cast(sizeof(uint32_t) + len); @@ -847,6 +847,7 @@ class PlainByteArrayDecoder : public PlainDecoder, data += increment; data_size -= increment; bytes_decoded += increment; + ++values_decoded; } else { RETURN_NOT_OK(builder->AppendNull()); } @@ -856,8 +857,8 @@ class PlainByteArrayDecoder : public PlainDecoder, data_ += bytes_decoded; len_ -= bytes_decoded; - num_values_ -= num_values; - *values_decoded = num_values; + num_values_ -= values_decoded; + *out_values_decoded = values_decoded; return arrow::Status::OK(); } @@ -956,7 +957,6 @@ class DictDecoderImpl : public DecoderImpl, virtual public DictDecoder { int DecodeIndicesSpaced(int num_values, int null_count, const uint8_t* valid_bits, int64_t valid_bits_offset, arrow::ArrayBuilder* builder) override { - num_values = std::min(num_values, num_values_); if (num_values > 0) { // TODO(wesm): Refactor to batch reads for improved memory use. It is not // trivial because the null_count is relative to the entire bitmap @@ -983,8 +983,8 @@ class DictDecoderImpl : public DecoderImpl, virtual public DictDecoder { auto binary_builder = checked_cast(builder); PARQUET_THROW_NOT_OK( binary_builder->AppendIndices(indices_buffer, num_values, valid_bytes.data())); - num_values_ -= num_values; - return num_values; + num_values_ -= num_values - null_count; + return num_values - null_count; } int DecodeIndices(int num_values, arrow::ArrayBuilder* builder) override { @@ -1167,19 +1167,21 @@ class DictByteArrayDecoderImpl : public DictDecoderImpl, int* out_num_values) { constexpr int32_t buffer_size = 1024; int32_t indices_buffer[buffer_size]; + RETURN_NOT_OK(builder->Reserve(num_values)); arrow::internal::BitmapReader bit_reader(valid_bits, valid_bits_offset, num_values); auto dict_values = reinterpret_cast(dictionary_->data()); int values_decoded = 0; - while (values_decoded < num_values) { + int num_appended = 0; + while (num_appended < num_values) { bool is_valid = bit_reader.IsSet(); bit_reader.Next(); if (is_valid) { int32_t batch_size = - std::min(buffer_size, num_values - values_decoded - null_count); + std::min(buffer_size, num_values - num_appended - null_count); int num_indices = idx_decoder_.GetBatch(indices_buffer, batch_size); int i = 0; @@ -1189,11 +1191,12 @@ class DictByteArrayDecoderImpl : public DictDecoderImpl, const auto& val = dict_values[indices_buffer[i]]; RETURN_NOT_OK(builder->Append(val.ptr, val.len)); ++i; + ++values_decoded; } else { RETURN_NOT_OK(builder->AppendNull()); --null_count; } - ++values_decoded; + ++num_appended; if (i == num_indices) { // Do not advance the bit_reader if we have fulfilled the decode // request @@ -1205,12 +1208,12 @@ class DictByteArrayDecoderImpl : public DictDecoderImpl, } else { RETURN_NOT_OK(builder->AppendNull()); --null_count; - ++values_decoded; + ++num_appended; } } - if (values_decoded != num_values) { + if (num_values != num_appended) { return arrow::Status::IOError("Expected to dictionary-decode ", num_values, - " but only able to decode ", values_decoded); + " but only able to decode ", num_appended); } *out_num_values = values_decoded; return arrow::Status::OK(); diff --git a/cpp/src/parquet/encoding.h b/cpp/src/parquet/encoding.h index 8c9aa9cffd9..622545a3343 100644 --- a/cpp/src/parquet/encoding.h +++ b/cpp/src/parquet/encoding.h @@ -229,6 +229,7 @@ class ByteArrayDecoder : virtual public TypedDecoder { public: using TypedDecoder::DecodeSpaced; + /// \brief Returns number of encoded values decoded virtual int DecodeArrow(int num_values, int null_count, const uint8_t* valid_bits, int64_t valid_bits_offset, ::arrow::BinaryDictionary32Builder* builder) = 0; @@ -236,9 +237,11 @@ class ByteArrayDecoder : virtual public TypedDecoder { virtual int DecodeArrowNonNull(int num_values, ::arrow::BinaryDictionary32Builder* builder) = 0; + /// \brief Returns number of encoded values decoded virtual int DecodeArrow(int num_values, int null_count, const uint8_t* valid_bits, int64_t valid_bits_offset, ::arrow::BinaryBuilder* builder) = 0; + /// \brief Returns number of encoded values decoded virtual int DecodeArrow(int num_values, int null_count, const uint8_t* valid_bits, int64_t valid_bits_offset, ::arrow::internal::ChunkedBinaryBuilder* builder) = 0; @@ -340,7 +343,7 @@ std::unique_ptr MakeDictDecoder(Type::type type_num, template std::unique_ptr> MakeDictDecoder( - const ColumnDescriptor* descr, + const ColumnDescriptor* descr = NULLPTR, ::arrow::MemoryPool* pool = ::arrow::default_memory_pool()) { using OutType = DictDecoder; auto decoder = detail::MakeDictDecoder(DType::type_num, descr, pool); From 5aaf2817bee5d746d8faf67314c509c575d8aeb7 Mon Sep 17 00:00:00 2001 From: Wes McKinney Date: Mon, 12 Aug 2019 14:19:20 -0500 Subject: [PATCH 09/28] Temp [skip ci] --- cpp/src/parquet/encoding-test.cc | 73 ++++++++++++++++++++++++++++++-- 1 file changed, 69 insertions(+), 4 deletions(-) diff --git a/cpp/src/parquet/encoding-test.cc b/cpp/src/parquet/encoding-test.cc index c67dc5dd19c..f51cb06ae1c 100644 --- a/cpp/src/parquet/encoding-test.cc +++ b/cpp/src/parquet/encoding-test.cc @@ -480,7 +480,7 @@ TEST_F(PlainEncoding, CheckDecodeArrowNonNullDictBuilder) { this->CheckDecodeArrowNonNullUsingDictBuilder(); } -TEST(EncodingArrow, PlainBinaryDirectPut) { +TEST(PlainEncodingAdHoc, ArrowBinaryDirectPut) { // Implemented as part of ARROW-3246 const int64_t size = 50; @@ -513,9 +513,31 @@ TEST(EncodingArrow, PlainBinaryDirectPut) { ASSERT_THROW(encoder->Put(*i32_values), ParquetException); } -TEST(EncodingArrow, DictBinaryDirectPut) { - // Implemented as part of ARROW-3246 +void GetBinaryDictDecoder(DictEncoder* encoder, + std::shared_ptr* out_values, + std::shared_ptr* out_dict, + std::unique_ptr* out_decoder) { + auto decoder = MakeDictDecoder(); + auto buf = encoder->FlushValues(); + auto dict_buf = AllocateBuffer(default_memory_pool(), encoder->dict_encoded_size()); + encoder->WriteDict(dict_buf->mutable_data()); + + auto values_decoder = MakeTypedDecoder(Encoding::PLAIN); + values_decoder->SetData(encoder->num_entries(), dict_buf->data(), + static_cast(dict_buf->size())); + + int64_t num_values = values->length() - values->null_count(); + decoder->SetData(num_values, buf->data(), static_cast(buf->size())); + decoder->SetDict(values_decoder.get()); + + *out_values = buf; + *out_dict = dict_buf; + *out_decoder = std::unique_ptr( + dynamic_cast(decoder.release())); +} +TEST(DictEncodingAdHoc, ArrowBinaryDirectPut) { + // Implemented as part of ARROW-3246 const int64_t size = 50; const int64_t min_length = 0; const int64_t max_length = 10; @@ -525,12 +547,55 @@ TEST(EncodingArrow, DictBinaryDirectPut) { auto owned_encoder = MakeTypedEncoder(Encoding::PLAIN, /*use_dictionary=*/true); - auto owned_decoder = MakeDictDecoder(); auto encoder = dynamic_cast*>(owned_encoder.get()); auto decoder = dynamic_cast(owned_decoder.get()); ASSERT_NO_THROW(encoder->Put(*values)); + + auto buf = encoder->FlushValues(); + + auto dict_buf = AllocateBuffer(default_memory_pool(), encoder->dict_encoded_size()); + encoder->WriteDict(dict_buf->mutable_data()); + + auto values_decoder = MakeTypedDecoder(Encoding::PLAIN); + values_decoder->SetData(encoder->num_entries(), dict_buf->data(), + static_cast(dict_buf->size())); + + int64_t num_values = values->length() - values->null_count(); + decoder->SetData(num_values, buf->data(), static_cast(buf->size())); + owned_decoder->SetDict(values_decoder.get()); + + arrow::StringBuilder builder; + ASSERT_EQ(num_values, + decoder->DecodeArrow(values->length(), values->null_count(), + values->null_bitmap_data(), values->offset(), &builder)); + + std::shared_ptr result; + ASSERT_OK(builder.Finish(&result)); + arrow::AssertArraysEqual(*values, *result); +} + +TEST(DictEncodingAdHoc, PutDictionaryPutIndices) { + // Part of ARROW-3246 + auto dict_values = arrow::ArrayFromJSON(arrow::binary(), + "[\"foo\", \"bar\", \"baz\"]"); + + auto owned_encoder = MakeTypedEncoder(Encoding::PLAIN, + /*use_dictionary=*/true); + auto owned_decoder = MakeDictDecoder(); + + auto encoder = dynamic_cast*>(owned_encoder.get()); + auto decoder = dynamic_cast(owned_decoder.get()); + + ASSERT_NO_THROW(encoder->PutDictionary(*dict_values)); + + std::vector indices = {0, 1, 2}; + ASSERT_NO_THROW(encoder->PutIndices(indices.data(), + static_cast(indices.length()))); + ASSERT_NO_THROW(encoder->PutIndices(indices.data(), + static_cast(indices.length()))); + auto buf = encoder->FlushValues(); auto dict_buf = AllocateBuffer(default_memory_pool(), encoder->dict_encoded_size()); From 8cc1bcfa9cde5e36b849dba84f2cc8599892d7aa Mon Sep 17 00:00:00 2001 From: Wes McKinney Date: Mon, 12 Aug 2019 14:50:52 -0500 Subject: [PATCH 10/28] Unit test for PutDictionary, PutIndices --- cpp/src/parquet/encoding-test.cc | 70 +++++++++++++------------------- cpp/src/parquet/encoding.cc | 25 +++++++++++- cpp/src/parquet/encoding.h | 6 +-- 3 files changed, 54 insertions(+), 47 deletions(-) diff --git a/cpp/src/parquet/encoding-test.cc b/cpp/src/parquet/encoding-test.cc index f51cb06ae1c..7f57bc03936 100644 --- a/cpp/src/parquet/encoding-test.cc +++ b/cpp/src/parquet/encoding-test.cc @@ -513,7 +513,7 @@ TEST(PlainEncodingAdHoc, ArrowBinaryDirectPut) { ASSERT_THROW(encoder->Put(*i32_values), ParquetException); } -void GetBinaryDictDecoder(DictEncoder* encoder, +void GetBinaryDictDecoder(DictEncoder* encoder, int64_t num_values, std::shared_ptr* out_values, std::shared_ptr* out_dict, std::unique_ptr* out_decoder) { @@ -522,13 +522,12 @@ void GetBinaryDictDecoder(DictEncoder* encoder, auto dict_buf = AllocateBuffer(default_memory_pool(), encoder->dict_encoded_size()); encoder->WriteDict(dict_buf->mutable_data()); - auto values_decoder = MakeTypedDecoder(Encoding::PLAIN); - values_decoder->SetData(encoder->num_entries(), dict_buf->data(), - static_cast(dict_buf->size())); + auto dict_decoder = MakeTypedDecoder(Encoding::PLAIN); + dict_decoder->SetData(encoder->num_entries(), dict_buf->data(), + static_cast(dict_buf->size())); - int64_t num_values = values->length() - values->null_count(); decoder->SetData(num_values, buf->data(), static_cast(buf->size())); - decoder->SetDict(values_decoder.get()); + decoder->SetDict(dict_decoder.get()); *out_values = buf; *out_dict = dict_buf; @@ -549,22 +548,13 @@ TEST(DictEncodingAdHoc, ArrowBinaryDirectPut) { /*use_dictionary=*/true); auto encoder = dynamic_cast*>(owned_encoder.get()); - auto decoder = dynamic_cast(owned_decoder.get()); ASSERT_NO_THROW(encoder->Put(*values)); - auto buf = encoder->FlushValues(); - - auto dict_buf = AllocateBuffer(default_memory_pool(), encoder->dict_encoded_size()); - encoder->WriteDict(dict_buf->mutable_data()); - - auto values_decoder = MakeTypedDecoder(Encoding::PLAIN); - values_decoder->SetData(encoder->num_entries(), dict_buf->data(), - static_cast(dict_buf->size())); - + std::unique_ptr decoder; + std::shared_ptr buf, dict_buf; int64_t num_values = values->length() - values->null_count(); - decoder->SetData(num_values, buf->data(), static_cast(buf->size())); - owned_decoder->SetDict(values_decoder.get()); + GetBinaryDictDecoder(encoder, num_values, &buf, &dict_buf, &decoder); arrow::StringBuilder builder; ASSERT_EQ(num_values, @@ -578,45 +568,41 @@ TEST(DictEncodingAdHoc, ArrowBinaryDirectPut) { TEST(DictEncodingAdHoc, PutDictionaryPutIndices) { // Part of ARROW-3246 - auto dict_values = arrow::ArrayFromJSON(arrow::binary(), - "[\"foo\", \"bar\", \"baz\"]"); + auto dict_values = arrow::ArrayFromJSON(arrow::binary(), "[\"foo\", \"bar\", \"baz\"]"); + auto indices = arrow::ArrayFromJSON(arrow::int32(), "[0, 1, 2]"); + auto indices_nulls = arrow::ArrayFromJSON(arrow::int32(), "[null, 0, 1, null, 2]"); + + auto expected = arrow::ArrayFromJSON(arrow::binary(), + "[\"foo\", \"bar\", \"baz\", null, " + "\"foo\", \"bar\", null, \"baz\"]"); auto owned_encoder = MakeTypedEncoder(Encoding::PLAIN, /*use_dictionary=*/true); auto owned_decoder = MakeDictDecoder(); auto encoder = dynamic_cast*>(owned_encoder.get()); - auto decoder = dynamic_cast(owned_decoder.get()); ASSERT_NO_THROW(encoder->PutDictionary(*dict_values)); - std::vector indices = {0, 1, 2}; - ASSERT_NO_THROW(encoder->PutIndices(indices.data(), - static_cast(indices.length()))); - ASSERT_NO_THROW(encoder->PutIndices(indices.data(), - static_cast(indices.length()))); + // Trying to call PutDictionary again throws + ASSERT_THROW(encoder->PutDictionary(*dict_values), ParquetException); - auto buf = encoder->FlushValues(); + ASSERT_NO_THROW(encoder->PutIndices(*indices)); + ASSERT_NO_THROW(encoder->PutIndices(*indices_nulls)); - auto dict_buf = AllocateBuffer(default_memory_pool(), encoder->dict_encoded_size()); - encoder->WriteDict(dict_buf->mutable_data()); - - auto values_decoder = MakeTypedDecoder(Encoding::PLAIN); - values_decoder->SetData(encoder->num_entries(), dict_buf->data(), - static_cast(dict_buf->size())); - - int64_t num_values = values->length() - values->null_count(); - decoder->SetData(num_values, buf->data(), static_cast(buf->size())); - owned_decoder->SetDict(values_decoder.get()); + std::unique_ptr decoder; + std::shared_ptr buf, dict_buf; + int64_t num_values = expected->length() - expected->null_count(); + GetBinaryDictDecoder(encoder, num_values, &buf, &dict_buf, &decoder); - arrow::StringBuilder builder; - ASSERT_EQ(num_values, - decoder->DecodeArrow(values->length(), values->null_count(), - values->null_bitmap_data(), values->offset(), &builder)); + arrow::BinaryBuilder builder; + ASSERT_EQ(num_values, decoder->DecodeArrow(expected->length(), expected->null_count(), + expected->null_bitmap_data(), + expected->offset(), &builder)); std::shared_ptr result; ASSERT_OK(builder.Finish(&result)); - arrow::AssertArraysEqual(*values, *result); + arrow::AssertArraysEqual(*expected, *result); } class DictEncoding : public TestArrowBuilderDecoding { diff --git a/cpp/src/parquet/encoding.cc b/cpp/src/parquet/encoding.cc index 252586a0b90..b6a381f68fb 100644 --- a/cpp/src/parquet/encoding.cc +++ b/cpp/src/parquet/encoding.cc @@ -418,8 +418,25 @@ class DictEncoderImpl : public EncoderImpl, virtual public DictEncoder { void Put(const arrow::Array& values) override; void PutDictionary(const arrow::Array& values) override; - void PutIndices(const int32_t* indices, int length) override { - buffered_indices_.insert(std::end(buffered_indices_), indices, indices + length); + void PutIndices(const arrow::Array& data) override { + if (data.type()->id() != arrow::Type::INT32) { + throw ParquetException("Only int32 indices currently supported"); + } + const auto& indices = checked_cast(data); + auto values = indices.raw_values(); + if (indices.null_count() > 0) { + arrow::internal::BitmapReader valid_bits_reader(indices.null_bitmap_data(), + indices.offset(), indices.length()); + for (int64_t i = 0; i < indices.length(); ++i) { + if (valid_bits_reader.IsSet()) { + buffered_indices_.push_back(values[i]); + } + valid_bits_reader.Next(); + } + } else { + buffered_indices_.insert(std::end(buffered_indices_), values, + values + indices.length()); + } } std::shared_ptr FlushValues() override { @@ -551,6 +568,10 @@ void DictEncoderImpl::PutDictionary(const arrow::Array& values) { template <> void DictEncoderImpl::PutDictionary(const arrow::Array& values) { AssertBinary(values); + if (this->num_entries() > 0) { + throw ParquetException("Can only call PutDictionary on an empty DictEncoder"); + } + const auto& data = checked_cast(values); if (data.null_count() > 0) { throw ParquetException("Inserted binary dictionary cannot cannot contain nulls"); diff --git a/cpp/src/parquet/encoding.h b/cpp/src/parquet/encoding.h index 622545a3343..bca152caf63 100644 --- a/cpp/src/parquet/encoding.h +++ b/cpp/src/parquet/encoding.h @@ -101,9 +101,9 @@ class DictEncoder : virtual public TypedEncoder { /// \brief EXPERIMENTAL: Append dictionary indices into the encoder. It is /// assumed, without any boundschecking that the indices reference /// pre-existing dictionary values - /// \param[in] indices the dictionary index values - /// \param[in] length the number of values being inserted - virtual void PutIndices(const int32_t* indices, int length) = 0; + /// \param[in] indices the dictionary index values. Only Int32Array currently + /// supported + virtual void PutIndices(const ::arrow::Array& indices) = 0; /// \brief EXPERIMENTAL: Append dictionary into encoder, inserting indices /// separately. Currently throws exception if the current dictionary memo is From 0a293ee277a94b12e4121a1cfb596a180dfe3b0b Mon Sep 17 00:00:00 2001 From: Wes McKinney Date: Mon, 12 Aug 2019 16:09:26 -0500 Subject: [PATCH 11/28] More scaffolding [skip ci] --- cpp/src/parquet/arrow/writer.cc | 22 ------ cpp/src/parquet/column_writer.cc | 117 +++++++++++++++++++++++-------- 2 files changed, 86 insertions(+), 53 deletions(-) diff --git a/cpp/src/parquet/arrow/writer.cc b/cpp/src/parquet/arrow/writer.cc index fb437f14320..e47e6a1f2c5 100644 --- a/cpp/src/parquet/arrow/writer.cc +++ b/cpp/src/parquet/arrow/writer.cc @@ -444,28 +444,6 @@ class FileWriterImpl : public FileWriter { Status WriteColumnChunk(const std::shared_ptr& data, int64_t offset, int64_t size) override { - // DictionaryArrays are not yet handled with a fast path. To still support - // writing them as a workaround, we convert them back to their non-dictionary - // representation. - if (data->type()->id() == ::arrow::Type::DICTIONARY) { - const ::arrow::DictionaryType& dict_type = - static_cast(*data->type()); - - // TODO(ARROW-1648): Remove this special handling once we require an Arrow - // version that has this fixed. - if (dict_type.value_type()->id() == ::arrow::Type::NA) { - auto null_array = std::make_shared<::arrow::NullArray>(data->length()); - return WriteColumnChunk(*null_array); - } - - FunctionContext ctx(this->memory_pool()); - ::arrow::compute::Datum cast_input(data); - ::arrow::compute::Datum cast_output; - RETURN_NOT_OK( - Cast(&ctx, cast_input, dict_type.value_type(), CastOptions(), &cast_output)); - return WriteColumnChunk(cast_output.chunked_array(), offset, size); - } - ColumnWriter* column_writer; PARQUET_CATCH_NOT_OK(column_writer = row_group_writer_->NextColumn()); diff --git a/cpp/src/parquet/column_writer.cc b/cpp/src/parquet/column_writer.cc index e19b64db1d6..532889f7abd 100644 --- a/cpp/src/parquet/column_writer.cc +++ b/cpp/src/parquet/column_writer.cc @@ -680,7 +680,8 @@ class TypedColumnWriterImpl : public ColumnWriterImpl, public TypedColumnWriter< DCHECK_NE(nullptr, values); } WriteValues(values + value_offset, values_to_write, batch_size - values_to_write); - CommitWriteAndCheckLimits(batch_size, values_to_write); + CommitWriteAndCheckPageLimit(batch_size, values_to_write); + CheckDictionarySizeLimit(); value_offset += values_to_write; }; DoInBatches(num_values, properties_->write_batch_size(), WriteChunk); @@ -698,7 +699,8 @@ class TypedColumnWriterImpl : public ColumnWriterImpl, public TypedColumnWriter< &batch_num_values, &batch_num_spaced_values); WriteValuesSpaced(values + value_offset, batch_num_values, batch_num_spaced_values, valid_bits, valid_bits_offset + value_offset); - CommitWriteAndCheckLimits(batch_size, batch_num_spaced_values); + CommitWriteAndCheckPageLimit(batch_size, batch_num_spaced_values); + CheckDictionarySizeLimit(); value_offset += batch_num_spaced_values; }; DoInBatches(num_values, properties_->write_batch_size(), WriteChunk); @@ -717,6 +719,13 @@ class TypedColumnWriterImpl : public ColumnWriterImpl, public TypedColumnWriter< return current_encoder_->FlushValues(); } + // Internal function to handle direct writing of arrow::DictionaryArray, + // since the standard logic concerning dictionary size limits and fallback to + // plain encoding is circumvented + Status WriteArrowDictionary(const int16_t* def_levels, const int16_t* rep_levels, + int64_t num_levels, const ::arrow::Array& array, + ArrowWriteContext* context); + void WriteDictionaryPage() override { // We have to dynamic cast here because of TypedEncoder as // some compilers don't want to cast through virtual inheritance @@ -731,28 +740,6 @@ class TypedColumnWriterImpl : public ColumnWriterImpl, public TypedColumnWriter< total_bytes_written_ += pager_->WriteDictionaryPage(page); } - // Checks if the Dictionary Page size limit is reached - // If the limit is reached, the Dictionary and Data Pages are serialized - // The encoding is switched to PLAIN - // - // Only one Dictionary Page is written. - // Fallback to PLAIN if dictionary page limit is reached. - void CheckDictionarySizeLimit() { - // We have to dynamic cast here because TypedEncoder as some compilers - // don't want to cast through virtual inheritance - auto dict_encoder = dynamic_cast*>(current_encoder_.get()); - if (dict_encoder->dict_encoded_size() >= properties_->dictionary_pagesize_limit()) { - WriteDictionaryPage(); - // Serialize the buffered Dictionary Indicies - FlushBufferedDataPages(); - fallback_ = true; - // Only PLAIN encoding is supported for fallback in V1 - current_encoder_ = MakeEncoder(DType::type_num, Encoding::PLAIN, false, descr_, - properties_->memory_pool()); - encoding_ = Encoding::PLAIN; - } - } - EncodedStatistics GetPageStatistics() override { EncodedStatistics result; if (page_statistics_) result = page_statistics_->Encode(); @@ -791,6 +778,12 @@ class TypedColumnWriterImpl : public ColumnWriterImpl, public TypedColumnWriter< std::shared_ptr page_statistics_; std::shared_ptr chunk_statistics_; + // If writing a sequence of arrow::DictionaryArray to the writer, we keep the + // dictionary passed to DictEncoder::PutDictionary so we can check + // subsequent array chunks to see either if materialization is required (in + // which case we call back to the dense write path) + std::shared_ptr preserved_dictionary_; + int64_t WriteLevels(int64_t num_values, const int16_t* def_levels, const int16_t* rep_levels) { int64_t values_to_write = 0; @@ -874,15 +867,40 @@ class TypedColumnWriterImpl : public ColumnWriterImpl, public TypedColumnWriter< *out_spaced_values_to_write = spaced_values_to_write; } - void CommitWriteAndCheckLimits(int64_t num_levels, int64_t num_values) { + void CommitWriteAndCheckPageLimit(int64_t num_levels, int64_t num_values) { num_buffered_values_ += num_levels; num_buffered_encoded_values_ += num_values; if (current_encoder_->EstimatedDataEncodedSize() >= properties_->data_pagesize()) { AddDataPage(); } - if (has_dictionary_ && !fallback_) { - CheckDictionarySizeLimit(); + } + + // Checks if the Dictionary Page size limit is reached + // If the limit is reached, the Dictionary and Data Pages are serialized + // The encoding is switched to PLAIN + // + // Only one Dictionary Page is written. + // Fallback to PLAIN if dictionary page limit is reached. + void CheckDictionarySizeLimit() { + if (!has_dictionary_ || fallback_) { + // Either not using dictionary encoding, or we have already fallen back + // to PLAIN encoding because the size threshold was reached + return; + } + + // We have to dynamic cast here because TypedEncoder as some compilers + // don't want to cast through virtual inheritance + auto dict_encoder = dynamic_cast*>(current_encoder_.get()); + if (dict_encoder->dict_encoded_size() >= properties_->dictionary_pagesize_limit()) { + WriteDictionaryPage(); + // Serialize the buffered Dictionary Indicies + FlushBufferedDataPages(); + fallback_ = true; + // Only PLAIN encoding is supported for fallback in V1 + current_encoder_ = MakeEncoder(DType::type_num, Encoding::PLAIN, false, descr_, + properties_->memory_pool()); + encoding_ = Encoding::PLAIN; } } @@ -912,6 +930,16 @@ class TypedColumnWriterImpl : public ColumnWriterImpl, public TypedColumnWriter< } }; +template +Status TypedColumnWriterImpl::WriteArrowDictionary(const int16_t* def_levels, + const int16_t* rep_levels, + int64_t num_levels, + const ::arrow::Array& array, + ArrowWriteContext* ctx) { + return Status::NotImplemented("Dictionary writes not implemented for ", + descr_->ToString()); +} + // ---------------------------------------------------------------------- // Direct Arrow write path @@ -1296,32 +1324,59 @@ Status TypedColumnWriterImpl::WriteArrow(const int16_t* def_levels, // ---------------------------------------------------------------------- // Write Arrow to BYTE_ARRAY +Status MaterializeDictionary(const Array& array, std::shared_ptr* out) { + const ::arrow::DictionaryType& dict_type = + static_cast(*data->type()); + + // TODO(ARROW-1648): Remove this special handling once we require an Arrow + // version that has this fixed. + if (dict_type.value_type()->id() == ::arrow::Type::NA) { + *out = std::make_shared<::arrow::NullArray>(data->length()); + return Status::OK(); + } + + FunctionContext ctx(this->memory_pool()); + ::arrow::compute::Datum cast_output; + RETURN_NOT_OK(Cast(&ctx, data, dict_type.value_type(), CastOptions(), &cast_output)); + *out = cast_output.make_array(); + return Status::OK(); +} + +template <> +Status TypedColumnWriterImpl::WriteArrowDictionary( + const int16_t* def_levels, const int16_t* rep_levels, int64_t num_levels, + const ::arrow::Array& array, ArrowWriteContext* ctx) {} + template <> Status TypedColumnWriterImpl::WriteArrow(const int16_t* def_levels, const int16_t* rep_levels, int64_t num_levels, const ::arrow::Array& array, ArrowWriteContext* ctx) { - if (array.type()->id() != ::arrow::Type::BINARY && - array.type()->id() != ::arrow::Type::STRING) { + if (array.type()->id() != arrow::Type::DICTIONARY) { + return WriteArrowDictionary(def_levels, rep_levels, num_levels, array, ctx); + } else if (array.type()->id() != arrow::Type::BINARY && + array.type()->id() != arrow::Type::STRING) { ARROW_UNSUPPORTED(); } + int64_t value_offset = 0; auto WriteChunk = [&](int64_t offset, int64_t batch_size) { int64_t batch_num_values = 0; int64_t batch_num_spaced_values = 0; WriteLevelsSpaced(batch_size, def_levels + offset, rep_levels + offset, &batch_num_values, &batch_num_spaced_values); - std::shared_ptr<::arrow::Array> data_slice = array.Slice(value_offset, batch_num_spaced_values); current_encoder_->Put(*data_slice); if (page_statistics_ != nullptr) { page_statistics_->Update(*data_slice); } - CommitWriteAndCheckLimits(batch_size, batch_num_values); + CommitWriteAndCheckPageLimit(batch_size, batch_num_values); + CheckDictionarySizeLimit(); value_offset += batch_num_spaced_values; }; + PARQUET_CATCH_NOT_OK( DoInBatches(num_levels, properties_->write_batch_size(), WriteChunk)); return Status::OK(); From 138053158c46d955fa338dd6260f52907cc5f387 Mon Sep 17 00:00:00 2001 From: Wes McKinney Date: Mon, 12 Aug 2019 19:43:27 -0500 Subject: [PATCH 12/28] Closer to full dictionary write, NA test failing --- cpp/src/arrow/testing/gtest_util.cc | 1 + .../parquet/arrow/arrow-reader-writer-test.cc | 2 +- cpp/src/parquet/arrow/writer.cc | 22 +- cpp/src/parquet/column_writer.cc | 374 +++++++++++------- cpp/src/parquet/encoding.cc | 47 ++- 5 files changed, 279 insertions(+), 167 deletions(-) diff --git a/cpp/src/arrow/testing/gtest_util.cc b/cpp/src/arrow/testing/gtest_util.cc index 2c54aa0aef7..c5911dcf475 100644 --- a/cpp/src/arrow/testing/gtest_util.cc +++ b/cpp/src/arrow/testing/gtest_util.cc @@ -61,6 +61,7 @@ void AssertTsEqual(const T& expected, const T& actual) { std::stringstream pp_expected; std::stringstream pp_actual; ::arrow::PrettyPrintOptions options(/*indent=*/2); + options.window = 50; ARROW_EXPECT_OK(PrettyPrint(expected, options, &pp_expected)); ARROW_EXPECT_OK(PrettyPrint(actual, options, &pp_actual)); FAIL() << "Got: \n" << pp_actual.str() << "\nExpected: \n" << pp_expected.str(); diff --git a/cpp/src/parquet/arrow/arrow-reader-writer-test.cc b/cpp/src/parquet/arrow/arrow-reader-writer-test.cc index fd11ed087b4..7763b436fa9 100644 --- a/cpp/src/parquet/arrow/arrow-reader-writer-test.cc +++ b/cpp/src/parquet/arrow/arrow-reader-writer-test.cc @@ -671,7 +671,7 @@ TYPED_TEST(TestParquetIO, SingleColumnOptionalReadWrite) { // This also tests max_definition_level = 1 std::shared_ptr values; - ASSERT_OK(NullableArray(SMALL_SIZE, 10, kDefaultSeed, &values)); + ASSERT_OK(NullableArray(SMALL_SIZE / 8, 10, kDefaultSeed, &values)); std::shared_ptr schema = MakeSimpleSchema(*values->type(), Repetition::OPTIONAL); diff --git a/cpp/src/parquet/arrow/writer.cc b/cpp/src/parquet/arrow/writer.cc index e47e6a1f2c5..5c55426477f 100644 --- a/cpp/src/parquet/arrow/writer.cc +++ b/cpp/src/parquet/arrow/writer.cc @@ -25,7 +25,6 @@ #include "arrow/array.h" #include "arrow/buffer-builder.h" -#include "arrow/compute/api.h" #include "arrow/table.h" #include "arrow/type.h" #include "arrow/visitor_inline.h" @@ -43,6 +42,7 @@ using arrow::BinaryArray; using arrow::BooleanArray; using arrow::ChunkedArray; using arrow::Decimal128Array; +using arrow::DictionaryArray; using arrow::Field; using arrow::FixedSizeBinaryArray; using Int16BufferBuilder = arrow::TypedBufferBuilder; @@ -55,10 +55,6 @@ using arrow::Status; using arrow::Table; using arrow::TimeUnit; -using arrow::compute::Cast; -using arrow::compute::CastOptions; -using arrow::compute::FunctionContext; - using parquet::ParquetFileWriter; using parquet::ParquetVersion; using parquet::schema::GroupNode; @@ -89,6 +85,21 @@ class LevelBuilder { return Status::OK(); } + Status Visit(const DictionaryArray& array) { + // Only currently handle DictionaryArray where the dictionary is a + // primitive type + if (array.dict_type()->value_type()->num_children() > 0) { + return Status::NotImplemented( + "Writing DictionaryArray with nested dictionary " + "type not yet supported"); + } + array_offsets_.push_back(static_cast(array.offset())); + valid_bitmaps_.push_back(array.null_bitmap_data()); + null_counts_.push_back(array.null_count()); + values_array_ = std::make_shared(array.data()); + return Status::OK(); + } + Status Visit(const ListArray& array) { array_offsets_.push_back(static_cast(array.offset())); valid_bitmaps_.push_back(array.null_bitmap_data()); @@ -113,7 +124,6 @@ class LevelBuilder { NOT_IMPLEMENTED_VISIT(FixedSizeList) NOT_IMPLEMENTED_VISIT(Struct) NOT_IMPLEMENTED_VISIT(Union) - NOT_IMPLEMENTED_VISIT(Dictionary) NOT_IMPLEMENTED_VISIT(Extension) #undef NOT_IMPLEMENTED_VISIT diff --git a/cpp/src/parquet/column_writer.cc b/cpp/src/parquet/column_writer.cc index 532889f7abd..ab7a06caa4e 100644 --- a/cpp/src/parquet/column_writer.cc +++ b/cpp/src/parquet/column_writer.cc @@ -26,6 +26,7 @@ #include "arrow/array.h" #include "arrow/buffer-builder.h" +#include "arrow/compute/api.h" #include "arrow/type.h" #include "arrow/type_traits.h" #include "arrow/util/bit-stream-utils.h" @@ -46,11 +47,12 @@ namespace parquet { -using ::arrow::Status; -using ::arrow::internal::checked_cast; +using arrow::Status; +using arrow::compute::Datum; +using arrow::internal::checked_cast; -using BitWriter = ::arrow::BitUtil::BitWriter; -using RleEncoder = ::arrow::util::RleEncoder; +using BitWriter = arrow::BitUtil::BitWriter; +using RleEncoder = arrow::util::RleEncoder; LevelEncoder::LevelEncoder() {} LevelEncoder::~LevelEncoder() {} @@ -135,7 +137,7 @@ class SerializedPageWriter : public PageWriter { public: SerializedPageWriter(const std::shared_ptr& sink, Compression::type codec, ColumnChunkMetaDataBuilder* metadata, - MemoryPool* pool = ::arrow::default_memory_pool()) + MemoryPool* pool = arrow::default_memory_pool()) : sink_(sink), metadata_(metadata), pool_(pool), @@ -282,7 +284,7 @@ class SerializedPageWriter : public PageWriter { std::unique_ptr thrift_serializer_; // Compression codec to use. - std::unique_ptr<::arrow::util::Codec> compressor_; + std::unique_ptr compressor_; }; // This implementation of the PageWriter writes to the final sink on Close . @@ -290,7 +292,7 @@ class BufferedPageWriter : public PageWriter { public: BufferedPageWriter(const std::shared_ptr& sink, Compression::type codec, ColumnChunkMetaDataBuilder* metadata, - MemoryPool* pool = ::arrow::default_memory_pool()) + MemoryPool* pool = arrow::default_memory_pool()) : final_sink_(sink), metadata_(metadata) { in_memory_sink_ = CreateOutputStream(pool); pager_ = std::unique_ptr( @@ -332,7 +334,7 @@ class BufferedPageWriter : public PageWriter { private: std::shared_ptr final_sink_; ColumnChunkMetaDataBuilder* metadata_; - std::shared_ptr<::arrow::io::BufferOutputStream> in_memory_sink_; + std::shared_ptr in_memory_sink_; std::unique_ptr pager_; }; @@ -479,8 +481,8 @@ class ColumnWriterImpl { // Flag to infer if dictionary encoding has fallen back to PLAIN bool fallback_; - ::arrow::BufferBuilder definition_levels_sink_; - ::arrow::BufferBuilder repetition_levels_sink_; + arrow::BufferBuilder definition_levels_sink_; + arrow::BufferBuilder repetition_levels_sink_; std::shared_ptr definition_levels_rle_; std::shared_ptr repetition_levels_rle_; @@ -642,6 +644,33 @@ inline void DoInBatches(int64_t total, int64_t batch_size, Action&& action) { } } +bool DictionaryDirectWriteSupported(const arrow::Array& array) { + const arrow::DictionaryType& dict_type = + static_cast(*array.type()); + auto id = dict_type.value_type()->id(); + return id == arrow::Type::BINARY || id == arrow::Type::STRING; +} + +Status MaterializeDictionary(const arrow::Array& array, MemoryPool* pool, + std::shared_ptr* out) { + const arrow::DictionaryType& dict_type = + static_cast(*array.type()); + + // TODO(ARROW-1648): Remove this special handling once we require an Arrow + // version that has this fixed. + if (dict_type.value_type()->id() == arrow::Type::NA) { + *out = std::make_shared(array.length()); + return Status::OK(); + } + + arrow::compute::FunctionContext ctx(pool); + Datum cast_output; + RETURN_NOT_OK(arrow::compute::Cast(&ctx, Datum(array.data()), dict_type.value_type(), + arrow::compute::CastOptions(), &cast_output)); + *out = cast_output.make_array(); + return Status::OK(); +} + template class TypedColumnWriterImpl : public ColumnWriterImpl, public TypedColumnWriter { public: @@ -707,8 +736,14 @@ class TypedColumnWriterImpl : public ColumnWriterImpl, public TypedColumnWriter< } Status WriteArrow(const int16_t* def_levels, const int16_t* rep_levels, - int64_t num_levels, const ::arrow::Array& array, - ArrowWriteContext* context) override; + int64_t num_levels, const arrow::Array& array, + ArrowWriteContext* ctx) override { + if (array.type()->id() == arrow::Type::DICTIONARY) { + return WriteArrowDictionary(def_levels, rep_levels, num_levels, array, ctx); + } else { + return WriteArrowDense(def_levels, rep_levels, num_levels, array, ctx); + } + } int64_t EstimatedBufferedValueBytes() const override { return current_encoder_->EstimatedDataEncodedSize(); @@ -723,9 +758,13 @@ class TypedColumnWriterImpl : public ColumnWriterImpl, public TypedColumnWriter< // since the standard logic concerning dictionary size limits and fallback to // plain encoding is circumvented Status WriteArrowDictionary(const int16_t* def_levels, const int16_t* rep_levels, - int64_t num_levels, const ::arrow::Array& array, + int64_t num_levels, const arrow::Array& array, ArrowWriteContext* context); + Status WriteArrowDense(const int16_t* def_levels, const int16_t* rep_levels, + int64_t num_levels, const arrow::Array& array, + ArrowWriteContext* context); + void WriteDictionaryPage() override { // We have to dynamic cast here because of TypedEncoder as // some compilers don't want to cast through virtual inheritance @@ -876,6 +915,17 @@ class TypedColumnWriterImpl : public ColumnWriterImpl, public TypedColumnWriter< } } + void FallbackToPlainEncoding() { + WriteDictionaryPage(); + // Serialize the buffered Dictionary Indicies + FlushBufferedDataPages(); + fallback_ = true; + // Only PLAIN encoding is supported for fallback in V1 + current_encoder_ = MakeEncoder(DType::type_num, Encoding::PLAIN, false, descr_, + properties_->memory_pool()); + encoding_ = Encoding::PLAIN; + } + // Checks if the Dictionary Page size limit is reached // If the limit is reached, the Dictionary and Data Pages are serialized // The encoding is switched to PLAIN @@ -893,14 +943,7 @@ class TypedColumnWriterImpl : public ColumnWriterImpl, public TypedColumnWriter< // don't want to cast through virtual inheritance auto dict_encoder = dynamic_cast*>(current_encoder_.get()); if (dict_encoder->dict_encoded_size() >= properties_->dictionary_pagesize_limit()) { - WriteDictionaryPage(); - // Serialize the buffered Dictionary Indicies - FlushBufferedDataPages(); - fallback_ = true; - // Only PLAIN encoding is supported for fallback in V1 - current_encoder_ = MakeEncoder(DType::type_num, Encoding::PLAIN, false, descr_, - properties_->memory_pool()); - encoding_ = Encoding::PLAIN; + FallbackToPlainEncoding(); } } @@ -934,10 +977,72 @@ template Status TypedColumnWriterImpl::WriteArrowDictionary(const int16_t* def_levels, const int16_t* rep_levels, int64_t num_levels, - const ::arrow::Array& array, + const arrow::Array& array, ArrowWriteContext* ctx) { - return Status::NotImplemented("Dictionary writes not implemented for ", - descr_->ToString()); + // If this is the first time writing a DictionaryArray, then there's + // a few possible paths to take: + // + // - If dictionary encoding is not enabled, convert to densely + // encoded and call WriteArrow + // - Dictionary encoding enabled + // - If this is the first time this is called, then we call + // PutDictionary into the encoder and then PutIndices on each + // chunk. We store the dictionary that was written in + // preserved_dictionary_ so that subsequent calls to this method + // can make sure the dictionary has not changed + // - On subsequent calls, we have to check whether the dictionary + // has changed. If it has, then we trigger the varying + // dictionary path and materialize each chunk and then call + // WriteArrow with that + auto WriteDense = [&] { + std::shared_ptr dense_array; + RETURN_NOT_OK(MaterializeDictionary(array, properties_->memory_pool(), &dense_array)); + return WriteArrowDense(def_levels, rep_levels, num_levels, *dense_array, ctx); + }; + + if (current_encoder_->encoding() != Encoding::PLAIN_DICTIONARY || + !DictionaryDirectWriteSupported(array)) { + // No longer dictionary-encoding for whatever reason, maybe we + // never were or we decided to stop + return WriteDense(); + } + + auto dict_encoder = checked_cast*>(current_encoder_.get()); + const auto& data = checked_cast(array); + std::shared_ptr dictionary = data.dictionary(); + std::shared_ptr indices = data.indices(); + + int64_t value_offset = 0; + auto WriteIndicesChunk = [&](int64_t offset, int64_t batch_size) { + int64_t batch_num_values = 0; + int64_t batch_num_spaced_values = 0; + WriteLevelsSpaced(batch_size, def_levels + offset, rep_levels + offset, + &batch_num_values, &batch_num_spaced_values); + dict_encoder->PutIndices(*indices->Slice(value_offset, batch_num_spaced_values)); + CommitWriteAndCheckPageLimit(batch_size, batch_num_values); + value_offset += batch_num_spaced_values; + }; + + // Handle seeing dictionary for the first time + if (!preserved_dictionary_) { + // It's a new dictionary. Call PutDictionary and keep track of it + PARQUET_CATCH_NOT_OK(dict_encoder->PutDictionary(*dictionary)); + + // TODO(wesm): If some dictionary values are unobserved, then the + // statistics will be inaccurate. Do we care enough to fix it? + if (page_statistics_ != nullptr) { + PARQUET_CATCH_NOT_OK(page_statistics_->Update(*dictionary)); + } + preserved_dictionary_ = dictionary; + } else if (!dictionary->Equals(*preserved_dictionary_)) { + // Dictionary has changed + PARQUET_CATCH_NOT_OK(FallbackToPlainEncoding()); + return WriteDense(); + } + + PARQUET_CATCH_NOT_OK( + DoInBatches(num_levels, properties_->write_batch_size(), WriteIndicesChunk)); + return Status::OK(); } // ---------------------------------------------------------------------- @@ -946,7 +1051,7 @@ Status TypedColumnWriterImpl::WriteArrowDictionary(const int16_t* def_lev template struct SerializeFunctor { using ArrowCType = typename ArrowType::c_type; - using ArrayType = typename ::arrow::TypeTraits::ArrayType; + using ArrayType = typename arrow::TypeTraits::ArrayType; using ParquetCType = typename ParquetType::c_type; Status Serialize(const ArrayType& array, ArrowWriteContext*, ParquetCType* out) { const ArrowCType* input = array.raw_values(); @@ -962,15 +1067,15 @@ struct SerializeFunctor { }; template -inline Status SerializeData(const ::arrow::Array& array, ArrowWriteContext* ctx, +inline Status SerializeData(const arrow::Array& array, ArrowWriteContext* ctx, typename ParquetType::c_type* out) { - using ArrayType = typename ::arrow::TypeTraits::ArrayType; + using ArrayType = typename arrow::TypeTraits::ArrayType; SerializeFunctor functor; return functor.Serialize(checked_cast(array), ctx, out); } template -Status WriteArrowSerialize(const ::arrow::Array& array, int64_t num_levels, +Status WriteArrowSerialize(const arrow::Array& array, int64_t num_levels, const int16_t* def_levels, const int16_t* rep_levels, ArrowWriteContext* ctx, TypedColumnWriter* writer) { @@ -995,12 +1100,12 @@ Status WriteArrowSerialize(const ::arrow::Array& array, int64_t num_levels, } template -Status WriteArrowZeroCopy(const ::arrow::Array& array, int64_t num_levels, +Status WriteArrowZeroCopy(const arrow::Array& array, int64_t num_levels, const int16_t* def_levels, const int16_t* rep_levels, ArrowWriteContext* ctx, TypedColumnWriter* writer) { using T = typename ParquetType::c_type; - const auto& data = static_cast(array); + const auto& data = static_cast(array); const T* values = nullptr; // The values buffer may be null if the array is empty (ARROW-2744) if (data.values() != nullptr) { @@ -1018,13 +1123,13 @@ Status WriteArrowZeroCopy(const ::arrow::Array& array, int64_t num_levels, return Status::OK(); } -#define WRITE_SERIALIZE_CASE(ArrowEnum, ArrowType, ParquetType) \ - case ::arrow::Type::ArrowEnum: \ - return WriteArrowSerialize( \ +#define WRITE_SERIALIZE_CASE(ArrowEnum, ArrowType, ParquetType) \ + case arrow::Type::ArrowEnum: \ + return WriteArrowSerialize( \ array, num_levels, def_levels, rep_levels, ctx, this); #define WRITE_ZERO_COPY_CASE(ArrowEnum, ArrowType, ParquetType) \ - case ::arrow::Type::ArrowEnum: \ + case arrow::Type::ArrowEnum: \ return WriteArrowZeroCopy(array, num_levels, def_levels, rep_levels, \ ctx, this); @@ -1038,8 +1143,8 @@ Status WriteArrowZeroCopy(const ::arrow::Array& array, int64_t num_levels, // Write Arrow to BooleanType template <> -struct SerializeFunctor { - Status Serialize(const ::arrow::BooleanArray& data, ArrowWriteContext*, bool* out) { +struct SerializeFunctor { + Status Serialize(const arrow::BooleanArray& data, ArrowWriteContext*, bool* out) { for (int i = 0; i < data.length(); i++) { *out++ = data.Value(i); } @@ -1048,15 +1153,15 @@ struct SerializeFunctor { }; template <> -Status TypedColumnWriterImpl::WriteArrow(const int16_t* def_levels, - const int16_t* rep_levels, - int64_t num_levels, - const ::arrow::Array& array, - ArrowWriteContext* ctx) { - if (array.type_id() != ::arrow::Type::BOOL) { +Status TypedColumnWriterImpl::WriteArrowDense(const int16_t* def_levels, + const int16_t* rep_levels, + int64_t num_levels, + const arrow::Array& array, + ArrowWriteContext* ctx) { + if (array.type_id() != arrow::Type::BOOL) { ARROW_UNSUPPORTED(); } - return WriteArrowSerialize( + return WriteArrowSerialize( array, num_levels, def_levels, rep_levels, ctx, this); } @@ -1064,8 +1169,8 @@ Status TypedColumnWriterImpl::WriteArrow(const int16_t* def_levels, // Write Arrow types to INT32 template <> -struct SerializeFunctor { - Status Serialize(const ::arrow::Date64Array& array, ArrowWriteContext*, int32_t* out) { +struct SerializeFunctor { + Status Serialize(const arrow::Date64Array& array, ArrowWriteContext*, int32_t* out) { const int64_t* input = array.raw_values(); for (int i = 0; i < array.length(); i++) { *out++ = static_cast(*input++ / 86400000); @@ -1075,11 +1180,11 @@ struct SerializeFunctor { }; template <> -struct SerializeFunctor { - Status Serialize(const ::arrow::Time32Array& array, ArrowWriteContext*, int32_t* out) { +struct SerializeFunctor { + Status Serialize(const arrow::Time32Array& array, ArrowWriteContext*, int32_t* out) { const int32_t* input = array.raw_values(); - const auto& type = static_cast(*array.type()); - if (type.unit() == ::arrow::TimeUnit::SECOND) { + const auto& type = static_cast(*array.type()); + if (type.unit() == arrow::TimeUnit::SECOND) { for (int i = 0; i < array.length(); i++) { out[i] = input[i] * 1000; } @@ -1091,13 +1196,13 @@ struct SerializeFunctor { }; template <> -Status TypedColumnWriterImpl::WriteArrow(const int16_t* def_levels, - const int16_t* rep_levels, - int64_t num_levels, - const ::arrow::Array& array, - ArrowWriteContext* ctx) { +Status TypedColumnWriterImpl::WriteArrowDense(const int16_t* def_levels, + const int16_t* rep_levels, + int64_t num_levels, + const arrow::Array& array, + ArrowWriteContext* ctx) { switch (array.type()->id()) { - case ::arrow::Type::NA: { + case arrow::Type::NA: { PARQUET_CATCH_NOT_OK(WriteBatch(num_levels, def_levels, rep_levels, nullptr)); } break; WRITE_SERIALIZE_CASE(INT8, Int8Type, Int32Type) @@ -1122,21 +1227,21 @@ Status TypedColumnWriterImpl::WriteArrow(const int16_t* def_levels, for (int64_t i = 0; i < array.length(); i++) ConversionFunction(input[i], &out[i]); template <> -struct SerializeFunctor { - Status Serialize(const ::arrow::TimestampArray& array, ArrowWriteContext*, Int96* out) { +struct SerializeFunctor { + Status Serialize(const arrow::TimestampArray& array, ArrowWriteContext*, Int96* out) { const int64_t* input = array.raw_values(); - const auto& type = static_cast(*array.type()); + const auto& type = static_cast(*array.type()); switch (type.unit()) { - case ::arrow::TimeUnit::NANO: + case arrow::TimeUnit::NANO: INT96_CONVERT_LOOP(internal::NanosecondsToImpalaTimestamp); break; - case ::arrow::TimeUnit::MICRO: + case arrow::TimeUnit::MICRO: INT96_CONVERT_LOOP(internal::MicrosecondsToImpalaTimestamp); break; - case ::arrow::TimeUnit::MILLI: + case arrow::TimeUnit::MILLI: INT96_CONVERT_LOOP(internal::MillisecondsToImpalaTimestamp); break; - case ::arrow::TimeUnit::SECOND: + case arrow::TimeUnit::SECOND: INT96_CONVERT_LOOP(internal::SecondsToImpalaTimestamp); break; } @@ -1171,15 +1276,15 @@ static std::pair kTimestampCoercionFactors[4][4] = { {COERCE_MULTIPLY, 1}}}; template <> -struct SerializeFunctor { - Status Serialize(const ::arrow::TimestampArray& array, ArrowWriteContext* ctx, +struct SerializeFunctor { + Status Serialize(const arrow::TimestampArray& array, ArrowWriteContext* ctx, int64_t* out) { - const auto& source_type = static_cast(*array.type()); + const auto& source_type = static_cast(*array.type()); auto source_unit = source_type.unit(); const int64_t* values = array.raw_values(); - ::arrow::TimeUnit::type target_unit = ctx->properties->coerce_timestamps_unit(); - auto target_type = ::arrow::timestamp(target_unit); + arrow::TimeUnit::type target_unit = ctx->properties->coerce_timestamps_unit(); + auto target_type = arrow::timestamp(target_unit); bool truncation_allowed = ctx->properties->truncated_timestamps_allowed(); auto DivideBy = [&](const int64_t factor) { @@ -1215,15 +1320,15 @@ struct SerializeFunctor { #undef COERCE_INVALID #undef COERCE_MULTIPLY -Status WriteTimestamps(const ::arrow::Array& values, int64_t num_levels, +Status WriteTimestamps(const arrow::Array& values, int64_t num_levels, const int16_t* def_levels, const int16_t* rep_levels, ArrowWriteContext* ctx, TypedColumnWriter* writer) { - const auto& source_type = static_cast(*values.type()); + const auto& source_type = static_cast(*values.type()); auto WriteCoerce = [&](const ArrowWriterProperties* properties) { ArrowWriteContext temp_ctx = *ctx; temp_ctx.properties = properties; - return WriteArrowSerialize( + return WriteArrowSerialize( values, num_levels, def_levels, rep_levels, &temp_ctx, writer); }; @@ -1237,21 +1342,21 @@ Status WriteTimestamps(const ::arrow::Array& values, int64_t num_levels, return WriteCoerce(ctx->properties); } } else if (writer->properties()->version() == ParquetVersion::PARQUET_1_0 && - source_type.unit() == ::arrow::TimeUnit::NANO) { + source_type.unit() == arrow::TimeUnit::NANO) { // Absent superseding user instructions, when writing Parquet version 1.0 files, // timestamps in nanoseconds are coerced to microseconds std::shared_ptr properties = (ArrowWriterProperties::Builder()) - .coerce_timestamps(::arrow::TimeUnit::MICRO) + .coerce_timestamps(arrow::TimeUnit::MICRO) ->disallow_truncated_timestamps() ->build(); return WriteCoerce(properties.get()); - } else if (source_type.unit() == ::arrow::TimeUnit::SECOND) { + } else if (source_type.unit() == arrow::TimeUnit::SECOND) { // Absent superseding user instructions, timestamps in seconds are coerced to // milliseconds std::shared_ptr properties = (ArrowWriterProperties::Builder()) - .coerce_timestamps(::arrow::TimeUnit::MILLI) + .coerce_timestamps(arrow::TimeUnit::MILLI) ->build(); return WriteCoerce(properties.get()); } else { @@ -1262,13 +1367,13 @@ Status WriteTimestamps(const ::arrow::Array& values, int64_t num_levels, } template <> -Status TypedColumnWriterImpl::WriteArrow(const int16_t* def_levels, - const int16_t* rep_levels, - int64_t num_levels, - const ::arrow::Array& array, - ArrowWriteContext* ctx) { +Status TypedColumnWriterImpl::WriteArrowDense(const int16_t* def_levels, + const int16_t* rep_levels, + int64_t num_levels, + const arrow::Array& array, + ArrowWriteContext* ctx) { switch (array.type()->id()) { - case ::arrow::Type::TIMESTAMP: + case arrow::Type::TIMESTAMP: return WriteTimestamps(array, num_levels, def_levels, rep_levels, ctx, this); WRITE_ZERO_COPY_CASE(INT64, Int64Type, Int64Type) WRITE_SERIALIZE_CASE(UINT32, UInt32Type, Int64Type) @@ -1280,15 +1385,15 @@ Status TypedColumnWriterImpl::WriteArrow(const int16_t* def_levels, } template <> -Status TypedColumnWriterImpl::WriteArrow(const int16_t* def_levels, - const int16_t* rep_levels, - int64_t num_levels, - const ::arrow::Array& array, - ArrowWriteContext* ctx) { - if (array.type_id() != ::arrow::Type::TIMESTAMP) { +Status TypedColumnWriterImpl::WriteArrowDense(const int16_t* def_levels, + const int16_t* rep_levels, + int64_t num_levels, + const arrow::Array& array, + ArrowWriteContext* ctx) { + if (array.type_id() != arrow::Type::TIMESTAMP) { ARROW_UNSUPPORTED(); } - return WriteArrowSerialize( + return WriteArrowSerialize( array, num_levels, def_levels, rep_levels, ctx, this); } @@ -1296,12 +1401,12 @@ Status TypedColumnWriterImpl::WriteArrow(const int16_t* def_levels, // Floating point types template <> -Status TypedColumnWriterImpl::WriteArrow(const int16_t* def_levels, - const int16_t* rep_levels, - int64_t num_levels, - const ::arrow::Array& array, - ArrowWriteContext* ctx) { - if (array.type_id() != ::arrow::Type::FLOAT) { +Status TypedColumnWriterImpl::WriteArrowDense(const int16_t* def_levels, + const int16_t* rep_levels, + int64_t num_levels, + const arrow::Array& array, + ArrowWriteContext* ctx) { + if (array.type_id() != arrow::Type::FLOAT) { ARROW_UNSUPPORTED(); } return WriteArrowZeroCopy(array, num_levels, def_levels, rep_levels, ctx, @@ -1309,12 +1414,12 @@ Status TypedColumnWriterImpl::WriteArrow(const int16_t* def_levels, } template <> -Status TypedColumnWriterImpl::WriteArrow(const int16_t* def_levels, - const int16_t* rep_levels, - int64_t num_levels, - const ::arrow::Array& array, - ArrowWriteContext* ctx) { - if (array.type_id() != ::arrow::Type::DOUBLE) { +Status TypedColumnWriterImpl::WriteArrowDense(const int16_t* def_levels, + const int16_t* rep_levels, + int64_t num_levels, + const arrow::Array& array, + ArrowWriteContext* ctx) { + if (array.type_id() != arrow::Type::DOUBLE) { ARROW_UNSUPPORTED(); } return WriteArrowZeroCopy(array, num_levels, def_levels, rep_levels, ctx, @@ -1324,39 +1429,14 @@ Status TypedColumnWriterImpl::WriteArrow(const int16_t* def_levels, // ---------------------------------------------------------------------- // Write Arrow to BYTE_ARRAY -Status MaterializeDictionary(const Array& array, std::shared_ptr* out) { - const ::arrow::DictionaryType& dict_type = - static_cast(*data->type()); - - // TODO(ARROW-1648): Remove this special handling once we require an Arrow - // version that has this fixed. - if (dict_type.value_type()->id() == ::arrow::Type::NA) { - *out = std::make_shared<::arrow::NullArray>(data->length()); - return Status::OK(); - } - - FunctionContext ctx(this->memory_pool()); - ::arrow::compute::Datum cast_output; - RETURN_NOT_OK(Cast(&ctx, data, dict_type.value_type(), CastOptions(), &cast_output)); - *out = cast_output.make_array(); - return Status::OK(); -} - template <> -Status TypedColumnWriterImpl::WriteArrowDictionary( - const int16_t* def_levels, const int16_t* rep_levels, int64_t num_levels, - const ::arrow::Array& array, ArrowWriteContext* ctx) {} - -template <> -Status TypedColumnWriterImpl::WriteArrow(const int16_t* def_levels, - const int16_t* rep_levels, - int64_t num_levels, - const ::arrow::Array& array, - ArrowWriteContext* ctx) { - if (array.type()->id() != arrow::Type::DICTIONARY) { - return WriteArrowDictionary(def_levels, rep_levels, num_levels, array, ctx); - } else if (array.type()->id() != arrow::Type::BINARY && - array.type()->id() != arrow::Type::STRING) { +Status TypedColumnWriterImpl::WriteArrowDense(const int16_t* def_levels, + const int16_t* rep_levels, + int64_t num_levels, + const arrow::Array& array, + ArrowWriteContext* ctx) { + if (array.type()->id() != arrow::Type::BINARY && + array.type()->id() != arrow::Type::STRING) { ARROW_UNSUPPORTED(); } @@ -1366,7 +1446,7 @@ Status TypedColumnWriterImpl::WriteArrow(const int16_t* def_level int64_t batch_num_spaced_values = 0; WriteLevelsSpaced(batch_size, def_levels + offset, rep_levels + offset, &batch_num_values, &batch_num_spaced_values); - std::shared_ptr<::arrow::Array> data_slice = + std::shared_ptr data_slice = array.Slice(value_offset, batch_num_spaced_values); current_encoder_->Put(*data_slice); if (page_statistics_ != nullptr) { @@ -1387,8 +1467,8 @@ Status TypedColumnWriterImpl::WriteArrow(const int16_t* def_level template struct SerializeFunctor> { - Status Serialize(const ::arrow::FixedSizeBinaryArray& array, ArrowWriteContext*, + arrow::enable_if_fixed_size_binary> { + Status Serialize(const arrow::FixedSizeBinaryArray& array, ArrowWriteContext*, FLBA* out) { if (array.null_count() == 0) { // no nulls, just dump the data @@ -1408,17 +1488,17 @@ struct SerializeFunctor -Status WriteArrowSerialize( - const ::arrow::Array& array, int64_t num_levels, const int16_t* def_levels, +Status WriteArrowSerialize( + const arrow::Array& array, int64_t num_levels, const int16_t* def_levels, const int16_t* rep_levels, ArrowWriteContext* ctx, TypedColumnWriter* writer) { - const auto& data = static_cast(array); + const auto& data = static_cast(array); const int64_t length = data.length(); FLBA* buffer; RETURN_NOT_OK(ctx->GetScratchData(num_levels, &buffer)); - const auto& decimal_type = static_cast(*data.type()); + const auto& decimal_type = static_cast(*data.type()); const int32_t offset = decimal_type.byte_width() - internal::DecimalSize(decimal_type.precision()); @@ -1436,8 +1516,8 @@ Status WriteArrowSerialize( // todo(advancedxy): use a writeBatch to avoid this step for (int64_t i = 0, j = 0; i < length; ++i, j += 2) { auto unsigned_64_bit = reinterpret_cast(data.GetValue(i)); - big_endian_values[j] = ::arrow::BitUtil::ToBigEndian(unsigned_64_bit[1]); - big_endian_values[j + 1] = ::arrow::BitUtil::ToBigEndian(unsigned_64_bit[0]); + big_endian_values[j] = arrow::BitUtil::ToBigEndian(unsigned_64_bit[1]); + big_endian_values[j + 1] = arrow::BitUtil::ToBigEndian(unsigned_64_bit[0]); buffer[i] = FixedLenByteArray( reinterpret_cast(&big_endian_values[j]) + offset); } @@ -1445,8 +1525,8 @@ Status WriteArrowSerialize( for (int64_t i = 0, buffer_idx = 0, j = 0; i < length; ++i) { if (data.IsValid(i)) { auto unsigned_64_bit = reinterpret_cast(data.GetValue(i)); - big_endian_values[j] = ::arrow::BitUtil::ToBigEndian(unsigned_64_bit[1]); - big_endian_values[j + 1] = ::arrow::BitUtil::ToBigEndian(unsigned_64_bit[0]); + big_endian_values[j] = arrow::BitUtil::ToBigEndian(unsigned_64_bit[1]); + big_endian_values[j + 1] = arrow::BitUtil::ToBigEndian(unsigned_64_bit[0]); buffer[buffer_idx++] = FixedLenByteArray( reinterpret_cast(&big_endian_values[j]) + offset); j += 2; @@ -1458,11 +1538,11 @@ Status WriteArrowSerialize( } template <> -Status TypedColumnWriterImpl::WriteArrow(const int16_t* def_levels, - const int16_t* rep_levels, - int64_t num_levels, - const ::arrow::Array& array, - ArrowWriteContext* ctx) { +Status TypedColumnWriterImpl::WriteArrowDense(const int16_t* def_levels, + const int16_t* rep_levels, + int64_t num_levels, + const arrow::Array& array, + ArrowWriteContext* ctx) { switch (array.type()->id()) { WRITE_SERIALIZE_CASE(FIXED_SIZE_BINARY, FixedSizeBinaryType, FLBAType) WRITE_SERIALIZE_CASE(DECIMAL, Decimal128Type, FLBAType) diff --git a/cpp/src/parquet/encoding.cc b/cpp/src/parquet/encoding.cc index b6a381f68fb..fc7acf5114d 100644 --- a/cpp/src/parquet/encoding.cc +++ b/cpp/src/parquet/encoding.cc @@ -334,11 +334,13 @@ struct DictEncoderTraits { // Initially 1024 elements static constexpr int32_t kInitialHashTableSize = 1 << 10; -/// See the dictionary encoding section of https://github.com/Parquet/parquet-format. -/// The encoding supports streaming encoding. Values are encoded as they are added while -/// the dictionary is being constructed. At any time, the buffered values can be -/// written out with the current dictionary size. More values can then be added to -/// the encoder, including new dictionary entries. +/// See the dictionary encoding section of +/// https://github.com/Parquet/parquet-format. The encoding supports +/// streaming encoding. Values are encoded as they are added while the +/// dictionary is being constructed. At any time, the buffered values +/// can be written out with the current dictionary size. More values +/// can then be added to the encoder, including new dictionary +/// entries. template class DictEncoderImpl : public EncoderImpl, virtual public DictEncoder { using MemoTableType = typename DictEncoderTraits::MemoTableType; @@ -418,24 +420,43 @@ class DictEncoderImpl : public EncoderImpl, virtual public DictEncoder { void Put(const arrow::Array& values) override; void PutDictionary(const arrow::Array& values) override; - void PutIndices(const arrow::Array& data) override { - if (data.type()->id() != arrow::Type::INT32) { - throw ParquetException("Only int32 indices currently supported"); - } - const auto& indices = checked_cast(data); + template + void PutIndicesTyped(const arrow::Array& data) { + using T = typename ArrowType::c_type; + using ArrayType = typename arrow::TypeTraits::ArrayType; + const auto& indices = checked_cast(data); auto values = indices.raw_values(); + buffered_indices_.reserve( + buffered_indices_.size() + + static_cast(indices.length() - indices.null_count())); if (indices.null_count() > 0) { arrow::internal::BitmapReader valid_bits_reader(indices.null_bitmap_data(), indices.offset(), indices.length()); for (int64_t i = 0; i < indices.length(); ++i) { if (valid_bits_reader.IsSet()) { - buffered_indices_.push_back(values[i]); + buffered_indices_.push_back(static_cast(values[i])); } valid_bits_reader.Next(); } } else { - buffered_indices_.insert(std::end(buffered_indices_), values, - values + indices.length()); + for (int64_t i = 0; i < indices.length(); ++i) { + buffered_indices_.push_back(static_cast(values[i])); + } + } + } + + void PutIndices(const arrow::Array& data) override { + switch (data.type()->id()) { + case arrow::Type::INT8: + return PutIndicesTyped(data); + case arrow::Type::INT16: + return PutIndicesTyped(data); + case arrow::Type::INT32: + return PutIndicesTyped(data); + case arrow::Type::INT64: + return PutIndicesTyped(data); + default: + throw ParquetException("Dictionary indices were not signed integer"); } } From de9d0a5aea7d3416c33431058b4692deebbb464f Mon Sep 17 00:00:00 2001 From: Wes McKinney Date: Mon, 12 Aug 2019 20:01:06 -0500 Subject: [PATCH 13/28] Fix null dictionary test, unit tests passing again [skip ci] --- cpp/src/parquet/arrow/arrow-reader-writer-test.cc | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/cpp/src/parquet/arrow/arrow-reader-writer-test.cc b/cpp/src/parquet/arrow/arrow-reader-writer-test.cc index 7763b436fa9..354a0fd52f2 100644 --- a/cpp/src/parquet/arrow/arrow-reader-writer-test.cc +++ b/cpp/src/parquet/arrow/arrow-reader-writer-test.cc @@ -1168,8 +1168,13 @@ TEST_F(TestNullParquetIO, NullListColumn) { } TEST_F(TestNullParquetIO, NullDictionaryColumn) { + std::shared_ptr null_bitmap; + ASSERT_OK( + ::arrow::AllocateBitmap(::arrow::default_memory_pool(), SMALL_SIZE, &null_bitmap)); + std::memset(null_bitmap->mutable_data(), 0, null_bitmap->size()); + std::shared_ptr indices = - std::make_shared<::arrow::Int8Array>(SMALL_SIZE, nullptr, nullptr, SMALL_SIZE); + std::make_shared<::arrow::Int8Array>(SMALL_SIZE, nullptr, null_bitmap, SMALL_SIZE); std::shared_ptr<::arrow::DictionaryType> dict_type = std::make_shared<::arrow::DictionaryType>(::arrow::int8(), ::arrow::null()); From 28268d62431f6431727d20dd5e68f33e6a037f3f Mon Sep 17 00:00:00 2001 From: Wes McKinney Date: Mon, 12 Aug 2019 21:19:05 -0500 Subject: [PATCH 14/28] Add unit test for writing changing dictionaries --- .../parquet/arrow/arrow-reader-writer-test.cc | 49 ++++++++++++++++--- cpp/src/parquet/column_writer.cc | 2 +- 2 files changed, 43 insertions(+), 8 deletions(-) diff --git a/cpp/src/parquet/arrow/arrow-reader-writer-test.cc b/cpp/src/parquet/arrow/arrow-reader-writer-test.cc index 354a0fd52f2..f44c0a4d25a 100644 --- a/cpp/src/parquet/arrow/arrow-reader-writer-test.cc +++ b/cpp/src/parquet/arrow/arrow-reader-writer-test.cc @@ -343,9 +343,11 @@ void WriteTableToBuffer(const std::shared_ptr
& table, int64_t row_group_s const std::shared_ptr& arrow_properties, std::shared_ptr* out) { auto sink = CreateOutputStream(); + + auto write_props = WriterProperties::Builder().write_batch_size(100)->build(); + ASSERT_OK_NO_THROW(WriteTable(*table, ::arrow::default_memory_pool(), sink, - row_group_size, default_writer_properties(), - arrow_properties)); + row_group_size, write_props, arrow_properties)); ASSERT_OK_NO_THROW(sink->Finish(out)); } @@ -2807,7 +2809,7 @@ class TestArrowReadDictionary : public ::testing::TestWithParam { ::arrow::AssertTablesEqual(expected, *actual, /*same_chunk_layout=*/false); } - static std::vector null_probabilites() { return {0.0, 0.5, 1}; } + static std::vector null_probabilities() { return {0.0, 0.5, 1}; } protected: std::shared_ptr dense_values_; @@ -2817,7 +2819,7 @@ class TestArrowReadDictionary : public ::testing::TestWithParam { ArrowReaderProperties properties_; }; -void AsDictionaryEncoded(const Array& arr, std::shared_ptr* out) { +void AsDictionary32Encoded(const Array& arr, std::shared_ptr* out) { ::arrow::StringDictionary32Builder builder(default_memory_pool()); const auto& string_array = static_cast(arr); ASSERT_OK(builder.AppendArray(string_array)); @@ -2830,7 +2832,7 @@ TEST_P(TestArrowReadDictionary, ReadWholeFileDict) { std::vector> chunks(kNumRowGroups); const int64_t chunk_size = expected_dense_->num_rows() / kNumRowGroups; for (int i = 0; i < kNumRowGroups; ++i) { - AsDictionaryEncoded(*dense_values_->Slice(chunk_size * i, chunk_size), &chunks[i]); + AsDictionary32Encoded(*dense_values_->Slice(chunk_size * i, chunk_size), &chunks[i]); } auto ex_table = MakeSimpleTable(std::make_shared(chunks), /*nullable=*/true); @@ -2844,8 +2846,41 @@ TEST_P(TestArrowReadDictionary, ReadWholeFileDense) { INSTANTIATE_TEST_CASE_P( ReadDictionary, TestArrowReadDictionary, - ::testing::ValuesIn(TestArrowReadDictionary::null_probabilites())); + ::testing::ValuesIn(TestArrowReadDictionary::null_probabilities())); -} // namespace arrow +TEST(TestArrowWriteDictionaries, ChangingDictionaries) { + constexpr int num_unique = 50; + constexpr int repeat = 10000; + constexpr int64_t min_length = 2; + constexpr int64_t max_length = 20; + ::arrow::random::RandomArrayGenerator rag(0); + auto values = rag.BinaryWithRepeats(repeat * num_unique, num_unique, min_length, + max_length, /*null_probability=*/0.1); + auto expected = MakeSimpleTable(values, /*nullable=*/true); + + const int num_chunks = 10; + std::vector> chunks(num_chunks); + const int64_t chunk_size = values->length() / num_chunks; + for (int i = 0; i < num_chunks; ++i) { + AsDictionary32Encoded(*values->Slice(chunk_size * i, chunk_size), &chunks[i]); + } + + std::shared_ptr buffer; + ASSERT_NO_FATAL_FAILURE(WriteTableToBuffer(expected, + /*row_group_size=*/values->length() / 2, + default_arrow_writer_properties(), &buffer)); + auto arrow_properties = default_arrow_reader_properties(); + + std::unique_ptr reader; + FileReaderBuilder builder; + ASSERT_OK_NO_THROW(builder.Open(std::make_shared(buffer))); + ASSERT_OK(builder.properties(arrow_properties)->Build(&reader)); + + std::shared_ptr
actual; + ASSERT_OK_NO_THROW(reader->ReadTable(&actual)); + ::arrow::AssertTablesEqual(*expected, *actual, /*same_chunk_layout=*/false); +} + +} // namespace arrow } // namespace parquet diff --git a/cpp/src/parquet/column_writer.cc b/cpp/src/parquet/column_writer.cc index ab7a06caa4e..51c952ed703 100644 --- a/cpp/src/parquet/column_writer.cc +++ b/cpp/src/parquet/column_writer.cc @@ -1007,7 +1007,7 @@ Status TypedColumnWriterImpl::WriteArrowDictionary(const int16_t* def_lev return WriteDense(); } - auto dict_encoder = checked_cast*>(current_encoder_.get()); + auto dict_encoder = dynamic_cast*>(current_encoder_.get()); const auto& data = checked_cast(array); std::shared_ptr dictionary = data.dictionary(); std::shared_ptr indices = data.indices(); From 580a0ca9c975c350608e730700f126de2fa59e56 Mon Sep 17 00:00:00 2001 From: Wes McKinney Date: Tue, 13 Aug 2019 11:56:37 -0500 Subject: [PATCH 15/28] Add failing unit test for Arrow store schema option [skip ci] --- .../parquet/arrow/arrow-reader-writer-test.cc | 66 +++++++++++-------- cpp/src/parquet/properties.h | 23 +++++-- 2 files changed, 59 insertions(+), 30 deletions(-) diff --git a/cpp/src/parquet/arrow/arrow-reader-writer-test.cc b/cpp/src/parquet/arrow/arrow-reader-writer-test.cc index f44c0a4d25a..c32ca16118c 100644 --- a/cpp/src/parquet/arrow/arrow-reader-writer-test.cc +++ b/cpp/src/parquet/arrow/arrow-reader-writer-test.cc @@ -370,37 +370,39 @@ void AssertChunkedEqual(const ChunkedArray& expected, const ChunkedArray& actual } } -void DoConfiguredRoundtrip( - const std::shared_ptr
& table, int64_t row_group_size, - std::shared_ptr
* out, - const std::shared_ptr<::parquet::WriterProperties>& parquet_properties = - ::parquet::default_writer_properties(), - const std::shared_ptr& arrow_properties = - default_arrow_writer_properties()) { +void DoRoundtrip(const std::shared_ptr
& table, int64_t row_group_size, + std::shared_ptr
* out, + const std::shared_ptr<::parquet::WriterProperties>& writer_properties = + ::parquet::default_writer_properties(), + const std::shared_ptr& arrow_writer_properties = + default_arrow_writer_properties(), + const ArrowReaderProperties& arrow_reader_properties = + default_arrow_reader_properties()) { std::shared_ptr buffer; auto sink = CreateOutputStream(); ASSERT_OK_NO_THROW(WriteTable(*table, ::arrow::default_memory_pool(), sink, - row_group_size, parquet_properties, arrow_properties)); + row_group_size, writer_properties, + arrow_writer_properties)); ASSERT_OK_NO_THROW(sink->Finish(&buffer)); std::unique_ptr reader; - ASSERT_OK_NO_THROW(OpenFile(std::make_shared(buffer), - ::arrow::default_memory_pool(), &reader)); + FileReaderBuilder builder; + ASSERT_OK_NO_THROW(builder.Open(std::make_shared(buffer))); + ASSERT_OK(builder.properties(arrow_reader_properties)->Build(&reader)); ASSERT_OK_NO_THROW(reader->ReadTable(out)); } void CheckConfiguredRoundtrip( const std::shared_ptr
& input_table, const std::shared_ptr
& expected_table = nullptr, - const std::shared_ptr<::parquet::WriterProperties>& parquet_properties = + const std::shared_ptr<::parquet::WriterProperties>& writer_properties = ::parquet::default_writer_properties(), - const std::shared_ptr& arrow_properties = + const std::shared_ptr& arrow_writer_properties = default_arrow_writer_properties()) { std::shared_ptr
actual_table; - ASSERT_NO_FATAL_FAILURE(DoConfiguredRoundtrip(input_table, input_table->num_rows(), - &actual_table, parquet_properties, - arrow_properties)); + ASSERT_NO_FATAL_FAILURE(DoRoundtrip(input_table, input_table->num_rows(), &actual_table, + writer_properties, arrow_writer_properties)); if (expected_table) { ASSERT_NO_FATAL_FAILURE( ::arrow::AssertSchemaEqual(*actual_table->schema(), *expected_table->schema())); @@ -2854,7 +2856,7 @@ TEST(TestArrowWriteDictionaries, ChangingDictionaries) { constexpr int64_t min_length = 2; constexpr int64_t max_length = 20; ::arrow::random::RandomArrayGenerator rag(0); - auto values = rag.BinaryWithRepeats(repeat * num_unique, num_unique, min_length, + auto values = rag.StringWithRepeats(repeat * num_unique, num_unique, min_length, max_length, /*null_probability=*/0.1); auto expected = MakeSimpleTable(values, /*nullable=*/true); @@ -2865,20 +2867,32 @@ TEST(TestArrowWriteDictionaries, ChangingDictionaries) { AsDictionary32Encoded(*values->Slice(chunk_size * i, chunk_size), &chunks[i]); } - std::shared_ptr buffer; - ASSERT_NO_FATAL_FAILURE(WriteTableToBuffer(expected, - /*row_group_size=*/values->length() / 2, - default_arrow_writer_properties(), &buffer)); + auto dict_table = MakeSimpleTable(std::make_shared(chunks), + /*nullable=*/true); + + std::shared_ptr
actual; + DoRoundtrip(dict_table, /*row_group_size=*/values->length() / 2, &actual); + ::arrow::AssertTablesEqual(*expected, *actual, /*same_chunk_layout=*/false); +} + +TEST(TestArrowWriteDictionaries, AutoReadAsDictionary) { + constexpr int num_unique = 50; + constexpr int repeat = 100; + constexpr int64_t min_length = 2; + constexpr int64_t max_length = 20; + ::arrow::random::RandomArrayGenerator rag(0); + auto values = rag.BinaryWithRepeats(repeat * num_unique, num_unique, min_length, + max_length, /*null_probability=*/0.1); + std::shared_ptr dict_values; + AsDictionary32Encoded(*values, &dict_values); - auto arrow_properties = default_arrow_reader_properties(); + auto expected = MakeSimpleTable(dict_values, /*nullable=*/true); - std::unique_ptr reader; - FileReaderBuilder builder; - ASSERT_OK_NO_THROW(builder.Open(std::make_shared(buffer))); - ASSERT_OK(builder.properties(arrow_properties)->Build(&reader)); + auto arrow_writer_properties = ArrowWriterProperties::Builder().store_schema()->build(); std::shared_ptr
actual; - ASSERT_OK_NO_THROW(reader->ReadTable(&actual)); + DoRoundtrip(expected, values->length(), &actual, default_writer_properties(), + arrow_writer_properties); ::arrow::AssertTablesEqual(*expected, *actual, /*same_chunk_layout=*/false); } diff --git a/cpp/src/parquet/properties.h b/cpp/src/parquet/properties.h index b7e55f0cc96..209969a0054 100644 --- a/cpp/src/parquet/properties.h +++ b/cpp/src/parquet/properties.h @@ -482,7 +482,8 @@ class PARQUET_EXPORT ArrowWriterProperties { : write_timestamps_as_int96_(false), coerce_timestamps_enabled_(false), coerce_timestamps_unit_(::arrow::TimeUnit::SECOND), - truncated_timestamps_allowed_(false) {} + truncated_timestamps_allowed_(false), + store_schema_(false) {} virtual ~Builder() {} Builder* disable_deprecated_int96_timestamps() { @@ -511,10 +512,18 @@ class PARQUET_EXPORT ArrowWriterProperties { return this; } + /// \brief EXPERIMENTAL: Write binary serialized Arrow schema to the file, + /// to enable certain read options (like "read_dictionary") to be set + /// automatically + Builder* store_schema() { + store_schema_ = true; + return this; + } + std::shared_ptr build() { return std::shared_ptr(new ArrowWriterProperties( write_timestamps_as_int96_, coerce_timestamps_enabled_, coerce_timestamps_unit_, - truncated_timestamps_allowed_)); + truncated_timestamps_allowed_, store_schema_)); } private: @@ -523,6 +532,8 @@ class PARQUET_EXPORT ArrowWriterProperties { bool coerce_timestamps_enabled_; ::arrow::TimeUnit::type coerce_timestamps_unit_; bool truncated_timestamps_allowed_; + + bool store_schema_; }; bool support_deprecated_int96_timestamps() const { return write_timestamps_as_int96_; } @@ -534,20 +545,24 @@ class PARQUET_EXPORT ArrowWriterProperties { bool truncated_timestamps_allowed() const { return truncated_timestamps_allowed_; } + bool store_schema() const { return store_schema_; } + private: explicit ArrowWriterProperties(bool write_nanos_as_int96, bool coerce_timestamps_enabled, ::arrow::TimeUnit::type coerce_timestamps_unit, - bool truncated_timestamps_allowed) + bool truncated_timestamps_allowed, bool store_schema) : write_timestamps_as_int96_(write_nanos_as_int96), coerce_timestamps_enabled_(coerce_timestamps_enabled), coerce_timestamps_unit_(coerce_timestamps_unit), - truncated_timestamps_allowed_(truncated_timestamps_allowed) {} + truncated_timestamps_allowed_(truncated_timestamps_allowed), + store_schema_(store_schema) {} const bool write_timestamps_as_int96_; const bool coerce_timestamps_enabled_; const ::arrow::TimeUnit::type coerce_timestamps_unit_; const bool truncated_timestamps_allowed_; + const bool store_schema_; }; /// \brief State object used for writing Arrow data directly to a Parquet From 7705fdbc86cf6f3769059a839610fe37d1f7fe9e Mon Sep 17 00:00:00 2001 From: Wes McKinney Date: Tue, 13 Aug 2019 13:25:55 -0500 Subject: [PATCH 16/28] Automatically read dictionary fields by serializing the Arrow schema with the store_schema option [skip ci] --- cpp/src/arrow/ipc/writer.h | 3 +- cpp/src/parquet/arrow/reader.cc | 7 +-- cpp/src/parquet/arrow/reader_internal.cc | 61 +++++++++++++++++++++++- cpp/src/parquet/arrow/reader_internal.h | 5 ++ cpp/src/parquet/arrow/writer.cc | 35 ++++++++++++-- 5 files changed, 101 insertions(+), 10 deletions(-) diff --git a/cpp/src/arrow/ipc/writer.h b/cpp/src/arrow/ipc/writer.h index e70827ed380..3b1fece7127 100644 --- a/cpp/src/arrow/ipc/writer.h +++ b/cpp/src/arrow/ipc/writer.h @@ -24,6 +24,7 @@ #include #include +#include "arrow/ipc/dictionary.h" // IWYU pragma: export #include "arrow/ipc/message.h" #include "arrow/ipc/options.h" #include "arrow/result.h" @@ -49,8 +50,6 @@ class OutputStream; namespace ipc { -class DictionaryMemo; - /// \class RecordBatchWriter /// \brief Abstract interface for writing a stream of record batches class ARROW_EXPORT RecordBatchWriter { diff --git a/cpp/src/parquet/arrow/reader.cc b/cpp/src/parquet/arrow/reader.cc index 3d1ad76c8f0..4451276d6b3 100644 --- a/cpp/src/parquet/arrow/reader.cc +++ b/cpp/src/parquet/arrow/reader.cc @@ -106,8 +106,9 @@ class FileReaderImpl : public FileReader { : pool_(pool), reader_(std::move(reader)), reader_properties_(properties) {} Status Init() { - return BuildSchemaManifest(reader_->metadata()->schema(), reader_properties_, - &manifest_); + return BuildSchemaManifest(reader_->metadata()->schema(), + reader_->metadata()->key_value_metadata(), + reader_properties_, &manifest_); } std::vector AllRowGroups() { @@ -777,7 +778,7 @@ Status FileReaderImpl::ReadRowGroups(const std::vector& row_groups, } } - auto result_schema = ::arrow::schema(fields, reader_->metadata()->key_value_metadata()); + auto result_schema = ::arrow::schema(fields, manifest_.schema_metadata); *out = Table::Make(result_schema, columns); return (*out)->Validate(); END_PARQUET_CATCH_EXCEPTIONS diff --git a/cpp/src/parquet/arrow/reader_internal.cc b/cpp/src/parquet/arrow/reader_internal.cc index bfc40940e3b..20aafb35e34 100644 --- a/cpp/src/parquet/arrow/reader_internal.cc +++ b/cpp/src/parquet/arrow/reader_internal.cc @@ -24,6 +24,7 @@ #include #include #include +#include #include #include @@ -31,6 +32,8 @@ #include "arrow/array.h" #include "arrow/builder.h" #include "arrow/compute/kernel.h" +#include "arrow/io/memory.h" +#include "arrow/ipc/reader.h" #include "arrow/status.h" #include "arrow/table.h" #include "arrow/type.h" @@ -533,9 +536,51 @@ Status NodeToSchemaField(const Node& node, int16_t max_def_level, int16_t max_re } } +Status GetOriginSchema(const std::shared_ptr& metadata, + std::shared_ptr* clean_metadata, + std::shared_ptr<::arrow::Schema>* out) { + if (metadata == nullptr) { + *out = nullptr; + *clean_metadata = nullptr; + return Status::OK(); + } + + static const std::string kArrowSchemaKey = "ARROW:schema"; + int schema_index = metadata->FindKey(kArrowSchemaKey); + if (schema_index == -1) { + *out = nullptr; + *clean_metadata = metadata; + return Status::OK(); + } + + // The original Arrow schema was serialized using the store_schema option. We + // deserialize it here and use it to inform read options such as + // dictionary-encoded fields + auto schema_buf = std::make_shared(metadata->value(schema_index)); + + ::arrow::ipc::DictionaryMemo dict_memo; + ::arrow::io::BufferReader input(schema_buf); + RETURN_NOT_OK(::arrow::ipc::ReadSchema(&input, &dict_memo, out)); + + // Copy the metadata without the schema key + auto new_metadata = ::arrow::key_value_metadata({}); + new_metadata->reserve(metadata->size() - 1); + for (int64_t i = 0; i < metadata->size(); ++i) { + if (i == schema_index) continue; + new_metadata->Append(metadata->key(i), metadata->value(i)); + } + *clean_metadata = new_metadata; + return Status::OK(); +} + Status BuildSchemaManifest(const SchemaDescriptor* schema, + const std::shared_ptr& metadata, const ArrowReaderProperties& properties, SchemaManifest* manifest) { + std::shared_ptr<::arrow::Schema> origin_schema; + RETURN_NOT_OK( + GetOriginSchema(metadata, &manifest->schema_metadata, &manifest->origin_schema)); + SchemaTreeContext ctx; ctx.manifest = manifest; ctx.properties = properties; @@ -544,8 +589,20 @@ Status BuildSchemaManifest(const SchemaDescriptor* schema, manifest->descr = schema; manifest->schema_fields.resize(schema_node.field_count()); for (int i = 0; i < static_cast(schema_node.field_count()); ++i) { + SchemaField* out_field = &manifest->schema_fields[i]; RETURN_NOT_OK(NodeToSchemaField(*schema_node.field(i), 0, 0, &ctx, - /*parent=*/nullptr, &manifest->schema_fields[i])); + /*parent=*/nullptr, out_field)); + + // TODO(wesm): as follow up to ARROW-3246, we should really pass the origin + // schema (if any) through all functions in the schema reconstruction, but + // I'm being lazy and just setting dictionary fields at the top level for + // now + if (manifest->origin_schema) { + auto origin_field = manifest->origin_schema->field(i); + if (!origin_field->Equals(*out_field->field)) { + out_field->field = out_field->field->WithType(origin_field->type()); + } + } } return Status::OK(); } @@ -555,7 +612,7 @@ Status FromParquetSchema( const std::shared_ptr& key_value_metadata, std::shared_ptr<::arrow::Schema>* out) { SchemaManifest manifest; - RETURN_NOT_OK(BuildSchemaManifest(schema, properties, &manifest)); + RETURN_NOT_OK(BuildSchemaManifest(schema, key_value_metadata, properties, &manifest)); std::vector> fields(manifest.schema_fields.size()); for (int i = 0; i < static_cast(fields.size()); i++) { fields[i] = manifest.schema_fields[i].field; diff --git a/cpp/src/parquet/arrow/reader_internal.h b/cpp/src/parquet/arrow/reader_internal.h index 4568e421474..d8f08524681 100644 --- a/cpp/src/parquet/arrow/reader_internal.h +++ b/cpp/src/parquet/arrow/reader_internal.h @@ -38,6 +38,8 @@ class Array; class ChunkedArray; class DataType; class Field; +class KeyValueMetadata; +class Schema; } // namespace arrow @@ -138,6 +140,8 @@ struct PARQUET_EXPORT SchemaField { struct SchemaManifest { const SchemaDescriptor* descr; + std::shared_ptr<::arrow::Schema> origin_schema; + std::shared_ptr schema_metadata; std::vector schema_fields; std::unordered_map column_index_to_field; @@ -185,6 +189,7 @@ struct SchemaManifest { PARQUET_EXPORT Status BuildSchemaManifest(const SchemaDescriptor* schema, + const std::shared_ptr& metadata, const ArrowReaderProperties& properties, SchemaManifest* manifest); diff --git a/cpp/src/parquet/arrow/writer.cc b/cpp/src/parquet/arrow/writer.cc index 5c55426477f..007254aa160 100644 --- a/cpp/src/parquet/arrow/writer.cc +++ b/cpp/src/parquet/arrow/writer.cc @@ -19,12 +19,14 @@ #include #include +#include #include #include #include #include "arrow/array.h" #include "arrow/buffer-builder.h" +#include "arrow/ipc/writer.h" #include "arrow/table.h" #include "arrow/type.h" #include "arrow/visitor_inline.h" @@ -421,8 +423,8 @@ class FileWriterImpl : public FileWriter { closed_(false) {} Status Init() { - return BuildSchemaManifest(writer_->schema(), default_arrow_reader_properties(), - &schema_manifest_); + return BuildSchemaManifest(writer_->schema(), /*schema_metadata=*/nullptr, + default_arrow_reader_properties(), &schema_manifest_); } Status NewRowGroup(int64_t chunk_size) override { @@ -551,6 +553,30 @@ Status FileWriter::Open(const ::arrow::Schema& schema, ::arrow::MemoryPool* pool return Open(schema, pool, sink, properties, default_arrow_writer_properties(), writer); } +Status GetSchemaMetadata(const ::arrow::Schema& schema, ::arrow::MemoryPool* pool, + const ArrowWriterProperties& properties, + std::shared_ptr* out) { + if (!properties.store_schema()) { + *out = nullptr; + return Status::OK(); + } + + static const std::string kArrowSchemaKey = "ARROW:schema"; + std::shared_ptr result; + if (schema.metadata()) { + result = schema.metadata()->Copy(); + } else { + result = ::arrow::key_value_metadata({}); + } + + ::arrow::ipc::DictionaryMemo dict_memo; + std::shared_ptr serialized; + RETURN_NOT_OK(::arrow::ipc::SerializeSchema(schema, &dict_memo, pool, &serialized)); + result->Append(kArrowSchemaKey, serialized->ToString()); + *out = result; + return Status::OK(); +} + Status FileWriter::Open(const ::arrow::Schema& schema, ::arrow::MemoryPool* pool, const std::shared_ptr<::arrow::io::OutputStream>& sink, const std::shared_ptr& properties, @@ -562,8 +588,11 @@ Status FileWriter::Open(const ::arrow::Schema& schema, ::arrow::MemoryPool* pool auto schema_node = std::static_pointer_cast(parquet_schema->schema_root()); + std::shared_ptr metadata; + RETURN_NOT_OK(GetSchemaMetadata(schema, pool, *arrow_properties, &metadata)); + std::unique_ptr base_writer = - ParquetFileWriter::Open(sink, schema_node, properties, schema.metadata()); + ParquetFileWriter::Open(sink, schema_node, properties, metadata); auto schema_ptr = std::make_shared<::arrow::Schema>(schema); return Make(pool, std::move(base_writer), schema_ptr, arrow_properties, writer); From 7d663d5aca0d809633629aaa2b1a7f8d17181b9c Mon Sep 17 00:00:00 2001 From: Wes McKinney Date: Tue, 13 Aug 2019 13:42:52 -0500 Subject: [PATCH 17/28] Store schema when writing from Python, add unit test to exhibit direct dictionary reads --- cpp/src/parquet/arrow/reader_internal.cc | 19 +++++++++------ python/pyarrow/_parquet.pxd | 1 + python/pyarrow/_parquet.pyx | 5 ++++ python/pyarrow/tests/test_parquet.py | 30 ++++++++++++++++++++++++ 4 files changed, 48 insertions(+), 7 deletions(-) diff --git a/cpp/src/parquet/arrow/reader_internal.cc b/cpp/src/parquet/arrow/reader_internal.cc index 20aafb35e34..fd7cef739a3 100644 --- a/cpp/src/parquet/arrow/reader_internal.cc +++ b/cpp/src/parquet/arrow/reader_internal.cc @@ -562,14 +562,19 @@ Status GetOriginSchema(const std::shared_ptr& metadata, ::arrow::io::BufferReader input(schema_buf); RETURN_NOT_OK(::arrow::ipc::ReadSchema(&input, &dict_memo, out)); - // Copy the metadata without the schema key - auto new_metadata = ::arrow::key_value_metadata({}); - new_metadata->reserve(metadata->size() - 1); - for (int64_t i = 0; i < metadata->size(); ++i) { - if (i == schema_index) continue; - new_metadata->Append(metadata->key(i), metadata->value(i)); + if (metadata->size() > 1) { + // Copy the metadata without the schema key + auto new_metadata = ::arrow::key_value_metadata({}); + new_metadata->reserve(metadata->size() - 1); + for (int64_t i = 0; i < metadata->size(); ++i) { + if (i == schema_index) continue; + new_metadata->Append(metadata->key(i), metadata->value(i)); + } + *clean_metadata = new_metadata; + } else { + // No other keys, let metadata be null + *clean_metadata = nullptr; } - *clean_metadata = new_metadata; return Status::OK(); } diff --git a/python/pyarrow/_parquet.pxd b/python/pyarrow/_parquet.pxd index 82ca9fbb33e..97e73cb6468 100644 --- a/python/pyarrow/_parquet.pxd +++ b/python/pyarrow/_parquet.pxd @@ -365,6 +365,7 @@ cdef extern from "parquet/api/writer.h" namespace "parquet" nogil: Builder* coerce_timestamps(TimeUnit unit) Builder* allow_truncated_timestamps() Builder* disallow_truncated_timestamps() + Builder* store_schema() shared_ptr[ArrowWriterProperties] build() c_bool support_deprecated_int96_timestamps() diff --git a/python/pyarrow/_parquet.pyx b/python/pyarrow/_parquet.pyx index d51e7fb3c6c..82da6b572d8 100644 --- a/python/pyarrow/_parquet.pyx +++ b/python/pyarrow/_parquet.pyx @@ -1226,6 +1226,11 @@ cdef class ParquetWriter: properties = properties_builder.build() cdef ArrowWriterProperties.Builder arrow_properties_builder + + # Store the original Arrow schema so things like dictionary types can + # be automatically reconstructed + arrow_properties_builder.store_schema() + self._set_int96_support(&arrow_properties_builder) self._set_coerce_timestamps(&arrow_properties_builder) self._set_allow_truncated_timestamps(&arrow_properties_builder) diff --git a/python/pyarrow/tests/test_parquet.py b/python/pyarrow/tests/test_parquet.py index b143dd4bada..5fa86add77c 100644 --- a/python/pyarrow/tests/test_parquet.py +++ b/python/pyarrow/tests/test_parquet.py @@ -2981,6 +2981,36 @@ def test_parquet_file_too_small(tempdir): pq.read_table(path) +def test_dictionary_array_automatically_read(): + # ARROW-3246 + + # Make a large dictionary, a little over 4MB of data + dict_length = 4000 + dict_values = pa.array([('x' * 1000 + '_{}'.format(i)) + for i in range(dict_length)]) + + num_chunks = 10 + chunk_size = 100 + chunks = [] + for i in range(num_chunks): + indices = np.random.randint(0, dict_length, + size=chunk_size).astype(np.int32) + chunks.append(pa.DictionaryArray.from_arrays(pa.array(indices), + dict_values)) + + table = pa.table([pa.chunked_array(chunks)], names=['f0']) + + bio = pa.BufferOutputStream() + pq.write_table(table, bio) + contents = bio.getvalue() + result = pq.read_table(pa.BufferReader(contents)) + + assert result.equals(table) + + # The only key in the metadata was the Arrow schema key + assert result.schema.metadata is None + + @pytest.mark.pandas def test_multi_dataset_metadata(tempdir): filenames = ["ARROW-1983-dataset.0", "ARROW-1983-dataset.1"] From f26d7da80279f7126530431e18411a9b9dd7230c Mon Sep 17 00:00:00 2001 From: Wes McKinney Date: Tue, 13 Aug 2019 14:45:28 -0500 Subject: [PATCH 18/28] Fix up Python unit tests given schema serialization --- cpp/src/parquet/arrow/reader_internal.cc | 8 ++++-- python/pyarrow/tests/test_parquet.py | 36 +++++++++++++----------- 2 files changed, 26 insertions(+), 18 deletions(-) diff --git a/cpp/src/parquet/arrow/reader_internal.cc b/cpp/src/parquet/arrow/reader_internal.cc index fd7cef739a3..fad163aa07a 100644 --- a/cpp/src/parquet/arrow/reader_internal.cc +++ b/cpp/src/parquet/arrow/reader_internal.cc @@ -604,8 +604,12 @@ Status BuildSchemaManifest(const SchemaDescriptor* schema, // now if (manifest->origin_schema) { auto origin_field = manifest->origin_schema->field(i); - if (!origin_field->Equals(*out_field->field)) { - out_field->field = out_field->field->WithType(origin_field->type()); + auto current_type = out_field->field->type(); + if (origin_field->type()->id() == ::arrow::Type::DICTIONARY) { + if (current_type->id() != ::arrow::Type::DICTIONARY) { + out_field->field = out_field->field->WithType( + ::arrow::dictionary(::arrow::int32(), current_type)); + } } } } diff --git a/python/pyarrow/tests/test_parquet.py b/python/pyarrow/tests/test_parquet.py index 5fa86add77c..fc620c1eea7 100644 --- a/python/pyarrow/tests/test_parquet.py +++ b/python/pyarrow/tests/test_parquet.py @@ -159,10 +159,10 @@ def alltypes_sample(size=10000, seed=0, categorical=False): @pytest.mark.pandas @pytest.mark.parametrize('chunk_size', [None, 1000]) -def test_pandas_parquet_2_0_rountrip(tempdir, chunk_size): +def test_pandas_parquet_2_0_roundtrip(tempdir, chunk_size): df = alltypes_sample(size=10000, categorical=True) - filename = tempdir / 'pandas_rountrip.parquet' + filename = tempdir / 'pandas_roundtrip.parquet' arrow_table = pa.Table.from_pandas(df) assert arrow_table.schema.pandas_metadata is not None @@ -173,7 +173,7 @@ def test_pandas_parquet_2_0_rountrip(tempdir, chunk_size): assert arrow_table.schema.metadata == table_read.schema.metadata - df_read = table_read.to_pandas(categories=['str_category']) + df_read = table_read.to_pandas() tm.assert_frame_equal(df, df_read, check_categorical=False) @@ -297,7 +297,7 @@ def test_datetime_timezone_tzinfo(): def test_pandas_parquet_custom_metadata(tempdir): df = alltypes_sample(size=10000) - filename = tempdir / 'pandas_rountrip.parquet' + filename = tempdir / 'pandas_roundtrip.parquet' arrow_table = pa.Table.from_pandas(df) assert b'pandas' in arrow_table.schema.metadata @@ -321,7 +321,7 @@ def test_pandas_parquet_column_multiindex(tempdir): names=['level_1', 'level_2'] ) - filename = tempdir / 'pandas_rountrip.parquet' + filename = tempdir / 'pandas_roundtrip.parquet' arrow_table = pa.Table.from_pandas(df) assert arrow_table.schema.pandas_metadata is not None @@ -333,10 +333,10 @@ def test_pandas_parquet_column_multiindex(tempdir): @pytest.mark.pandas -def test_pandas_parquet_2_0_rountrip_read_pandas_no_index_written(tempdir): +def test_pandas_parquet_2_0_roundtrip_read_pandas_no_index_written(tempdir): df = alltypes_sample(size=10000) - filename = tempdir / 'pandas_rountrip.parquet' + filename = tempdir / 'pandas_roundtrip.parquet' arrow_table = pa.Table.from_pandas(df, preserve_index=False) js = arrow_table.schema.pandas_metadata assert not js['index_columns'] @@ -357,7 +357,7 @@ def test_pandas_parquet_2_0_rountrip_read_pandas_no_index_written(tempdir): @pytest.mark.pandas -def test_pandas_parquet_1_0_rountrip(tempdir): +def test_pandas_parquet_1_0_roundtrip(tempdir): size = 10000 np.random.seed(0) df = pd.DataFrame({ @@ -376,7 +376,7 @@ def test_pandas_parquet_1_0_rountrip(tempdir): 'str_with_nulls': [None] + [str(x) for x in range(size - 2)] + [None], 'empty_str': [''] * size }) - filename = tempdir / 'pandas_rountrip.parquet' + filename = tempdir / 'pandas_roundtrip.parquet' arrow_table = pa.Table.from_pandas(df) _write_table(arrow_table, filename, version='1.0') table_read = _read_table(filename) @@ -415,7 +415,7 @@ def test_pandas_column_selection(tempdir): 'uint8': np.arange(size, dtype=np.uint8), 'uint16': np.arange(size, dtype=np.uint16) }) - filename = tempdir / 'pandas_rountrip.parquet' + filename = tempdir / 'pandas_roundtrip.parquet' arrow_table = pa.Table.from_pandas(df) _write_table(arrow_table, filename) table_read = _read_table(filename, columns=['uint8']) @@ -567,7 +567,7 @@ def test_pandas_parquet_configuration_options(tempdir): 'float64': np.arange(size, dtype=np.float64), 'bool': np.random.randn(size) > 0 }) - filename = tempdir / 'pandas_rountrip.parquet' + filename = tempdir / 'pandas_roundtrip.parquet' arrow_table = pa.Table.from_pandas(df) for use_dictionary in [True, False]: @@ -883,7 +883,7 @@ def test_validate_schema_write_table(tempdir): def test_column_of_arrays(tempdir): df, schema = dataframe_with_arrays() - filename = tempdir / 'pandas_rountrip.parquet' + filename = tempdir / 'pandas_roundtrip.parquet' arrow_table = pa.Table.from_pandas(df, schema=schema) _write_table(arrow_table, filename, version="2.0", coerce_timestamps='ms') table_read = _read_table(filename) @@ -914,7 +914,7 @@ def test_coerce_timestamps(tempdir): df = pd.DataFrame(arrays) schema = pa.schema(fields) - filename = tempdir / 'pandas_rountrip.parquet' + filename = tempdir / 'pandas_roundtrip.parquet' arrow_table = pa.Table.from_pandas(df, schema=schema) _write_table(arrow_table, filename, version="2.0", coerce_timestamps='us') @@ -967,7 +967,7 @@ def test_coerce_timestamps_truncated(tempdir): def test_column_of_lists(tempdir): df, schema = dataframe_with_lists(parquet_compatible=True) - filename = tempdir / 'pandas_rountrip.parquet' + filename = tempdir / 'pandas_roundtrip.parquet' arrow_table = pa.Table.from_pandas(df, schema=schema) _write_table(arrow_table, filename, version='2.0') table_read = _read_table(filename) @@ -1888,8 +1888,12 @@ def test_read_schema(tempdir): table = pa.Table.from_pandas(df) _write_table(table, data_path) - assert table.schema.equals(pq.read_schema(data_path)) - assert table.schema.equals(pq.read_schema(data_path, memory_map=True)) + read1 = pq.read_schema(data_path) + read2 = pq.read_schema(data_path, memory_map=True) + assert table.schema.equals(read1, check_metadata=False) + assert table.schema.equals(read2, check_metadata=False) + + assert table.schema.metadata[b'pandas'] == read1.metadata[b'pandas'] def _filter_partition(df, part_keys): From 3425da4f224ef9e9b100a9b68470e106f3b76d96 Mon Sep 17 00:00:00 2001 From: Wes McKinney Date: Wed, 14 Aug 2019 13:28:29 -0500 Subject: [PATCH 19/28] Revert change causing ASAN failure [skip ci] --- cpp/src/parquet/arrow/arrow-reader-writer-test.cc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cpp/src/parquet/arrow/arrow-reader-writer-test.cc b/cpp/src/parquet/arrow/arrow-reader-writer-test.cc index c32ca16118c..ca3fff796cc 100644 --- a/cpp/src/parquet/arrow/arrow-reader-writer-test.cc +++ b/cpp/src/parquet/arrow/arrow-reader-writer-test.cc @@ -675,7 +675,7 @@ TYPED_TEST(TestParquetIO, SingleColumnOptionalReadWrite) { // This also tests max_definition_level = 1 std::shared_ptr values; - ASSERT_OK(NullableArray(SMALL_SIZE / 8, 10, kDefaultSeed, &values)); + ASSERT_OK(NullableArray(SMALL_SIZE, 10, kDefaultSeed, &values)); std::shared_ptr schema = MakeSimpleSchema(*values->type(), Repetition::OPTIONAL); From 5dc00b1ff2464d8a90d33013b016dd6219d43028 Mon Sep 17 00:00:00 2001 From: Wes McKinney Date: Wed, 14 Aug 2019 14:00:08 -0500 Subject: [PATCH 20/28] Fix MSVC compilation warnings [skip ci] --- cpp/src/parquet/encoding-test.cc | 21 +++++++++++++-------- cpp/src/parquet/encoding.cc | 6 +++--- 2 files changed, 16 insertions(+), 11 deletions(-) diff --git a/cpp/src/parquet/encoding-test.cc b/cpp/src/parquet/encoding-test.cc index 7f57bc03936..b55d815ce78 100644 --- a/cpp/src/parquet/encoding-test.cc +++ b/cpp/src/parquet/encoding-test.cc @@ -496,13 +496,15 @@ TEST(PlainEncodingAdHoc, ArrowBinaryDirectPut) { ASSERT_NO_THROW(encoder->Put(*values)); auto buf = encoder->FlushValues(); - int64_t num_values = values->length() - values->null_count(); + int num_values = static_cast(values->length() - values->null_count()); decoder->SetData(num_values, buf->data(), static_cast(buf->size())); arrow::StringBuilder builder; ASSERT_EQ(num_values, - decoder->DecodeArrow(values->length(), values->null_count(), - values->null_bitmap_data(), values->offset(), &builder)); + decoder->DecodeArrow(static_cast(values->length()), + static_cast(values->null_count()), + values->null_bitmap_data(), + values->offset(), &builder)); std::shared_ptr result; ASSERT_OK(builder.Finish(&result)); @@ -526,7 +528,8 @@ void GetBinaryDictDecoder(DictEncoder* encoder, int64_t num_value dict_decoder->SetData(encoder->num_entries(), dict_buf->data(), static_cast(dict_buf->size())); - decoder->SetData(num_values, buf->data(), static_cast(buf->size())); + decoder->SetData(static_cast(num_values), buf->data(), + static_cast(buf->size())); decoder->SetDict(dict_decoder.get()); *out_values = buf; @@ -553,12 +556,13 @@ TEST(DictEncodingAdHoc, ArrowBinaryDirectPut) { std::unique_ptr decoder; std::shared_ptr buf, dict_buf; - int64_t num_values = values->length() - values->null_count(); + int num_values = static_cast(values->length() - values->null_count()); GetBinaryDictDecoder(encoder, num_values, &buf, &dict_buf, &decoder); arrow::StringBuilder builder; ASSERT_EQ(num_values, - decoder->DecodeArrow(values->length(), values->null_count(), + decoder->DecodeArrow(static_cast(values->length()), + static_cast(values->null_count()), values->null_bitmap_data(), values->offset(), &builder)); std::shared_ptr result; @@ -592,11 +596,12 @@ TEST(DictEncodingAdHoc, PutDictionaryPutIndices) { std::unique_ptr decoder; std::shared_ptr buf, dict_buf; - int64_t num_values = expected->length() - expected->null_count(); + int num_values = static_cast(expected->length() - expected->null_count()); GetBinaryDictDecoder(encoder, num_values, &buf, &dict_buf, &decoder); arrow::BinaryBuilder builder; - ASSERT_EQ(num_values, decoder->DecodeArrow(expected->length(), expected->null_count(), + ASSERT_EQ(num_values, decoder->DecodeArrow(static_cast(expected->length()), + static_cast(expected->null_count()), expected->null_bitmap_data(), expected->offset(), &builder)); diff --git a/cpp/src/parquet/encoding.cc b/cpp/src/parquet/encoding.cc index fc7acf5114d..86ab972c724 100644 --- a/cpp/src/parquet/encoding.cc +++ b/cpp/src/parquet/encoding.cc @@ -434,13 +434,13 @@ class DictEncoderImpl : public EncoderImpl, virtual public DictEncoder { indices.offset(), indices.length()); for (int64_t i = 0; i < indices.length(); ++i) { if (valid_bits_reader.IsSet()) { - buffered_indices_.push_back(static_cast(values[i])); + buffered_indices_.push_back(static_cast(values[i])); } valid_bits_reader.Next(); } } else { for (int64_t i = 0; i < indices.length(); ++i) { - buffered_indices_.push_back(static_cast(values[i])); + buffered_indices_.push_back(static_cast(values[i])); } } } @@ -877,7 +877,7 @@ class PlainByteArrayDecoder : public PlainDecoder, const uint8_t* data = data_; int64_t data_size = len_; int bytes_decoded = 0; - int64_t values_decoded = 0; + int values_decoded = 0; while (i < num_values && values_decoded < num_values_) { if (bit_reader.IsSet()) { uint32_t len = arrow::util::SafeLoadAs(data); From 8f4cd44636c0234e4b799bdce93030e90eafdfd4 Mon Sep 17 00:00:00 2001 From: Wes McKinney Date: Wed, 14 Aug 2019 14:12:29 -0500 Subject: [PATCH 21/28] Fix DecodeArrow bug which only occurred when there are nulls at the end of the data [skip ci] --- cpp/src/arrow/testing/gtest_util.cc | 11 ++++++++++- cpp/src/arrow/testing/gtest_util.h | 4 +++- cpp/src/parquet/encoding-test.cc | 1 + cpp/src/parquet/encoding.cc | 2 +- 4 files changed, 15 insertions(+), 3 deletions(-) diff --git a/cpp/src/arrow/testing/gtest_util.cc b/cpp/src/arrow/testing/gtest_util.cc index c5911dcf475..8d753cdf595 100644 --- a/cpp/src/arrow/testing/gtest_util.cc +++ b/cpp/src/arrow/testing/gtest_util.cc @@ -68,9 +68,18 @@ void AssertTsEqual(const T& expected, const T& actual) { } } -void AssertArraysEqual(const Array& expected, const Array& actual) { +void AssertArraysEqual(const Array& expected, const Array& actual, + bool verbose) { std::stringstream diff; if (!expected.Equals(actual, EqualOptions().diff_sink(&diff))) { + if (verbose) { + ::arrow::PrettyPrintOptions options(/*indent=*/2); + options.window = 50; + diff << "Expected:\n"; + ARROW_EXPECT_OK(PrettyPrint(expected, options, &diff)); + diff << "\nActual:\n"; + ARROW_EXPECT_OK(PrettyPrint(actual, options, &diff)); + } FAIL() << diff.str(); } } diff --git a/cpp/src/arrow/testing/gtest_util.h b/cpp/src/arrow/testing/gtest_util.h index f378b808c54..e070dc7d612 100644 --- a/cpp/src/arrow/testing/gtest_util.h +++ b/cpp/src/arrow/testing/gtest_util.h @@ -140,7 +140,9 @@ using ArrayVector = std::vector>; #define ASSERT_ARRAYS_EQUAL(lhs, rhs) AssertArraysEqual((lhs), (rhs)) #define ASSERT_BATCHES_EQUAL(lhs, rhs) AssertBatchesEqual((lhs), (rhs)) -ARROW_EXPORT void AssertArraysEqual(const Array& expected, const Array& actual); +// If verbose is true, then the arrays will be pretty printed +ARROW_EXPORT void AssertArraysEqual(const Array& expected, const Array& actual, + bool verbose = false); ARROW_EXPORT void AssertBatchesEqual(const RecordBatch& expected, const RecordBatch& actual); ARROW_EXPORT void AssertChunkedEqual(const ChunkedArray& expected, diff --git a/cpp/src/parquet/encoding-test.cc b/cpp/src/parquet/encoding-test.cc index b55d815ce78..b68b33ab326 100644 --- a/cpp/src/parquet/encoding-test.cc +++ b/cpp/src/parquet/encoding-test.cc @@ -508,6 +508,7 @@ TEST(PlainEncodingAdHoc, ArrowBinaryDirectPut) { std::shared_ptr result; ASSERT_OK(builder.Finish(&result)); + ASSERT_EQ(50, result->length()); arrow::AssertArraysEqual(*values, *result); // Type checked diff --git a/cpp/src/parquet/encoding.cc b/cpp/src/parquet/encoding.cc index 86ab972c724..93f72c7fccc 100644 --- a/cpp/src/parquet/encoding.cc +++ b/cpp/src/parquet/encoding.cc @@ -878,7 +878,7 @@ class PlainByteArrayDecoder : public PlainDecoder, int64_t data_size = len_; int bytes_decoded = 0; int values_decoded = 0; - while (i < num_values && values_decoded < num_values_) { + while (i < num_values) { if (bit_reader.IsSet()) { uint32_t len = arrow::util::SafeLoadAs(data); increment = static_cast(sizeof(uint32_t) + len); From 3f45fef0af6970c68ecbf2857c31cdab1bae1d8c Mon Sep 17 00:00:00 2001 From: Wes McKinney Date: Wed, 14 Aug 2019 14:26:35 -0500 Subject: [PATCH 22/28] Check more random seeds, fix warnings --- cpp/src/arrow/testing/gtest_util.cc | 3 +- cpp/src/parquet/encoding-test.cc | 52 ++++++++++++++++------------- cpp/src/parquet/encoding.cc | 1 - 3 files changed, 30 insertions(+), 26 deletions(-) diff --git a/cpp/src/arrow/testing/gtest_util.cc b/cpp/src/arrow/testing/gtest_util.cc index 8d753cdf595..3928b07aa6f 100644 --- a/cpp/src/arrow/testing/gtest_util.cc +++ b/cpp/src/arrow/testing/gtest_util.cc @@ -68,8 +68,7 @@ void AssertTsEqual(const T& expected, const T& actual) { } } -void AssertArraysEqual(const Array& expected, const Array& actual, - bool verbose) { +void AssertArraysEqual(const Array& expected, const Array& actual, bool verbose) { std::stringstream diff; if (!expected.Equals(actual, EqualOptions().diff_sink(&diff))) { if (verbose) { diff --git a/cpp/src/parquet/encoding-test.cc b/cpp/src/parquet/encoding-test.cc index b68b33ab326..d1dc44b2606 100644 --- a/cpp/src/parquet/encoding-test.cc +++ b/cpp/src/parquet/encoding-test.cc @@ -486,34 +486,40 @@ TEST(PlainEncodingAdHoc, ArrowBinaryDirectPut) { const int64_t size = 50; const int64_t min_length = 0; const int64_t max_length = 10; - const double null_probability = 0.1; - arrow::random::RandomArrayGenerator rag(0); - auto values = rag.String(size, min_length, max_length, null_probability); + const double null_probability = 0.25; - auto encoder = MakeTypedEncoder(Encoding::PLAIN); - auto decoder = MakeTypedDecoder(Encoding::PLAIN); + auto CheckSeed = [&](int seed) { + arrow::random::RandomArrayGenerator rag(seed); + auto values = rag.String(size, min_length, max_length, null_probability); - ASSERT_NO_THROW(encoder->Put(*values)); - auto buf = encoder->FlushValues(); + auto encoder = MakeTypedEncoder(Encoding::PLAIN); + auto decoder = MakeTypedDecoder(Encoding::PLAIN); - int num_values = static_cast(values->length() - values->null_count()); - decoder->SetData(num_values, buf->data(), static_cast(buf->size())); + ASSERT_NO_THROW(encoder->Put(*values)); + auto buf = encoder->FlushValues(); - arrow::StringBuilder builder; - ASSERT_EQ(num_values, - decoder->DecodeArrow(static_cast(values->length()), - static_cast(values->null_count()), - values->null_bitmap_data(), - values->offset(), &builder)); + int num_values = static_cast(values->length() - values->null_count()); + decoder->SetData(num_values, buf->data(), static_cast(buf->size())); - std::shared_ptr result; - ASSERT_OK(builder.Finish(&result)); - ASSERT_EQ(50, result->length()); - arrow::AssertArraysEqual(*values, *result); + arrow::StringBuilder builder; + ASSERT_EQ(num_values, decoder->DecodeArrow(static_cast(values->length()), + static_cast(values->null_count()), + values->null_bitmap_data(), + values->offset(), &builder)); + + std::shared_ptr result; + ASSERT_OK(builder.Finish(&result)); + ASSERT_EQ(50, result->length()); + arrow::AssertArraysEqual(*values, *result); - // Type checked - auto i32_values = rag.Int32(size, 0, 10, null_probability); - ASSERT_THROW(encoder->Put(*i32_values), ParquetException); + // Type checked + auto i32_values = rag.Int32(size, 0, 10, null_probability); + ASSERT_THROW(encoder->Put(*i32_values), ParquetException); + }; + + for (auto seed : {0, 1, 2, 3, 4, 5, 6, 7, 8, 9}) { + CheckSeed(seed); + } } void GetBinaryDictDecoder(DictEncoder* encoder, int64_t num_values, @@ -530,7 +536,7 @@ void GetBinaryDictDecoder(DictEncoder* encoder, int64_t num_value static_cast(dict_buf->size())); decoder->SetData(static_cast(num_values), buf->data(), - static_cast(buf->size())); + static_cast(buf->size())); decoder->SetDict(dict_decoder.get()); *out_values = buf; diff --git a/cpp/src/parquet/encoding.cc b/cpp/src/parquet/encoding.cc index 93f72c7fccc..7f411d941ab 100644 --- a/cpp/src/parquet/encoding.cc +++ b/cpp/src/parquet/encoding.cc @@ -422,7 +422,6 @@ class DictEncoderImpl : public EncoderImpl, virtual public DictEncoder { template void PutIndicesTyped(const arrow::Array& data) { - using T = typename ArrowType::c_type; using ArrayType = typename arrow::TypeTraits::ArrayType; const auto& indices = checked_cast(data); auto values = indices.raw_values(); From 91555b645e3f1a43e6cf0b4d8ef5ce70cd79972a Mon Sep 17 00:00:00 2001 From: Wes McKinney Date: Wed, 14 Aug 2019 16:31:58 -0500 Subject: [PATCH 23/28] Fix another new MSVC warning --- cpp/src/parquet/encoding-test.cc | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/cpp/src/parquet/encoding-test.cc b/cpp/src/parquet/encoding-test.cc index d1dc44b2606..ccd456afce2 100644 --- a/cpp/src/parquet/encoding-test.cc +++ b/cpp/src/parquet/encoding-test.cc @@ -484,8 +484,8 @@ TEST(PlainEncodingAdHoc, ArrowBinaryDirectPut) { // Implemented as part of ARROW-3246 const int64_t size = 50; - const int64_t min_length = 0; - const int64_t max_length = 10; + const int32_t min_length = 0; + const int32_t max_length = 10; const double null_probability = 0.25; auto CheckSeed = [&](int seed) { From 92cf4e063e69e2a736c5d752df533502ec5fe621 Mon Sep 17 00:00:00 2001 From: Wes McKinney Date: Thu, 15 Aug 2019 09:18:01 -0500 Subject: [PATCH 24/28] Code review feedback --- cpp/CMakeLists.txt | 2 +- cpp/src/parquet/column_writer.cc | 31 +++++++++++++++++++------- cpp/src/parquet/encoding.cc | 38 +++++++++++++------------------- 3 files changed, 39 insertions(+), 32 deletions(-) diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt index 0367e3fedc2..0e5365a508b 100644 --- a/cpp/CMakeLists.txt +++ b/cpp/CMakeLists.txt @@ -267,7 +267,7 @@ if((ARROW_BUILD_TESTS OR ARROW_BUILD_INTEGRATION) AND NOT ARROW_JSON) message(FATAL_ERROR "JSON parsing of arrays is required for Arrow tests") endif() -if(ARROW_FLIGHT OR ARROW_BUILD_TESTS) +if(ARROW_FLIGHT OR ARROW_PARQUET OR ARROW_BUILD_TESTS) set(ARROW_IPC ON) endif() diff --git a/cpp/src/parquet/column_writer.cc b/cpp/src/parquet/column_writer.cc index 51c952ed703..507584070e2 100644 --- a/cpp/src/parquet/column_writer.cc +++ b/cpp/src/parquet/column_writer.cc @@ -651,8 +651,8 @@ bool DictionaryDirectWriteSupported(const arrow::Array& array) { return id == arrow::Type::BINARY || id == arrow::Type::STRING; } -Status MaterializeDictionary(const arrow::Array& array, MemoryPool* pool, - std::shared_ptr* out) { +Status ConvertDictionaryToDense(const arrow::Array& array, MemoryPool* pool, + std::shared_ptr* out) { const arrow::DictionaryType& dict_type = static_cast(*array.type()); @@ -710,8 +710,11 @@ class TypedColumnWriterImpl : public ColumnWriterImpl, public TypedColumnWriter< } WriteValues(values + value_offset, values_to_write, batch_size - values_to_write); CommitWriteAndCheckPageLimit(batch_size, values_to_write); - CheckDictionarySizeLimit(); value_offset += values_to_write; + + // Dictionary size checked separately from data page size since we + // circumvent this check when writing arrow::DictionaryArray directly + CheckDictionarySizeLimit(); }; DoInBatches(num_values, properties_->write_batch_size(), WriteChunk); } @@ -729,8 +732,11 @@ class TypedColumnWriterImpl : public ColumnWriterImpl, public TypedColumnWriter< WriteValuesSpaced(values + value_offset, batch_num_values, batch_num_spaced_values, valid_bits, valid_bits_offset + value_offset); CommitWriteAndCheckPageLimit(batch_size, batch_num_spaced_values); - CheckDictionarySizeLimit(); value_offset += batch_num_spaced_values; + + // Dictionary size checked separately from data page size since we + // circumvent this check when writing arrow::DictionaryArray directly + CheckDictionarySizeLimit(); }; DoInBatches(num_values, properties_->write_batch_size(), WriteChunk); } @@ -973,6 +979,10 @@ class TypedColumnWriterImpl : public ColumnWriterImpl, public TypedColumnWriter< } }; +static inline bool IsDictionaryEncoding(Encoding::type encoding) { + return encoding == Encoding::PLAIN_DICTIONARY; +} + template Status TypedColumnWriterImpl::WriteArrowDictionary(const int16_t* def_levels, const int16_t* rep_levels, @@ -996,14 +1006,19 @@ Status TypedColumnWriterImpl::WriteArrowDictionary(const int16_t* def_lev // WriteArrow with that auto WriteDense = [&] { std::shared_ptr dense_array; - RETURN_NOT_OK(MaterializeDictionary(array, properties_->memory_pool(), &dense_array)); + RETURN_NOT_OK( + ConvertDictionaryToDense(array, properties_->memory_pool(), &dense_array)); return WriteArrowDense(def_levels, rep_levels, num_levels, *dense_array, ctx); }; - if (current_encoder_->encoding() != Encoding::PLAIN_DICTIONARY || + if (!IsDictionaryEncoding(current_encoder_->encoding()) || !DictionaryDirectWriteSupported(array)) { - // No longer dictionary-encoding for whatever reason, maybe we - // never were or we decided to stop + // No longer dictionary-encoding for whatever reason, maybe we never were + // or we decided to stop. Note that WriteArrow can be invoked multiple + // times with both dense and dictionary-encoded versions of the same data + // without a problem. Any dense data will be hashed to indices until the + // dictionary page limit is reached, at which everything (dictionary and + // dense) will fall back to plain encoding return WriteDense(); } diff --git a/cpp/src/parquet/encoding.cc b/cpp/src/parquet/encoding.cc index 7f411d941ab..9dd440815cf 100644 --- a/cpp/src/parquet/encoding.cc +++ b/cpp/src/parquet/encoding.cc @@ -114,10 +114,7 @@ class PlainEncoder : public EncoderImpl, virtual public TypedEncoder { Put(data, num_valid_values); } - protected: - std::shared_ptr values_sink_; - - void WriteByteArray(const ByteArray& val) { + void Put(const ByteArray& val) { // Write the result to the output stream PARQUET_THROW_NOT_OK(values_sink_->Write(reinterpret_cast(&val.len), sizeof(uint32_t))); @@ -127,6 +124,9 @@ class PlainEncoder : public EncoderImpl, virtual public TypedEncoder { PARQUET_THROW_NOT_OK( values_sink_->Write(reinterpret_cast(val.ptr), val.len)); } + + protected: + std::shared_ptr values_sink_; }; template @@ -138,7 +138,7 @@ void PlainEncoder::Put(const T* buffer, int num_values) { template <> inline void PlainEncoder::Put(const ByteArray* src, int num_values) { for (int i = 0; i < num_values; ++i) { - WriteByteArray(src[i]); + Put(src[i]); } } @@ -153,24 +153,29 @@ void AssertBinary(const arrow::Array& values) { } } -template <> -void PlainEncoder::Put(const arrow::Array& values) { +template +void PutBinaryArray(const arrow::Array& values, EncoderType* encoder) { AssertBinary(values); const auto& data = checked_cast(values); if (data.null_count() == 0) { // no nulls, just dump the data for (int64_t i = 0; i < data.length(); i++) { - WriteByteArray(ByteArray(data.GetView(i))); + encoder->Put(ByteArray(data.GetView(i))); } } else { for (int64_t i = 0; i < data.length(); i++) { if (data.IsValid(i)) { - WriteByteArray(ByteArray(data.GetView(i))); + encoder->Put(ByteArray(data.GetView(i))); } } } } +template <> +void PlainEncoder::Put(const arrow::Array& values) { + PutBinaryArray(values, this); +} + template <> inline void PlainEncoder::Put(const FixedLenByteArray* src, int num_values) { for (int i = 0; i < num_values; ++i) { @@ -564,20 +569,7 @@ void DictEncoderImpl::Put(const arrow::Array& values) { template <> void DictEncoderImpl::Put(const arrow::Array& values) { - AssertBinary(values); - const auto& data = checked_cast(values); - if (data.null_count() == 0) { - // no nulls, just dump the data - for (int64_t i = 0; i < data.length(); i++) { - Put(ByteArray(data.GetView(i))); - } - } else { - for (int64_t i = 0; i < data.length(); i++) { - if (data.IsValid(i)) { - Put(ByteArray(data.GetView(i))); - } - } - } + PutBinaryArray(values, this); } template From 494b954acbcc72212523f5e149e3155e25c8e4e9 Mon Sep 17 00:00:00 2001 From: Wes McKinney Date: Thu, 15 Aug 2019 11:57:50 -0500 Subject: [PATCH 25/28] Use other KeyValueMetadata factory function to hopefully appease MinGW --- cpp/src/parquet/arrow/reader_internal.cc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cpp/src/parquet/arrow/reader_internal.cc b/cpp/src/parquet/arrow/reader_internal.cc index fad163aa07a..0a4b4e5c0d5 100644 --- a/cpp/src/parquet/arrow/reader_internal.cc +++ b/cpp/src/parquet/arrow/reader_internal.cc @@ -564,7 +564,7 @@ Status GetOriginSchema(const std::shared_ptr& metadata, if (metadata->size() > 1) { // Copy the metadata without the schema key - auto new_metadata = ::arrow::key_value_metadata({}); + auto new_metadata = ::arrow::key_value_metadata({}, {}); new_metadata->reserve(metadata->size() - 1); for (int64_t i = 0; i < metadata->size(); ++i) { if (i == schema_index) continue; From 7f3a2a89f2fcea9f83ca780c63d383cfd50888d2 Mon Sep 17 00:00:00 2001 From: Wes McKinney Date: Thu, 15 Aug 2019 12:01:57 -0500 Subject: [PATCH 26/28] Fix another KeyValueMetadata factory --- cpp/src/parquet/arrow/writer.cc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cpp/src/parquet/arrow/writer.cc b/cpp/src/parquet/arrow/writer.cc index 007254aa160..0d13528d5f9 100644 --- a/cpp/src/parquet/arrow/writer.cc +++ b/cpp/src/parquet/arrow/writer.cc @@ -566,7 +566,7 @@ Status GetSchemaMetadata(const ::arrow::Schema& schema, ::arrow::MemoryPool* poo if (schema.metadata()) { result = schema.metadata()->Copy(); } else { - result = ::arrow::key_value_metadata({}); + result = ::arrow::key_value_metadata({}, {}); } ::arrow::ipc::DictionaryMemo dict_memo; From ad3bad34ae4b99553b72879def34b7ca99cffeab Mon Sep 17 00:00:00 2001 From: Wes McKinney Date: Thu, 15 Aug 2019 22:25:34 -0500 Subject: [PATCH 27/28] Address code review comments --- cpp/src/arrow/array/builder_dict.cc | 2 +- .../parquet/arrow/arrow-reader-writer-test.cc | 50 ++++++++++-- cpp/src/parquet/arrow/reader_internal.cc | 37 ++++++--- cpp/src/parquet/column_writer.cc | 29 ++++--- cpp/src/parquet/encoding.cc | 3 +- cpp/src/parquet/encoding.h | 2 +- cpp/src/parquet/statistics.h | 49 ++++++------ cpp/src/parquet/types.h | 80 ------------------- 8 files changed, 112 insertions(+), 140 deletions(-) diff --git a/cpp/src/arrow/array/builder_dict.cc b/cpp/src/arrow/array/builder_dict.cc index bf5cbdd7051..d5f1857516c 100644 --- a/cpp/src/arrow/array/builder_dict.cc +++ b/cpp/src/arrow/array/builder_dict.cc @@ -203,7 +203,7 @@ class internal::DictionaryMemoTable::DictionaryMemoTableImpl { template enable_if_memoize InsertValues(const DType&, const ArrayType& array) { if (array.null_count() > 0) { - return Status::Invalid("Cannot yet insert dictionary values containing nulls"); + return Status::Invalid("Cannot insert dictionary values containing nulls"); } for (int64_t i = 0; i < array.length(); ++i) { ARROW_IGNORE_EXPR(impl_->GetOrInsert(array.GetView(i))); diff --git a/cpp/src/parquet/arrow/arrow-reader-writer-test.cc b/cpp/src/parquet/arrow/arrow-reader-writer-test.cc index ca3fff796cc..b4b6d5ec6a9 100644 --- a/cpp/src/parquet/arrow/arrow-reader-writer-test.cc +++ b/cpp/src/parquet/arrow/arrow-reader-writer-test.cc @@ -1173,9 +1173,8 @@ TEST_F(TestNullParquetIO, NullListColumn) { TEST_F(TestNullParquetIO, NullDictionaryColumn) { std::shared_ptr null_bitmap; - ASSERT_OK( - ::arrow::AllocateBitmap(::arrow::default_memory_pool(), SMALL_SIZE, &null_bitmap)); - std::memset(null_bitmap->mutable_data(), 0, null_bitmap->size()); + ASSERT_OK(::arrow::AllocateEmptyBitmap(::arrow::default_memory_pool(), SMALL_SIZE, + &null_bitmap)); std::shared_ptr indices = std::make_shared<::arrow::Int8Array>(SMALL_SIZE, nullptr, null_bitmap, SMALL_SIZE); @@ -2881,19 +2880,54 @@ TEST(TestArrowWriteDictionaries, AutoReadAsDictionary) { constexpr int64_t min_length = 2; constexpr int64_t max_length = 20; ::arrow::random::RandomArrayGenerator rag(0); - auto values = rag.BinaryWithRepeats(repeat * num_unique, num_unique, min_length, + auto values = rag.StringWithRepeats(repeat * num_unique, num_unique, min_length, max_length, /*null_probability=*/0.1); std::shared_ptr dict_values; AsDictionary32Encoded(*values, &dict_values); auto expected = MakeSimpleTable(dict_values, /*nullable=*/true); + auto expected_dense = MakeSimpleTable(values, /*nullable=*/true); - auto arrow_writer_properties = ArrowWriterProperties::Builder().store_schema()->build(); + auto props_store_schema = ArrowWriterProperties::Builder().store_schema()->build(); + std::shared_ptr
actual, actual_dense; - std::shared_ptr
actual; DoRoundtrip(expected, values->length(), &actual, default_writer_properties(), - arrow_writer_properties); - ::arrow::AssertTablesEqual(*expected, *actual, /*same_chunk_layout=*/false); + props_store_schema); + ::arrow::AssertTablesEqual(*expected, *actual); + + auto props_no_store_schema = ArrowWriterProperties::Builder().build(); + DoRoundtrip(expected, values->length(), &actual_dense, default_writer_properties(), + props_no_store_schema); + ::arrow::AssertTablesEqual(*expected_dense, *actual_dense); +} + +TEST(TestArrowWriteDictionaries, NestedSubfield) { + // ARROW-3246: Automatic decoding of dictionary subfields left as followup + // work + auto offsets = ::arrow::ArrayFromJSON(::arrow::int32(), "[0, 0, 2, 3]"); + auto indices = ::arrow::ArrayFromJSON(::arrow::int32(), "[0, 0, 0]"); + auto dict = ::arrow::ArrayFromJSON(::arrow::utf8(), "[\"foo\"]"); + + std::shared_ptr dict_values, values; + auto dict_ty = ::arrow::dictionary(::arrow::int32(), ::arrow::utf8()); + ASSERT_OK(::arrow::DictionaryArray::FromArrays(dict_ty, indices, dict, &dict_values)); + ASSERT_OK(::arrow::ListArray::FromArrays(*offsets, *dict_values, + ::arrow::default_memory_pool(), &values)); + + auto dense_ty = ::arrow::list(::arrow::utf8()); + auto dense_values = + ::arrow::ArrayFromJSON(dense_ty, "[[], [\"foo\", \"foo\"], [\"foo\"]]"); + + auto table = MakeSimpleTable(values, /*nullable=*/true); + auto expected_table = MakeSimpleTable(dense_values, /*nullable=*/true); + + auto props_store_schema = ArrowWriterProperties::Builder().store_schema()->build(); + std::shared_ptr
actual; + DoRoundtrip(table, values->length(), &actual, default_writer_properties(), + props_store_schema); + + // The nested subfield is not automatically decoded to dictionary + ::arrow::AssertTablesEqual(*expected_table, *actual); } } // namespace arrow diff --git a/cpp/src/parquet/arrow/reader_internal.cc b/cpp/src/parquet/arrow/reader_internal.cc index 0a4b4e5c0d5..649f73f76be 100644 --- a/cpp/src/parquet/arrow/reader_internal.cc +++ b/cpp/src/parquet/arrow/reader_internal.cc @@ -494,11 +494,13 @@ Status GroupToSchemaField(const GroupNode& node, int16_t max_def_level, Status NodeToSchemaField(const Node& node, int16_t max_def_level, int16_t max_rep_level, SchemaTreeContext* ctx, const SchemaField* parent, SchemaField* out) { + /// Workhorse function for converting a Parquet schema node to an Arrow + /// type. Handles different conventions for nested data if (node.is_optional()) { ++max_def_level; } else if (node.is_repeated()) { - // Repeated fields add a definition level. This is used to distinguish - // between an empty list and a list with an item in it. + // Repeated fields add both a repetition and definition level. This is used + // to distinguish between an empty list and a list with an item in it. ++max_rep_level; ++max_def_level; } @@ -507,9 +509,19 @@ Status NodeToSchemaField(const Node& node, int16_t max_def_level, int16_t max_re // Now, walk the schema and create a ColumnDescriptor for each leaf node if (node.is_group()) { + // A nested field, but we don't know what kind yet return GroupToSchemaField(static_cast(node), max_def_level, max_rep_level, ctx, parent, out); } else { + // Either a normal flat primitive type, or a list type encoded with 1-level + // list encoding. Note that the 3-level encoding is the form recommended by + // the parquet specification, but technically we can have either + // + // required/optional $TYPE $FIELD_NAME + // + // or + // + // repeated $TYPE $FIELD_NAME const auto& primitive_node = static_cast(node); int column_index = ctx->schema->GetColumnIndex(primitive_node); std::shared_ptr type; @@ -529,6 +541,7 @@ Status NodeToSchemaField(const Node& node, int16_t max_def_level, int16_t max_re out->max_repetition_level = max_rep_level; return Status::OK(); } else { + // A normal (required/optional) primitive node return PopulateLeaf(column_index, ::arrow::field(node.name(), type, node.is_optional()), max_def_level, max_rep_level, ctx, parent, out); @@ -602,15 +615,17 @@ Status BuildSchemaManifest(const SchemaDescriptor* schema, // schema (if any) through all functions in the schema reconstruction, but // I'm being lazy and just setting dictionary fields at the top level for // now - if (manifest->origin_schema) { - auto origin_field = manifest->origin_schema->field(i); - auto current_type = out_field->field->type(); - if (origin_field->type()->id() == ::arrow::Type::DICTIONARY) { - if (current_type->id() != ::arrow::Type::DICTIONARY) { - out_field->field = out_field->field->WithType( - ::arrow::dictionary(::arrow::int32(), current_type)); - } - } + if (manifest->origin_schema == nullptr) { + continue; + } + auto origin_field = manifest->origin_schema->field(i); + auto current_type = out_field->field->type(); + if (origin_field->type()->id() != ::arrow::Type::DICTIONARY) { + continue; + } + if (current_type->id() != ::arrow::Type::DICTIONARY) { + out_field->field = + out_field->field->WithType(::arrow::dictionary(::arrow::int32(), current_type)); } } return Status::OK(); diff --git a/cpp/src/parquet/column_writer.cc b/cpp/src/parquet/column_writer.cc index 507584070e2..052ca14967a 100644 --- a/cpp/src/parquet/column_writer.cc +++ b/cpp/src/parquet/column_writer.cc @@ -645,6 +645,7 @@ inline void DoInBatches(int64_t total, int64_t batch_size, Action&& action) { } bool DictionaryDirectWriteSupported(const arrow::Array& array) { + DCHECK_EQ(array.type_id(), arrow::Type::DICTIONARY); const arrow::DictionaryType& dict_type = static_cast(*array.type()); auto id = dict_type.value_type()->id(); @@ -671,6 +672,10 @@ Status ConvertDictionaryToDense(const arrow::Array& array, MemoryPool* pool, return Status::OK(); } +static inline bool IsDictionaryEncoding(Encoding::type encoding) { + return encoding == Encoding::PLAIN_DICTIONARY; +} + template class TypedColumnWriterImpl : public ColumnWriterImpl, public TypedColumnWriter { public: @@ -818,7 +823,7 @@ class TypedColumnWriterImpl : public ColumnWriterImpl, public TypedColumnWriter< private: using ValueEncoderType = typename EncodingTraits::Encoder; - using TypedStats = typename TypeClasses::Statistics; + using TypedStats = TypedStatistics; std::unique_ptr current_encoder_; std::shared_ptr page_statistics_; std::shared_ptr chunk_statistics_; @@ -922,14 +927,16 @@ class TypedColumnWriterImpl : public ColumnWriterImpl, public TypedColumnWriter< } void FallbackToPlainEncoding() { - WriteDictionaryPage(); - // Serialize the buffered Dictionary Indicies - FlushBufferedDataPages(); - fallback_ = true; - // Only PLAIN encoding is supported for fallback in V1 - current_encoder_ = MakeEncoder(DType::type_num, Encoding::PLAIN, false, descr_, - properties_->memory_pool()); - encoding_ = Encoding::PLAIN; + if (IsDictionaryEncoding(current_encoder_->encoding())) { + WriteDictionaryPage(); + // Serialize the buffered Dictionary Indicies + FlushBufferedDataPages(); + fallback_ = true; + // Only PLAIN encoding is supported for fallback in V1 + current_encoder_ = MakeEncoder(DType::type_num, Encoding::PLAIN, false, descr_, + properties_->memory_pool()); + encoding_ = Encoding::PLAIN; + } } // Checks if the Dictionary Page size limit is reached @@ -979,10 +986,6 @@ class TypedColumnWriterImpl : public ColumnWriterImpl, public TypedColumnWriter< } }; -static inline bool IsDictionaryEncoding(Encoding::type encoding) { - return encoding == Encoding::PLAIN_DICTIONARY; -} - template Status TypedColumnWriterImpl::WriteArrowDictionary(const int16_t* def_levels, const int16_t* rep_levels, diff --git a/cpp/src/parquet/encoding.cc b/cpp/src/parquet/encoding.cc index 9dd440815cf..cd4518ebf3f 100644 --- a/cpp/src/parquet/encoding.cc +++ b/cpp/src/parquet/encoding.cc @@ -148,7 +148,8 @@ void PlainEncoder::Put(const arrow::Array& values) { } void AssertBinary(const arrow::Array& values) { - if (dynamic_cast(&values) == nullptr) { + if (values.type_id() != arrow::Type::BINARY && + values.type_id() != arrow::Type::STRING) { throw ParquetException("Only BinaryArray and subclasses supported"); } } diff --git a/cpp/src/parquet/encoding.h b/cpp/src/parquet/encoding.h index bca152caf63..618fd1a4c0c 100644 --- a/cpp/src/parquet/encoding.h +++ b/cpp/src/parquet/encoding.h @@ -99,7 +99,7 @@ class DictEncoder : virtual public TypedEncoder { virtual int num_entries() const = 0; /// \brief EXPERIMENTAL: Append dictionary indices into the encoder. It is - /// assumed, without any boundschecking that the indices reference + /// assumed (without any boundschecking) that the indices reference /// pre-existing dictionary values /// \param[in] indices the dictionary index values. Only Int32Array currently /// supported diff --git a/cpp/src/parquet/statistics.h b/cpp/src/parquet/statistics.h index bb91f31d225..d338d48026d 100644 --- a/cpp/src/parquet/statistics.h +++ b/cpp/src/parquet/statistics.h @@ -101,18 +101,17 @@ class TypedComparator : public Comparator { /// \brief Typed version of Comparator::Make template -inline std::shared_ptr::Comparator> MakeComparator( - Type::type physical_type, SortOrder::type sort_order, int type_length = -1) { - return std::dynamic_pointer_cast::Comparator>( +std::shared_ptr> MakeComparator(Type::type physical_type, + SortOrder::type sort_order, + int type_length = -1) { + return std::static_pointer_cast>( Comparator::Make(physical_type, sort_order, type_length)); } /// \brief Typed version of Comparator::Make template -inline std::shared_ptr::Comparator> MakeComparator( - const ColumnDescriptor* descr) { - return std::dynamic_pointer_cast::Comparator>( - Comparator::Make(descr)); +std::shared_ptr> MakeComparator(const ColumnDescriptor* descr) { + return std::static_pointer_cast>(Comparator::Make(descr)); } // ---------------------------------------------------------------------- @@ -152,33 +151,33 @@ class PARQUET_EXPORT EncodedStatistics { } } - inline bool is_set() const { + bool is_set() const { return has_min || has_max || has_null_count || has_distinct_count; } - inline bool is_signed() const { return is_signed_; } + bool is_signed() const { return is_signed_; } - inline void set_is_signed(bool is_signed) { is_signed_ = is_signed; } + void set_is_signed(bool is_signed) { is_signed_ = is_signed; } - inline EncodedStatistics& set_max(const std::string& value) { + EncodedStatistics& set_max(const std::string& value) { *max_ = value; has_max = true; return *this; } - inline EncodedStatistics& set_min(const std::string& value) { + EncodedStatistics& set_min(const std::string& value) { *min_ = value; has_min = true; return *this; } - inline EncodedStatistics& set_null_count(int64_t value) { + EncodedStatistics& set_null_count(int64_t value) { null_count = value; has_null_count = true; return *this; } - inline EncodedStatistics& set_distinct_count(int64_t value) { + EncodedStatistics& set_distinct_count(int64_t value) { distinct_count = value; has_distinct_count = true; return *this; @@ -286,11 +285,10 @@ class TypedStatistics : public Statistics { /// \brief Typed version of Statistics::Make template -inline std::shared_ptr::Statistics> MakeStatistics( +std::shared_ptr> MakeStatistics( const ColumnDescriptor* descr, ::arrow::MemoryPool* pool = ::arrow::default_memory_pool()) { - return std::dynamic_pointer_cast::Statistics>( - Statistics::Make(descr, pool)); + return std::static_pointer_cast>(Statistics::Make(descr, pool)); } /// \brief Create Statistics initialized to a particular state @@ -300,22 +298,23 @@ inline std::shared_ptr::Statistics> MakeStatistics( /// \param[in] null_count number of null values /// \param[in] distinct_count number of distinct values template -inline std::shared_ptr::Statistics> MakeStatistics( - const typename DType::c_type& min, const typename DType::c_type& max, - int64_t num_values, int64_t null_count, int64_t distinct_count) { - return std::dynamic_pointer_cast::Statistics>( - Statistics::Make(DType::type_num, &min, &max, num_values, null_count, - distinct_count)); +std::shared_ptr> MakeStatistics(const typename DType::c_type& min, + const typename DType::c_type& max, + int64_t num_values, + int64_t null_count, + int64_t distinct_count) { + return std::static_pointer_cast>(Statistics::Make( + DType::type_num, &min, &max, num_values, null_count, distinct_count)); } /// \brief Typed version of Statistics::Make template -inline std::shared_ptr::Statistics> MakeStatistics( +std::shared_ptr> MakeStatistics( const ColumnDescriptor* descr, const std::string& encoded_min, const std::string& encoded_max, int64_t num_values, int64_t null_count, int64_t distinct_count, bool has_min_max, ::arrow::MemoryPool* pool = ::arrow::default_memory_pool()) { - return std::dynamic_pointer_cast::Statistics>( + return std::static_pointer_cast>( Statistics::Make(descr, encoded_min, encoded_max, num_values, null_count, distinct_count, has_min_max, pool)); } diff --git a/cpp/src/parquet/types.h b/cpp/src/parquet/types.h index 04ae8afa30a..30395f37ec4 100644 --- a/cpp/src/parquet/types.h +++ b/cpp/src/parquet/types.h @@ -650,86 +650,6 @@ using DoubleType = PhysicalType; using ByteArrayType = PhysicalType; using FLBAType = PhysicalType; -// Type forward declarations for TypeClasses -class Comparator; -class Statistics; - -template -class TypedComparator; - -template -class TypedStatistics; - -using BoolComparator = TypedComparator; -using Int32Comparator = TypedComparator; -using Int64Comparator = TypedComparator; -using Int96Comparator = TypedComparator; -using FloatComparator = TypedComparator; -using DoubleComparator = TypedComparator; -using FLBAComparator = TypedComparator; -using ByteArrayComparator = TypedComparator; - -using BoolStatistics = TypedStatistics; -using Int32Statistics = TypedStatistics; -using Int64Statistics = TypedStatistics; -using FloatStatistics = TypedStatistics; -using DoubleStatistics = TypedStatistics; -using FLBAStatistics = TypedStatistics; -using ByteArrayStatistics = TypedStatistics; - -template -struct TypeClasses {}; - -template <> -struct TypeClasses { - using Comparator = BoolComparator; - using Statistics = BoolStatistics; -}; - -template <> -struct TypeClasses { - using Comparator = Int32Comparator; - using Statistics = Int32Statistics; -}; - -template <> -struct TypeClasses { - using Comparator = Int64Comparator; - using Statistics = Int64Statistics; -}; - -template <> -struct TypeClasses { - using Comparator = Int96Comparator; - - // unused - using Statistics = TypedStatistics; -}; - -template <> -struct TypeClasses { - using Comparator = FloatComparator; - using Statistics = FloatStatistics; -}; - -template <> -struct TypeClasses { - using Comparator = DoubleComparator; - using Statistics = DoubleStatistics; -}; - -template <> -struct TypeClasses { - using Comparator = ByteArrayComparator; - using Statistics = ByteArrayStatistics; -}; - -template <> -struct TypeClasses { - using Comparator = FLBAComparator; - using Statistics = FLBAStatistics; -}; - template inline std::string format_fwf(int width) { std::stringstream ss; From 6b1769cb1340bd2464bd327dcff14a4a0abaef60 Mon Sep 17 00:00:00 2001 From: Wes McKinney Date: Fri, 16 Aug 2019 07:00:10 -0500 Subject: [PATCH 28/28] Restore statistics aliases --- cpp/src/parquet/statistics.h | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/cpp/src/parquet/statistics.h b/cpp/src/parquet/statistics.h index d338d48026d..30d58aafd8d 100644 --- a/cpp/src/parquet/statistics.h +++ b/cpp/src/parquet/statistics.h @@ -283,6 +283,14 @@ class TypedStatistics : public Statistics { virtual void SetMinMax(const T& min, const T& max) = 0; }; +using BoolStatistics = TypedStatistics; +using Int32Statistics = TypedStatistics; +using Int64Statistics = TypedStatistics; +using FloatStatistics = TypedStatistics; +using DoubleStatistics = TypedStatistics; +using ByteArrayStatistics = TypedStatistics; +using FLBAStatistics = TypedStatistics; + /// \brief Typed version of Statistics::Make template std::shared_ptr> MakeStatistics(