From ea793ac7c8a2dd1b56c38f70229342b4f644e5d3 Mon Sep 17 00:00:00 2001 From: Antoine Pitrou Date: Thu, 24 Jun 2021 18:50:54 +0200 Subject: [PATCH] ARROW-13104: [C++] Fix unsafe cast in ByteStreamSplit implementation --- cpp/src/parquet/encoding.cc | 49 +++++++++++++++++++++---------------- 1 file changed, 28 insertions(+), 21 deletions(-) diff --git a/cpp/src/parquet/encoding.cc b/cpp/src/parquet/encoding.cc index cc1e262a96d..6e8f7ee5491 100644 --- a/cpp/src/parquet/encoding.cc +++ b/cpp/src/parquet/encoding.cc @@ -861,20 +861,31 @@ class ByteStreamSplitEncoder : public EncoderImpl, virtual public TypedEncoder values_; + template + void PutImpl(const ::arrow::Array& values) { + if (values.type_id() != ArrowType::type_id) { + throw ParquetException(std::string() + "direct put to " + ArrowType::type_name() + + " from " + values.type()->ToString() + " not supported"); + } + const auto& data = *values.data(); + PutSpaced(data.GetValues(1), + static_cast(data.length), data.GetValues(0, 0), data.offset); + } - private: - void PutArrowArray(const ::arrow::Array& values); + ::arrow::BufferBuilder sink_; + int64_t num_values_in_buffer_; }; template ByteStreamSplitEncoder::ByteStreamSplitEncoder(const ColumnDescriptor* descr, ::arrow::MemoryPool* pool) - : EncoderImpl(descr, Encoding::BYTE_STREAM_SPLIT, pool), values_{pool} {} + : EncoderImpl(descr, Encoding::BYTE_STREAM_SPLIT, pool), + sink_{pool}, + num_values_in_buffer_{0} {} template int64_t ByteStreamSplitEncoder::EstimatedDataEncodedSize() { - return values_.length() * sizeof(T); + return sink_.length(); } template @@ -882,34 +893,30 @@ std::shared_ptr ByteStreamSplitEncoder::FlushValues() { std::shared_ptr output_buffer = AllocateBuffer(this->memory_pool(), EstimatedDataEncodedSize()); uint8_t* output_buffer_raw = output_buffer->mutable_data(); - const size_t num_values = values_.length(); - const uint8_t* raw_values = reinterpret_cast(values_.data()); - ::arrow::util::internal::ByteStreamSplitEncode(raw_values, num_values, + const uint8_t* raw_values = sink_.data(); + ::arrow::util::internal::ByteStreamSplitEncode(raw_values, num_values_in_buffer_, output_buffer_raw); - values_.Reset(); + sink_.Reset(); + num_values_in_buffer_ = 0; return std::move(output_buffer); } template void ByteStreamSplitEncoder::Put(const T* buffer, int num_values) { - if (num_values > 0) PARQUET_THROW_NOT_OK(values_.Append(buffer, num_values)); -} - -template -void ByteStreamSplitEncoder::Put(const ::arrow::Array& values) { - PutArrowArray(values); + if (num_values > 0) { + PARQUET_THROW_NOT_OK(sink_.Append(buffer, num_values * sizeof(T))); + num_values_in_buffer_ += num_values; + } } template <> -void ByteStreamSplitEncoder::PutArrowArray(const ::arrow::Array& values) { - DirectPutImpl<::arrow::FloatArray>(values, - reinterpret_cast<::arrow::BufferBuilder*>(&values_)); +void ByteStreamSplitEncoder::Put(const ::arrow::Array& values) { + PutImpl<::arrow::FloatType>(values); } template <> -void ByteStreamSplitEncoder::PutArrowArray(const ::arrow::Array& values) { - DirectPutImpl<::arrow::DoubleArray>( - values, reinterpret_cast<::arrow::BufferBuilder*>(&values_)); +void ByteStreamSplitEncoder::Put(const ::arrow::Array& values) { + PutImpl<::arrow::DoubleType>(values); } template