diff --git a/cpp/src/arrow/util/rle_encoding.h b/cpp/src/arrow/util/rle_encoding.h index bc4ba1c727b..d5befbc8167 100644 --- a/cpp/src/arrow/util/rle_encoding.h +++ b/cpp/src/arrow/util/rle_encoding.h @@ -126,6 +126,9 @@ class RleDecoder { int GetBatchWithDict(const T* dictionary, T* values, int batch_size); /// Like GetBatchWithDict but add spacing for null entries + /// + /// Null entries will be zero-initialized in `values` to avoid leaking + /// private data. template int GetBatchWithDictSpaced(const T* dictionary, T* values, int batch_size, int null_count, const uint8_t* valid_bits, @@ -433,6 +436,8 @@ inline int RleDecoder::GetBatchWithDictSpaced(const T* dictionary, T* out, int b DCHECK_GE(bit_width_, 0); int values_read = 0; int remaining_nulls = null_count; + T zero; + memset(&zero, 0, sizeof(T)); arrow::internal::BitmapReader bit_reader(valid_bits, valid_bits_offset, batch_size); @@ -484,6 +489,7 @@ inline int RleDecoder::GetBatchWithDictSpaced(const T* dictionary, T* out, int b *out = dictionary[indices[literals_read]]; literals_read++; } else { + *out = zero; skipped++; } ++out; @@ -494,6 +500,7 @@ inline int RleDecoder::GetBatchWithDictSpaced(const T* dictionary, T* out, int b remaining_nulls -= skipped; } } else { + *out = zero; ++out; values_read++; remaining_nulls--; diff --git a/cpp/src/parquet/arrow/reader_internal.cc b/cpp/src/parquet/arrow/reader_internal.cc index e9c75025c0e..5afe0d832d6 100644 --- a/cpp/src/parquet/arrow/reader_internal.cc +++ b/cpp/src/parquet/arrow/reader_internal.cc @@ -738,7 +738,13 @@ Status TransferInt96(RecordReader* reader, MemoryPool* pool, RETURN_NOT_OK(::arrow::AllocateBuffer(pool, length * sizeof(int64_t), &data)); auto data_ptr = reinterpret_cast(data->mutable_data()); for (int64_t i = 0; i < length; i++) { - *data_ptr++ = Int96GetNanoSeconds(values[i]); + if (values[i].value[2] == 0) { + // Happens for null entries: avoid triggering UBSAN as that Int96 timestamp + // isn't representable as a 64-bit Unix timestamp. + *data_ptr++ = 0; + } else { + *data_ptr++ = Int96GetNanoSeconds(values[i]); + } } *out = std::make_shared(type, length, data, reader->ReleaseIsValid(), reader->null_count());