Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3,409 changes: 1,969 additions & 1,440 deletions cpp/src/arrow/util/bpacking.h

Large diffs are not rendered by default.

9 changes: 4 additions & 5 deletions cpp/src/arrow/util/hashing.h
Original file line number Diff line number Diff line change
Expand Up @@ -149,9 +149,8 @@ hash_t ComputeStringHash(const void* data, int64_t length) {
// the results
uint32_t x, y;
hash_t hx, hy;
// XXX those are unaligned accesses. Should we have a facility for that?
x = *reinterpret_cast<const uint32_t*>(p + n - 4);
y = *reinterpret_cast<const uint32_t*>(p);
x = util::SafeLoadAs<uint32_t>(p + n - 4);
y = util::SafeLoadAs<uint32_t>(p);
hx = ScalarHelper<uint32_t, AlgNum>::ComputeHash(x);
hy = ScalarHelper<uint32_t, AlgNum ^ 1>::ComputeHash(y);
return n ^ hx ^ hy;
Expand All @@ -160,8 +159,8 @@ hash_t ComputeStringHash(const void* data, int64_t length) {
// Apply the same principle as above
uint64_t x, y;
hash_t hx, hy;
x = *reinterpret_cast<const uint64_t*>(p + n - 8);
y = *reinterpret_cast<const uint64_t*>(p);
x = util::SafeLoadAs<uint64_t>(p + n - 8);
y = util::SafeLoadAs<uint64_t>(p);
hx = ScalarHelper<uint64_t, AlgNum>::ComputeHash(x);
hy = ScalarHelper<uint64_t, AlgNum ^ 1>::ComputeHash(y);
return n ^ hx ^ hy;
Expand Down
16 changes: 16 additions & 0 deletions cpp/src/arrow/util/ubsan.h
Original file line number Diff line number Diff line change
Expand Up @@ -49,5 +49,21 @@ inline T* MakeNonNull(T* maybe_null) {
return reinterpret_cast<T*>(&internal::non_null_filler);
}

template <typename T>
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I'm no C++ guru, but can't you make a single method for this by templating the input type too, e.g.

template <typename T, typename I = uint8_t>
inline typename std::enable_if<std::is_integral<T>::value, T>::type SafeLoad(
    const I* unaligned) {
  typename std::remove_const<T>::type ret;
  std::memcpy(&ret, unaligned, sizeof(T));
  return ret;
}

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This is correct, I think I like how the methods are now though since it is more explicit when casting and a pass-through when not. One method could certainly delegate to the other.

inline typename std::enable_if<std::is_integral<T>::value, T>::type SafeLoadAs(
const uint8_t* unaligned) {
typename std::remove_const<T>::type ret;
std::memcpy(&ret, unaligned, sizeof(T));
return ret;
}

template <typename T>
inline typename std::enable_if<std::is_integral<T>::value, T>::type SafeLoad(
const T* unaligned) {
typename std::remove_const<T>::type ret;
std::memcpy(&ret, unaligned, sizeof(T));
return ret;
}

} // namespace util
} // namespace arrow
20 changes: 10 additions & 10 deletions cpp/src/parquet/arrow/reader.cc
Original file line number Diff line number Diff line change
Expand Up @@ -83,6 +83,7 @@ namespace arrow {

using ::arrow::BitUtil::FromBigEndian;
using ::arrow::internal::SafeLeftShift;
using ::arrow::util::SafeLoadAs;

template <typename ArrowType>
using ArrayType = typename ::arrow::TypeTraits<ArrowType>::ArrayType;
Expand Down Expand Up @@ -1212,38 +1213,37 @@ static uint64_t BytesToInteger(const uint8_t* bytes, int32_t start, int32_t stop
case 1:
return bytes[start];
case 2:
return FromBigEndian(*reinterpret_cast<const uint16_t*>(bytes + start));
return FromBigEndian(SafeLoadAs<uint16_t>(bytes + start));
case 3: {
const uint64_t first_two_bytes =
FromBigEndian(*reinterpret_cast<const uint16_t*>(bytes + start));
const uint64_t first_two_bytes = FromBigEndian(SafeLoadAs<uint16_t>(bytes + start));
const uint64_t last_byte = bytes[stop - 1];
return first_two_bytes << 8 | last_byte;
}
case 4:
return FromBigEndian(*reinterpret_cast<const uint32_t*>(bytes + start));
return FromBigEndian(SafeLoadAs<uint32_t>(bytes + start));
case 5: {
const uint64_t first_four_bytes =
FromBigEndian(*reinterpret_cast<const uint32_t*>(bytes + start));
FromBigEndian(SafeLoadAs<uint32_t>(bytes + start));
const uint64_t last_byte = bytes[stop - 1];
return first_four_bytes << 8 | last_byte;
}
case 6: {
const uint64_t first_four_bytes =
FromBigEndian(*reinterpret_cast<const uint32_t*>(bytes + start));
FromBigEndian(SafeLoadAs<uint32_t>(bytes + start));
const uint64_t last_two_bytes =
FromBigEndian(*reinterpret_cast<const uint16_t*>(bytes + start + 4));
FromBigEndian(SafeLoadAs<uint16_t>(bytes + start + 4));
return first_four_bytes << 16 | last_two_bytes;
}
case 7: {
const uint64_t first_four_bytes =
FromBigEndian(*reinterpret_cast<const uint32_t*>(bytes + start));
FromBigEndian(SafeLoadAs<uint32_t>(bytes + start));
const uint64_t second_two_bytes =
FromBigEndian(*reinterpret_cast<const uint16_t*>(bytes + start + 4));
FromBigEndian(SafeLoadAs<uint16_t>(bytes + start + 4));
const uint64_t last_byte = bytes[stop - 1];
return first_four_bytes << 24 | second_two_bytes << 8 | last_byte;
}
case 8:
return FromBigEndian(*reinterpret_cast<const uint64_t*>(bytes + start));
return FromBigEndian(SafeLoadAs<uint64_t>(bytes + start));
default: {
DCHECK(false);
return UINT64_MAX;
Expand Down
5 changes: 3 additions & 2 deletions cpp/src/parquet/arrow/writer.h
Original file line number Diff line number Diff line change
Expand Up @@ -211,8 +211,9 @@ inline void ArrowTimestampToImpalaTimestamp(const int64_t time, Int96* impala_ti
(*impala_timestamp).value[2] = (uint32_t)julian_days;

int64_t last_day_units = time % UnitPerDay;
int64_t* impala_last_day_nanos = reinterpret_cast<int64_t*>(impala_timestamp);
*impala_last_day_nanos = last_day_units * NanosecondsPerUnit;
auto last_day_nanos = last_day_units * NanosecondsPerUnit;
// Strage might be unaligned, so use mempcy instead of reinterpret_cast
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

All even indexed Int96 in a vector will be unaligned (according to int64_t alignment).

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Good point I'll open up a follow-up PR, we must not hav good test data here or UBSan isn't foolproof, or somehow I didn't run this test properly

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Wait were you just commenting on my comment?

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yes, I was commenting on your comment on the "strange" part :)

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Did you notice the typo? ("Strage")

std::memcpy(impala_timestamp, &last_day_nanos, sizeof(int64_t));
}

constexpr int64_t kSecondsInNanos = INT64_C(1000000000);
Expand Down
3 changes: 2 additions & 1 deletion cpp/src/parquet/column_reader.cc
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,7 @@
#include "arrow/util/compression.h"
#include "arrow/util/logging.h"
#include "arrow/util/rle-encoding.h"
#include "arrow/util/ubsan.h"

#include "parquet/column_page.h"
#include "parquet/encoding.h"
Expand All @@ -50,7 +51,7 @@ int LevelDecoder::SetData(Encoding::type encoding, int16_t max_level,
bit_width_ = BitUtil::Log2(max_level + 1);
switch (encoding) {
case Encoding::RLE: {
num_bytes = *reinterpret_cast<const int32_t*>(data);
num_bytes = arrow::util::SafeLoadAs<int32_t>(data);
const uint8_t* decoder_data = data + sizeof(int32_t);
if (!rle_decoder_) {
rle_decoder_.reset(
Expand Down
11 changes: 6 additions & 5 deletions cpp/src/parquet/encoding.cc
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,7 @@
#include "arrow/util/logging.h"
#include "arrow/util/rle-encoding.h"
#include "arrow/util/string_view.h"
#include "arrow/util/ubsan.h"

#include "parquet/exception.h"
#include "parquet/platform.h"
Expand Down Expand Up @@ -609,7 +610,7 @@ inline int DecodePlain<ByteArray>(const uint8_t* data, int64_t data_size, int nu
int bytes_decoded = 0;
int increment;
for (int i = 0; i < num_values; ++i) {
uint32_t len = out[i].len = *reinterpret_cast<const uint32_t*>(data);
uint32_t len = out[i].len = arrow::util::SafeLoadAs<uint32_t>(data);
increment = static_cast<int>(sizeof(uint32_t) + len);
if (data_size < increment) ParquetException::EofException();
out[i].ptr = data + sizeof(uint32_t);
Expand Down Expand Up @@ -719,7 +720,7 @@ class PlainByteArrayDecoder : public PlainDecoder<ByteArrayType>,
int bytes_decoded = 0;
while (i < num_values) {
if (bit_reader.IsSet()) {
uint32_t len = *reinterpret_cast<const uint32_t*>(data);
uint32_t len = arrow::util::SafeLoadAs<uint32_t>(data);
increment = static_cast<int>(sizeof(uint32_t) + len);
if (data_size < increment) {
ParquetException::EofException();
Expand Down Expand Up @@ -752,7 +753,7 @@ class PlainByteArrayDecoder : public PlainDecoder<ByteArrayType>,
int bytes_decoded = 0;

while (i < num_values) {
uint32_t len = *reinterpret_cast<const uint32_t*>(data);
uint32_t len = arrow::util::SafeLoadAs<uint32_t>(data);
int increment = static_cast<int>(sizeof(uint32_t) + len);
if (data_size < increment) ParquetException::EofException();
builder->Append(data + sizeof(uint32_t), len);
Expand Down Expand Up @@ -1103,7 +1104,7 @@ class DeltaLengthByteArrayDecoder : public DecoderImpl,
virtual void SetData(int num_values, const uint8_t* data, int len) {
num_values_ = num_values;
if (len == 0) return;
int total_lengths_len = *reinterpret_cast<const int*>(data);
int total_lengths_len = arrow::util::SafeLoadAs<int32_t>(data);
data += 4;
this->len_decoder_.SetData(num_values, data, total_lengths_len);
data_ = data + total_lengths_len;
Expand Down Expand Up @@ -1145,7 +1146,7 @@ class DeltaByteArrayDecoder : public DecoderImpl,
virtual void SetData(int num_values, const uint8_t* data, int len) {
num_values_ = num_values;
if (len == 0) return;
int prefix_len_length = *reinterpret_cast<const int*>(data);
int prefix_len_length = arrow::util::SafeLoadAs<int32_t>(data);
data += 4;
len -= 4;
prefix_len_decoder_.SetData(num_values, data, prefix_len_length);
Expand Down
3 changes: 2 additions & 1 deletion cpp/src/parquet/file_reader.cc
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,7 @@
#include "arrow/io/file.h"
#include "arrow/status.h"
#include "arrow/util/logging.h"
#include "arrow/util/ubsan.h"

#include "parquet/column_reader.h"
#include "parquet/column_scanner.h"
Expand Down Expand Up @@ -179,7 +180,7 @@ class SerializedFile : public ParquetFileReader::Contents {
throw ParquetException("Invalid parquet file. Corrupt footer.");
}

uint32_t metadata_len = *reinterpret_cast<const uint32_t*>(
uint32_t metadata_len = arrow::util::SafeLoadAs<uint32_t>(
reinterpret_cast<const uint8_t*>(footer_buffer->data()) + footer_read_size -
kFooterSize);
int64_t metadata_start = file_size - kFooterSize - metadata_len;
Expand Down
4 changes: 3 additions & 1 deletion cpp/src/plasma/common.cc
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,8 @@

#include <limits>

#include "arrow/util/ubsan.h"

#include "plasma/plasma_generated.h"

namespace fb = plasma::flatbuf;
Expand Down Expand Up @@ -64,7 +66,7 @@ uint64_t MurmurHash64A(const void* key, int len, unsigned int seed) {
const uint64_t* end = data + (len / 8);

while (data != end) {
uint64_t k = *data++;
uint64_t k = arrow::util::SafeLoad(data++);

k *= m;
k ^= k >> r;
Expand Down