Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 4 additions & 3 deletions cpp/build-support/run_clang_format.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
#!/usr/bin/python
#!/usr/bin/env python
# Licensed to the Apache Software Foundation (ASF) under one
# or more contributor license agreements. See the NOTICE file
# distributed with this work for additional information
Expand Down Expand Up @@ -58,8 +58,9 @@
# fi

try:
subprocess.check_output([CLANG_FORMAT, '-i'] + files_to_format,
stderr=subprocess.STDOUT)
cmd = [CLANG_FORMAT, '-i'] + files_to_format
subprocess.check_output(cmd, stderr=subprocess.STDOUT)
except Exception as e:
print(e)
print(' '.join(cmd))
raise
34 changes: 16 additions & 18 deletions cpp/src/arrow/compute/cast.cc
Original file line number Diff line number Diff line change
Expand Up @@ -261,8 +261,9 @@ void UnpackFixedSizeBinaryDictionary(FunctionContext* ctx, const Array& indices,
const FixedSizeBinaryArray& dictionary,
ArrayData* output) {
using index_c_type = typename IndexType::c_type;
const uint8_t* valid_bits = indices.null_bitmap_data();
INIT_BITSET(valid_bits, indices.offset());

internal::BitmapReader valid_bits_reader(indices.null_bitmap_data(), indices.offset(),
indices.length());

const index_c_type* in =
reinterpret_cast<const index_c_type*>(indices.data()->buffers[1]->data()) +
Expand All @@ -271,11 +272,11 @@ void UnpackFixedSizeBinaryDictionary(FunctionContext* ctx, const Array& indices,
int32_t byte_width =
static_cast<const FixedSizeBinaryType&>(*output->type).byte_width();
for (int64_t i = 0; i < indices.length(); ++i) {
if (bitset_valid_bits & (1 << bit_offset_valid_bits)) {
if (valid_bits_reader.IsSet()) {
const uint8_t* value = dictionary.Value(in[i]);
memcpy(out + i * byte_width, value, byte_width);
}
READ_NEXT_BITSET(valid_bits);
valid_bits_reader.Next();
}
}

Expand All @@ -293,8 +294,7 @@ struct CastFunctor<

// Check if values and output type match
DCHECK(values_type.Equals(*output->type))
<< "Dictionary type: " << values_type
<< " target type: " << (*output->type);
<< "Dictionary type: " << values_type << " target type: " << (*output->type);

const Array& indices = *dict_array.indices();
switch (indices.type()->id()) {
Expand Down Expand Up @@ -327,21 +327,21 @@ Status UnpackBinaryDictionary(FunctionContext* ctx, const Array& indices,
RETURN_NOT_OK(MakeBuilder(ctx->memory_pool(), output->type, &builder));
BinaryBuilder* binary_builder = static_cast<BinaryBuilder*>(builder.get());

const uint8_t* valid_bits = indices.null_bitmap_data();
INIT_BITSET(valid_bits, indices.offset());
internal::BitmapReader valid_bits_reader(indices.null_bitmap_data(), indices.offset(),
indices.length());

const index_c_type* in =
reinterpret_cast<const index_c_type*>(indices.data()->buffers[1]->data()) +
indices.offset();
for (int64_t i = 0; i < indices.length(); ++i) {
if (bitset_valid_bits & (1 << bit_offset_valid_bits)) {
if (valid_bits_reader.IsSet()) {
int32_t length;
const uint8_t* value = dictionary.GetValue(in[i], &length);
RETURN_NOT_OK(binary_builder->Append(value, length));
} else {
RETURN_NOT_OK(binary_builder->AppendNull());
}
READ_NEXT_BITSET(valid_bits);
valid_bits_reader.Next();
}

std::shared_ptr<Array> plain_array;
Expand All @@ -366,8 +366,7 @@ struct CastFunctor<T, DictionaryType,

// Check if values and output type match
DCHECK(values_type.Equals(*output->type))
<< "Dictionary type: " << values_type
<< " target type: " << (*output->type);
<< "Dictionary type: " << values_type << " target type: " << (*output->type);

const Array& indices = *dict_array.indices();
switch (indices.type()->id()) {
Expand Down Expand Up @@ -401,17 +400,17 @@ void UnpackPrimitiveDictionary(const Array& indices, const c_type* dictionary,
c_type* out) {
using index_c_type = typename IndexType::c_type;

const uint8_t* valid_bits = indices.null_bitmap_data();
INIT_BITSET(valid_bits, indices.offset());
internal::BitmapReader valid_bits_reader(indices.null_bitmap_data(), indices.offset(),
indices.length());

const index_c_type* in =
reinterpret_cast<const index_c_type*>(indices.data()->buffers[1]->data()) +
indices.offset();
for (int64_t i = 0; i < indices.length(); ++i) {
if (bitset_valid_bits & (1 << bit_offset_valid_bits)) {
if (valid_bits_reader.IsSet()) {
out[i] = dictionary[in[i]];
}
READ_NEXT_BITSET(valid_bits);
valid_bits_reader.Next();
}
}

Expand All @@ -429,8 +428,7 @@ struct CastFunctor<T, DictionaryType,

// Check if values and output type match
DCHECK(values_type.Equals(*output->type))
<< "Dictionary type: " << values_type
<< " target type: " << (*output->type);
<< "Dictionary type: " << values_type << " target type: " << (*output->type);

auto dictionary =
reinterpret_cast<const c_type*>(type.dictionary()->data()->buffers[1]->data()) +
Expand Down
21 changes: 21 additions & 0 deletions cpp/src/arrow/util/bit-util-test.cc
Original file line number Diff line number Diff line change
Expand Up @@ -72,6 +72,27 @@ TEST(BitUtilTests, TestNextPower2) {
ASSERT_EQ(1LL << 62, NextPower2((1LL << 62) - 1));
}

TEST(BitmapReader, DoesNotReadOutOfBounds) {
uint8_t bitmap[16] = {0};

const int length = 128;

internal::BitmapReader r1(bitmap, 0, length);

// If this were to read out of bounds, valgrind would tell us
for (int i = 0; i < length; ++i) {
ASSERT_TRUE(r1.IsNotSet());
r1.Next();
}

internal::BitmapReader r2(bitmap, 5, length - 5);

for (int i = 0; i < (length - 5); ++i) {
ASSERT_TRUE(r2.IsNotSet());
r2.Next();
}
}

static inline int64_t SlowCountBits(const uint8_t* data, int64_t bit_offset,
int64_t length) {
int64_t count = 0;
Expand Down
49 changes: 49 additions & 0 deletions cpp/src/arrow/util/bit-util.h
Original file line number Diff line number Diff line change
Expand Up @@ -48,6 +48,53 @@
#endif

namespace arrow {
namespace internal {

class BitmapReader {
public:
BitmapReader(const uint8_t* bitmap, int64_t start_offset, int64_t length)
: bitmap_(bitmap), position_(0), length_(length) {
byte_offset_ = start_offset / 8;
bit_offset_ = start_offset % 8;
current_byte_ = bitmap[byte_offset_];
}

#if defined(_MSC_VER)
// MSVC is finicky about this cast
bool IsSet() const { return (current_byte_ & (1 << bit_offset_)) != 0; }
#else
bool IsSet() const { return current_byte_ & (1 << bit_offset_); }
#endif

bool IsNotSet() const { return (current_byte_ & (1 << bit_offset_)) == 0; }

void Next() {
++bit_offset_;
++position_;
if (bit_offset_ == 8) {
bit_offset_ = 0;
++byte_offset_;
if (ARROW_PREDICT_TRUE(position_ < length_)) {
current_byte_ = bitmap_[byte_offset_];
}
}
}

private:
const uint8_t* bitmap_;
int64_t position_;
int64_t length_;

uint8_t current_byte_;
int64_t byte_offset_;
int64_t bit_offset_;
};

} // namespace internal

#ifndef ARROW_NO_DEPRECATED_API

// \deprecated Since > 0.7.0

#define INIT_BITSET(valid_bits_vector, valid_bits_index) \
int64_t byte_offset_##valid_bits_vector = (valid_bits_index) / 8; \
Expand All @@ -62,6 +109,8 @@ namespace arrow {
bitset_##valid_bits_vector = valid_bits_vector[byte_offset_##valid_bits_vector]; \
}

#endif

// TODO(wesm): The source from Impala was depending on boost::make_unsigned
//
// We add a partial stub implementation here
Expand Down
15 changes: 8 additions & 7 deletions cpp/src/arrow/util/rle-encoding.h
Original file line number Diff line number Diff line change
Expand Up @@ -352,11 +352,12 @@ inline int RleDecoder::GetBatchWithDictSpaced(const T* dictionary, T* values,
DCHECK_GE(bit_width_, 0);
int values_read = 0;
int remaining_nulls = null_count;
INIT_BITSET(valid_bits, static_cast<int>(valid_bits_offset));

internal::BitmapReader bit_reader(valid_bits, valid_bits_offset, batch_size);

while (values_read < batch_size) {
bool is_valid = (bitset_valid_bits & (1 << bit_offset_valid_bits)) != 0;
READ_NEXT_BITSET(valid_bits);
bool is_valid = bit_reader.IsSet();
bit_reader.Next();

if (is_valid) {
if ((repeat_count_ == 0) && (literal_count_ == 0)) {
Expand All @@ -369,14 +370,14 @@ inline int RleDecoder::GetBatchWithDictSpaced(const T* dictionary, T* values,
repeat_count_--;

while (repeat_count_ > 0 && (values_read + repeat_batch) < batch_size) {
if (bitset_valid_bits & (1 << bit_offset_valid_bits)) {
if (bit_reader.IsSet()) {
repeat_count_--;
} else {
remaining_nulls--;
}
repeat_batch++;

READ_NEXT_BITSET(valid_bits);
bit_reader.Next();
}
std::fill(values + values_read, values + values_read + repeat_batch, value);
values_read += repeat_batch;
Expand All @@ -397,15 +398,15 @@ inline int RleDecoder::GetBatchWithDictSpaced(const T* dictionary, T* values,

// Read the first bitset to the end
while (literals_read < literal_batch) {
if (bitset_valid_bits & (1 << bit_offset_valid_bits)) {
if (bit_reader.IsSet()) {
values[values_read + literals_read + skipped] =
dictionary[indices[literals_read]];
literals_read++;
} else {
skipped++;
}

READ_NEXT_BITSET(valid_bits);
bit_reader.Next();
}
literal_count_ -= literal_batch;
values_read += literal_batch + skipped;
Expand Down