From fa4786505eb628f6c347b08a687f20b13c48fc9f Mon Sep 17 00:00:00 2001 From: Wes McKinney Date: Sun, 24 Sep 2017 11:12:23 -0400 Subject: [PATCH 1/3] Add BitmapReader class to replace the bitset macros Change-Id: I3ead4b04e7ae724a8cea90ef162950fcdeaf8086 --- cpp/build-support/run_clang_format.py | 7 +++-- cpp/src/arrow/compute/cast.cc | 34 ++++++++++----------- cpp/src/arrow/util/bit-util-test.cc | 20 ++++++++++++ cpp/src/arrow/util/bit-util.h | 44 +++++++++++++++++++++++++++ cpp/src/arrow/util/rle-encoding.h | 15 ++++----- 5 files changed, 92 insertions(+), 28 deletions(-) diff --git a/cpp/build-support/run_clang_format.py b/cpp/build-support/run_clang_format.py index ac4954ca570..f1a448f5361 100755 --- a/cpp/build-support/run_clang_format.py +++ b/cpp/build-support/run_clang_format.py @@ -1,4 +1,4 @@ -#!/usr/bin/python +#!/usr/bin/env python # Licensed to the Apache Software Foundation (ASF) under one # or more contributor license agreements. See the NOTICE file # distributed with this work for additional information @@ -58,8 +58,9 @@ # fi try: - subprocess.check_output([CLANG_FORMAT, '-i'] + files_to_format, - stderr=subprocess.STDOUT) + cmd = [CLANG_FORMAT, '-i'] + files_to_format + subprocess.check_output(cmd, stderr=subprocess.STDOUT) except Exception as e: print(e) + print(' '.join(cmd)) raise diff --git a/cpp/src/arrow/compute/cast.cc b/cpp/src/arrow/compute/cast.cc index 5283bf0a4c2..ee838fa38f4 100644 --- a/cpp/src/arrow/compute/cast.cc +++ b/cpp/src/arrow/compute/cast.cc @@ -261,8 +261,9 @@ void UnpackFixedSizeBinaryDictionary(FunctionContext* ctx, const Array& indices, const FixedSizeBinaryArray& dictionary, ArrayData* output) { using index_c_type = typename IndexType::c_type; - const uint8_t* valid_bits = indices.null_bitmap_data(); - INIT_BITSET(valid_bits, indices.offset()); + + internal::BitmapReader valid_bits_reader(indices.null_bitmap_data(), indices.offset(), + indices.length()); const index_c_type* in = reinterpret_cast(indices.data()->buffers[1]->data()) + @@ -271,11 +272,11 @@ void UnpackFixedSizeBinaryDictionary(FunctionContext* ctx, const Array& indices, int32_t byte_width = static_cast(*output->type).byte_width(); for (int64_t i = 0; i < indices.length(); ++i) { - if (bitset_valid_bits & (1 << bit_offset_valid_bits)) { + if (valid_bits_reader.IsSet()) { const uint8_t* value = dictionary.Value(in[i]); memcpy(out + i * byte_width, value, byte_width); } - READ_NEXT_BITSET(valid_bits); + valid_bits_reader.Next(); } } @@ -293,8 +294,7 @@ struct CastFunctor< // Check if values and output type match DCHECK(values_type.Equals(*output->type)) - << "Dictionary type: " << values_type - << " target type: " << (*output->type); + << "Dictionary type: " << values_type << " target type: " << (*output->type); const Array& indices = *dict_array.indices(); switch (indices.type()->id()) { @@ -327,21 +327,21 @@ Status UnpackBinaryDictionary(FunctionContext* ctx, const Array& indices, RETURN_NOT_OK(MakeBuilder(ctx->memory_pool(), output->type, &builder)); BinaryBuilder* binary_builder = static_cast(builder.get()); - const uint8_t* valid_bits = indices.null_bitmap_data(); - INIT_BITSET(valid_bits, indices.offset()); + internal::BitmapReader valid_bits_reader(indices.null_bitmap_data(), indices.offset(), + indices.length()); const index_c_type* in = reinterpret_cast(indices.data()->buffers[1]->data()) + indices.offset(); for (int64_t i = 0; i < indices.length(); ++i) { - if (bitset_valid_bits & (1 << bit_offset_valid_bits)) { + if (valid_bits_reader.IsSet()) { int32_t length; const uint8_t* value = dictionary.GetValue(in[i], &length); RETURN_NOT_OK(binary_builder->Append(value, length)); } else { RETURN_NOT_OK(binary_builder->AppendNull()); } - READ_NEXT_BITSET(valid_bits); + valid_bits_reader.Next(); } std::shared_ptr plain_array; @@ -366,8 +366,7 @@ struct CastFunctortype)) - << "Dictionary type: " << values_type - << " target type: " << (*output->type); + << "Dictionary type: " << values_type << " target type: " << (*output->type); const Array& indices = *dict_array.indices(); switch (indices.type()->id()) { @@ -401,17 +400,17 @@ void UnpackPrimitiveDictionary(const Array& indices, const c_type* dictionary, c_type* out) { using index_c_type = typename IndexType::c_type; - const uint8_t* valid_bits = indices.null_bitmap_data(); - INIT_BITSET(valid_bits, indices.offset()); + internal::BitmapReader valid_bits_reader(indices.null_bitmap_data(), indices.offset(), + indices.length()); const index_c_type* in = reinterpret_cast(indices.data()->buffers[1]->data()) + indices.offset(); for (int64_t i = 0; i < indices.length(); ++i) { - if (bitset_valid_bits & (1 << bit_offset_valid_bits)) { + if (valid_bits_reader.IsSet()) { out[i] = dictionary[in[i]]; } - READ_NEXT_BITSET(valid_bits); + valid_bits_reader.Next(); } } @@ -429,8 +428,7 @@ struct CastFunctortype)) - << "Dictionary type: " << values_type - << " target type: " << (*output->type); + << "Dictionary type: " << values_type << " target type: " << (*output->type); auto dictionary = reinterpret_cast(type.dictionary()->data()->buffers[1]->data()) + diff --git a/cpp/src/arrow/util/bit-util-test.cc b/cpp/src/arrow/util/bit-util-test.cc index d838ab9d7a7..5bd7781c735 100644 --- a/cpp/src/arrow/util/bit-util-test.cc +++ b/cpp/src/arrow/util/bit-util-test.cc @@ -72,6 +72,26 @@ TEST(BitUtilTests, TestNextPower2) { ASSERT_EQ(1LL << 62, NextPower2((1LL << 62) - 1)); } +TEST(BitmapReader, TestNextPower2) { + uint8_t bitmap[16] = {0}; + + const int length = 128; + + internal::BitmapReader r1(bitmap, 0, length); + + for (int i = 0; i < length; ++i) { + ASSERT_TRUE(r1.IsNotSet()); + r1.Next(); + } + + internal::BitmapReader r2(bitmap, 5, length - 5); + + for (int i = 0; i < (length - 5); ++i) { + ASSERT_TRUE(r2.IsNotSet()); + r2.Next(); + } +} + static inline int64_t SlowCountBits(const uint8_t* data, int64_t bit_offset, int64_t length) { int64_t count = 0; diff --git a/cpp/src/arrow/util/bit-util.h b/cpp/src/arrow/util/bit-util.h index b8a8efabe5f..198003b1f75 100644 --- a/cpp/src/arrow/util/bit-util.h +++ b/cpp/src/arrow/util/bit-util.h @@ -48,6 +48,48 @@ #endif namespace arrow { +namespace internal { + +class BitmapReader { + public: + BitmapReader(const uint8_t* bitmap, int64_t start_offset, int64_t length) + : bitmap_(bitmap), position_(0), length_(length) { + byte_offset_ = start_offset / 8; + bit_offset_ = start_offset % 8; + current_byte_ = bitmap[byte_offset_]; + } + + bool IsSet() const { return current_byte_ & (1 << bit_offset_); } + + bool IsNotSet() const { return (current_byte_ & (1 << bit_offset_)) == 0; } + + void Next() { + ++bit_offset_; + ++position_; + if (bit_offset_ == 8) { + bit_offset_ = 0; + ++byte_offset_; + if (ARROW_PREDICT_TRUE(position_ < length_)) { + current_byte_ = bitmap_[byte_offset_]; + } + } + } + + private: + const uint8_t* bitmap_; + int64_t position_; + int64_t length_; + + uint8_t current_byte_; + int64_t byte_offset_; + int64_t bit_offset_; +}; + +} // namespace internal + +#ifndef ARROW_NO_DEPRECATED_API + +// \deprecated Since > 0.7.0 #define INIT_BITSET(valid_bits_vector, valid_bits_index) \ int64_t byte_offset_##valid_bits_vector = (valid_bits_index) / 8; \ @@ -62,6 +104,8 @@ namespace arrow { bitset_##valid_bits_vector = valid_bits_vector[byte_offset_##valid_bits_vector]; \ } +#endif + // TODO(wesm): The source from Impala was depending on boost::make_unsigned // // We add a partial stub implementation here diff --git a/cpp/src/arrow/util/rle-encoding.h b/cpp/src/arrow/util/rle-encoding.h index f4c8a772efc..645c3eb120a 100644 --- a/cpp/src/arrow/util/rle-encoding.h +++ b/cpp/src/arrow/util/rle-encoding.h @@ -352,11 +352,12 @@ inline int RleDecoder::GetBatchWithDictSpaced(const T* dictionary, T* values, DCHECK_GE(bit_width_, 0); int values_read = 0; int remaining_nulls = null_count; - INIT_BITSET(valid_bits, static_cast(valid_bits_offset)); + + internal::BitmapReader bit_reader(valid_bits, valid_bits_offset, batch_size); while (values_read < batch_size) { - bool is_valid = (bitset_valid_bits & (1 << bit_offset_valid_bits)) != 0; - READ_NEXT_BITSET(valid_bits); + bool is_valid = bit_reader.IsNotSet(); + bit_reader.Next(); if (is_valid) { if ((repeat_count_ == 0) && (literal_count_ == 0)) { @@ -369,14 +370,14 @@ inline int RleDecoder::GetBatchWithDictSpaced(const T* dictionary, T* values, repeat_count_--; while (repeat_count_ > 0 && (values_read + repeat_batch) < batch_size) { - if (bitset_valid_bits & (1 << bit_offset_valid_bits)) { + if (bit_reader.IsSet()) { repeat_count_--; } else { remaining_nulls--; } repeat_batch++; - READ_NEXT_BITSET(valid_bits); + bit_reader.Next(); } std::fill(values + values_read, values + values_read + repeat_batch, value); values_read += repeat_batch; @@ -397,7 +398,7 @@ inline int RleDecoder::GetBatchWithDictSpaced(const T* dictionary, T* values, // Read the first bitset to the end while (literals_read < literal_batch) { - if (bitset_valid_bits & (1 << bit_offset_valid_bits)) { + if (bit_reader.IsSet()) { values[values_read + literals_read + skipped] = dictionary[indices[literals_read]]; literals_read++; @@ -405,7 +406,7 @@ inline int RleDecoder::GetBatchWithDictSpaced(const T* dictionary, T* values, skipped++; } - READ_NEXT_BITSET(valid_bits); + bit_reader.Next(); } literal_count_ -= literal_batch; values_read += literal_batch + skipped; From ba58b8a52660a4104d0a989ad4049fc80d4aaad2 Mon Sep 17 00:00:00 2001 From: Wes McKinney Date: Sun, 24 Sep 2017 11:27:35 -0400 Subject: [PATCH 2/3] Fix test name Change-Id: Ibdc4de87b25fbe8306e6461d2306ec6ec5e4f6f5 --- cpp/src/arrow/util/bit-util-test.cc | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/cpp/src/arrow/util/bit-util-test.cc b/cpp/src/arrow/util/bit-util-test.cc index 5bd7781c735..a5c6cecc597 100644 --- a/cpp/src/arrow/util/bit-util-test.cc +++ b/cpp/src/arrow/util/bit-util-test.cc @@ -72,13 +72,14 @@ TEST(BitUtilTests, TestNextPower2) { ASSERT_EQ(1LL << 62, NextPower2((1LL << 62) - 1)); } -TEST(BitmapReader, TestNextPower2) { +TEST(BitmapReader, DoesNotReadOutOfBounds) { uint8_t bitmap[16] = {0}; const int length = 128; internal::BitmapReader r1(bitmap, 0, length); + // If this were to read out of bounds, valgrind would tell us for (int i = 0; i < length; ++i) { ASSERT_TRUE(r1.IsNotSet()); r1.Next(); From 6cec81cca052a5faffc73077a493484d4375637b Mon Sep 17 00:00:00 2001 From: Wes McKinney Date: Sun, 24 Sep 2017 17:56:01 -0400 Subject: [PATCH 3/3] Fix RleDecoder logic with BitmapReader Change-Id: I333e73af9a840f11cbe173747bf2b7dd30691784 --- cpp/src/arrow/util/bit-util.h | 5 +++++ cpp/src/arrow/util/rle-encoding.h | 2 +- 2 files changed, 6 insertions(+), 1 deletion(-) diff --git a/cpp/src/arrow/util/bit-util.h b/cpp/src/arrow/util/bit-util.h index 198003b1f75..fa0d7a4d566 100644 --- a/cpp/src/arrow/util/bit-util.h +++ b/cpp/src/arrow/util/bit-util.h @@ -59,7 +59,12 @@ class BitmapReader { current_byte_ = bitmap[byte_offset_]; } +#if defined(_MSC_VER) + // MSVC is finicky about this cast + bool IsSet() const { return (current_byte_ & (1 << bit_offset_)) != 0; } +#else bool IsSet() const { return current_byte_ & (1 << bit_offset_); } +#endif bool IsNotSet() const { return (current_byte_ & (1 << bit_offset_)) == 0; } diff --git a/cpp/src/arrow/util/rle-encoding.h b/cpp/src/arrow/util/rle-encoding.h index 645c3eb120a..f343b74cc78 100644 --- a/cpp/src/arrow/util/rle-encoding.h +++ b/cpp/src/arrow/util/rle-encoding.h @@ -356,7 +356,7 @@ inline int RleDecoder::GetBatchWithDictSpaced(const T* dictionary, T* values, internal::BitmapReader bit_reader(valid_bits, valid_bits_offset, batch_size); while (values_read < batch_size) { - bool is_valid = bit_reader.IsNotSet(); + bool is_valid = bit_reader.IsSet(); bit_reader.Next(); if (is_valid) {