diff --git a/cpp/build-support/iwyu/iwyu.sh b/cpp/build-support/iwyu/iwyu.sh index 27c62dcd9b1..813f64d6bc4 100755 --- a/cpp/build-support/iwyu/iwyu.sh +++ b/cpp/build-support/iwyu/iwyu.sh @@ -25,7 +25,7 @@ IWYU_LOG=$(mktemp -t arrow-cpp-iwyu.XXXXXX) trap "rm -f $IWYU_LOG" EXIT IWYU_MAPPINGS_PATH="$ROOT/cpp/build-support/iwyu/mappings" -IWYU_ARGS="--no_fwd_decls \ +IWYU_ARGS="\ --mapping_file=$IWYU_MAPPINGS_PATH/boost-all.imp \ --mapping_file=$IWYU_MAPPINGS_PATH/boost-all-private.imp \ --mapping_file=$IWYU_MAPPINGS_PATH/boost-extra.imp \ diff --git a/cpp/src/arrow/CMakeLists.txt b/cpp/src/arrow/CMakeLists.txt index f622cd86a33..27cdd02440c 100644 --- a/cpp/src/arrow/CMakeLists.txt +++ b/cpp/src/arrow/CMakeLists.txt @@ -173,7 +173,11 @@ set(ARROW_SRCS io/slow.cc testing/util.cc util/basic_decimal.cc + util/bit_block_counter.cc util/bit_util.cc + util/bitmap.cc + util/bitmap_builders.cc + util/bitmap_ops.cc util/compression.cc util/cpu_info.cc util/decimal.cc diff --git a/cpp/src/arrow/array/array_binary_test.cc b/cpp/src/arrow/array/array_binary_test.cc index 15212308d45..d044bafb23f 100644 --- a/cpp/src/arrow/array/array_binary_test.cc +++ b/cpp/src/arrow/array/array_binary_test.cc @@ -34,6 +34,7 @@ #include "arrow/type.h" #include "arrow/type_traits.h" #include "arrow/util/bit_util.h" +#include "arrow/util/bitmap_builders.h" #include "arrow/util/checked_cast.h" #include "arrow/util/string_view.h" #include "arrow/visitor_inline.h" @@ -93,7 +94,7 @@ class TestStringArray : public ::testing::Test { length_ = static_cast(offsets_.size()) - 1; value_buf_ = Buffer::Wrap(chars_); offsets_buf_ = Buffer::Wrap(offsets_); - ASSERT_OK_AND_ASSIGN(null_bitmap_, BitUtil::BytesToBits(valid_bytes_)); + ASSERT_OK_AND_ASSIGN(null_bitmap_, internal::BytesToBits(valid_bytes_)); null_count_ = CountNulls(valid_bytes_); strings_ = std::make_shared(length_, offsets_buf_, value_buf_, diff --git a/cpp/src/arrow/array/array_dict.cc b/cpp/src/arrow/array/array_dict.cc index 1c5abf6e0d8..48e98bd11f2 100644 --- a/cpp/src/arrow/array/array_dict.cc +++ b/cpp/src/arrow/array/array_dict.cc @@ -33,6 +33,7 @@ #include "arrow/type.h" #include "arrow/type_traits.h" #include "arrow/util/bit_util.h" +#include "arrow/util/bitmap_ops.h" #include "arrow/util/checked_cast.h" #include "arrow/util/int_util.h" #include "arrow/util/logging.h" diff --git a/cpp/src/arrow/array/array_list_test.cc b/cpp/src/arrow/array/array_list_test.cc index e51b73f0533..3057e0d03cd 100644 --- a/cpp/src/arrow/array/array_list_test.cc +++ b/cpp/src/arrow/array/array_list_test.cc @@ -30,6 +30,7 @@ #include "arrow/testing/gtest_util.h" #include "arrow/type.h" #include "arrow/util/bit_util.h" +#include "arrow/util/bitmap_builders.h" #include "arrow/util/checked_cast.h" namespace arrow { @@ -616,7 +617,7 @@ TEST_F(TestMapArray, BuildingStringToInt) { std::vector offsets = {0, 2, 2, 3, 3}; auto expected_keys = ArrayFromJSON(utf8(), R"(["joe", "mark", "cap"])"); auto expected_values = ArrayFromJSON(int32(), "[0, null, 8]"); - ASSERT_OK_AND_ASSIGN(auto expected_null_bitmap, BitUtil::BytesToBits({1, 0, 1, 1})); + ASSERT_OK_AND_ASSIGN(auto expected_null_bitmap, internal::BytesToBits({1, 0, 1, 1})); MapArray expected(type, 4, Buffer::Wrap(offsets), expected_keys, expected_values, expected_null_bitmap, 1); diff --git a/cpp/src/arrow/array/array_nested.cc b/cpp/src/arrow/array/array_nested.cc index fa07694420a..7e4a71ab344 100644 --- a/cpp/src/arrow/array/array_nested.cc +++ b/cpp/src/arrow/array/array_nested.cc @@ -34,6 +34,7 @@ #include "arrow/type_traits.h" #include "arrow/util/atomic_shared_ptr.h" #include "arrow/util/bit_util.h" +#include "arrow/util/bitmap_ops.h" #include "arrow/util/checked_cast.h" #include "arrow/util/logging.h" diff --git a/cpp/src/arrow/array/array_test.cc b/cpp/src/arrow/array/array_test.cc index 13cf91e9015..12e37bd8dbf 100644 --- a/cpp/src/arrow/array/array_test.cc +++ b/cpp/src/arrow/array/array_test.cc @@ -57,6 +57,7 @@ #include "arrow/type.h" #include "arrow/type_traits.h" #include "arrow/util/bit_util.h" +#include "arrow/util/bitmap_builders.h" #include "arrow/util/checked_cast.h" #include "arrow/util/decimal.h" #include "arrow/util/macros.h" @@ -116,7 +117,7 @@ Status MakeArrayFromValidBytes(const std::vector& v, MemoryPool* pool, std::shared_ptr* out) { int64_t null_count = v.size() - std::accumulate(v.begin(), v.end(), 0); - ARROW_ASSIGN_OR_RAISE(auto null_buf, BitUtil::BytesToBits(v)); + ARROW_ASSIGN_OR_RAISE(auto null_buf, internal::BytesToBits(v)); TypedBufferBuilder value_builder(pool); for (size_t i = 0; i < v.size(); ++i) { @@ -219,7 +220,7 @@ TEST_F(TestArray, TestIsNullIsValid) { } } - ASSERT_OK_AND_ASSIGN(auto null_buf, BitUtil::BytesToBits(null_bitmap)); + ASSERT_OK_AND_ASSIGN(auto null_buf, internal::BytesToBits(null_bitmap)); std::unique_ptr arr; arr.reset(new Int32Array(null_bitmap.size(), nullptr, null_buf, null_count)); @@ -468,7 +469,7 @@ class TestPrimitiveBuilder : public TestBuilder { int64_t ex_null_count = 0; if (nullable) { - ASSERT_OK_AND_ASSIGN(ex_null_bitmap, BitUtil::BytesToBits(valid_bytes_)); + ASSERT_OK_AND_ASSIGN(ex_null_bitmap, internal::BytesToBits(valid_bytes_)); ex_null_count = CountNulls(valid_bytes_); } else { ex_null_bitmap = nullptr; @@ -590,9 +591,9 @@ void TestPrimitiveBuilder::Check(const std::unique_ptr std::shared_ptr ex_null_bitmap; int64_t ex_null_count = 0; - ASSERT_OK_AND_ASSIGN(ex_data, BitUtil::BytesToBits(draws_)); + ASSERT_OK_AND_ASSIGN(ex_data, internal::BytesToBits(draws_)); if (nullable) { - ASSERT_OK_AND_ASSIGN(ex_null_bitmap, BitUtil::BytesToBits(valid_bytes_)); + ASSERT_OK_AND_ASSIGN(ex_null_bitmap, internal::BytesToBits(valid_bytes_)); ex_null_count = CountNulls(valid_bytes_); } else { ex_null_bitmap = nullptr; @@ -2089,7 +2090,7 @@ class DecimalTest : public ::testing::TestWithParam { auto expected_data = std::make_shared(raw_bytes.data(), BYTE_WIDTH); std::shared_ptr expected_null_bitmap; - ASSERT_OK_AND_ASSIGN(expected_null_bitmap, BitUtil::BytesToBits(valid_bytes)); + ASSERT_OK_AND_ASSIGN(expected_null_bitmap, internal::BytesToBits(valid_bytes)); int64_t expected_null_count = CountNulls(valid_bytes); auto expected = std::make_shared( diff --git a/cpp/src/arrow/array/builder_union.h b/cpp/src/arrow/array/builder_union.h index 0f8f258247c..aba707e8383 100644 --- a/cpp/src/arrow/array/builder_union.h +++ b/cpp/src/arrow/array/builder_union.h @@ -20,13 +20,13 @@ #include #include #include -#include #include #include "arrow/array/array_nested.h" #include "arrow/array/builder_base.h" #include "arrow/array/data.h" #include "arrow/buffer_builder.h" +#include "arrow/memory_pool.h" #include "arrow/status.h" #include "arrow/type.h" #include "arrow/util/visibility.h" diff --git a/cpp/src/arrow/array/concatenate.cc b/cpp/src/arrow/array/concatenate.cc index e83bc47a889..e1f6d71dc55 100644 --- a/cpp/src/arrow/array/concatenate.cc +++ b/cpp/src/arrow/array/concatenate.cc @@ -34,6 +34,7 @@ #include "arrow/status.h" #include "arrow/type.h" #include "arrow/util/bit_util.h" +#include "arrow/util/bitmap_ops.h" #include "arrow/util/checked_cast.h" #include "arrow/util/logging.h" #include "arrow/visitor_inline.h" diff --git a/cpp/src/arrow/array/data.cc b/cpp/src/arrow/array/data.cc index 3aa6db62812..6bd7f1eb450 100644 --- a/cpp/src/arrow/array/data.cc +++ b/cpp/src/arrow/array/data.cc @@ -28,7 +28,7 @@ #include "arrow/buffer.h" #include "arrow/status.h" #include "arrow/type.h" -#include "arrow/util/bit_util.h" +#include "arrow/util/bitmap_ops.h" #include "arrow/util/logging.h" #include "arrow/util/macros.h" diff --git a/cpp/src/arrow/array/data.h b/cpp/src/arrow/array/data.h index afe6694b846..23f46dfc761 100644 --- a/cpp/src/arrow/array/data.h +++ b/cpp/src/arrow/array/data.h @@ -25,7 +25,6 @@ #include "arrow/buffer.h" #include "arrow/result.h" -#include "arrow/type_fwd.h" #include "arrow/util/macros.h" #include "arrow/util/visibility.h" diff --git a/cpp/src/arrow/array/diff.h b/cpp/src/arrow/array/diff.h index 00565a1f577..a405164b333 100644 --- a/cpp/src/arrow/array/diff.h +++ b/cpp/src/arrow/array/diff.h @@ -27,7 +27,6 @@ #include "arrow/result.h" #include "arrow/status.h" #include "arrow/type.h" -#include "arrow/util/bit_util.h" #include "arrow/util/visibility.h" namespace arrow { diff --git a/cpp/src/arrow/buffer.cc b/cpp/src/arrow/buffer.cc index 2f7917a11c3..6dbbac314fb 100644 --- a/cpp/src/arrow/buffer.cc +++ b/cpp/src/arrow/buffer.cc @@ -22,6 +22,7 @@ #include #include "arrow/memory_pool.h" +#include "arrow/result.h" #include "arrow/status.h" #include "arrow/util/bit_util.h" #include "arrow/util/logging.h" diff --git a/cpp/src/arrow/buffer_builder.h b/cpp/src/arrow/buffer_builder.h index 1870ee6e996..41a47c91729 100644 --- a/cpp/src/arrow/buffer_builder.h +++ b/cpp/src/arrow/buffer_builder.h @@ -27,6 +27,7 @@ #include "arrow/buffer.h" #include "arrow/status.h" #include "arrow/util/bit_util.h" +#include "arrow/util/bitmap_generate.h" #include "arrow/util/macros.h" #include "arrow/util/ubsan.h" #include "arrow/util/visibility.h" diff --git a/cpp/src/arrow/compare.cc b/cpp/src/arrow/compare.cc index f1af4184fd7..db4033863fc 100644 --- a/cpp/src/arrow/compare.cc +++ b/cpp/src/arrow/compare.cc @@ -39,6 +39,7 @@ #include "arrow/type.h" #include "arrow/type_traits.h" #include "arrow/util/bit_util.h" +#include "arrow/util/bitmap_ops.h" #include "arrow/util/checked_cast.h" #include "arrow/util/logging.h" #include "arrow/util/macros.h" diff --git a/cpp/src/arrow/compute/api_scalar.h b/cpp/src/arrow/compute/api_scalar.h index 66c29f495a0..e3210d3ac91 100644 --- a/cpp/src/arrow/compute/api_scalar.h +++ b/cpp/src/arrow/compute/api_scalar.h @@ -20,7 +20,6 @@ #pragma once -#include #include #include diff --git a/cpp/src/arrow/compute/exec.cc b/cpp/src/arrow/compute/exec.cc index b1828572308..c5581bfacf2 100644 --- a/cpp/src/arrow/compute/exec.cc +++ b/cpp/src/arrow/compute/exec.cc @@ -40,6 +40,7 @@ #include "arrow/type.h" #include "arrow/type_traits.h" #include "arrow/util/bit_util.h" +#include "arrow/util/bitmap_ops.h" #include "arrow/util/checked_cast.h" #include "arrow/util/cpu_info.h" #include "arrow/util/logging.h" diff --git a/cpp/src/arrow/compute/exec_test.cc b/cpp/src/arrow/compute/exec_test.cc index f938a45cf01..d8615075054 100644 --- a/cpp/src/arrow/compute/exec_test.cc +++ b/cpp/src/arrow/compute/exec_test.cc @@ -38,6 +38,7 @@ #include "arrow/table.h" #include "arrow/type.h" #include "arrow/util/bit_util.h" +#include "arrow/util/bitmap_ops.h" #include "arrow/util/checked_cast.h" #include "arrow/util/cpu_info.h" #include "arrow/util/logging.h" diff --git a/cpp/src/arrow/compute/kernels/aggregate_basic.cc b/cpp/src/arrow/compute/kernels/aggregate_basic.cc index 2f9f3d7a61e..bdec9999ebe 100644 --- a/cpp/src/arrow/compute/kernels/aggregate_basic.cc +++ b/cpp/src/arrow/compute/kernels/aggregate_basic.cc @@ -15,6 +15,8 @@ // specific language governing permissions and limitations // under the License. +#include + #include "arrow/compute/api_aggregate.h" #include "arrow/compute/kernels/aggregate_internal.h" #include "arrow/compute/kernels/common.h" diff --git a/cpp/src/arrow/compute/kernels/codegen_internal.h b/cpp/src/arrow/compute/kernels/codegen_internal.h index b14381b9483..de420017ec5 100644 --- a/cpp/src/arrow/compute/kernels/codegen_internal.h +++ b/cpp/src/arrow/compute/kernels/codegen_internal.h @@ -34,6 +34,9 @@ #include "arrow/type.h" #include "arrow/type_traits.h" #include "arrow/util/bit_util.h" +#include "arrow/util/bitmap_generate.h" +#include "arrow/util/bitmap_reader.h" +#include "arrow/util/bitmap_writer.h" #include "arrow/util/decimal.h" #include "arrow/util/logging.h" #include "arrow/util/optional.h" diff --git a/cpp/src/arrow/compute/kernels/common.h b/cpp/src/arrow/compute/kernels/common.h index 23557876cf2..b2e02cbec4e 100644 --- a/cpp/src/arrow/compute/kernels/common.h +++ b/cpp/src/arrow/compute/kernels/common.h @@ -39,21 +39,16 @@ #include "arrow/table.h" #include "arrow/type.h" #include "arrow/type_traits.h" -#include "arrow/util/bit_util.h" #include "arrow/util/checked_cast.h" #include "arrow/util/logging.h" #include "arrow/util/macros.h" #include "arrow/util/string_view.h" -#include "arrow/visitor_inline.h" // IWYU pragma: end_exports namespace arrow { -using internal::Bitmap; -using internal::BitmapReader; using internal::checked_cast; using internal::checked_pointer_cast; -using internal::FirstTimeBitmapWriter; } // namespace arrow diff --git a/cpp/src/arrow/compute/kernels/scalar_boolean.cc b/cpp/src/arrow/compute/kernels/scalar_boolean.cc index 15ff8cbe309..89f4de08052 100644 --- a/cpp/src/arrow/compute/kernels/scalar_boolean.cc +++ b/cpp/src/arrow/compute/kernels/scalar_boolean.cc @@ -18,8 +18,14 @@ #include #include "arrow/compute/kernels/common.h" +#include "arrow/util/bit_util.h" +#include "arrow/util/bitmap.h" +#include "arrow/util/bitmap_ops.h" namespace arrow { + +using internal::Bitmap; + namespace compute { namespace { diff --git a/cpp/src/arrow/compute/kernels/scalar_cast_string.cc b/cpp/src/arrow/compute/kernels/scalar_cast_string.cc index 66182604183..9830102b6aa 100644 --- a/cpp/src/arrow/compute/kernels/scalar_cast_string.cc +++ b/cpp/src/arrow/compute/kernels/scalar_cast_string.cc @@ -17,10 +17,14 @@ // Implementation of casting to integer or floating point types +#include "arrow/array/array_base.h" #include "arrow/compute/kernels/common.h" #include "arrow/compute/kernels/scalar_cast_internal.h" +#include "arrow/result.h" #include "arrow/util/formatting.h" +#include "arrow/util/optional.h" #include "arrow/util/utf8.h" +#include "arrow/visitor_inline.h" namespace arrow { diff --git a/cpp/src/arrow/compute/kernels/scalar_cast_temporal.cc b/cpp/src/arrow/compute/kernels/scalar_cast_temporal.cc index a793504b2dc..b2addd04e1f 100644 --- a/cpp/src/arrow/compute/kernels/scalar_cast_temporal.cc +++ b/cpp/src/arrow/compute/kernels/scalar_cast_temporal.cc @@ -18,11 +18,10 @@ // Implementation of casting to (or between) temporal types #include -#include -#include #include "arrow/compute/kernels/common.h" #include "arrow/compute/kernels/scalar_cast_internal.h" +#include "arrow/util/bitmap_reader.h" #include "arrow/util/time.h" #include "arrow/util/value_parsing.h" diff --git a/cpp/src/arrow/compute/kernels/scalar_compare_test.cc b/cpp/src/arrow/compute/kernels/scalar_compare_test.cc index df4306d94b0..758e10b60d8 100644 --- a/cpp/src/arrow/compute/kernels/scalar_compare_test.cc +++ b/cpp/src/arrow/compute/kernels/scalar_compare_test.cc @@ -27,13 +27,13 @@ #include "arrow/array.h" #include "arrow/compute/api.h" #include "arrow/compute/kernels/test_util.h" -#include "arrow/type.h" -#include "arrow/type_traits.h" -#include "arrow/util/checked_cast.h" - #include "arrow/testing/gtest_common.h" #include "arrow/testing/gtest_util.h" #include "arrow/testing/random.h" +#include "arrow/type.h" +#include "arrow/type_traits.h" +#include "arrow/util/bitmap_reader.h" +#include "arrow/util/checked_cast.h" namespace arrow { namespace compute { diff --git a/cpp/src/arrow/compute/kernels/scalar_set_lookup.cc b/cpp/src/arrow/compute/kernels/scalar_set_lookup.cc index bdc815ab8a7..fc14bf5e1d4 100644 --- a/cpp/src/arrow/compute/kernels/scalar_set_lookup.cc +++ b/cpp/src/arrow/compute/kernels/scalar_set_lookup.cc @@ -19,8 +19,11 @@ #include "arrow/array/builder_primitive.h" #include "arrow/compute/api_scalar.h" #include "arrow/compute/kernels/common.h" +#include "arrow/util/bit_util.h" +#include "arrow/util/bitmap_writer.h" #include "arrow/util/hashing.h" #include "arrow/util/optional.h" +#include "arrow/visitor_inline.h" namespace arrow { diff --git a/cpp/src/arrow/compute/kernels/vector_filter.cc b/cpp/src/arrow/compute/kernels/vector_filter.cc index 03a725d2bbf..db21d402e35 100644 --- a/cpp/src/arrow/compute/kernels/vector_filter.cc +++ b/cpp/src/arrow/compute/kernels/vector_filter.cc @@ -22,6 +22,7 @@ #include "arrow/compute/kernels/vector_selection_internal.h" #include "arrow/record_batch.h" #include "arrow/result.h" +#include "arrow/visitor_inline.h" namespace arrow { namespace compute { diff --git a/cpp/src/arrow/ipc/json_internal.cc b/cpp/src/arrow/ipc/json_internal.cc index 757d53fa7ed..f658c3391ef 100644 --- a/cpp/src/arrow/ipc/json_internal.cc +++ b/cpp/src/arrow/ipc/json_internal.cc @@ -451,7 +451,8 @@ class ArrayWriter { if (arr.IsValid(i)) { writer_->Int64(arr.Value(i)); } else { - writer_->RawNumber(null_string.data(), null_string.size()); + writer_->RawNumber(null_string.data(), + static_cast(null_string.size())); } } } @@ -466,10 +467,12 @@ class ArrayWriter { static const std::string null_string = "0"; for (int64_t i = 0; i < arr.length(); ++i) { if (arr.IsValid(i)) { - fmt(arr.Value(i), - [&](util::string_view repr) { writer_->String(repr.data(), repr.size()); }); + fmt(arr.Value(i), [&](util::string_view repr) { + writer_->String(repr.data(), static_cast(repr.size())); + }); } else { - writer_->String(null_string.data(), null_string.size()); + writer_->String(null_string.data(), + static_cast(null_string.size())); } } } diff --git a/cpp/src/arrow/ipc/json_simple_test.cc b/cpp/src/arrow/ipc/json_simple_test.cc index 2bd73215920..5b21fd598b3 100644 --- a/cpp/src/arrow/ipc/json_simple_test.cc +++ b/cpp/src/arrow/ipc/json_simple_test.cc @@ -34,6 +34,7 @@ #include "arrow/testing/gtest_util.h" #include "arrow/type.h" #include "arrow/type_traits.h" +#include "arrow/util/bitmap_builders.h" #include "arrow/util/checked_cast.h" #include "arrow/util/decimal.h" @@ -47,6 +48,7 @@ namespace ipc { namespace internal { namespace json { +using ::arrow::internal::BytesToBits; using ::arrow::internal::checked_cast; using ::arrow::internal::checked_pointer_cast; @@ -668,7 +670,7 @@ TEST(TestMap, StringToInteger) { std::vector offsets = {0, 2, 2, 3, 3}; auto expected_keys = ArrayFromJSON(utf8(), R"(["joe", "mark", "cap"])"); auto expected_values = ArrayFromJSON(int32(), "[0, null, 8]"); - ASSERT_OK_AND_ASSIGN(auto expected_null_bitmap, BitUtil::BytesToBits({1, 0, 1, 1})); + ASSERT_OK_AND_ASSIGN(auto expected_null_bitmap, BytesToBits({1, 0, 1, 1})); auto expected = std::make_shared(type, 4, Buffer::Wrap(offsets), expected_keys, expected_values, expected_null_bitmap, 1); diff --git a/cpp/src/arrow/ipc/test_common.cc b/cpp/src/arrow/ipc/test_common.cc index de6fa54a294..00c39420f7d 100644 --- a/cpp/src/arrow/ipc/test_common.cc +++ b/cpp/src/arrow/ipc/test_common.cc @@ -37,6 +37,7 @@ #include "arrow/type.h" #include "arrow/type_traits.h" #include "arrow/util/bit_util.h" +#include "arrow/util/bitmap_builders.h" #include "arrow/util/checked_cast.h" namespace arrow { @@ -160,11 +161,11 @@ Status MakeRandomBooleanArray(const int length, bool include_nulls, std::shared_ptr* out) { std::vector values(length); random_null_bytes(length, 0.5, values.data()); - ARROW_ASSIGN_OR_RAISE(auto data, BitUtil::BytesToBits(values)); + ARROW_ASSIGN_OR_RAISE(auto data, internal::BytesToBits(values)); if (include_nulls) { std::vector valid_bytes(length); - ARROW_ASSIGN_OR_RAISE(auto null_bitmap, BitUtil::BytesToBits(valid_bytes)); + ARROW_ASSIGN_OR_RAISE(auto null_bitmap, internal::BytesToBits(valid_bytes)); random_null_bytes(length, 0.1, valid_bytes.data()); *out = std::make_shared(length, data, null_bitmap, -1); } else { @@ -422,7 +423,7 @@ Status MakeStruct(std::shared_ptr* out) { std::shared_ptr no_nulls(new StructArray(type, list_batch->num_rows(), columns)); std::vector null_bytes(list_batch->num_rows(), 1); null_bytes[0] = 0; - ARROW_ASSIGN_OR_RAISE(auto null_bitmap, BitUtil::BytesToBits(null_bytes)); + ARROW_ASSIGN_OR_RAISE(auto null_bitmap, internal::BytesToBits(null_bytes)); std::shared_ptr with_nulls( new StructArray(type, list_batch->num_rows(), columns, null_bitmap, 1)); @@ -479,7 +480,7 @@ Status MakeUnion(std::shared_ptr* out) { std::vector null_bytes(length, 1); null_bytes[2] = 0; - ARROW_ASSIGN_OR_RAISE(auto null_bitmap, BitUtil::BytesToBits(null_bytes)); + ARROW_ASSIGN_OR_RAISE(auto null_bitmap, internal::BytesToBits(null_bytes)); // construct individual nullable/non-nullable struct arrays auto sparse_no_nulls = @@ -757,7 +758,7 @@ Status MakeDecimal(std::shared_ptr* out) { random_null_bytes(length, 0.1, is_valid_bytes.data()); ARROW_ASSIGN_OR_RAISE(std::shared_ptr is_valid, - BitUtil::BytesToBits(is_valid_bytes)); + internal::BytesToBits(is_valid_bytes)); auto a1 = std::make_shared(f0->type(), length, data, is_valid, kUnknownNullCount); diff --git a/cpp/src/arrow/ipc/writer.cc b/cpp/src/arrow/ipc/writer.cc index c8bb9a336b4..ae0af10791e 100644 --- a/cpp/src/arrow/ipc/writer.cc +++ b/cpp/src/arrow/ipc/writer.cc @@ -45,6 +45,7 @@ #include "arrow/type.h" #include "arrow/type_traits.h" #include "arrow/util/bit_util.h" +#include "arrow/util/bitmap_ops.h" #include "arrow/util/checked_cast.h" #include "arrow/util/compression.h" #include "arrow/util/key_value_metadata.h" diff --git a/cpp/src/arrow/json/parser.cc b/cpp/src/arrow/json/parser.cc index 334af097699..0ff04f0e946 100644 --- a/cpp/src/arrow/json/parser.cc +++ b/cpp/src/arrow/json/parser.cc @@ -33,6 +33,7 @@ #include "arrow/builder.h" #include "arrow/memory_pool.h" #include "arrow/type.h" +#include "arrow/util/bitset_stack.h" #include "arrow/util/logging.h" #include "arrow/util/make_unique.h" #include "arrow/util/string_view.h" diff --git a/cpp/src/arrow/python/numpy_to_arrow.cc b/cpp/src/arrow/python/numpy_to_arrow.cc index 329fb310910..76a0ac8eff9 100644 --- a/cpp/src/arrow/python/numpy_to_arrow.cc +++ b/cpp/src/arrow/python/numpy_to_arrow.cc @@ -36,6 +36,8 @@ #include "arrow/type_fwd.h" #include "arrow/type_traits.h" #include "arrow/util/bit_util.h" +#include "arrow/util/bitmap_generate.h" +#include "arrow/util/bitmap_ops.h" #include "arrow/util/checked_cast.h" #include "arrow/util/logging.h" #include "arrow/util/macros.h" diff --git a/cpp/src/arrow/sparse_tensor_test.cc b/cpp/src/arrow/sparse_tensor_test.cc index 5602c6f0f46..0e28b2c3bc7 100644 --- a/cpp/src/arrow/sparse_tensor_test.cc +++ b/cpp/src/arrow/sparse_tensor_test.cc @@ -17,6 +17,7 @@ // Unit tests for DataType (and subclasses), Field, and Schema +#include #include #include #include diff --git a/cpp/src/arrow/util/bit_block_counter.cc b/cpp/src/arrow/util/bit_block_counter.cc new file mode 100644 index 00000000000..a596c07ef85 --- /dev/null +++ b/cpp/src/arrow/util/bit_block_counter.cc @@ -0,0 +1,75 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include "arrow/util/bit_block_counter.h" + +#include +#include + +#include "arrow/util/bit_util.h" +#include "arrow/util/bitmap_ops.h" +#include "arrow/util/ubsan.h" + +namespace arrow { +namespace internal { + +BitBlockCounter::Block BitBlockCounter::NextBlock() { + auto load_word = [](const uint8_t* bytes) -> uint64_t { + return BitUtil::ToLittleEndian(util::SafeLoadAs(bytes)); + }; + auto shift_word = [](uint64_t current, uint64_t next, int64_t shift) -> uint64_t { + return (current >> shift) | (next << (64 - shift)); + }; + + // When the offset is > 0, we need there to be a word beyond the last aligned + // word in the bitmap for the bit shifting logic. + const int64_t bits_required_to_scan_words = offset_ == 0 ? 256 : 256 + (64 - offset_); + if (bits_remaining_ < bits_required_to_scan_words) { + // End of the bitmap, leave it to the caller to decide how to best check + // these bits, no need to do redundant computation here. + const int16_t run_length = static_cast(bits_remaining_); + bits_remaining_ -= run_length; + return {run_length, static_cast(CountSetBits(bitmap_, offset_, run_length))}; + } + + int64_t total_popcount = 0; + if (offset_ == 0) { + total_popcount += BitUtil::PopCount(load_word(bitmap_)); + total_popcount += BitUtil::PopCount(load_word(bitmap_ + 8)); + total_popcount += BitUtil::PopCount(load_word(bitmap_ + 16)); + total_popcount += BitUtil::PopCount(load_word(bitmap_ + 24)); + } else { + auto current = load_word(bitmap_); + auto next = load_word(bitmap_ + 8); + total_popcount += BitUtil::PopCount(shift_word(current, next, offset_)); + current = next; + next = load_word(bitmap_ + 16); + total_popcount += BitUtil::PopCount(shift_word(current, next, offset_)); + current = next; + next = load_word(bitmap_ + 24); + total_popcount += BitUtil::PopCount(shift_word(current, next, offset_)); + current = next; + next = load_word(bitmap_ + 32); + total_popcount += BitUtil::PopCount(shift_word(current, next, offset_)); + } + bitmap_ += BitUtil::BytesForBits(kTargetBlockLength); + bits_remaining_ -= 256; + return {256, static_cast(total_popcount)}; +} + +} // namespace internal +} // namespace arrow diff --git a/cpp/src/arrow/util/bit_block_counter.h b/cpp/src/arrow/util/bit_block_counter.h new file mode 100644 index 00000000000..92e2b9d3aa4 --- /dev/null +++ b/cpp/src/arrow/util/bit_block_counter.h @@ -0,0 +1,55 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#pragma once + +#include + +#include "arrow/util/visibility.h" + +namespace arrow { +namespace internal { + +/// \brief A class that scans through a true/false bitmap to yield blocks of up +/// to 256 bits at a time along with their popcount. This is used to accelerate +/// processing of mostly-not-null array data. +class ARROW_EXPORT BitBlockCounter { + public: + struct Block { + int16_t length; + int16_t popcount; + }; + + static constexpr int16_t kTargetBlockLength = 256; + + BitBlockCounter(const uint8_t* bitmap, int64_t start_offset, int64_t length) + : bitmap_(bitmap + start_offset / 8), + bits_remaining_(length), + offset_(start_offset % 8) {} + + /// \brief Return the next run of available bits, up to 256. The returned + /// pair contains the size of run and the number of true values + Block NextBlock(); + + private: + const uint8_t* bitmap_; + int64_t bits_remaining_; + int64_t offset_; +}; + +} // namespace internal +} // namespace arrow diff --git a/cpp/src/arrow/util/bit_util.cc b/cpp/src/arrow/util/bit_util.cc index 8b63da6b2df..6e23678ddf9 100644 --- a/cpp/src/arrow/util/bit_util.cc +++ b/cpp/src/arrow/util/bit_util.cc @@ -15,35 +15,13 @@ // specific language governing permissions and limitations // under the License. -#include +#include "arrow/util/bit_util.h" + #include #include -#include -#include -#include -#include - -#include "arrow/array/array_primitive.h" -#include "arrow/buffer.h" -#include "arrow/status.h" -#include "arrow/util/align_util.h" -#include "arrow/util/bit_util.h" -#include "arrow/util/logging.h" -#include "arrow/util/ubsan.h" namespace arrow { namespace BitUtil { -namespace { - -void FillBitsFromBytes(const std::vector& bytes, uint8_t* bits) { - for (size_t i = 0; i < bytes.size(); ++i) { - if (bytes[i] > 0) { - SetBit(bits, i); - } - } -} - -} // namespace void SetBitsTo(uint8_t* bits, int64_t start_offset, int64_t length, bool bits_are_set) { if (length == 0) { @@ -89,550 +67,5 @@ void SetBitsTo(uint8_t* bits, int64_t start_offset, int64_t length, bool bits_ar bits[bytes_end - 1] |= static_cast(fill_byte & ~last_byte_mask); } -Result> BytesToBits(const std::vector& bytes, - MemoryPool* pool) { - int64_t bit_length = BytesForBits(bytes.size()); - - ARROW_ASSIGN_OR_RAISE(auto buffer, AllocateBuffer(bit_length, pool)); - uint8_t* out_buf = buffer->mutable_data(); - memset(out_buf, 0, static_cast(buffer->capacity())); - FillBitsFromBytes(bytes, out_buf); - return std::move(buffer); -} - } // namespace BitUtil - -namespace internal { - -int64_t CountSetBits(const uint8_t* data, int64_t bit_offset, int64_t length) { - constexpr int64_t pop_len = sizeof(uint64_t) * 8; - DCHECK_GE(bit_offset, 0); - int64_t count = 0; - - const auto p = BitmapWordAlign(data, bit_offset, length); - for (int64_t i = bit_offset; i < bit_offset + p.leading_bits; ++i) { - if (BitUtil::GetBit(data, i)) { - ++count; - } - } - - if (p.aligned_words > 0) { - // popcount as much as possible with the widest possible count - const uint64_t* u64_data = reinterpret_cast(p.aligned_start); - DCHECK_EQ(reinterpret_cast(u64_data) & 7, 0); - const uint64_t* end = u64_data + p.aligned_words; - - for (auto iter = u64_data; iter < end; ++iter) { - count += BitUtil::PopCount(*iter); - } - } - - // Account for left over bits (in theory we could fall back to smaller - // versions of popcount but the code complexity is likely not worth it) - for (int64_t i = p.trailing_bit_offset; i < bit_offset + length; ++i) { - if (BitUtil::GetBit(data, i)) { - ++count; - } - } - - return count; -} - -template -void TransferBitmap(const uint8_t* data, int64_t offset, int64_t length, - int64_t dest_offset, uint8_t* dest) { - int64_t byte_offset = offset / 8; - int64_t bit_offset = offset % 8; - int64_t dest_byte_offset = dest_offset / 8; - int64_t dest_bit_offset = dest_offset % 8; - int64_t num_bytes = BitUtil::BytesForBits(length); - // Shift dest by its byte offset - dest += dest_byte_offset; - - if (bit_offset || dest_bit_offset) { - data += byte_offset; - - const int64_t n_words = length / 64; - if (n_words > 1) { - auto load_word = [](const uint8_t* bytes) -> uint64_t { - return BitUtil::ToLittleEndian(util::SafeLoadAs(bytes)); - }; - auto shift_word = [](uint64_t current, uint64_t next, int64_t shift) -> uint64_t { - if (shift == 0) return current; - return (current >> shift) | (next << (64 - shift)); - }; - auto write_word = [](uint8_t* bytes, uint64_t word) { - util::SafeStore(bytes, BitUtil::FromLittleEndian(word)); - }; - - const uint64_t dest_mask = (1U << dest_bit_offset) - 1; - auto data_current = load_word(data); - auto dest_current = load_word(dest); - - for (int64_t i = 0; i < n_words - 1; ++i) { - data += 8; - const auto data_next = load_word(data); - auto word = shift_word(data_current, data_next, bit_offset); - data_current = data_next; - if (invert_bits) { - word = ~word; - } - - if (dest_bit_offset) { - word = (word << dest_bit_offset) | (word >> (64 - dest_bit_offset)); - auto dest_next = load_word(dest + 8); - dest_current = (dest_current & dest_mask) | (word & ~dest_mask); - dest_next = (dest_next & ~dest_mask) | (word & dest_mask); - write_word(dest, dest_current); - write_word(dest + 8, dest_next); - dest_current = dest_next; - } else { - write_word(dest, word); - } - dest += 8; - } - - length -= (n_words - 1) * 64; - } - - internal::BitmapReader valid_reader(data, bit_offset, length); - internal::BitmapWriter valid_writer(dest, dest_bit_offset, length); - - for (int64_t i = 0; i < length; i++) { - if (invert_bits ^ valid_reader.IsSet()) { - valid_writer.Set(); - } else { - valid_writer.Clear(); - } - valid_reader.Next(); - valid_writer.Next(); - } - valid_writer.Finish(); - } else { - // Take care of the trailing bits in the last byte - int64_t trailing_bits = num_bytes * 8 - length; - uint8_t trail = 0; - if (trailing_bits && restore_trailing_bits) { - trail = dest[num_bytes - 1]; - } - - if (invert_bits) { - for (int64_t i = 0; i < num_bytes; i++) { - dest[i] = static_cast(~(data[byte_offset + i])); - } - } else { - std::memcpy(dest, data + byte_offset, static_cast(num_bytes)); - } - - if (restore_trailing_bits) { - for (int i = 0; i < trailing_bits; i++) { - if (BitUtil::GetBit(&trail, i + 8 - trailing_bits)) { - BitUtil::SetBit(dest, length + i); - } else { - BitUtil::ClearBit(dest, length + i); - } - } - } - } -} - -template -Result> TransferBitmap(MemoryPool* pool, const uint8_t* data, - int64_t offset, int64_t length) { - ARROW_ASSIGN_OR_RAISE(auto buffer, AllocateEmptyBitmap(length, pool)); - uint8_t* dest = buffer->mutable_data(); - - TransferBitmap(data, offset, length, 0, dest); - - // As we have freshly allocated this bitmap, we should take care of zeroing the - // remaining bits. - int64_t num_bytes = BitUtil::BytesForBits(length); - int64_t bits_to_zero = num_bytes * 8 - length; - for (int64_t i = length; i < length + bits_to_zero; ++i) { - // Both branches may copy extra bits - unsetting to match specification. - BitUtil::ClearBit(dest, i); - } - return buffer; -} - -void CopyBitmap(const uint8_t* data, int64_t offset, int64_t length, uint8_t* dest, - int64_t dest_offset, bool restore_trailing_bits) { - if (restore_trailing_bits) { - TransferBitmap(data, offset, length, dest_offset, dest); - } else { - TransferBitmap(data, offset, length, dest_offset, dest); - } -} - -void InvertBitmap(const uint8_t* data, int64_t offset, int64_t length, uint8_t* dest, - int64_t dest_offset) { - TransferBitmap(data, offset, length, dest_offset, dest); -} - -Result> CopyBitmap(MemoryPool* pool, const uint8_t* data, - int64_t offset, int64_t length) { - return TransferBitmap(pool, data, offset, length); -} - -Result> InvertBitmap(MemoryPool* pool, const uint8_t* data, - int64_t offset, int64_t length, - std::shared_ptr* out) { - return TransferBitmap(pool, data, offset, length); -} - -bool BitmapEquals(const uint8_t* left, int64_t left_offset, const uint8_t* right, - int64_t right_offset, int64_t bit_length) { - if (left_offset % 8 == 0 && right_offset % 8 == 0) { - // byte aligned, can use memcmp - bool bytes_equal = std::memcmp(left + left_offset / 8, right + right_offset / 8, - bit_length / 8) == 0; - if (!bytes_equal) { - return false; - } - for (int64_t i = (bit_length / 8) * 8; i < bit_length; ++i) { - if (BitUtil::GetBit(left, left_offset + i) != - BitUtil::GetBit(right, right_offset + i)) { - return false; - } - } - return true; - } - - // Unaligned slow case - left += left_offset / 8; - right += right_offset / 8; - left_offset %= 8; - right_offset %= 8; - - // process in 64 bits, may touch two adjacent words in one iteration - const int64_t n_words = bit_length / 64; - if (n_words > 1) { - auto load_word = [](const uint8_t* bytes) -> uint64_t { - return BitUtil::ToLittleEndian(util::SafeLoadAs(bytes)); - }; - auto shift_word = [](uint64_t current, uint64_t next, int64_t shift) -> uint64_t { - if (shift == 0) return current; - return (current >> shift) | (next << (64 - shift)); - }; - - auto left_current = load_word(left); - auto right_current = load_word(right); - - for (int64_t i = 0; i < n_words - 1; ++i) { - left += 8; - auto left_next = load_word(left); - auto left_word = shift_word(left_current, left_next, left_offset); - left_current = left_next; - - right += 8; - auto right_next = load_word(right); - auto right_word = shift_word(right_current, right_next, right_offset); - right_current = right_next; - - if (left_word != right_word) { - return false; - } - } - - bit_length -= (n_words - 1) * 64; - } - - // process in bit - for (int64_t i = 0; i < bit_length; ++i) { - if (BitUtil::GetBit(left, left_offset + i) != - BitUtil::GetBit(right, right_offset + i)) { - return false; - } - } - return true; -} - -namespace { - -template