diff --git a/cpp/src/arrow/CMakeLists.txt b/cpp/src/arrow/CMakeLists.txt index c65824f5385..e78e5d73164 100644 --- a/cpp/src/arrow/CMakeLists.txt +++ b/cpp/src/arrow/CMakeLists.txt @@ -130,9 +130,11 @@ if (ARROW_COMPUTE) add_subdirectory(compute) set(ARROW_SRCS ${ARROW_SRCS} compute/context.cc + compute/kernels/aggregate.cc compute/kernels/boolean.cc compute/kernels/cast.cc compute/kernels/hash.cc + compute/kernels/sum.cc compute/kernels/util-internal.cc ) endif() diff --git a/cpp/src/arrow/compute/kernel.h b/cpp/src/arrow/compute/kernel.h index adfea897c88..2270a48c9a9 100644 --- a/cpp/src/arrow/compute/kernel.h +++ b/cpp/src/arrow/compute/kernel.h @@ -57,9 +57,50 @@ class ARROW_EXPORT OpKernel { /// \brief Placeholder for Scalar values until we implement these struct ARROW_EXPORT Scalar { - ~Scalar() {} + util::variant + value; - ARROW_DISALLOW_COPY_AND_ASSIGN(Scalar); + explicit Scalar(bool value) : value(value) {} + explicit Scalar(uint8_t value) : value(value) {} + explicit Scalar(int8_t value) : value(value) {} + explicit Scalar(uint16_t value) : value(value) {} + explicit Scalar(int16_t value) : value(value) {} + explicit Scalar(uint32_t value) : value(value) {} + explicit Scalar(int32_t value) : value(value) {} + explicit Scalar(uint64_t value) : value(value) {} + explicit Scalar(int64_t value) : value(value) {} + explicit Scalar(float value) : value(value) {} + explicit Scalar(double value) : value(value) {} + + Type::type kind() const { + switch (this->value.which()) { + case 0: + return Type::BOOL; + case 1: + return Type::UINT8; + case 2: + return Type::INT8; + case 3: + return Type::UINT16; + case 4: + return Type::INT16; + case 5: + return Type::UINT32; + case 6: + return Type::INT32; + case 7: + return Type::UINT64; + case 8: + return Type::INT64; + case 9: + return Type::FLOAT; + case 10: + return Type::DOUBLE; + default: + return Type::NA; + } + } }; /// \class Datum @@ -67,7 +108,7 @@ struct ARROW_EXPORT Scalar { struct ARROW_EXPORT Datum { enum type { NONE, SCALAR, ARRAY, CHUNKED_ARRAY, RECORD_BATCH, TABLE, COLLECTION }; - util::variant, std::shared_ptr, + util::variant, std::shared_ptr, std::shared_ptr, std::shared_ptr, std::vector> value; @@ -75,7 +116,7 @@ struct ARROW_EXPORT Datum { /// \brief Empty datum, to be populated elsewhere Datum() : value(NULLPTR) {} - Datum(const std::shared_ptr& value) // NOLINT implicit conversion + Datum(const Scalar& value) // NOLINT implicit conversion : value(value) {} Datum(const std::shared_ptr& value) // NOLINT implicit conversion : value(value) {} @@ -147,6 +188,10 @@ struct ARROW_EXPORT Datum { return util::get>(this->value); } + Scalar scalar() const { return util::get(this->value); } + + bool is_array() const { return this->kind() == Datum::ARRAY; } + bool is_arraylike() const { return this->kind() == Datum::ARRAY || this->kind() == Datum::CHUNKED_ARRAY; } diff --git a/cpp/src/arrow/compute/kernels/CMakeLists.txt b/cpp/src/arrow/compute/kernels/CMakeLists.txt index 4d508aacb99..df80fb2b84e 100644 --- a/cpp/src/arrow/compute/kernels/CMakeLists.txt +++ b/cpp/src/arrow/compute/kernels/CMakeLists.txt @@ -20,3 +20,7 @@ ARROW_INSTALL_ALL_HEADERS("arrow/compute/kernels") ADD_ARROW_TEST(boolean-test PREFIX "arrow-compute") ADD_ARROW_TEST(cast-test PREFIX "arrow-compute") ADD_ARROW_TEST(hash-test PREFIX "arrow-compute") + +# Aggregates +ADD_ARROW_TEST(aggregate-test PREFIX "arrow-compute") +ADD_ARROW_BENCHMARK(aggregate-benchmark PREFIX "arrow-compute") diff --git a/cpp/src/arrow/compute/kernels/aggregate-benchmark.cc b/cpp/src/arrow/compute/kernels/aggregate-benchmark.cc new file mode 100644 index 00000000000..0c31d2bede5 --- /dev/null +++ b/cpp/src/arrow/compute/kernels/aggregate-benchmark.cc @@ -0,0 +1,349 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include "benchmark/benchmark.h" + +#include +#ifdef _MSC_VER +#include +#else +#include +#endif + +#include "arrow/builder.h" +#include "arrow/memory_pool.h" +#include "arrow/test-random.h" +#include "arrow/test-util.h" +#include "arrow/util/bit-util.h" +#include "arrow/util/cpu-info.h" + +#include "arrow/compute/context.h" +#include "arrow/compute/kernel.h" +#include "arrow/compute/kernels/sum.h" + +namespace arrow { +namespace compute { + +#include +#include +#include +#include + +using internal::CpuInfo; +static CpuInfo* cpu_info = CpuInfo::GetInstance(); + +static const int64_t kL1Size = cpu_info->CacheSize(CpuInfo::L1_CACHE); +static const int64_t kL2Size = cpu_info->CacheSize(CpuInfo::L2_CACHE); +static const int64_t kL3Size = cpu_info->CacheSize(CpuInfo::L3_CACHE); + +namespace BitUtil = arrow::BitUtil; +using arrow::internal::BitmapReader; + +template +struct SumState { + using ValueType = T; + + SumState() : total(0), valid_count(0) {} + + T total = 0; + int64_t valid_count = 0; +}; + +template +struct Traits {}; + +template <> +struct Traits { + using ArrayType = typename CTypeTraits::ArrayType; + static constexpr int64_t null_sentinel = std::numeric_limits::lowest(); + + static void FixSentinel(std::shared_ptr& array) { + auto data = array->data(); + for (int64_t i = 0; i < array->length(); i++) + if (array->IsNull(i)) { + int64_t* val_ptr = data->GetMutableValues(1, i); + *val_ptr = null_sentinel; + } + } + + static inline bool IsNull(int64_t val) { return val == null_sentinel; } + + static inline bool NotNull(int64_t val) { return val != null_sentinel; } +}; + +template +struct Summer { + public: + using ValueType = T; + using ArrowType = typename CTypeTraits::ArrowType; +}; + +template +struct SumNoNulls : public Summer { + using ArrayType = typename CTypeTraits::ArrayType; + + static void Sum(const ArrayType& array, SumState* state) { + SumState local; + + const auto values = array.raw_values(); + for (int64_t i = 0; i < array.length(); ++i) { + local.total += values[i]; + } + + local.valid_count = array.length(); + *state = local; + } +}; + +template +struct SumNoNullsUnrolled : public Summer { + using ArrayType = typename CTypeTraits::ArrayType; + + static void Sum(const ArrayType& array, SumState* state) { + SumState local; + + const auto values = array.raw_values(); + const auto length = array.length(); + const int64_t length_rounded = BitUtil::RoundDown(length, 8); + for (int64_t i = 0; i < length_rounded; i += 8) { + local.total += values[i + 0] + values[i + 1] + values[i + 2] + values[i + 3] + + values[i + 4] + values[i + 5] + values[i + 6] + values[i + 7]; + } + + for (int64_t i = length_rounded; i < length; ++i) { + local.total += values[i]; + } + + local.valid_count = length; + + *state = local; + } +}; + +template +struct SumSentinel : public Summer { + using ArrayType = typename CTypeTraits::ArrayType; + + static void Sum(const ArrayType& array, SumState* state) { + SumState local; + + const auto values = array.raw_values(); + const auto length = array.length(); + for (int64_t i = 0; i < length; i++) { + // NaN is not equal to itself + local.total += values[i] * Traits::NotNull(values[i]); + local.valid_count++; + } + + *state = local; + } +}; + +template +struct SumSentinelUnrolled : public Summer { + using ArrayType = typename CTypeTraits::ArrayType; + + static void Sum(const ArrayType& array, SumState* state) { + SumState local; + +#define SUM_NOT_NULL(ITEM) \ + do { \ + local.total += values[i + ITEM] * Traits::NotNull(values[i + ITEM]); \ + local.valid_count++; \ + } while (0) + + const auto values = array.raw_values(); + const auto length = array.length(); + const int64_t length_rounded = BitUtil::RoundDown(length, 8); + for (int64_t i = 0; i < length_rounded; i += 8) { + SUM_NOT_NULL(0); + SUM_NOT_NULL(1); + SUM_NOT_NULL(2); + SUM_NOT_NULL(3); + SUM_NOT_NULL(4); + SUM_NOT_NULL(5); + SUM_NOT_NULL(6); + SUM_NOT_NULL(7); + } + +#undef SUM_NOT_NULL + + for (int64_t i = length_rounded * 8; i < length; ++i) { + local.total += values[i] * Traits::NotNull(values[i]); + ++local.valid_count; + } + + *state = local; + } +}; + +template +struct SumBitmapNaive : public Summer { + using ArrayType = typename CTypeTraits::ArrayType; + + static void Sum(const ArrayType& array, SumState* state) { + SumState local; + + const auto values = array.raw_values(); + const auto bitmap = array.null_bitmap_data(); + const auto length = array.length(); + + for (int64_t i = 0; i < length; ++i) { + if (BitUtil::GetBit(bitmap, i)) { + local.total += values[i]; + ++local.valid_count; + } + } + + *state = local; + } +}; + +template +struct SumBitmapReader : public Summer { + using ArrayType = typename CTypeTraits::ArrayType; + + static void Sum(const ArrayType& array, SumState* state) { + SumState local; + + const auto values = array.raw_values(); + const auto bitmap = array.null_bitmap_data(); + const auto length = array.length(); + BitmapReader bit_reader(bitmap, 0, length); + for (int64_t i = 0; i < length; ++i) { + if (bit_reader.IsSet()) { + local.total += values[i]; + ++local.valid_count; + } + + bit_reader.Next(); + } + + *state = local; + } +}; + +template +struct SumBitmapVectorizeUnroll : public Summer { + using ArrayType = typename CTypeTraits::ArrayType; + + static void Sum(const ArrayType& array, SumState* state) { + SumState local; + + const auto values = array.raw_values(); + const auto bitmap = array.null_bitmap_data(); + const auto length = array.length(); + const int64_t length_rounded = BitUtil::RoundDown(length, 8); + for (int64_t i = 0; i < length_rounded; i += 8) { + const uint8_t valid_byte = bitmap[i / 8]; + +#define SUM_SHIFT(ITEM) (values[i + ITEM] * ((valid_byte >> ITEM) & 1)) + + if (valid_byte < 0xFF) { + // Some nulls + local.total += SUM_SHIFT(0); + local.total += SUM_SHIFT(1); + local.total += SUM_SHIFT(2); + local.total += SUM_SHIFT(3); + local.total += SUM_SHIFT(4); + local.total += SUM_SHIFT(5); + local.total += SUM_SHIFT(6); + local.total += SUM_SHIFT(7); + local.valid_count += BitUtil::kBytePopcount[valid_byte]; + } else { + // No nulls + local.total += values[i + 0] + values[i + 1] + values[i + 2] + values[i + 3] + + values[i + 4] + values[i + 5] + values[i + 6] + values[i + 7]; + local.valid_count += 8; + } + } + +#undef SUM_SHIFT + + for (int64_t i = length_rounded; i < length; ++i) { + if (BitUtil::GetBit(bitmap, i)) { + local.total = values[i]; + ++local.valid_count; + } + } + + *state = local; + } +}; + +template +void BenchSum(benchmark::State& state) { + using T = typename Functor::ValueType; + + const int64_t array_size = state.range(0) / sizeof(int64_t); + const double null_percent = static_cast(state.range(1)) / 100.0; + auto rand = random::RandomArrayGenerator(1923); + auto array = std::static_pointer_cast>( + rand.Int64(array_size, -100, 100, null_percent)); + + Traits::FixSentinel(array); + + for (auto _ : state) { + SumState sum_state; + Functor::Sum(*array, &sum_state); + benchmark::DoNotOptimize(sum_state); + } + + state.counters["size"] = static_cast(state.range(0)); + state.counters["null_percent"] = static_cast(state.range(1)); + state.SetBytesProcessed(state.iterations() * array_size * sizeof(T)); +} + +static void SetArgs(benchmark::internal::Benchmark* bench) { + bench->Unit(benchmark::kMicrosecond); + + for (auto size : {kL1Size, kL2Size, kL3Size, kL3Size * 4}) + for (auto nulls : std::vector({0, 1, 10, 50})) + bench->Args({static_cast(size), nulls}); +} + +BENCHMARK_TEMPLATE(BenchSum, SumNoNulls)->Apply(SetArgs); +BENCHMARK_TEMPLATE(BenchSum, SumNoNullsUnrolled)->Apply(SetArgs); +BENCHMARK_TEMPLATE(BenchSum, SumSentinel)->Apply(SetArgs); +BENCHMARK_TEMPLATE(BenchSum, SumSentinelUnrolled)->Apply(SetArgs); +BENCHMARK_TEMPLATE(BenchSum, SumBitmapNaive)->Apply(SetArgs); +BENCHMARK_TEMPLATE(BenchSum, SumBitmapReader)->Apply(SetArgs); +BENCHMARK_TEMPLATE(BenchSum, SumBitmapVectorizeUnroll)->Apply(SetArgs); + +static void BenchSumKernel(benchmark::State& state) { + const int64_t array_size = state.range(0) / sizeof(int64_t); + const double null_percent = static_cast(state.range(1)) / 100.0; + auto rand = random::RandomArrayGenerator(1923); + auto array = std::static_pointer_cast>( + rand.Int64(array_size, -100, 100, null_percent)); + + FunctionContext ctx; + for (auto _ : state) { + Datum out; + ABORT_NOT_OK(Sum(&ctx, Datum(array), &out)); + benchmark::DoNotOptimize(out); + } + + state.counters["size"] = static_cast(state.range(0)); + state.counters["null_percent"] = static_cast(state.range(1)); + state.SetBytesProcessed(state.iterations() * array_size * sizeof(int64_t)); +} + +BENCHMARK(BenchSumKernel)->Apply(SetArgs); + +} // namespace compute +} // namespace arrow diff --git a/cpp/src/arrow/compute/kernels/aggregate-test.cc b/cpp/src/arrow/compute/kernels/aggregate-test.cc new file mode 100644 index 00000000000..a142a6d17b7 --- /dev/null +++ b/cpp/src/arrow/compute/kernels/aggregate-test.cc @@ -0,0 +1,156 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include +#include + +#include + +#include "arrow/array.h" +#include "arrow/test-common.h" +#include "arrow/test-random.h" +#include "arrow/test-util.h" +#include "arrow/type.h" + +#include "arrow/compute/kernel.h" +#include "arrow/compute/kernels/sum.h" +#include "arrow/compute/test-util.h" + +using std::shared_ptr; +using std::vector; + +namespace arrow { +namespace compute { + +template +struct DatumEqual { + static void EnsureEqual(const Datum& lhs, const Datum& rhs) {} +}; + +template +struct DatumEqual::value>::type> { + static constexpr double kArbitraryDoubleErrorBound = 1.0; + + static void EnsureEqual(const Datum& lhs, const Datum& rhs) { + ASSERT_EQ(lhs.kind(), rhs.kind()); + if (lhs.kind() == Datum::SCALAR) { + ASSERT_EQ(lhs.scalar().kind(), rhs.scalar().kind()); + ASSERT_NEAR(util::get(lhs.scalar().value), + util::get(rhs.scalar().value), kArbitraryDoubleErrorBound); + } + } +}; + +template +struct DatumEqual::value>::type> { + static void EnsureEqual(const Datum& lhs, const Datum& rhs) { + ASSERT_EQ(lhs.kind(), rhs.kind()); + if (lhs.kind() == Datum::SCALAR) { + ASSERT_EQ(lhs.scalar().kind(), rhs.scalar().kind()); + ASSERT_EQ(util::get(lhs.scalar().value), + util::get(rhs.scalar().value)); + } + } +}; + +template +void ValidateSum(FunctionContext* ctx, const Array& input, Datum expected) { + using CType = typename ArrowType::c_type; + using SumType = typename FindAccumulatorType::Type; + + Datum result; + ASSERT_OK(Sum(ctx, input, &result)); + DatumEqual::EnsureEqual(result, expected); +} + +template +void ValidateSum(FunctionContext* ctx, const char* json, Datum expected) { + auto array = ArrayFromJSON(TypeTraits::type_singleton(), json); + ValidateSum(ctx, *array, expected); +} + +template +static Datum DummySum(const Array& array) { + using CType = typename ArrowType::c_type; + using ArrayType = typename TypeTraits::ArrayType; + using SumType = typename FindAccumulatorType::Type; + + SumType sum = 0; + int64_t count = 0; + + const auto& array_numeric = reinterpret_cast(array); + const auto values = array_numeric.raw_values(); + const auto bitmap = array.null_bitmap_data(); + for (int64_t i = 0; i < array.length(); i++) { + if (BitUtil::GetBit(bitmap, i)) { + sum += values[i]; + count++; + } + } + + return (count > 0) ? Datum(Scalar(sum)) : Datum(); +} + +template +void ValidateSum(FunctionContext* ctx, const Array& array) { + ValidateSum(ctx, array, DummySum(array)); +} + +template +class TestSumKernelNumeric : public ComputeFixture, public TestBase {}; + +typedef ::testing::Types + NumericArrowTypes; + +TYPED_TEST_CASE(TestSumKernelNumeric, NumericArrowTypes); +TYPED_TEST(TestSumKernelNumeric, SimpleSum) { + using CType = typename TypeParam::c_type; + using SumType = typename FindAccumulatorType::Type; + + ValidateSum(&this->ctx_, "[]", Datum()); + + ValidateSum(&this->ctx_, "[0, 1, 2, 3, 4, 5]", + Datum(Scalar(static_cast(5 * 6 / 2)))); + + // Avoid this tests for (U)Int8Type + if (sizeof(CType) > 1) + ValidateSum(&this->ctx_, "[1000, null, 300, null, 30, null, 7]", + Datum(Scalar(static_cast(1337)))); +} + +template +class TestRandomSumKernelNumeric : public ComputeFixture, public TestBase {}; + +TYPED_TEST_CASE(TestRandomSumKernelNumeric, NumericArrowTypes); +TYPED_TEST(TestRandomSumKernelNumeric, RandomArraySum) { + auto rand = random::RandomArrayGenerator(0x5487655); + for (size_t i = 5; i < 14; i++) { + for (auto null_probability : {0.0, 0.01, 0.1, 0.25, 0.5, 1.0}) { + for (auto length_offset : {-2, -1, 0, 1, 2}) { + int64_t length = (1UL << i) + length_offset; + auto array = rand.Numeric(length, 0, 100, null_probability); + ValidateSum(&this->ctx_, *array); + } + } + } +} + +} // namespace compute +} // namespace arrow diff --git a/cpp/src/arrow/compute/kernels/aggregate.cc b/cpp/src/arrow/compute/kernels/aggregate.cc new file mode 100644 index 00000000000..e1e2dd942a1 --- /dev/null +++ b/cpp/src/arrow/compute/kernels/aggregate.cc @@ -0,0 +1,66 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include + +#include "arrow/compute/context.h" +#include "arrow/compute/kernels/aggregate.h" + +namespace arrow { +namespace compute { + +// Helper class that properly invokes destructor when state goes out of scope. +class ManagedAggregateState { + public: + ManagedAggregateState(std::shared_ptr& desc, + std::shared_ptr&& buffer) + : desc_(desc), state_(buffer) { + desc_->New(state_->mutable_data()); + } + + ~ManagedAggregateState() { desc_->Delete(state_->mutable_data()); } + + void* mutable_data() { return state_->mutable_data(); } + + static std::shared_ptr Make( + std::shared_ptr& desc, MemoryPool* pool) { + std::shared_ptr buf; + if (!AllocateBuffer(pool, desc->Size(), &buf).ok()) return nullptr; + + return std::make_shared(desc, std::move(buf)); + } + + private: + std::shared_ptr desc_; + std::shared_ptr state_; +}; + +Status AggregateUnaryKernel::Call(FunctionContext* ctx, const Datum& input, Datum* out) { + if (!input.is_array()) return Status::Invalid("AggregateKernel expects Array datum"); + + auto state = ManagedAggregateState::Make(aggregate_function_, ctx->memory_pool()); + if (!state) return Status::OutOfMemory("AggregateState allocation failed"); + + auto array = input.make_array(); + RETURN_NOT_OK(aggregate_function_->Consume(*array, state->mutable_data())); + RETURN_NOT_OK(aggregate_function_->Finalize(state->mutable_data(), out)); + + return Status::OK(); +} + +} // namespace compute +} // namespace arrow diff --git a/cpp/src/arrow/compute/kernels/aggregate.h b/cpp/src/arrow/compute/kernels/aggregate.h new file mode 100644 index 00000000000..4bc869aec3f --- /dev/null +++ b/cpp/src/arrow/compute/kernels/aggregate.h @@ -0,0 +1,111 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#pragma once + +#include + +#include "arrow/compute/kernel.h" + +namespace arrow { + +class Array; +class Status; + +namespace compute { + +class FunctionContext; +struct Datum; + +/// AggregateFunction is an interface for Aggregates +/// +/// An aggregates transforms an array into single result called a state via the +/// Consume method.. State supports the merge operation via the Merge method. +/// State can be sealed into a final result via the Finalize method. +// +/// State ownership is handled by callers, thus the interface exposes 3 methods +/// for the caller to manage memory: +/// - Size +/// - New (placement new constructor invocation) +/// - Delete (state desctructor) +/// +/// Design inspired by ClickHouse aggregate functions. +class AggregateFunction { + public: + /// \brief Consume an array into a state. + virtual Status Consume(const Array& input, void* state) const = 0; + + /// \brief Merge states. + virtual Status Merge(const void* src, void* dst) const = 0; + + /// \brief Convert state into a final result. + virtual Status Finalize(const void* src, Datum* output) const = 0; + + virtual ~AggregateFunction() {} + + /// State management methods. + virtual int64_t Size() const = 0; + virtual void New(void* ptr) const = 0; + virtual void Delete(void* ptr) const = 0; +}; + +/// AggregateFunction partial implementation for static type state +template +class AggregateFunctionStaticState : public AggregateFunction { + virtual Status Consume(const Array& input, State* state) const = 0; + virtual Status Merge(const State& src, State* dst) const = 0; + virtual Status Finalize(const State& src, Datum* output) const = 0; + + Status Consume(const Array& input, void* state) const final { + return Consume(input, static_cast(state)); + } + + Status Merge(const void* src, void* dst) const final { + return Merge(*static_cast(src), static_cast(dst)); + } + + /// \brief Convert state into a final result. + Status Finalize(const void* src, Datum* output) const final { + return Finalize(*static_cast(src), output); + } + + int64_t Size() const final { return sizeof(State); } + + void New(void* ptr) const final { + // By using placement-new syntax, the constructor of the State is invoked + // in the memory location defined by the caller. This only supports State + // with a parameter-less constructor. + new (ptr) State; + } + + void Delete(void* ptr) const final { static_cast(ptr)->~State(); } +}; + +/// \brief UnaryKernel implemented by an AggregateState +class ARROW_EXPORT AggregateUnaryKernel : public UnaryKernel { + public: + explicit AggregateUnaryKernel(std::shared_ptr& aggregate) + : aggregate_function_(aggregate) {} + + Status Call(FunctionContext* ctx, const Datum& input, Datum* out) override; + + private: + std::shared_ptr aggregate_function_; +}; + +} // namespace compute +} // namespace arrow diff --git a/cpp/src/arrow/compute/kernels/hash-test.cc b/cpp/src/arrow/compute/kernels/hash-test.cc index f20575f621b..17c8c9b5583 100644 --- a/cpp/src/arrow/compute/kernels/hash-test.cc +++ b/cpp/src/arrow/compute/kernels/hash-test.cc @@ -114,12 +114,8 @@ TYPED_TEST(TestHashKernelPrimitive, DictEncode) { TYPED_TEST(TestHashKernelPrimitive, PrimitiveResizeTable) { using T = typename TypeParam::c_type; - // Skip this test for (u)int8 - if (sizeof(Scalar) == 1) { - return; - } - const int64_t kTotalValues = 1000000; + const int64_t kTotalValues = std::min(INT16_MAX, 1UL << sizeof(T) / 2); const int64_t kRepeats = 5; vector values; diff --git a/cpp/src/arrow/compute/kernels/sum.cc b/cpp/src/arrow/compute/kernels/sum.cc new file mode 100644 index 00000000000..cb37c4a0961 --- /dev/null +++ b/cpp/src/arrow/compute/kernels/sum.cc @@ -0,0 +1,192 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// returnGegarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include "arrow/compute/kernels/sum.h" + +#include "arrow/array.h" +#include "arrow/compute/kernel.h" +#include "arrow/compute/kernels/aggregate.h" +#include "arrow/type_traits.h" +#include "arrow/util/bit-util.h" +#include "arrow/util/logging.h" +#include "arrow/visitor_inline.h" + +namespace arrow { +namespace compute { + +template ::Type> +struct SumState { + using ThisType = SumState; + + ThisType operator+(const ThisType& rhs) const { + return ThisType(this->count + rhs.count, this->sum + rhs.sum); + } + + ThisType& operator+=(const ThisType& rhs) { + this->count += rhs.count; + this->sum += this->sum; + + return *this; + } + + size_t count = 0; + SumType sum = 0; +}; + +template > +class SumAggregateFunction final : public AggregateFunctionStaticState { + using CType = typename TypeTraits::CType; + using ArrayType = typename TypeTraits::ArrayType; + + public: + Status Consume(const Array& input, StateType* state) const override { + const ArrayType& array = static_cast(input); + + if (input.null_count() > 0) { + *state = ConsumeSparse(array); + } else { + *state = ConsumeDense(array); + } + + return Status::OK(); + } + + Status Merge(const StateType& src, StateType* dst) const override { + *dst += src; + return Status::OK(); + } + + Status Finalize(const StateType& src, Datum* output) const override { + *output = (src.count > 0) ? Datum(Scalar(src.sum)) : Datum(); + return Status::OK(); + } + + private: + StateType ConsumeDense(const ArrayType& array) const { + StateType local; + + const auto values = array.raw_values(); + for (int64_t i = 0; i < array.length(); i++) { + local.sum += values[i]; + } + + local.count = array.length(); + + return local; + } + + StateType ConsumeSparse(const ArrayType& array) const { + StateType local; + + // TODO(fsaintjacques): This fails on slice not byte-aligned. + DCHECK_EQ(array.offset() % 8, 0); + + const auto values = array.raw_values(); + const auto bitmap = array.null_bitmap_data() + BitUtil::RoundDown(array.offset(), 8); + const auto length = array.length(); + const auto length_rounded = BitUtil::RoundDown(length, 8); + + for (int64_t i = 0; i < length_rounded; i += 8) { + const uint8_t valid_byte = bitmap[i / 8]; + if (valid_byte < 0xFF) { +#define SUM_SHIFT(ITEM) \ + static_cast(values[i + ITEM] * static_cast(((valid_byte >> ITEM) & 1U))) + // Some nulls + local.sum += SUM_SHIFT(0); + local.sum += SUM_SHIFT(1); + local.sum += SUM_SHIFT(2); + local.sum += SUM_SHIFT(3); + local.sum += SUM_SHIFT(4); + local.sum += SUM_SHIFT(5); + local.sum += SUM_SHIFT(6); + local.sum += SUM_SHIFT(7); + local.count += BitUtil::kBytePopcount[valid_byte]; +#undef SUM_SHIFT + } else { + // No nulls + local.sum += values[i + 0] + values[i + 1] + values[i + 2] + values[i + 3] + + values[i + 4] + values[i + 5] + values[i + 6] + values[i + 7]; + local.count += 8; + } + } + + for (int64_t i = length_rounded; i < length; ++i) { + if (BitUtil::GetBit(bitmap, i)) { + local.sum += values[i]; + local.count++; + } + } + + return local; + } +}; + +#define SUM_AGG_FN_CASE(T) \ + case T::type_id: \ + return std::static_pointer_cast( \ + std::make_shared>()); + +std::shared_ptr MakeSumAggregateFunction(const DataType& type, + FunctionContext* ctx) { + switch (type.id()) { + SUM_AGG_FN_CASE(UInt8Type); + SUM_AGG_FN_CASE(Int8Type); + SUM_AGG_FN_CASE(UInt16Type); + SUM_AGG_FN_CASE(Int16Type); + SUM_AGG_FN_CASE(UInt32Type); + SUM_AGG_FN_CASE(Int32Type); + SUM_AGG_FN_CASE(UInt64Type); + SUM_AGG_FN_CASE(Int64Type); + SUM_AGG_FN_CASE(FloatType); + SUM_AGG_FN_CASE(DoubleType); + default: + return nullptr; + } + +#undef SUM_AGG_FN_CASE +} + +static Status GetSumKernel(FunctionContext* ctx, const DataType& type, + std::shared_ptr& kernel) { + std::shared_ptr aggregate = MakeSumAggregateFunction(type, ctx); + if (!aggregate) return Status::Invalid("No sum for type ", type); + + kernel = std::make_shared(aggregate); + + return Status::OK(); +} + +Status Sum(FunctionContext* ctx, const Datum& value, Datum* out) { + std::shared_ptr kernel; + + auto data_type = value.type(); + if (data_type == nullptr) + return Status::Invalid("Datum must be array-like"); + else if (!is_integer(data_type->id()) && !is_floating(data_type->id())) + return Status::Invalid("Datum must contain a NumericType"); + + RETURN_NOT_OK(GetSumKernel(ctx, *data_type, kernel)); + + return kernel->Call(ctx, value, out); +} + +Status Sum(FunctionContext* ctx, const Array& array, Datum* out) { + return Sum(ctx, Datum(array.data()), out); +} + +} // namespace compute +} // namespace arrow diff --git a/cpp/src/arrow/compute/kernels/sum.h b/cpp/src/arrow/compute/kernels/sum.h new file mode 100644 index 00000000000..2e2ca3c3d4b --- /dev/null +++ b/cpp/src/arrow/compute/kernels/sum.h @@ -0,0 +1,91 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#ifndef ARROW_COMPUTE_KERNELS_SUM_H +#define ARROW_COMPUTE_KERNELS_SUM_H + +#include +#include + +#include "arrow/status.h" +#include "arrow/util/visibility.h" + +namespace arrow { + +class Array; +class DataType; + +namespace compute { + +// Find the largest compatible primitive type for a primitive type. +template +struct FindAccumulatorType { + using Type = double; +}; + +template +struct FindAccumulatorType::value && + std::is_signed::value>::type> { + using Type = int64_t; +}; + +template +struct FindAccumulatorType::value && + std::is_unsigned::value>::type> { + using Type = uint64_t; +}; + +template +struct FindAccumulatorType< + I, typename std::enable_if::value>::type> { + using Type = double; +}; + +struct Datum; +class FunctionContext; +class AggregateFunction; + +ARROW_EXPORT +std::shared_ptr MakeSumAggregateFunction(const DataType& type, + FunctionContext* context); + +/// \brief Sum values of a numeric array. +/// +/// \param[in] context the FunctionContext +/// \param[in] value datum to sum, expecting Array or ChunkedArray +/// \param[out] out resulting datum +/// +/// \since 0.13.0 +/// \note API not yet finalized +ARROW_EXPORT +Status Sum(FunctionContext* context, const Datum& value, Datum* out); + +/// \brief Sum values of a numeric array. +/// +/// \param[in] context the FunctionContext +/// \param[in] array to sum +/// \param[out] out resulting datum +/// +/// \since 0.13.0 +/// \note API not yet finalized +ARROW_EXPORT +Status Sum(FunctionContext* context, const Array& array, Datum* out); + +} // namespace compute +} // namespace arrow + +#endif // ARROW_COMPUTE_KERNELS_CAST_H diff --git a/cpp/src/arrow/io/memory-benchmark.cc b/cpp/src/arrow/io/memory-benchmark.cc index b36be4de163..b37aff96649 100644 --- a/cpp/src/arrow/io/memory-benchmark.cc +++ b/cpp/src/arrow/io/memory-benchmark.cc @@ -32,49 +32,137 @@ namespace arrow { -static const int kNumCores = internal::CpuInfo::GetInstance()->num_cores(); +using internal::CpuInfo; +static CpuInfo* cpu_info = CpuInfo::GetInstance(); + +static const int kNumCores = cpu_info->num_cores(); +static const int64_t kL1Size = cpu_info->CacheSize(CpuInfo::L1_CACHE); +static const int64_t kL2Size = cpu_info->CacheSize(CpuInfo::L2_CACHE); +static const int64_t kL3Size = cpu_info->CacheSize(CpuInfo::L3_CACHE); + constexpr size_t kMemoryPerCore = 32 * 1024 * 1024; using BufferPtr = std::shared_ptr; +#ifdef ARROW_AVX512 + +using VectorType = __m512i; +#define VectorSet _mm512_set1_epi32 +#define VectorLoad _mm512_stream_load_si512 +#define VectorLoadAsm(SRC, DST) \ + asm volatile("vmovaps %[src], %[dst]" : [dst] "=v"(DST) : [src] "m"(SRC) :) +#define VectorStreamLoad _mm512_stream_load_si512 +#define VectorStreamLoadAsm(SRC, DST) \ + asm volatile("vmovntdqa %[src], %[dst]" : [dst] "=v"(DST) : [src] "m"(SRC) :) +#define VectorStreamWrite _mm512_stream_si512 + +#else + +#ifdef ARROW_AVX2 + +using VectorType = __m256i; +#define VectorSet _mm256_set1_epi32 +#define VectorLoad _mm256_stream_load_si256 +#define VectorLoadAsm(SRC, DST) \ + asm volatile("vmovaps %[src], %[dst]" : [dst] "=v"(DST) : [src] "m"(SRC) :) +#define VectorStreamLoad _mm256_stream_load_si256 +#define VectorStreamLoadAsm(SRC, DST) \ + asm volatile("vmovntdqa %[src], %[dst]" : [dst] "=v"(DST) : [src] "m"(SRC) :) +#define VectorStreamWrite _mm256_stream_si256 + +#else + using VectorType = __m128i; +#define VectorSet _mm_set1_epi32 +#define VectorLoad _mm_stream_load_si128 +#define VectorLoadAsm(SRC, DST) \ + asm volatile("movaps %[src], %[dst]" : [dst] "=x"(DST) : [src] "m"(SRC) :) +#define VectorStreamLoad _mm_stream_load_si128 +#define VectorStreamLoadAsm(SRC, DST) \ + asm volatile("movntdqa %[src], %[dst]" : [dst] "=x"(DST) : [src] "m"(SRC) :) +#define VectorStreamWrite _mm_stream_si128 + +#endif +#endif + +static void Read(void* src, void* dst, size_t size) { + const auto simd = static_cast(src); + VectorType a, b, c, d; + (void)dst; + + for (size_t i = 0; i < size / sizeof(VectorType); i += 4) { + VectorLoadAsm(simd[i], a); + VectorLoadAsm(simd[i + 1], b); + VectorLoadAsm(simd[i + 2], c); + VectorLoadAsm(simd[i + 3], d); + } + + memset(&a, 0, sizeof(a)); + memset(&b, 0, sizeof(b)); + memset(&c, 0, sizeof(c)); + memset(&d, 0, sizeof(d)); + + benchmark::DoNotOptimize(a + b + c + d); +} // See http://codearcana.com/posts/2013/05/18/achieving-maximum-memory-bandwidth.html // for the usage of stream loads/writes. Or section 6.1, page 47 of // https://akkadia.org/drepper/cpumemory.pdf . - -static void Read(void* src, void* dst, size_t size) { +static void StreamRead(void* src, void* dst, size_t size) { auto simd = static_cast(src); + VectorType a, b, c, d; (void)dst; - for (size_t i = 0; i < size / sizeof(VectorType); i++) - benchmark::DoNotOptimize(_mm_stream_load_si128(&simd[i])); + memset(&a, 0, sizeof(a)); + memset(&b, 0, sizeof(b)); + memset(&c, 0, sizeof(c)); + memset(&d, 0, sizeof(d)); + + for (size_t i = 0; i < size / sizeof(VectorType); i += 4) { + VectorStreamLoadAsm(simd[i], a); + VectorStreamLoadAsm(simd[i + 1], b); + VectorStreamLoadAsm(simd[i + 2], c); + VectorStreamLoadAsm(simd[i + 3], d); + } + + benchmark::DoNotOptimize(a + b + c + d); } -static void Write(void* src, void* dst, size_t size) { +static void StreamWrite(void* src, void* dst, size_t size) { auto simd = static_cast(dst); - const VectorType ones = _mm_set1_epi32(1); + const VectorType ones = VectorSet(1); (void)src; - for (size_t i = 0; i < size / sizeof(VectorType); i++) _mm_stream_si128(&simd[i], ones); + for (size_t i = 0; i < size / sizeof(VectorType); i += 4) { + VectorStreamWrite(&simd[i], ones); + VectorStreamWrite(&simd[i + 1], ones); + VectorStreamWrite(&simd[i + 2], ones); + VectorStreamWrite(&simd[i + 3], ones); + } } -static void ReadWrite(void* src, void* dst, size_t size) { +static void StreamReadWrite(void* src, void* dst, size_t size) { auto src_simd = static_cast(src); auto dst_simd = static_cast(dst); - for (size_t i = 0; i < size / sizeof(VectorType); i++) - _mm_stream_si128(&dst_simd[i], _mm_stream_load_si128(&src_simd[i])); + for (size_t i = 0; i < size / sizeof(VectorType); i += 4) { + VectorStreamWrite(&dst_simd[i], VectorStreamLoad(&src_simd[i])); + VectorStreamWrite(&dst_simd[i + 1], VectorStreamLoad(&src_simd[i + 1])); + VectorStreamWrite(&dst_simd[i + 2], VectorStreamLoad(&src_simd[i + 2])); + VectorStreamWrite(&dst_simd[i + 3], VectorStreamLoad(&src_simd[i + 3])); + } } +static void PlatformMemcpy(void* src, void* dst, size_t size) { memcpy(src, dst, size); } + using ApplyFn = decltype(Read); template static void MemoryBandwidth(benchmark::State& state) { // NOLINT non-const reference - const size_t buffer_size = kMemoryPerCore; + const size_t buffer_size = state.range(0); BufferPtr src, dst; - ABORT_NOT_OK(AllocateBuffer(buffer_size, &src)); ABORT_NOT_OK(AllocateBuffer(buffer_size, &dst)); + ABORT_NOT_OK(AllocateBuffer(buffer_size, &src)); random_bytes(buffer_size, 0, src->mutable_data()); while (state.KeepRunning()) { @@ -84,11 +172,29 @@ static void MemoryBandwidth(benchmark::State& state) { // NOLINT non-const refe state.SetBytesProcessed(state.iterations() * buffer_size); } -// `UseRealTime` is required due to threads, otherwise the cumulative CPU time -// is used which will skew the results by the number of threads. -BENCHMARK_TEMPLATE(MemoryBandwidth, Read)->ThreadRange(1, kNumCores)->UseRealTime(); -BENCHMARK_TEMPLATE(MemoryBandwidth, Write)->ThreadRange(1, kNumCores)->UseRealTime(); -BENCHMARK_TEMPLATE(MemoryBandwidth, ReadWrite)->ThreadRange(1, kNumCores)->UseRealTime(); +static void SetCacheBandwidthArgs(benchmark::internal::Benchmark* bench) { + auto cache_sizes = {kL1Size, kL2Size, kL3Size}; + for (auto size : cache_sizes) { + bench->Arg(size / 2); + bench->Arg(size); + bench->Arg(size * 2); + } + + bench->ArgName("size"); +} + +BENCHMARK_TEMPLATE(MemoryBandwidth, Read)->Apply(SetCacheBandwidthArgs); + +static void SetMemoryBandwidthArgs(benchmark::internal::Benchmark* bench) { + // `UseRealTime` is required due to threads, otherwise the cumulative CPU time + // is used which will skew the results by the number of threads. + bench->Arg(kMemoryPerCore)->ThreadRange(1, kNumCores)->UseRealTime(); +} + +BENCHMARK_TEMPLATE(MemoryBandwidth, StreamRead)->Apply(SetMemoryBandwidthArgs); +BENCHMARK_TEMPLATE(MemoryBandwidth, StreamWrite)->Apply(SetMemoryBandwidthArgs); +BENCHMARK_TEMPLATE(MemoryBandwidth, StreamReadWrite)->Apply(SetMemoryBandwidthArgs); +BENCHMARK_TEMPLATE(MemoryBandwidth, PlatformMemcpy)->Apply(SetMemoryBandwidthArgs); static void ParallelMemoryCopy(benchmark::State& state) { // NOLINT non-const reference const int64_t n_threads = state.range(0); @@ -107,9 +213,12 @@ static void ParallelMemoryCopy(benchmark::State& state) { // NOLINT non-const r } state.SetBytesProcessed(int64_t(state.iterations()) * buffer_size); - state.counters["threads"] = static_cast(n_threads); } -BENCHMARK(ParallelMemoryCopy)->RangeMultiplier(2)->Range(1, kNumCores)->UseRealTime(); +BENCHMARK(ParallelMemoryCopy) + ->RangeMultiplier(2) + ->Range(1, kNumCores) + ->ArgName("threads") + ->UseRealTime(); } // namespace arrow diff --git a/cpp/src/arrow/test-random.h b/cpp/src/arrow/test-random.h index dc57dcab025..1ed8d03e8a1 100644 --- a/cpp/src/arrow/test-random.h +++ b/cpp/src/arrow/test-random.h @@ -22,6 +22,7 @@ #include #include +#include "arrow/type.h" #include "arrow/util/visibility.h" namespace arrow { @@ -158,6 +159,45 @@ class ARROW_EXPORT RandomArrayGenerator { std::shared_ptr Float64(int64_t size, double min, double max, double null_probability); + template + std::shared_ptr Numeric(int64_t size, CType min, CType max, + double null_probability) { + switch (ArrowType::type_id) { + case Type::UINT8: + return UInt8(size, static_cast(min), static_cast(max), + null_probability); + case Type::INT8: + return Int8(size, static_cast(min), static_cast(max), + null_probability); + case Type::UINT16: + return UInt16(size, static_cast(min), static_cast(max), + null_probability); + case Type::INT16: + return Int16(size, static_cast(min), static_cast(max), + null_probability); + case Type::UINT32: + return UInt32(size, static_cast(min), static_cast(max), + null_probability); + case Type::INT32: + return Int32(size, static_cast(min), static_cast(max), + null_probability); + case Type::UINT64: + return UInt64(size, static_cast(min), static_cast(max), + null_probability); + case Type::INT64: + return Int64(size, static_cast(min), static_cast(max), + null_probability); + case Type::FLOAT: + return Float32(size, static_cast(min), static_cast(max), + null_probability); + case Type::DOUBLE: + return Float64(size, static_cast(min), static_cast(max), + null_probability); + default: + return nullptr; + } + } + private: SeedType seed() { return seed_distribution_(seed_rng_); } diff --git a/cpp/src/arrow/util/bit-util.h b/cpp/src/arrow/util/bit-util.h index bfdb44f255c..0e4a8640b14 100644 --- a/cpp/src/arrow/util/bit-util.h +++ b/cpp/src/arrow/util/bit-util.h @@ -81,6 +81,23 @@ typename std::make_unsigned::type as_unsigned(Integer x) { namespace BitUtil { +// The number of set bits in a given unsigned byte value, pre-computed +// +// Generated with the following Python code +// output = 'static constexpr uint8_t kBytePopcount[] = {{{0}}};' +// popcounts = [str(bin(i).count('1')) for i in range(0, 256)] +// print(output.format(', '.join(popcounts))) +static constexpr uint8_t kBytePopcount[] = { + 0, 1, 1, 2, 1, 2, 2, 3, 1, 2, 2, 3, 2, 3, 3, 4, 1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, + 4, 4, 5, 1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5, 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, + 4, 5, 4, 5, 5, 6, 1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5, 2, 3, 3, 4, 3, 4, 4, + 5, 3, 4, 4, 5, 4, 5, 5, 6, 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6, 3, 4, 4, 5, + 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7, 1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5, 2, + 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6, 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, + 5, 6, 3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7, 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, + 5, 4, 5, 5, 6, 3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7, 3, 4, 4, 5, 4, 5, 5, 6, + 4, 5, 5, 6, 5, 6, 6, 7, 4, 5, 5, 6, 5, 6, 6, 7, 5, 6, 6, 7, 6, 7, 7, 8}; + // // Bit-related computations on integer values // @@ -117,6 +134,11 @@ constexpr int64_t RoundUp(int64_t value, int64_t factor) { return (value + (factor - 1)) / factor * factor; } +// Returns 'value' rounded down to the nearest multiple of 'factor' +constexpr int64_t RoundDown(int64_t value, int64_t factor) { + return (value / factor) * factor; +} + // Returns 'value' rounded up to the nearest multiple of 'factor' when factor // is a power of two. // The result is undefined on overflow, i.e. if `value > 2**64 - factor`,