diff --git a/cpp/src/arrow/array/array_binary_test.cc b/cpp/src/arrow/array/array_binary_test.cc index e593cf7e6c4..6892e5f0a91 100644 --- a/cpp/src/arrow/array/array_binary_test.cc +++ b/cpp/src/arrow/array/array_binary_test.cc @@ -43,11 +43,6 @@ namespace arrow { using internal::checked_cast; -using StringTypes = - ::testing::Types; - -using UTF8Types = ::testing::Types; - // ---------------------------------------------------------------------- // String / Binary tests @@ -329,7 +324,7 @@ class TestStringArray : public ::testing::Test { std::shared_ptr strings_; }; -TYPED_TEST_SUITE(TestStringArray, StringTypes); +TYPED_TEST_SUITE(TestStringArray, BinaryArrowTypes); TYPED_TEST(TestStringArray, TestArrayBasics) { this->TestArrayBasics(); } @@ -386,7 +381,7 @@ class TestUTF8Array : public ::testing::Test { } }; -TYPED_TEST_SUITE(TestUTF8Array, UTF8Types); +TYPED_TEST_SUITE(TestUTF8Array, StringArrowTypes); TYPED_TEST(TestUTF8Array, TestValidateUTF8) { this->TestValidateUTF8(); } @@ -666,7 +661,7 @@ class TestStringBuilder : public TestBuilder { std::shared_ptr result_; }; -TYPED_TEST_SUITE(TestStringBuilder, StringTypes); +TYPED_TEST_SUITE(TestStringBuilder, BinaryArrowTypes); TYPED_TEST(TestStringBuilder, TestScalarAppend) { this->TestScalarAppend(); } @@ -896,7 +891,7 @@ class TestBinaryDataVisitor : public ::testing::Test { std::shared_ptr type_; }; -TYPED_TEST_SUITE(TestBinaryDataVisitor, StringTypes); +TYPED_TEST_SUITE(TestBinaryDataVisitor, BinaryArrowTypes); TYPED_TEST(TestBinaryDataVisitor, Basics) { this->TestBasics(); } diff --git a/cpp/src/arrow/array/concatenate_test.cc b/cpp/src/arrow/array/concatenate_test.cc index ebeaa0f6397..305910c247c 100644 --- a/cpp/src/arrow/array/concatenate_test.cc +++ b/cpp/src/arrow/array/concatenate_test.cc @@ -133,10 +133,7 @@ class PrimitiveConcatenateTest : public ConcatenateTest { public: }; -using PrimitiveTypes = - ::testing::Types; -TYPED_TEST_SUITE(PrimitiveConcatenateTest, PrimitiveTypes); +TYPED_TEST_SUITE(PrimitiveConcatenateTest, PrimitiveArrowTypes); TYPED_TEST(PrimitiveConcatenateTest, Primitives) { this->Check([this](int64_t size, double null_probability, std::shared_ptr* out) { diff --git a/cpp/src/arrow/compute/kernels/aggregate_basic.cc b/cpp/src/arrow/compute/kernels/aggregate_basic.cc index b19536d33ab..323d1e7ca9e 100644 --- a/cpp/src/arrow/compute/kernels/aggregate_basic.cc +++ b/cpp/src/arrow/compute/kernels/aggregate_basic.cc @@ -607,10 +607,7 @@ void AddMinMaxKernels(KernelInit init, const std::vector>& types, ScalarAggregateFunction* func, SimdLevel::type simd_level) { for (const auto& ty : types) { - // any[T] -> scalar[struct] - auto out_ty = struct_({field("min", ty), field("max", ty)}); - auto sig = KernelSignature::Make({InputType(ty->id())}, ValueDescr::Scalar(out_ty)); - AddAggKernel(std::move(sig), init, func, simd_level); + AddMinMaxKernel(init, ty, func, simd_level); } } @@ -764,8 +761,12 @@ void RegisterScalarAggregateBasic(FunctionRegistry* registry) { func = std::make_shared( "min_max", Arity::Unary(), &min_max_doc, &default_scalar_aggregate_options); - aggregate::AddMinMaxKernels(aggregate::MinMaxInit, {boolean()}, func.get()); + aggregate::AddMinMaxKernels(aggregate::MinMaxInit, {null(), boolean()}, func.get()); aggregate::AddMinMaxKernels(aggregate::MinMaxInit, NumericTypes(), func.get()); + aggregate::AddMinMaxKernels(aggregate::MinMaxInit, TemporalTypes(), func.get()); + aggregate::AddMinMaxKernels(aggregate::MinMaxInit, BaseBinaryTypes(), func.get()); + aggregate::AddMinMaxKernel(aggregate::MinMaxInit, Type::FIXED_SIZE_BINARY, func.get()); + aggregate::AddMinMaxKernel(aggregate::MinMaxInit, Type::INTERVAL_MONTHS, func.get()); aggregate::AddMinMaxKernel(aggregate::MinMaxInit, Type::DECIMAL128, func.get()); aggregate::AddMinMaxKernel(aggregate::MinMaxInit, Type::DECIMAL256, func.get()); // Add the SIMD variants for min max diff --git a/cpp/src/arrow/compute/kernels/aggregate_basic_avx2.cc b/cpp/src/arrow/compute/kernels/aggregate_basic_avx2.cc index 55e9f290e0e..dd12d5244f5 100644 --- a/cpp/src/arrow/compute/kernels/aggregate_basic_avx2.cc +++ b/cpp/src/arrow/compute/kernels/aggregate_basic_avx2.cc @@ -55,30 +55,32 @@ Result> MeanInitAvx2(KernelContext* ctx, Result> MinMaxInitAvx2(KernelContext* ctx, const KernelInitArgs& args) { + ARROW_ASSIGN_OR_RAISE(auto out_type, + args.kernel->signature->out_type().Resolve(ctx, args.inputs)); MinMaxInitState visitor( - ctx, *args.inputs[0].type, args.kernel->signature->out_type().type(), + ctx, *args.inputs[0].type, std::move(out_type.type), static_cast(*args.options)); return visitor.Create(); } void AddSumAvx2AggKernels(ScalarAggregateFunction* func) { - AddBasicAggKernels(SumInitAvx2, internal::SignedIntTypes(), int64(), func, - SimdLevel::AVX2); - AddBasicAggKernels(SumInitAvx2, internal::UnsignedIntTypes(), uint64(), func, - SimdLevel::AVX2); - AddBasicAggKernels(SumInitAvx2, internal::FloatingPointTypes(), float64(), func, - SimdLevel::AVX2); + AddBasicAggKernels(SumInitAvx2, SignedIntTypes(), int64(), func, SimdLevel::AVX2); + AddBasicAggKernels(SumInitAvx2, UnsignedIntTypes(), uint64(), func, SimdLevel::AVX2); + AddBasicAggKernels(SumInitAvx2, FloatingPointTypes(), float64(), func, SimdLevel::AVX2); } void AddMeanAvx2AggKernels(ScalarAggregateFunction* func) { - AddBasicAggKernels(MeanInitAvx2, internal::NumericTypes(), float64(), func, - SimdLevel::AVX2); + AddBasicAggKernels(MeanInitAvx2, NumericTypes(), float64(), func, SimdLevel::AVX2); } void AddMinMaxAvx2AggKernels(ScalarAggregateFunction* func) { // Enable int types for AVX2 variants. // No auto vectorize for float/double as it use fmin/fmax which has NaN handling. - AddMinMaxKernels(MinMaxInitAvx2, internal::IntTypes(), func, SimdLevel::AVX2); + AddMinMaxKernels(MinMaxInitAvx2, IntTypes(), func, SimdLevel::AVX2); + AddMinMaxKernels(MinMaxInitAvx2, TemporalTypes(), func, SimdLevel::AVX2); + AddMinMaxKernels(MinMaxInitAvx2, BaseBinaryTypes(), func, SimdLevel::AVX2); + AddMinMaxKernel(MinMaxInitAvx2, Type::FIXED_SIZE_BINARY, func, SimdLevel::AVX2); + AddMinMaxKernel(MinMaxInitAvx2, Type::INTERVAL_MONTHS, func, SimdLevel::AVX2); } } // namespace aggregate diff --git a/cpp/src/arrow/compute/kernels/aggregate_basic_avx512.cc b/cpp/src/arrow/compute/kernels/aggregate_basic_avx512.cc index df33dedabba..ebe748d685d 100644 --- a/cpp/src/arrow/compute/kernels/aggregate_basic_avx512.cc +++ b/cpp/src/arrow/compute/kernels/aggregate_basic_avx512.cc @@ -55,23 +55,24 @@ Result> MeanInitAvx512(KernelContext* ctx, Result> MinMaxInitAvx512(KernelContext* ctx, const KernelInitArgs& args) { + ARROW_ASSIGN_OR_RAISE(auto out_type, + args.kernel->signature->out_type().Resolve(ctx, args.inputs)); MinMaxInitState visitor( - ctx, *args.inputs[0].type, args.kernel->signature->out_type().type(), + ctx, *args.inputs[0].type, std::move(out_type.type), static_cast(*args.options)); return visitor.Create(); } void AddSumAvx512AggKernels(ScalarAggregateFunction* func) { - AddBasicAggKernels(SumInitAvx512, internal::SignedIntTypes(), int64(), func, + AddBasicAggKernels(SumInitAvx512, SignedIntTypes(), int64(), func, SimdLevel::AVX512); + AddBasicAggKernels(SumInitAvx512, UnsignedIntTypes(), uint64(), func, SimdLevel::AVX512); - AddBasicAggKernels(SumInitAvx512, internal::UnsignedIntTypes(), uint64(), func, - SimdLevel::AVX512); - AddBasicAggKernels(SumInitAvx512, internal::FloatingPointTypes(), float64(), func, + AddBasicAggKernels(SumInitAvx512, FloatingPointTypes(), float64(), func, SimdLevel::AVX512); } void AddMeanAvx512AggKernels(ScalarAggregateFunction* func) { - aggregate::AddBasicAggKernels(MeanInitAvx512, internal::NumericTypes(), float64(), func, + aggregate::AddBasicAggKernels(MeanInitAvx512, NumericTypes(), float64(), func, SimdLevel::AVX512); } @@ -79,6 +80,10 @@ void AddMinMaxAvx512AggKernels(ScalarAggregateFunction* func) { // Enable 32/64 int types for avx512 variants, no advantage on 8/16 int. AddMinMaxKernels(MinMaxInitAvx512, {int32(), uint32(), int64(), uint64()}, func, SimdLevel::AVX512); + AddMinMaxKernels(MinMaxInitAvx512, TemporalTypes(), func, SimdLevel::AVX512); + AddMinMaxKernels(MinMaxInitAvx512, BaseBinaryTypes(), func, SimdLevel::AVX2); + AddMinMaxKernel(MinMaxInitAvx512, Type::FIXED_SIZE_BINARY, func, SimdLevel::AVX2); + AddMinMaxKernel(MinMaxInitAvx512, Type::INTERVAL_MONTHS, func, SimdLevel::AVX512); } } // namespace aggregate diff --git a/cpp/src/arrow/compute/kernels/aggregate_basic_internal.h b/cpp/src/arrow/compute/kernels/aggregate_basic_internal.h index b97af066585..f5ea9a0d65a 100644 --- a/cpp/src/arrow/compute/kernels/aggregate_basic_internal.h +++ b/cpp/src/arrow/compute/kernels/aggregate_basic_internal.h @@ -209,7 +209,6 @@ struct MinMaxState> { ThisType& operator+=(const ThisType& rhs) { this->has_nulls |= rhs.has_nulls; - this->has_values |= rhs.has_values; this->min = this->min && rhs.min; this->max = this->max || rhs.max; return *this; @@ -223,17 +222,16 @@ struct MinMaxState> { T min = true; T max = false; bool has_nulls = false; - bool has_values = false; }; template struct MinMaxState> { using ThisType = MinMaxState; using T = typename ArrowType::c_type; + using ScalarType = typename TypeTraits::ScalarType; ThisType& operator+=(const ThisType& rhs) { this->has_nulls |= rhs.has_nulls; - this->has_values |= rhs.has_values; this->min = std::min(this->min, rhs.min); this->max = std::max(this->max, rhs.max); return *this; @@ -247,17 +245,16 @@ struct MinMaxState> { T min = std::numeric_limits::max(); T max = std::numeric_limits::min(); bool has_nulls = false; - bool has_values = false; }; template struct MinMaxState> { using ThisType = MinMaxState; using T = typename ArrowType::c_type; + using ScalarType = typename TypeTraits::ScalarType; ThisType& operator+=(const ThisType& rhs) { this->has_nulls |= rhs.has_nulls; - this->has_values |= rhs.has_values; this->min = std::fmin(this->min, rhs.min); this->max = std::fmax(this->max, rhs.max); return *this; @@ -271,25 +268,26 @@ struct MinMaxState> { T min = std::numeric_limits::infinity(); T max = -std::numeric_limits::infinity(); bool has_nulls = false; - bool has_values = false; }; template struct MinMaxState> { using ThisType = MinMaxState; using T = typename TypeTraits::CType; + using ScalarType = typename TypeTraits::ScalarType; MinMaxState() : min(T::GetMaxSentinel()), max(T::GetMinSentinel()) {} ThisType& operator+=(const ThisType& rhs) { this->has_nulls |= rhs.has_nulls; - this->has_values |= rhs.has_values; this->min = std::min(this->min, rhs.min); this->max = std::max(this->max, rhs.max); return *this; } - void MergeOne(const uint8_t* value) { MergeOne(T(value)); } + void MergeOne(util::string_view value) { + MergeOne(T(reinterpret_cast(value.data()))); + } void MergeOne(const T value) { this->min = std::min(this->min, value); @@ -299,7 +297,50 @@ struct MinMaxState> { T min; T max; bool has_nulls = false; - bool has_values = false; +}; + +template +struct MinMaxState::value || + std::is_same::value>> { + using ThisType = MinMaxState; + using ScalarType = typename TypeTraits::ScalarType; + + ThisType& operator+=(const ThisType& rhs) { + if (!this->seen && rhs.seen) { + this->min = rhs.min; + this->max = rhs.max; + } else if (this->seen && rhs.seen) { + if (this->min > rhs.min) { + this->min = rhs.min; + } + if (this->max < rhs.max) { + this->max = rhs.max; + } + } + this->has_nulls |= rhs.has_nulls; + this->seen |= rhs.seen; + return *this; + } + + void MergeOne(util::string_view value) { + if (!seen) { + this->min = std::string(value); + this->max = std::string(value); + } else { + if (value < util::string_view(this->min)) { + this->min = std::string(value); + } else if (value > util::string_view(this->max)) { + this->max = std::string(value); + } + } + this->seen = true; + } + + std::string min; + std::string max; + bool has_nulls = false; + bool seen = false; }; template @@ -309,7 +350,9 @@ struct MinMaxImpl : public ScalarAggregator { using StateType = MinMaxState; MinMaxImpl(std::shared_ptr out_type, ScalarAggregateOptions options) - : out_type(std::move(out_type)), options(std::move(options)) {} + : out_type(std::move(out_type)), options(std::move(options)), count(0) { + this->options.min_count = std::max(1, this->options.min_count); + } Status Consume(KernelContext*, const ExecBatch& batch) override { if (batch[0].is_array()) { @@ -321,7 +364,7 @@ struct MinMaxImpl : public ScalarAggregator { Status ConsumeScalar(const Scalar& scalar) { StateType local; local.has_nulls = !scalar.is_valid; - local.has_values = scalar.is_valid; + this->count += scalar.is_valid; if (local.has_nulls && !options.skip_nulls) { this->state = local; @@ -338,7 +381,7 @@ struct MinMaxImpl : public ScalarAggregator { const auto null_count = arr.null_count(); local.has_nulls = null_count > 0; - local.has_values = (arr.length() - null_count) > 0; + this->count += arr.length() - null_count; if (local.has_nulls && !options.skip_nulls) { this->state = local; @@ -349,7 +392,7 @@ struct MinMaxImpl : public ScalarAggregator { local += ConsumeWithNulls(arr); } else { // All true values for (int64_t i = 0; i < arr.length(); i++) { - local.MergeOne(arr.Value(i)); + local.MergeOne(arr.GetView(i)); } } this->state = local; @@ -359,23 +402,26 @@ struct MinMaxImpl : public ScalarAggregator { Status MergeFrom(KernelContext*, KernelState&& src) override { const auto& other = checked_cast(src); this->state += other.state; + this->count += other.count; return Status::OK(); } Status Finalize(KernelContext*, Datum* out) override { - using ScalarType = typename TypeTraits::ScalarType; - const auto& struct_type = checked_cast(*out_type); const auto& child_type = struct_type.field(0)->type(); std::vector> values; - if (!state.has_values || (state.has_nulls && !options.skip_nulls)) { + // Physical type != result type + if ((state.has_nulls && !options.skip_nulls) || (this->count < options.min_count)) { // (null, null) - values = {std::make_shared(child_type), - std::make_shared(child_type)}; + auto null_scalar = MakeNullScalar(child_type); + values = {null_scalar, null_scalar}; } else { - values = {std::make_shared(state.min, child_type), - std::make_shared(state.max, child_type)}; + ARROW_ASSIGN_OR_RAISE(auto min_scalar, + MakeScalar(child_type, std::move(state.min))); + ARROW_ASSIGN_OR_RAISE(auto max_scalar, + MakeScalar(child_type, std::move(state.max))); + values = {std::move(min_scalar), std::move(max_scalar)}; } out->value = std::make_shared(std::move(values), this->out_type); return Status::OK(); @@ -383,6 +429,7 @@ struct MinMaxImpl : public ScalarAggregator { std::shared_ptr out_type; ScalarAggregateOptions options; + int64_t count; MinMaxState state; private: @@ -398,7 +445,7 @@ struct MinMaxImpl : public ScalarAggregator { const int64_t leading_bits = p.leading_bits; while (idx < leading_bits) { if (BitUtil::GetBit(bitmap, offset)) { - local.MergeOne(arr.Value(idx)); + local.MergeOne(arr.GetView(idx)); } idx++; offset++; @@ -416,7 +463,7 @@ struct MinMaxImpl : public ScalarAggregator { current_block = data_counter.NextWord(); } for (int64_t i = 0; i < run_length; i++) { - local.MergeOne(arr.Value(idx + i)); + local.MergeOne(arr.GetView(idx + i)); } idx += run_length; offset += run_length; @@ -426,7 +473,7 @@ struct MinMaxImpl : public ScalarAggregator { BitmapReader reader(arr.null_bitmap_data(), offset, current_block.length); for (int64_t i = 0; i < current_block.length; i++) { if (reader.IsSet()) { - local.MergeOne(arr.Value(idx + i)); + local.MergeOne(arr.GetView(idx + i)); } reader.Next(); } @@ -463,7 +510,7 @@ struct BooleanMinMaxImpl : public MinMaxImpl { const auto valid_count = arr_length - null_count; local.has_nulls = null_count > 0; - local.has_values = valid_count > 0; + this->count += valid_count; if (local.has_nulls && !options.skip_nulls) { this->state = local; return Status::OK(); @@ -482,7 +529,7 @@ struct BooleanMinMaxImpl : public MinMaxImpl { StateType local; local.has_nulls = !scalar.is_valid; - local.has_values = scalar.is_valid; + this->count += scalar.is_valid; if (local.has_nulls && !options.skip_nulls) { this->state = local; return Status::OK(); @@ -498,6 +545,20 @@ struct BooleanMinMaxImpl : public MinMaxImpl { } }; +struct NullMinMaxImpl : public ScalarAggregator { + Status Consume(KernelContext*, const ExecBatch& batch) override { return Status::OK(); } + + Status MergeFrom(KernelContext*, KernelState&& src) override { return Status::OK(); } + + Status Finalize(KernelContext*, Datum* out) override { + std::vector> values{std::make_shared(), + std::make_shared()}; + out->value = std::make_shared( + std::move(values), struct_({field("min", null()), field("max", null())})); + return Status::OK(); + } +}; + template struct MinMaxInitState { std::unique_ptr state; @@ -511,12 +572,17 @@ struct MinMaxInitState { const ScalarAggregateOptions& options) : ctx(ctx), in_type(in_type), out_type(out_type), options(options) {} - Status Visit(const DataType&) { - return Status::NotImplemented("No min/max implemented"); + Status Visit(const DataType& ty) { + return Status::NotImplemented("No min/max implemented for ", ty); } - Status Visit(const HalfFloatType&) { - return Status::NotImplemented("No min/max implemented"); + Status Visit(const HalfFloatType& ty) { + return Status::NotImplemented("No min/max implemented for ", ty); + } + + Status Visit(const NullType&) { + state.reset(new NullMinMaxImpl()); + return Status::OK(); } Status Visit(const BooleanType&) { @@ -525,13 +591,26 @@ struct MinMaxInitState { } template - enable_if_number Visit(const Type&) { + enable_if_physical_integer Visit(const Type&) { + using PhysicalType = typename Type::PhysicalType; + state.reset(new MinMaxImpl(out_type, options)); + return Status::OK(); + } + + template + enable_if_floating_point Visit(const Type&) { state.reset(new MinMaxImpl(out_type, options)); return Status::OK(); } template - enable_if_decimal Visit(const Type&) { + enable_if_base_binary Visit(const Type&) { + state.reset(new MinMaxImpl(out_type, options)); + return Status::OK(); + } + + template + enable_if_fixed_size_binary Visit(const Type&) { state.reset(new MinMaxImpl(out_type, options)); return Status::OK(); } diff --git a/cpp/src/arrow/compute/kernels/aggregate_test.cc b/cpp/src/arrow/compute/kernels/aggregate_test.cc index fcf48e25a92..54405b229a9 100644 --- a/cpp/src/arrow/compute/kernels/aggregate_test.cc +++ b/cpp/src/arrow/compute/kernels/aggregate_test.cc @@ -1215,7 +1215,9 @@ class TestPrimitiveMinMaxKernel : public ::testing::Test { AssertMinMaxIsNull(array, options); } - std::shared_ptr type_singleton() { return Traits::type_singleton(); } + std::shared_ptr type_singleton() { + return default_type_instance(); + } }; template @@ -1225,6 +1227,8 @@ template class TestFloatingMinMaxKernel : public TestPrimitiveMinMaxKernel {}; class TestBooleanMinMaxKernel : public TestPrimitiveMinMaxKernel {}; +class TestDayTimeIntervalMinMaxKernel + : public TestPrimitiveMinMaxKernel {}; TEST_F(TestBooleanMinMaxKernel, Basics) { ScalarAggregateOptions options; @@ -1270,20 +1274,20 @@ TEST_F(TestBooleanMinMaxKernel, Basics) { options = ScalarAggregateOptions(/*skip_nulls=*/false, /*min_count=*/2); EXPECT_THAT(MinMax(MakeNullScalar(boolean()), options), ResultWith(null_min_max)); - EXPECT_THAT(MinMax(MakeScalar(true), options), ResultWith(true_min_max)); + EXPECT_THAT(MinMax(MakeScalar(true), options), ResultWith(null_min_max)); options = ScalarAggregateOptions(/*skip_nulls=*/true, /*min_count=*/0); this->AssertMinMaxIsNull("[]", options); this->AssertMinMaxIsNull("[null]", options); } -TYPED_TEST_SUITE(TestIntegerMinMaxKernel, IntegralArrowTypes); +TYPED_TEST_SUITE(TestIntegerMinMaxKernel, PhysicalIntegralArrowTypes); TYPED_TEST(TestIntegerMinMaxKernel, Basics) { ScalarAggregateOptions options; std::vector chunked_input1 = {"[5, 1, 2, 3, 4]", "[9, 1, null, 3, 4]"}; std::vector chunked_input2 = {"[5, null, 2, 3, 4]", "[9, 1, 2, 3, 4]"}; std::vector chunked_input3 = {"[5, 1, 2, 3, null]", "[9, 1, null, 3, 4]"}; - auto item_ty = TypeTraits::type_singleton(); + auto item_ty = default_type_instance(); auto ty = struct_({field("min", item_ty), field("max", item_ty)}); // SKIP nulls by default @@ -1303,14 +1307,22 @@ TYPED_TEST(TestIntegerMinMaxKernel, Basics) { EXPECT_THAT(MinMax(MakeNullScalar(item_ty)), ResultWith(null_min_max)); EXPECT_THAT(MinMax(one_scalar), ResultWith(one_min_max)); - options = ScalarAggregateOptions(/*skip_nulls=*/false); + options = ScalarAggregateOptions(/*skip_nulls=*/false, /*min_count=*/1); this->AssertMinMaxIs("[5, 1, 2, 3, 4]", 1, 5, options); - // output null this->AssertMinMaxIsNull("[5, null, 2, 3, 4]", options); - // output null this->AssertMinMaxIsNull(chunked_input1, options); this->AssertMinMaxIsNull(chunked_input2, options); this->AssertMinMaxIsNull(chunked_input3, options); + + options = ScalarAggregateOptions(/*skip_nulls=*/true, /*min_count=*/5); + this->AssertMinMaxIs("[5, 1, 2, 3, 4]", 1, 5, options); + this->AssertMinMaxIsNull("[5, null, 2, 3, 4]", options); + this->AssertMinMaxIs(chunked_input1, 1, 9, options); + + options = ScalarAggregateOptions(/*skip_nulls=*/false, /*min_count=*/5); + this->AssertMinMaxIs("[5, 1, 2, 3, 4]", 1, 5, options); + this->AssertMinMaxIsNull("[5, null, 2, 3, 4]", options); + this->AssertMinMaxIsNull(chunked_input1, options); } TYPED_TEST_SUITE(TestFloatingMinMaxKernel, RealArrowTypes); @@ -1342,11 +1354,8 @@ TYPED_TEST(TestFloatingMinMaxKernel, Floats) { options = ScalarAggregateOptions(/*skip_nulls=*/false); this->AssertMinMaxIs("[5, 1, 2, 3, 4]", 1, 5, options); this->AssertMinMaxIs("[5, -Inf, 2, 3, 4]", -INFINITY, 5, options); - // output null this->AssertMinMaxIsNull("[5, null, 2, 3, 4]", options); - // output null this->AssertMinMaxIsNull("[5, -Inf, null, 3, 4]", options); - // output null this->AssertMinMaxIsNull(chunked_input1, options); this->AssertMinMaxIsNull(chunked_input2, options); this->AssertMinMaxIsNull(chunked_input3, options); @@ -1358,6 +1367,16 @@ TYPED_TEST(TestFloatingMinMaxKernel, Floats) { options = ScalarAggregateOptions(/*skip_nulls=*/false, /*min_count=*/1); this->AssertMinMaxIsNull("[]", options); this->AssertMinMaxIsNull("[null]", options); + + options = ScalarAggregateOptions(/*skip_nulls=*/true, /*min_count=*/5); + this->AssertMinMaxIs("[5, 1, 2, 3, 4]", 1, 5, options); + this->AssertMinMaxIsNull("[5, null, 2, 3, 4]", options); + this->AssertMinMaxIs(chunked_input1, 1, 9, options); + + options = ScalarAggregateOptions(/*skip_nulls=*/false, /*min_count=*/5); + this->AssertMinMaxIs("[5, 1, 2, 3, 4]", 1, 5, options); + this->AssertMinMaxIsNull("[5, null, 2, 3, 4]", options); + this->AssertMinMaxIsNull(chunked_input1, options); } TYPED_TEST(TestFloatingMinMaxKernel, DefaultOptions) { @@ -1418,7 +1437,6 @@ TEST(TestDecimalMinMaxKernel, Decimals) { MinMax(ArrayFromJSON(item_ty, R"(["5.10", "-1.23", "2.00", "3.45", "4.56"])"), options), ResultWith(ScalarFromJSON(ty, R"({"min": "-1.23", "max": "5.10"})"))); - // output null EXPECT_THAT( MinMax(ArrayFromJSON(item_ty, R"(["5.10", null, "2.00", "3.45", "4.56"])"), options), @@ -1436,6 +1454,16 @@ TEST(TestDecimalMinMaxKernel, Decimals) { EXPECT_THAT(MinMax(ArrayFromJSON(item_ty, R"([null])"), options), ResultWith(ScalarFromJSON(ty, R"({"min": null, "max": null})"))); + options = ScalarAggregateOptions(/*skip_nulls=*/true, /*min_count=*/5); + EXPECT_THAT(MinMax(ArrayFromJSON( + item_ty, R"(["5.10", "-1.23", "2.00", "3.45", "4.56", null])"), + options), + ResultWith(ScalarFromJSON(ty, R"({"min": "-1.23", "max": "5.10"})"))); + EXPECT_THAT( + MinMax(ArrayFromJSON(item_ty, R"(["5.10", "-1.23", "2.00", "3.45", null])"), + options), + ResultWith(ScalarFromJSON(ty, R"({"min": null, "max": null})"))); + options = ScalarAggregateOptions(/*skip_nulls=*/false, /*min_count=*/1); EXPECT_THAT(MinMax(ArrayFromJSON(item_ty, R"([])"), options), ResultWith(ScalarFromJSON(ty, R"({"min": null, "max": null})"))); @@ -1444,6 +1472,134 @@ TEST(TestDecimalMinMaxKernel, Decimals) { } } +TEST(TestNullMinMaxKernel, Basics) { + auto item_ty = null(); + auto ty = struct_({field("min", item_ty), field("max", item_ty)}); + Datum result = ScalarFromJSON(ty, "[null, null]"); + EXPECT_THAT(MinMax(ScalarFromJSON(item_ty, "null")), ResultWith(result)); + EXPECT_THAT(MinMax(ArrayFromJSON(item_ty, "[]")), ResultWith(result)); + EXPECT_THAT(MinMax(ArrayFromJSON(item_ty, "[null]")), ResultWith(result)); + EXPECT_THAT(MinMax(ChunkedArrayFromJSON(item_ty, {"[null]", "[]", "[null, null]"})), + ResultWith(result)); +} + +template +class TestBaseBinaryMinMaxKernel : public ::testing::Test {}; +TYPED_TEST_SUITE(TestBaseBinaryMinMaxKernel, BinaryArrowTypes); +TYPED_TEST(TestBaseBinaryMinMaxKernel, Basics) { + std::vector chunked_input1 = {R"(["cc", "", "aa", "b", "c"])", + R"(["d", "", null, "b", "c"])"}; + std::vector chunked_input2 = {R"(["cc", null, "aa", "b", "c"])", + R"(["d", "", "aa", "b", "c"])"}; + std::vector chunked_input3 = {R"(["cc", "", "aa", "b", null])", + R"(["d", "", null, "b", "c"])"}; + auto ty = std::make_shared(); + auto res_ty = struct_({field("min", ty), field("max", ty)}); + Datum null = ScalarFromJSON(res_ty, R"([null, null])"); + + // SKIP nulls by default + EXPECT_THAT(MinMax(ArrayFromJSON(ty, R"([])")), ResultWith(null)); + EXPECT_THAT(MinMax(ArrayFromJSON(ty, R"([null, null, null])")), ResultWith(null)); + EXPECT_THAT(MinMax(ArrayFromJSON(ty, chunked_input1[0])), + ResultWith(ScalarFromJSON(res_ty, R"(["", "cc"])"))); + EXPECT_THAT(MinMax(ArrayFromJSON(ty, chunked_input2[0])), + ResultWith(ScalarFromJSON(res_ty, R"(["aa", "cc"])"))); + EXPECT_THAT(MinMax(ChunkedArrayFromJSON(ty, chunked_input1)), + ResultWith(ScalarFromJSON(res_ty, R"(["", "d"])"))); + EXPECT_THAT(MinMax(ChunkedArrayFromJSON(ty, chunked_input2)), + ResultWith(ScalarFromJSON(res_ty, R"(["", "d"])"))); + EXPECT_THAT(MinMax(ChunkedArrayFromJSON(ty, chunked_input3)), + ResultWith(ScalarFromJSON(res_ty, R"(["", "d"])"))); + + EXPECT_THAT(MinMax(MakeNullScalar(ty)), ResultWith(null)); + EXPECT_THAT(MinMax(ScalarFromJSON(ty, R"("one")")), + ResultWith(ScalarFromJSON(res_ty, R"(["one", "one"])"))); + + ScalarAggregateOptions options(/*skip_nulls=*/false); + EXPECT_THAT(MinMax(ArrayFromJSON(ty, chunked_input1[0]), options), + ResultWith(ScalarFromJSON(res_ty, R"(["", "cc"])"))); + EXPECT_THAT(MinMax(ArrayFromJSON(ty, chunked_input2[0]), options), ResultWith(null)); + EXPECT_THAT(MinMax(ChunkedArrayFromJSON(ty, chunked_input1), options), + ResultWith(null)); + EXPECT_THAT(MinMax(MakeNullScalar(ty), options), ResultWith(null)); + EXPECT_THAT(MinMax(ScalarFromJSON(ty, R"("one")"), options), + ResultWith(ScalarFromJSON(res_ty, R"(["one", "one"])"))); + + options = ScalarAggregateOptions(/*skip_nulls=*/true, /*min_count=*/9); + EXPECT_THAT(MinMax(ArrayFromJSON(ty, chunked_input1[0]), options), ResultWith(null)); + EXPECT_THAT(MinMax(ArrayFromJSON(ty, chunked_input2[0]), options), ResultWith(null)); + EXPECT_THAT(MinMax(ChunkedArrayFromJSON(ty, chunked_input1), options), + ResultWith(ScalarFromJSON(res_ty, R"(["", "d"])"))); + EXPECT_THAT(MinMax(MakeNullScalar(ty), options), ResultWith(null)); + EXPECT_THAT(MinMax(ScalarFromJSON(ty, R"("one")"), options), ResultWith(null)); + + options = ScalarAggregateOptions(/*skip_nulls=*/false, /*min_count=*/4); + EXPECT_THAT(MinMax(ArrayFromJSON(ty, chunked_input1[0]), options), + ResultWith(ScalarFromJSON(res_ty, R"(["", "cc"])"))); + EXPECT_THAT(MinMax(ArrayFromJSON(ty, chunked_input2[0]), options), ResultWith(null)); + EXPECT_THAT(MinMax(ChunkedArrayFromJSON(ty, chunked_input1), options), + ResultWith(null)); + EXPECT_THAT(MinMax(MakeNullScalar(ty), options), ResultWith(null)); + EXPECT_THAT(MinMax(ScalarFromJSON(ty, R"("one")"), options), ResultWith(null)); +} + +TEST(TestFixedSizeBinaryMinMaxKernel, Basics) { + auto ty = fixed_size_binary(2); + std::vector chunked_input1 = {R"(["cd", "aa", "ab", "bb", "cc"])", + R"(["da", "aa", null, "bb", "cc"])"}; + std::vector chunked_input2 = {R"(["cd", null, "ab", "bb", "cc"])", + R"(["da", "aa", "ab", "bb", "cc"])"}; + std::vector chunked_input3 = {R"(["cd", "aa", "ab", "bb", null])", + R"(["da", "aa", null, "bb", "cc"])"}; + auto res_ty = struct_({field("min", ty), field("max", ty)}); + Datum null = ScalarFromJSON(res_ty, R"([null, null])"); + + // SKIP nulls by default + EXPECT_THAT(MinMax(ArrayFromJSON(ty, R"([])")), ResultWith(null)); + EXPECT_THAT(MinMax(ArrayFromJSON(ty, R"([null, null, null])")), ResultWith(null)); + EXPECT_THAT(MinMax(ArrayFromJSON(ty, chunked_input1[0])), + ResultWith(ScalarFromJSON(res_ty, R"(["aa", "cd"])"))); + EXPECT_THAT(MinMax(ArrayFromJSON(ty, chunked_input2[0])), + ResultWith(ScalarFromJSON(res_ty, R"(["ab", "cd"])"))); + EXPECT_THAT(MinMax(ChunkedArrayFromJSON(ty, chunked_input1)), + ResultWith(ScalarFromJSON(res_ty, R"(["aa", "da"])"))); + EXPECT_THAT(MinMax(ChunkedArrayFromJSON(ty, chunked_input2)), + ResultWith(ScalarFromJSON(res_ty, R"(["aa", "da"])"))); + EXPECT_THAT(MinMax(ChunkedArrayFromJSON(ty, chunked_input3)), + ResultWith(ScalarFromJSON(res_ty, R"(["aa", "da"])"))); + + EXPECT_THAT(MinMax(MakeNullScalar(ty)), ResultWith(null)); + EXPECT_THAT(MinMax(ScalarFromJSON(ty, R"("aa")")), + ResultWith(ScalarFromJSON(res_ty, R"(["aa", "aa"])"))); + + ScalarAggregateOptions options(/*skip_nulls=*/false); + EXPECT_THAT(MinMax(ArrayFromJSON(ty, chunked_input1[0]), options), + ResultWith(ScalarFromJSON(res_ty, R"(["aa", "cd"])"))); + EXPECT_THAT(MinMax(ArrayFromJSON(ty, chunked_input2[0]), options), ResultWith(null)); + EXPECT_THAT(MinMax(ChunkedArrayFromJSON(ty, chunked_input1), options), + ResultWith(null)); + EXPECT_THAT(MinMax(MakeNullScalar(ty), options), ResultWith(null)); + EXPECT_THAT(MinMax(ScalarFromJSON(ty, R"("aa")"), options), + ResultWith(ScalarFromJSON(res_ty, R"(["aa", "aa"])"))); + + options = ScalarAggregateOptions(/*skip_nulls=*/true, /*min_count=*/9); + EXPECT_THAT(MinMax(ArrayFromJSON(ty, chunked_input1[0]), options), ResultWith(null)); + EXPECT_THAT(MinMax(ArrayFromJSON(ty, chunked_input2[0]), options), ResultWith(null)); + EXPECT_THAT(MinMax(ChunkedArrayFromJSON(ty, chunked_input1), options), + ResultWith(ScalarFromJSON(res_ty, R"(["aa", "da"])"))); + EXPECT_THAT(MinMax(MakeNullScalar(ty), options), ResultWith(null)); + EXPECT_THAT(MinMax(ScalarFromJSON(ty, R"("aa")"), options), ResultWith(null)); + + options = ScalarAggregateOptions(/*skip_nulls=*/false, /*min_count=*/4); + EXPECT_THAT(MinMax(ArrayFromJSON(ty, chunked_input1[0]), options), + ResultWith(ScalarFromJSON(res_ty, R"(["aa", "cd"])"))); + EXPECT_THAT(MinMax(ArrayFromJSON(ty, chunked_input2[0]), options), ResultWith(null)); + EXPECT_THAT(MinMax(ChunkedArrayFromJSON(ty, chunked_input1), options), + ResultWith(null)); + EXPECT_THAT(MinMax(MakeNullScalar(ty), options), ResultWith(null)); + EXPECT_THAT(MinMax(ScalarFromJSON(ty, R"("aa")"), options), ResultWith(null)); +} + template struct MinMaxResult { using T = typename ArrowType::c_type; diff --git a/cpp/src/arrow/compute/kernels/codegen_internal.cc b/cpp/src/arrow/compute/kernels/codegen_internal.cc index f230ca7ff73..c8bbebd10d2 100644 --- a/cpp/src/arrow/compute/kernels/codegen_internal.cc +++ b/cpp/src/arrow/compute/kernels/codegen_internal.cc @@ -41,131 +41,6 @@ ArrayKernelExec MakeFlippedBinaryExec(ArrayKernelExec exec) { }; } -std::vector> g_signed_int_types; -std::vector> g_unsigned_int_types; -std::vector> g_int_types; -std::vector> g_floating_types; -std::vector> g_numeric_types; -std::vector> g_base_binary_types; -std::vector> g_temporal_types; -std::vector> g_interval_types; -std::vector> g_primitive_types; -std::vector g_decimal_type_ids; -static std::once_flag codegen_static_initialized; - -template -void Extend(const std::vector& values, std::vector* out) { - for (const auto& t : values) { - out->push_back(t); - } -} - -static void InitStaticData() { - // Signed int types - g_signed_int_types = {int8(), int16(), int32(), int64()}; - - // Unsigned int types - g_unsigned_int_types = {uint8(), uint16(), uint32(), uint64()}; - - // All int types - Extend(g_unsigned_int_types, &g_int_types); - Extend(g_signed_int_types, &g_int_types); - - // Floating point types - g_floating_types = {float32(), float64()}; - - // Decimal types - g_decimal_type_ids = {Type::DECIMAL128, Type::DECIMAL256}; - - // Numeric types - Extend(g_int_types, &g_numeric_types); - Extend(g_floating_types, &g_numeric_types); - - // Temporal types - g_temporal_types = {date32(), - date64(), - time32(TimeUnit::SECOND), - time32(TimeUnit::MILLI), - time64(TimeUnit::MICRO), - time64(TimeUnit::NANO), - timestamp(TimeUnit::SECOND), - timestamp(TimeUnit::MILLI), - timestamp(TimeUnit::MICRO), - timestamp(TimeUnit::NANO)}; - - // Interval types - g_interval_types = {day_time_interval(), month_interval()}; - - // Base binary types (without FixedSizeBinary) - g_base_binary_types = {binary(), utf8(), large_binary(), large_utf8()}; - - // Non-parametric, non-nested types. This also DOES NOT include - // - // * Decimal - // * Fixed Size Binary - // * Time32 - // * Time64 - // * Timestamp - g_primitive_types = {null(), boolean(), date32(), date64()}; - Extend(g_numeric_types, &g_primitive_types); - Extend(g_base_binary_types, &g_primitive_types); -} - -const std::vector>& BaseBinaryTypes() { - std::call_once(codegen_static_initialized, InitStaticData); - return g_base_binary_types; -} - -const std::vector>& StringTypes() { - static DataTypeVector types = {utf8(), large_utf8()}; - return types; -} - -const std::vector>& SignedIntTypes() { - std::call_once(codegen_static_initialized, InitStaticData); - return g_signed_int_types; -} - -const std::vector>& UnsignedIntTypes() { - std::call_once(codegen_static_initialized, InitStaticData); - return g_unsigned_int_types; -} - -const std::vector>& IntTypes() { - std::call_once(codegen_static_initialized, InitStaticData); - return g_int_types; -} - -const std::vector>& FloatingPointTypes() { - std::call_once(codegen_static_initialized, InitStaticData); - return g_floating_types; -} - -const std::vector& DecimalTypeIds() { - std::call_once(codegen_static_initialized, InitStaticData); - return g_decimal_type_ids; -} - -const std::vector>& NumericTypes() { - std::call_once(codegen_static_initialized, InitStaticData); - return g_numeric_types; -} - -const std::vector>& TemporalTypes() { - std::call_once(codegen_static_initialized, InitStaticData); - return g_temporal_types; -} - -const std::vector>& IntervalTypes() { - std::call_once(codegen_static_initialized, InitStaticData); - return g_interval_types; -} - -const std::vector>& PrimitiveTypes() { - std::call_once(codegen_static_initialized, InitStaticData); - return g_primitive_types; -} - const std::vector>& ExampleParametricTypes() { static DataTypeVector example_parametric_types = { decimal128(12, 2), @@ -185,9 +60,6 @@ const std::vector>& ExampleParametricTypes() { return example_parametric_types; } -// Construct dummy parametric types so that we can get VisitTypeInline to -// work above - Result FirstType(KernelContext*, const std::vector& descrs) { ValueDescr result = descrs.front(); result.shape = GetBroadcastShape(descrs); diff --git a/cpp/src/arrow/compute/kernels/codegen_internal.h b/cpp/src/arrow/compute/kernels/codegen_internal.h index 98ca835a14c..6a7261eb653 100644 --- a/cpp/src/arrow/compute/kernels/codegen_internal.h +++ b/cpp/src/arrow/compute/kernels/codegen_internal.h @@ -407,14 +407,6 @@ ArrayKernelExec MakeFlippedBinaryExec(ArrayKernelExec exec); // Helpers for iterating over common DataType instances for adding kernels to // functions -const std::vector>& BaseBinaryTypes(); -const std::vector>& StringTypes(); -const std::vector>& SignedIntTypes(); -const std::vector>& UnsignedIntTypes(); -const std::vector>& IntTypes(); -const std::vector>& FloatingPointTypes(); -const std::vector& DecimalTypeIds(); - // Returns a vector of example instances of parametric types such as // // * Decimal @@ -433,18 +425,6 @@ const std::vector& DecimalTypeIds(); // corresponding InputType const std::vector>& ExampleParametricTypes(); -// Number types without boolean -const std::vector>& NumericTypes(); - -// Temporal types including time and timestamps for each unit -const std::vector>& TemporalTypes(); - -// Interval types -const std::vector>& IntervalTypes(); - -// Integer, floating point, base binary, and temporal -const std::vector>& PrimitiveTypes(); - // ---------------------------------------------------------------------- // "Applicators" take an operator definition (which may be scalar-valued or // array-valued) and creates an ArrayKernelExec which can be used to add an diff --git a/cpp/src/arrow/compute/kernels/hash_aggregate.cc b/cpp/src/arrow/compute/kernels/hash_aggregate.cc index 23bb73f2a7f..85389f95abe 100644 --- a/cpp/src/arrow/compute/kernels/hash_aggregate.cc +++ b/cpp/src/arrow/compute/kernels/hash_aggregate.cc @@ -816,6 +816,26 @@ Status AddHashAggKernels( return Status::OK(); } +// ---------------------------------------------------------------------- +// Helpers for more easily implementing hash aggregates + +template +struct GroupedValueTraits { + using CType = typename TypeTraits::CType; + + static CType Get(const CType* values, uint32_t g) { return values[g]; } + static void Set(CType* values, uint32_t g, CType v) { values[g] = v; } +}; +template <> +struct GroupedValueTraits { + static bool Get(const uint8_t* values, uint32_t g) { + return BitUtil::GetBit(values, g); + } + static void Set(uint8_t* values, uint32_t g, bool v) { + BitUtil::SetBitTo(values, g, v); + } +}; + template void VisitGroupedValues(const ExecBatch& batch, ConsumeValue&& valid_func, ConsumeNull&& null_func) { @@ -1737,6 +1757,12 @@ struct AntiExtrema { static constexpr CType anti_max() { return std::numeric_limits::min(); } }; +template <> +struct AntiExtrema { + static constexpr bool anti_min() { return true; } + static constexpr bool anti_max() { return false; } +}; + template <> struct AntiExtrema { static constexpr float anti_min() { return std::numeric_limits::infinity(); } @@ -1762,8 +1788,11 @@ struct AntiExtrema { }; template -struct GroupedMinMaxImpl : public GroupedAggregator { +struct GroupedMinMaxImpl final : public GroupedAggregator { using CType = typename TypeTraits::CType; + using GetSet = GroupedValueTraits; + using ArrType = + typename std::conditional::value, uint8_t, CType>::type; Status Init(ExecContext* ctx, const FunctionOptions* options) override { options_ = *checked_cast(options); @@ -1786,14 +1815,14 @@ struct GroupedMinMaxImpl : public GroupedAggregator { } Status Consume(const ExecBatch& batch) override { - auto raw_mins = reinterpret_cast(mins_.mutable_data()); - auto raw_maxes = reinterpret_cast(maxes_.mutable_data()); + auto raw_mins = mins_.mutable_data(); + auto raw_maxes = maxes_.mutable_data(); VisitGroupedValues( batch, [&](uint32_t g, CType val) { - raw_maxes[g] = std::max(raw_maxes[g], val); - raw_mins[g] = std::min(raw_mins[g], val); + GetSet::Set(raw_mins, g, std::min(GetSet::Get(raw_mins, g), val)); + GetSet::Set(raw_maxes, g, std::max(GetSet::Get(raw_maxes, g), val)); BitUtil::SetBit(has_values_.mutable_data(), g); }, [&](uint32_t g) { BitUtil::SetBit(has_nulls_.mutable_data(), g); }); @@ -1804,16 +1833,21 @@ struct GroupedMinMaxImpl : public GroupedAggregator { const ArrayData& group_id_mapping) override { auto other = checked_cast(&raw_other); - auto raw_mins = reinterpret_cast(mins_.mutable_data()); - auto raw_maxes = reinterpret_cast(maxes_.mutable_data()); + auto raw_mins = mins_.mutable_data(); + auto raw_maxes = maxes_.mutable_data(); - auto other_raw_mins = reinterpret_cast(other->mins_.mutable_data()); - auto other_raw_maxes = reinterpret_cast(other->maxes_.mutable_data()); + auto other_raw_mins = other->mins_.mutable_data(); + auto other_raw_maxes = other->maxes_.mutable_data(); auto g = group_id_mapping.GetValues(1); - for (int64_t other_g = 0; other_g < group_id_mapping.length; ++other_g, ++g) { - raw_mins[*g] = std::min(raw_mins[*g], other_raw_mins[other_g]); - raw_maxes[*g] = std::max(raw_maxes[*g], other_raw_maxes[other_g]); + for (uint32_t other_g = 0; static_cast(other_g) < group_id_mapping.length; + ++other_g, ++g) { + GetSet::Set( + raw_mins, *g, + std::min(GetSet::Get(raw_mins, *g), GetSet::Get(other_raw_mins, other_g))); + GetSet::Set( + raw_maxes, *g, + std::max(GetSet::Get(raw_maxes, *g), GetSet::Get(other_raw_maxes, other_g))); if (BitUtil::GetBit(other->has_values_.data(), other_g)) { BitUtil::SetBit(has_values_.mutable_data(), *g); @@ -1856,6 +1890,37 @@ struct GroupedMinMaxImpl : public GroupedAggregator { ScalarAggregateOptions options_; }; +struct GroupedNullMinMaxImpl final : public GroupedAggregator { + Status Init(ExecContext* ctx, const FunctionOptions*) override { return Status::OK(); } + + Status Resize(int64_t new_num_groups) override { + num_groups_ = new_num_groups; + return Status::OK(); + } + + Status Consume(const ExecBatch& batch) override { return Status::OK(); } + + Status Merge(GroupedAggregator&& raw_other, + const ArrayData& group_id_mapping) override { + return Status::OK(); + } + + Result Finalize() override { + return ArrayData::Make( + out_type(), num_groups_, {nullptr}, + { + ArrayData::Make(null(), num_groups_, {nullptr}, num_groups_), + ArrayData::Make(null(), num_groups_, {nullptr}, num_groups_), + }); + } + + std::shared_ptr out_type() const override { + return struct_({field("min", null()), field("max", null())}); + } + + int64_t num_groups_; +}; + template Result> MinMaxInit(KernelContext* ctx, const KernelInitArgs& args) { @@ -1866,8 +1931,21 @@ Result> MinMaxInit(KernelContext* ctx, struct GroupedMinMaxFactory { template - enable_if_number Visit(const T&) { - kernel = MakeKernel(std::move(argument_type), MinMaxInit); + enable_if_physical_integer Visit(const T&) { + using PhysicalType = typename T::PhysicalType; + kernel = MakeKernel(std::move(argument_type), MinMaxInit); + return Status::OK(); + } + + // MSVC2015 apparently doesn't compile this properly if we use + // enable_if_floating_point + Status Visit(const FloatType&) { + kernel = MakeKernel(std::move(argument_type), MinMaxInit); + return Status::OK(); + } + + Status Visit(const DoubleType&) { + kernel = MakeKernel(std::move(argument_type), MinMaxInit); return Status::OK(); } @@ -1877,6 +1955,17 @@ struct GroupedMinMaxFactory { return Status::OK(); } + Status Visit(const BooleanType&) { + kernel = MakeKernel(std::move(argument_type), MinMaxInit); + return Status::OK(); + } + + Status Visit(const NullType&) { + kernel = + MakeKernel(std::move(argument_type), HashAggregateInit); + return Status::OK(); + } + Status Visit(const HalfFloatType& type) { return Status::NotImplemented("Computing min/max of data of type ", type); } @@ -2666,9 +2755,11 @@ void RegisterHashAggregateBasic(FunctionRegistry* registry) { "hash_min_max", Arity::Binary(), &hash_min_max_doc, &default_scalar_aggregate_options); DCHECK_OK(AddHashAggKernels(NumericTypes(), GroupedMinMaxFactory::Make, func.get())); + DCHECK_OK(AddHashAggKernels(TemporalTypes(), GroupedMinMaxFactory::Make, func.get())); // Type parameters are ignored - DCHECK_OK(AddHashAggKernels({decimal128(1, 1), decimal256(1, 1)}, - GroupedMinMaxFactory::Make, func.get())); + DCHECK_OK(AddHashAggKernels( + {null(), boolean(), decimal128(1, 1), decimal256(1, 1), month_interval()}, + GroupedMinMaxFactory::Make, func.get())); DCHECK_OK(registry->AddFunction(std::move(func))); } diff --git a/cpp/src/arrow/compute/kernels/hash_aggregate_test.cc b/cpp/src/arrow/compute/kernels/hash_aggregate_test.cc index df13bd569ea..2a9ceeb7e70 100644 --- a/cpp/src/arrow/compute/kernels/hash_aggregate_test.cc +++ b/cpp/src/arrow/compute/kernels/hash_aggregate_test.cc @@ -1305,35 +1305,48 @@ TEST(GroupBy, VarianceOptions) { } TEST(GroupBy, MinMaxOnly) { + auto in_schema = schema({ + field("argument", float64()), + field("argument1", null()), + field("argument2", boolean()), + field("key", int64()), + }); for (bool use_exec_plan : {false, true}) { for (bool use_threads : {true, false}) { SCOPED_TRACE(use_threads ? "parallel/merged" : "serial"); - auto table = TableFromJSON( - schema({field("argument", float64()), field("key", int64())}), {R"([ - [1.0, 1], - [null, 1] - ])", - R"([ - [0.0, 2], - [null, 3], - [4.0, null], - [3.25, 1], - [0.125, 2] - ])", - R"([ - [-0.25, 2], - [0.75, null], - [null, 3] - ])"}); + auto table = TableFromJSON(in_schema, {R"([ + [1.0, null, true, 1], + [null, null, true, 1] +])", + R"([ + [0.0, null, false, 2], + [null, null, false, 3], + [4.0, null, null, null], + [3.25, null, true, 1], + [0.125, null, false, 2] +])", + R"([ + [-0.25, null, false, 2], + [0.75, null, true, null], + [null, null, true, 3] +])"}); ASSERT_OK_AND_ASSIGN(Datum aggregated_and_grouped, - GroupByTest({table->GetColumnByName("argument")}, - {table->GetColumnByName("key")}, - { - {"hash_min_max", nullptr}, - }, - use_threads, use_exec_plan)); + GroupByTest( + { + table->GetColumnByName("argument"), + table->GetColumnByName("argument1"), + table->GetColumnByName("argument2"), + }, + {table->GetColumnByName("key")}, + { + {"hash_min_max", nullptr}, + {"hash_min_max", nullptr}, + {"hash_min_max", nullptr}, + }, + use_threads, use_exec_plan)); + ValidateOutput(aggregated_and_grouped); SortBy({"key_0"}, &aggregated_and_grouped); AssertDatumsEqual( @@ -1342,13 +1355,21 @@ TEST(GroupBy, MinMaxOnly) { field("min", float64()), field("max", float64()), })), + field("hash_min_max", struct_({ + field("min", null()), + field("max", null()), + })), + field("hash_min_max", struct_({ + field("min", boolean()), + field("max", boolean()), + })), field("key_0", int64()), }), R"([ - [{"min": 1.0, "max": 3.25}, 1], - [{"min": -0.25, "max": 0.125}, 2], - [{"min": null, "max": null}, 3], - [{"min": 0.75, "max": 4.0}, null] + [{"min": 1.0, "max": 3.25}, {"min": null, "max": null}, {"min": true, "max": true}, 1], + [{"min": -0.25, "max": 0.125}, {"min": null, "max": null}, {"min": false, "max": false}, 2], + [{"min": null, "max": null}, {"min": null, "max": null}, {"min": false, "max": true}, 3], + [{"min": 0.75, "max": 4.0}, {"min": null, "max": null}, {"min": true, "max": true}, null] ])"), aggregated_and_grouped, /*verbose=*/true); @@ -1356,6 +1377,59 @@ TEST(GroupBy, MinMaxOnly) { } } +TEST(GroupBy, MinMaxTypes) { + std::vector> types; + types.insert(types.end(), NumericTypes().begin(), NumericTypes().end()); + types.insert(types.end(), TemporalTypes().begin(), TemporalTypes().end()); + types.push_back(month_interval()); + for (const auto& ty : types) { + SCOPED_TRACE(ty->ToString()); + auto in_schema = schema({field("argument0", ty), field("key", int64())}); + auto table = TableFromJSON(in_schema, {R"([ + [1, 1], + [null, 1] +])", + R"([ + [0, 2], + [null, 3], + [3, 4], + [5, 4], + [4, null], + [3, 1], + [0, 2] +])", + R"([ + [0, 2], + [1, null], + [null, 3] +])"}); + + ASSERT_OK_AND_ASSIGN( + Datum aggregated_and_grouped, + GroupByTest({table->GetColumnByName("argument0")}, + {table->GetColumnByName("key")}, {{"hash_min_max", nullptr}}, + /*use_threads=*/true, /*use_exec_plan=*/true)); + ValidateOutput(aggregated_and_grouped); + SortBy({"key_0"}, &aggregated_and_grouped); + + AssertDatumsEqual( + ArrayFromJSON( + struct_({ + field("hash_min_max", struct_({field("min", ty), field("max", ty)})), + field("key_0", int64()), + }), + R"([ + [{"min": 1, "max": 3}, 1], + [{"min": 0, "max": 0}, 2], + [{"min": null, "max": null}, 3], + [{"min": 3, "max": 5}, 4], + [{"min": 1, "max": 4}, null] + ])"), + aggregated_and_grouped, + /*verbose=*/true); + } +} + TEST(GroupBy, MinMaxDecimal) { auto in_schema = schema({ field("argument0", decimal128(3, 2)), diff --git a/cpp/src/arrow/compute/kernels/scalar_arithmetic.cc b/cpp/src/arrow/compute/kernels/scalar_arithmetic.cc index 4a686ea6db5..12f8c14e0c2 100644 --- a/cpp/src/arrow/compute/kernels/scalar_arithmetic.cc +++ b/cpp/src/arrow/compute/kernels/scalar_arithmetic.cc @@ -1929,7 +1929,7 @@ void RegisterScalarArithmetic(FunctionRegistry* registry) { AddDecimalBinaryKernels("subtract", &subtract); // Add subtract(timestamp, timestamp) -> duration - for (auto unit : ::arrow::internal::AllTimeUnits()) { + for (auto unit : TimeUnit::values()) { InputType in_type(match::TimestampTypeUnit(unit)); auto exec = ArithmeticExecFromOp(Type::TIMESTAMP); DCHECK_OK(subtract->AddKernel({in_type, in_type}, duration(unit), std::move(exec))); diff --git a/cpp/src/arrow/compute/kernels/scalar_arithmetic_test.cc b/cpp/src/arrow/compute/kernels/scalar_arithmetic_test.cc index 11a730c71f3..35b734e29f6 100644 --- a/cpp/src/arrow/compute/kernels/scalar_arithmetic_test.cc +++ b/cpp/src/arrow/compute/kernels/scalar_arithmetic_test.cc @@ -562,7 +562,7 @@ TEST(TestBinaryArithmetic, SubtractTimestamps) { auto rhs = rand.Int64(length, 0, 100000000); auto expected_int64 = (*Subtract(lhs, rhs)).make_array(); - for (auto unit : ::arrow::internal::AllTimeUnits()) { + for (auto unit : TimeUnit::values()) { auto timestamp_ty = timestamp(unit); auto duration_ty = duration(unit); diff --git a/cpp/src/arrow/compute/kernels/scalar_compare.cc b/cpp/src/arrow/compute/kernels/scalar_compare.cc index 5d0d41f4e0d..17eae5adbbd 100644 --- a/cpp/src/arrow/compute/kernels/scalar_compare.cc +++ b/cpp/src/arrow/compute/kernels/scalar_compare.cc @@ -24,7 +24,6 @@ namespace arrow { -using internal::AllTimeUnits; using internal::checked_cast; using internal::checked_pointer_cast; using util::string_view; @@ -223,7 +222,7 @@ std::shared_ptr MakeCompareFunction(std::string name, AddGenericCompare(float64(), func.get()); // Add timestamp kernels - for (auto unit : AllTimeUnits()) { + for (auto unit : TimeUnit::values()) { InputType in_type(match::TimestampTypeUnit(unit)); auto exec = GeneratePhysicalInteger( @@ -232,7 +231,7 @@ std::shared_ptr MakeCompareFunction(std::string name, } // Duration - for (auto unit : AllTimeUnits()) { + for (auto unit : TimeUnit::values()) { InputType in_type(match::DurationTypeUnit(unit)); auto exec = GeneratePhysicalInteger( @@ -262,7 +261,7 @@ std::shared_ptr MakeCompareFunction(std::string name, DCHECK_OK(func->AddKernel({ty, ty}, boolean(), std::move(exec))); } - for (const auto id : DecimalTypeIds()) { + for (const auto id : {Type::DECIMAL128, Type::DECIMAL256}) { auto exec = GenerateDecimal(id); DCHECK_OK( func->AddKernel({InputType(id), InputType(id)}, boolean(), std::move(exec))); diff --git a/cpp/src/arrow/compute/kernels/scalar_temporal.cc b/cpp/src/arrow/compute/kernels/scalar_temporal.cc index e7aeb3fcf0b..59ffee73c97 100644 --- a/cpp/src/arrow/compute/kernels/scalar_temporal.cc +++ b/cpp/src/arrow/compute/kernels/scalar_temporal.cc @@ -28,7 +28,6 @@ namespace arrow { -using internal::AllTimeUnits; using internal::checked_cast; using internal::checked_pointer_cast; @@ -858,7 +857,7 @@ std::shared_ptr MakeTemporal( } case WithTimestamps: { - for (auto unit : AllTimeUnits()) { + for (auto unit : TimeUnit::values()) { InputType in_type{match::TimestampTypeUnit(unit)}; switch (unit) { case TimeUnit::SECOND: { @@ -914,7 +913,7 @@ std::shared_ptr MakeSimpleUnaryTemporal( break; } case WithTimestamps: { - for (auto unit : AllTimeUnits()) { + for (auto unit : TimeUnit::values()) { InputType in_type{match::TimestampTypeUnit(unit)}; switch (unit) { case TimeUnit::SECOND: { diff --git a/cpp/src/arrow/compute/kernels/scalar_temporal_test.cc b/cpp/src/arrow/compute/kernels/scalar_temporal_test.cc index 0e7b4beb95e..85bfc203589 100644 --- a/cpp/src/arrow/compute/kernels/scalar_temporal_test.cc +++ b/cpp/src/arrow/compute/kernels/scalar_temporal_test.cc @@ -27,7 +27,6 @@ namespace arrow { -using internal::AllTimeUnits; using internal::StringFormatter; namespace compute { @@ -170,7 +169,7 @@ TEST_F(ScalarTemporalTest, TestTemporalComponentExtractionAllTemporalTypes) { } TEST_F(ScalarTemporalTest, TestTemporalComponentExtractionWithDifferentUnits) { - for (auto u : AllTimeUnits()) { + for (auto u : TimeUnit::values()) { auto unit = timestamp(u); CheckScalarUnary("year", unit, times_seconds_precision, int64(), year); CheckScalarUnary("month", unit, times_seconds_precision, int64(), month); @@ -298,7 +297,7 @@ TEST_F(ScalarTemporalTest, TestZoned1) { } TEST_F(ScalarTemporalTest, TestZoned2) { - for (auto u : AllTimeUnits()) { + for (auto u : TimeUnit::values()) { auto unit = timestamp(u, "Australia/Broken_Hill"); auto iso_calendar_type = struct_({field("iso_year", int64()), field("iso_week", int64()), @@ -361,7 +360,7 @@ TEST_F(ScalarTemporalTest, TestNonexistentTimezone) { auto data_buffer = Buffer::Wrap(std::vector{1, 2, 3}); auto null_buffer = Buffer::FromString("\xff"); - for (auto u : AllTimeUnits()) { + for (auto u : TimeUnit::values()) { auto ts_type = timestamp(u, "Mars/Mariner_Valley"); auto timestamp_array = std::make_shared>( ts_type, 2, data_buffer, null_buffer, 0); @@ -448,7 +447,7 @@ TEST_F(ScalarTemporalTest, TestAssumeTimezone) { auto options_us_central = AssumeTimezoneOptions(timezone_us_central); auto options_invalid = AssumeTimezoneOptions("Europe/Brusselsss"); - for (auto u : AllTimeUnits()) { + for (auto u : TimeUnit::values()) { auto unit = timestamp(u); auto unit_utc = timestamp(u, timezone_utc); auto unit_kolkata = timestamp(u, timezone_kolkata); @@ -486,7 +485,7 @@ TEST_F(ScalarTemporalTest, TestAssumeTimezoneAmbiguous) { auto options_raise = AssumeTimezoneOptions(timezone, AssumeTimezoneOptions::AMBIGUOUS_RAISE); - for (auto u : AllTimeUnits()) { + for (auto u : TimeUnit::values()) { auto unit = timestamp(u); auto unit_local = timestamp(u, timezone); ASSERT_RAISES(Invalid, AssumeTimezone(ArrayFromJSON(unit, times), options_raise)); @@ -519,7 +518,7 @@ TEST_F(ScalarTemporalTest, TestAssumeTimezoneNonexistent) { AssumeTimezoneOptions(timezone, AssumeTimezoneOptions::AMBIGUOUS_RAISE, AssumeTimezoneOptions::NONEXISTENT_EARLIEST); - for (auto u : AllTimeUnits()) { + for (auto u : TimeUnit::values()) { auto unit = timestamp(u); auto unit_local = timestamp(u, timezone); ASSERT_RAISES(Invalid, AssumeTimezone(ArrayFromJSON(unit, times), options_raise)); diff --git a/cpp/src/arrow/compute/kernels/vector_sort.cc b/cpp/src/arrow/compute/kernels/vector_sort.cc index 70537954510..da6d9cabac8 100644 --- a/cpp/src/arrow/compute/kernels/vector_sort.cc +++ b/cpp/src/arrow/compute/kernels/vector_sort.cc @@ -662,7 +662,7 @@ void AddSortingKernels(VectorKernel base, VectorFunction* func) { base.exec = GenerateNumeric(*physical_type); DCHECK_OK(func->AddKernel(base)); } - for (const auto id : DecimalTypeIds()) { + for (const auto id : {Type::DECIMAL128, Type::DECIMAL256}) { base.signature = KernelSignature::Make({InputType::Array(id)}, uint64()); base.exec = GenerateDecimal(id); DCHECK_OK(func->AddKernel(base)); diff --git a/cpp/src/arrow/scalar.h b/cpp/src/arrow/scalar.h index 81d9f2dc80e..3b601992b58 100644 --- a/cpp/src/arrow/scalar.h +++ b/cpp/src/arrow/scalar.h @@ -567,6 +567,19 @@ struct MakeScalarImpl { return Status::OK(); } + // Enable constructing string/binary scalars (but not decimal, etc) from std::string + template + enable_if_t< + std::is_same::type, std::string>::value && + (is_base_binary_type::value || std::is_same::value), + Status> + Visit(const T& t) { + using ScalarType = typename TypeTraits::ScalarType; + out_ = std::make_shared(Buffer::FromString(std::move(value_)), + std::move(type_)); + return Status::OK(); + } + Status Visit(const DataType& t) { return Status::NotImplemented("constructing scalars of type ", t, " from unboxed values"); diff --git a/cpp/src/arrow/scalar_test.cc b/cpp/src/arrow/scalar_test.cc index 247cd1d9f9d..99bcaec095b 100644 --- a/cpp/src/arrow/scalar_test.cc +++ b/cpp/src/arrow/scalar_test.cc @@ -557,6 +557,14 @@ TEST(TestStringScalar, MakeScalarImplicit) { ASSERT_EQ(StringScalar("three"), *three); } +TEST(TestStringScalar, MakeScalarString) { + // MakeScalar(std::string) creates a StringScalar via FromBuffer + std::string buf = "three"; + auto three = MakeScalar(std::move(buf)); + ASSERT_OK(three->ValidateFull()); + ASSERT_EQ(StringScalar("three"), *three); +} + TEST(TestFixedSizeBinaryScalar, Basics) { std::string data = "test data"; auto buf = std::make_shared(data); diff --git a/cpp/src/arrow/testing/gtest_util.h b/cpp/src/arrow/testing/gtest_util.h index 13bab35a66d..f0021e05603 100644 --- a/cpp/src/arrow/testing/gtest_util.h +++ b/cpp/src/arrow/testing/gtest_util.h @@ -161,6 +161,15 @@ using RealArrowTypes = ::testing::Types; using IntegralArrowTypes = ::testing::Types; +using PhysicalIntegralArrowTypes = + ::testing::Types; + +using PrimitiveArrowTypes = + ::testing::Types; + using TemporalArrowTypes = ::testing::Types; diff --git a/cpp/src/arrow/type.cc b/cpp/src/arrow/type.cc index 260686177f6..ff6a7508cf5 100644 --- a/cpp/src/arrow/type.cc +++ b/cpp/src/arrow/type.cc @@ -21,6 +21,7 @@ #include #include #include +#include #include #include // IWYU pragma: keep #include @@ -187,12 +188,6 @@ int GetByteWidth(const DataType& type) { return fw_type.bit_width() / CHAR_BIT; } -const std::vector& AllTimeUnits() { - static std::vector units = {TimeUnit::SECOND, TimeUnit::MILLI, - TimeUnit::MICRO, TimeUnit::NANO}; - return units; -} - } // namespace internal namespace { @@ -2294,4 +2289,128 @@ std::string Decimal256Type::ToString() const { return s.str(); } +namespace { + +std::vector> g_signed_int_types; +std::vector> g_unsigned_int_types; +std::vector> g_int_types; +std::vector> g_floating_types; +std::vector> g_numeric_types; +std::vector> g_base_binary_types; +std::vector> g_temporal_types; +std::vector> g_interval_types; +std::vector> g_primitive_types; +std::once_flag static_data_initialized; + +template +void Extend(const std::vector& values, std::vector* out) { + out->insert(out->end(), values.begin(), values.end()); +} + +void InitStaticData() { + // Signed int types + g_signed_int_types = {int8(), int16(), int32(), int64()}; + + // Unsigned int types + g_unsigned_int_types = {uint8(), uint16(), uint32(), uint64()}; + + // All int types + Extend(g_unsigned_int_types, &g_int_types); + Extend(g_signed_int_types, &g_int_types); + + // Floating point types + g_floating_types = {float32(), float64()}; + + // Numeric types + Extend(g_int_types, &g_numeric_types); + Extend(g_floating_types, &g_numeric_types); + + // Temporal types + g_temporal_types = {date32(), + date64(), + time32(TimeUnit::SECOND), + time32(TimeUnit::MILLI), + time64(TimeUnit::MICRO), + time64(TimeUnit::NANO), + timestamp(TimeUnit::SECOND), + timestamp(TimeUnit::MILLI), + timestamp(TimeUnit::MICRO), + timestamp(TimeUnit::NANO)}; + + // Interval types + g_interval_types = {day_time_interval(), month_interval()}; + + // Base binary types (without FixedSizeBinary) + g_base_binary_types = {binary(), utf8(), large_binary(), large_utf8()}; + + // Non-parametric, non-nested types. This also DOES NOT include + // + // * Decimal + // * Fixed Size Binary + // * Time32 + // * Time64 + // * Timestamp + g_primitive_types = {null(), boolean(), date32(), date64()}; + Extend(g_numeric_types, &g_primitive_types); + Extend(g_base_binary_types, &g_primitive_types); +} + +} // namespace + +const std::vector>& BaseBinaryTypes() { + std::call_once(static_data_initialized, InitStaticData); + return g_base_binary_types; +} + +const std::vector>& StringTypes() { + static DataTypeVector types = {utf8(), large_utf8()}; + return types; +} + +const std::vector>& SignedIntTypes() { + std::call_once(static_data_initialized, InitStaticData); + return g_signed_int_types; +} + +const std::vector>& UnsignedIntTypes() { + std::call_once(static_data_initialized, InitStaticData); + return g_unsigned_int_types; +} + +const std::vector>& IntTypes() { + std::call_once(static_data_initialized, InitStaticData); + return g_int_types; +} + +const std::vector>& FloatingPointTypes() { + std::call_once(static_data_initialized, InitStaticData); + return g_floating_types; +} + +const std::vector>& NumericTypes() { + std::call_once(static_data_initialized, InitStaticData); + return g_numeric_types; +} + +const std::vector>& TemporalTypes() { + std::call_once(static_data_initialized, InitStaticData); + return g_temporal_types; +} + +const std::vector>& IntervalTypes() { + std::call_once(static_data_initialized, InitStaticData); + return g_interval_types; +} + +const std::vector>& PrimitiveTypes() { + std::call_once(static_data_initialized, InitStaticData); + return g_primitive_types; +} + +const std::vector& TimeUnit::values() { + static std::vector units = {TimeUnit::SECOND, TimeUnit::MILLI, + TimeUnit::MICRO, TimeUnit::NANO}; + return units; +} + } // namespace arrow diff --git a/cpp/src/arrow/type.h b/cpp/src/arrow/type.h index acd5e66a822..2b0fadd5bd3 100644 --- a/cpp/src/arrow/type.h +++ b/cpp/src/arrow/type.h @@ -1287,8 +1287,8 @@ class ARROW_EXPORT DayTimeIntervalType : public IntervalType { struct DayMilliseconds { int32_t days = 0; int32_t milliseconds = 0; - DayMilliseconds() = default; - DayMilliseconds(int32_t days, int32_t milliseconds) + constexpr DayMilliseconds() = default; + constexpr DayMilliseconds(int32_t days, int32_t milliseconds) : days(days), milliseconds(milliseconds) {} bool operator==(DayMilliseconds other) const { return this->days == other.days && this->milliseconds == other.milliseconds; @@ -1955,9 +1955,34 @@ std::string ToString(TimeUnit::type unit); ARROW_EXPORT int GetByteWidth(const DataType& type); -ARROW_EXPORT -const std::vector& AllTimeUnits(); - } // namespace internal +// Helpers to get instances of data types based on general categories + +ARROW_EXPORT +const std::vector>& SignedIntTypes(); +ARROW_EXPORT +const std::vector>& UnsignedIntTypes(); +ARROW_EXPORT +const std::vector>& IntTypes(); +ARROW_EXPORT +const std::vector>& FloatingPointTypes(); +// Number types without boolean +ARROW_EXPORT +const std::vector>& NumericTypes(); +// Binary and string-like types (except fixed-size binary) +ARROW_EXPORT +const std::vector>& BaseBinaryTypes(); +ARROW_EXPORT +const std::vector>& StringTypes(); +// Temporal types including time and timestamps for each unit +ARROW_EXPORT +const std::vector>& TemporalTypes(); +// Interval types +ARROW_EXPORT +const std::vector>& IntervalTypes(); +// Integer, floating point, base binary, and temporal +ARROW_EXPORT +const std::vector>& PrimitiveTypes(); + } // namespace arrow diff --git a/cpp/src/arrow/type_fwd.h b/cpp/src/arrow/type_fwd.h index 7c52c87eb71..45afd7af2e6 100644 --- a/cpp/src/arrow/type_fwd.h +++ b/cpp/src/arrow/type_fwd.h @@ -222,9 +222,12 @@ using Date64Array = NumericArray; using Date64Builder = NumericBuilder; struct Date64Scalar; -struct TimeUnit { +struct ARROW_EXPORT TimeUnit { /// The unit for a time or timestamp DataType enum type { SECOND = 0, MILLI = 1, MICRO = 2, NANO = 3 }; + + /// Iterate over all valid time units + static const std::vector& values(); }; class TimeType; diff --git a/cpp/src/arrow/type_traits.h b/cpp/src/arrow/type_traits.h index 02337cf1178..ac2c5322616 100644 --- a/cpp/src/arrow/type_traits.h +++ b/cpp/src/arrow/type_traits.h @@ -221,6 +221,7 @@ struct TypeTraits { using ArrayType = DayTimeIntervalArray; using BuilderType = DayTimeIntervalBuilder; using ScalarType = DayTimeIntervalScalar; + using CType = DayTimeIntervalType::c_type; static constexpr int64_t bytes_required(int64_t elements) { return elements * static_cast(sizeof(DayTimeIntervalType::DayMilliseconds)); @@ -774,7 +775,8 @@ template using is_physical_signed_integer_type = std::integral_constant::value || - (is_temporal_type::value && has_c_type::value)>; + (is_temporal_type::value && has_c_type::value && + std::is_integral::value)>; template using enable_if_physical_signed_integer = diff --git a/docs/source/cpp/compute.rst b/docs/source/cpp/compute.rst index 9e9556d3a0b..662776b86d0 100644 --- a/docs/source/cpp/compute.rst +++ b/docs/source/cpp/compute.rst @@ -175,6 +175,9 @@ of general type categories: * "List-like": List, LargeList, sometimes also FixedSizeList. +* "Nested": List-likes (including FixedSizeList), Struct, Union, and + related types like Map. + If you are unsure whether a function supports a concrete input type, we recommend you try it out. Unsupported input types return a ``TypeError`` :class:`Status`. @@ -185,35 +188,35 @@ Aggregations Scalar aggregations operate on a (chunked) array or scalar value and reduce the input to a single output value. -+---------------+-------+-------------+------------------------+----------------------------------+-------+ -| Function name | Arity | Input types | Output type | Options class | Notes | -+===============+=======+=============+========================+==================================+=======+ -| all | Unary | Boolean | Scalar Boolean | :struct:`ScalarAggregateOptions` | \(1) | -+---------------+-------+-------------+------------------------+----------------------------------+-------+ -| any | Unary | Boolean | Scalar Boolean | :struct:`ScalarAggregateOptions` | \(1) | -+---------------+-------+-------------+------------------------+----------------------------------+-------+ -| count | Unary | Any | Scalar Int64 | :struct:`CountOptions` | \(2) | -+---------------+-------+-------------+------------------------+----------------------------------+-------+ -| index | Unary | Any | Scalar Int64 | :struct:`IndexOptions` | | -+---------------+-------+-------------+------------------------+----------------------------------+-------+ -| mean | Unary | Numeric | Scalar Decimal/Float64 | :struct:`ScalarAggregateOptions` | | -+---------------+-------+-------------+------------------------+----------------------------------+-------+ -| min_max | Unary | Numeric | Scalar Struct | :struct:`ScalarAggregateOptions` | \(3) | -+---------------+-------+-------------+------------------------+----------------------------------+-------+ -| mode | Unary | Numeric | Struct | :struct:`ModeOptions` | \(4) | -+---------------+-------+-------------+------------------------+----------------------------------+-------+ -| product | Unary | Numeric | Scalar Numeric | :struct:`ScalarAggregateOptions` | \(5) | -+---------------+-------+-------------+------------------------+----------------------------------+-------+ -| quantile | Unary | Numeric | Scalar Numeric | :struct:`QuantileOptions` | \(6) | -+---------------+-------+-------------+------------------------+----------------------------------+-------+ -| stddev | Unary | Numeric | Scalar Float64 | :struct:`VarianceOptions` | | -+---------------+-------+-------------+------------------------+----------------------------------+-------+ -| sum | Unary | Numeric | Scalar Numeric | :struct:`ScalarAggregateOptions` | \(5) | -+---------------+-------+-------------+------------------------+----------------------------------+-------+ -| tdigest | Unary | Numeric | Scalar Float64 | :struct:`TDigestOptions` | \(7) | -+---------------+-------+-------------+------------------------+----------------------------------+-------+ -| variance | Unary | Numeric | Scalar Float64 | :struct:`VarianceOptions` | | -+---------------+-------+-------------+------------------------+----------------------------------+-------+ ++---------------+-------+------------------+------------------------+----------------------------------+-------+ +| Function name | Arity | Input types | Output type | Options class | Notes | ++===============+=======+==================+========================+==================================+=======+ +| all | Unary | Boolean | Scalar Boolean | :struct:`ScalarAggregateOptions` | \(1) | ++---------------+-------+------------------+------------------------+----------------------------------+-------+ +| any | Unary | Boolean | Scalar Boolean | :struct:`ScalarAggregateOptions` | \(1) | ++---------------+-------+------------------+------------------------+----------------------------------+-------+ +| count | Unary | Any | Scalar Int64 | :struct:`CountOptions` | \(2) | ++---------------+-------+------------------+------------------------+----------------------------------+-------+ +| index | Unary | Any | Scalar Int64 | :struct:`IndexOptions` | | ++---------------+-------+------------------+------------------------+----------------------------------+-------+ +| mean | Unary | Numeric | Scalar Decimal/Float64 | :struct:`ScalarAggregateOptions` | | ++---------------+-------+------------------+------------------------+----------------------------------+-------+ +| min_max | Unary | Non-nested types | Scalar Struct | :struct:`ScalarAggregateOptions` | \(3) | ++---------------+-------+------------------+------------------------+----------------------------------+-------+ +| mode | Unary | Numeric | Struct | :struct:`ModeOptions` | \(4) | ++---------------+-------+------------------+------------------------+----------------------------------+-------+ +| product | Unary | Numeric | Scalar Numeric | :struct:`ScalarAggregateOptions` | \(5) | ++---------------+-------+------------------+------------------------+----------------------------------+-------+ +| quantile | Unary | Numeric | Scalar Numeric | :struct:`QuantileOptions` | \(6) | ++---------------+-------+------------------+------------------------+----------------------------------+-------+ +| stddev | Unary | Numeric | Scalar Float64 | :struct:`VarianceOptions` | | ++---------------+-------+------------------+------------------------+----------------------------------+-------+ +| sum | Unary | Numeric | Scalar Numeric | :struct:`ScalarAggregateOptions` | \(5) | ++---------------+-------+------------------+------------------------+----------------------------------+-------+ +| tdigest | Unary | Numeric | Scalar Float64 | :struct:`TDigestOptions` | \(7) | ++---------------+-------+------------------+------------------------+----------------------------------+-------+ +| variance | Unary | Numeric | Scalar Float64 | :struct:`VarianceOptions` | | ++---------------+-------+------------------+------------------------+----------------------------------+-------+ Notes: @@ -226,6 +229,9 @@ Notes: * \(3) Output is a ``{"min": input type, "max": input type}`` Struct. + Of the interval types, only the month interval is supported, as the day-time + and month-day-nano types are not sortable. + * \(4) Output is an array of ``{"mode": input type, "count": Int64}`` Struct. It contains the *N* most common elements in the input, in descending order, where *N* is given in :member:`ModeOptions::n`. @@ -288,33 +294,33 @@ The supported aggregation functions are as follows. All function names are prefixed with ``hash_``, which differentiates them from their scalar equivalents above and reflects how they are implemented internally. -+---------------------+-------+-------------+-----------------+----------------------------------+-------+ -| Function name | Arity | Input types | Output type | Options class | Notes | -+=====================+=======+=============+=================+==================================+=======+ -| hash_all | Unary | Boolean | Boolean | :struct:`ScalarAggregateOptions` | \(1) | -+---------------------+-------+-------------+-----------------+----------------------------------+-------+ -| hash_any | Unary | Boolean | Boolean | :struct:`ScalarAggregateOptions` | \(1) | -+---------------------+-------+-------------+-----------------+----------------------------------+-------+ -| hash_count | Unary | Any | Int64 | :struct:`CountOptions` | \(2) | -+---------------------+-------+-------------+-----------------+----------------------------------+-------+ -| hash_count_distinct | Unary | Any | Int64 | :struct:`CountOptions` | \(2) | -+---------------------+-------+-------------+-----------------+----------------------------------+-------+ -| hash_distinct | Unary | Any | Input type | :struct:`CountOptions` | \(2) | -+---------------------+-------+-------------+-----------------+----------------------------------+-------+ -| hash_mean | Unary | Numeric | Decimal/Float64 | :struct:`ScalarAggregateOptions` | | -+---------------------+-------+-------------+-----------------+----------------------------------+-------+ -| hash_min_max | Unary | Numeric | Struct | :struct:`ScalarAggregateOptions` | \(3) | -+---------------------+-------+-------------+-----------------+----------------------------------+-------+ -| hash_product | Unary | Numeric | Numeric | :struct:`ScalarAggregateOptions` | \(4) | -+---------------------+-------+-------------+-----------------+----------------------------------+-------+ -| hash_stddev | Unary | Numeric | Float64 | :struct:`VarianceOptions` | | -+---------------------+-------+-------------+-----------------+----------------------------------+-------+ -| hash_sum | Unary | Numeric | Numeric | :struct:`ScalarAggregateOptions` | \(4) | -+---------------------+-------+-------------+-----------------+----------------------------------+-------+ -| hash_tdigest | Unary | Numeric | Float64 | :struct:`TDigestOptions` | \(5) | -+---------------------+-------+-------------+-----------------+----------------------------------+-------+ -| hash_variance | Unary | Numeric | Float64 | :struct:`VarianceOptions` | | -+---------------------+-------+-------------+-----------------+----------------------------------+-------+ ++---------------------+-------+------------------------------------+-----------------+----------------------------------+-------+ +| Function name | Arity | Input types | Output type | Options class | Notes | ++=====================+=======+====================================+=================+==================================+=======+ +| hash_all | Unary | Boolean | Boolean | :struct:`ScalarAggregateOptions` | \(1) | ++---------------------+-------+------------------------------------+-----------------+----------------------------------+-------+ +| hash_any | Unary | Boolean | Boolean | :struct:`ScalarAggregateOptions` | \(1) | ++---------------------+-------+------------------------------------+-----------------+----------------------------------+-------+ +| hash_count | Unary | Any | Int64 | :struct:`CountOptions` | \(2) | ++---------------------+-------+------------------------------------+-----------------+----------------------------------+-------+ +| hash_count_distinct | Unary | Any | Int64 | :struct:`CountOptions` | \(2) | ++---------------------+-------+------------------------------------+-----------------+----------------------------------+-------+ +| hash_distinct | Unary | Any | Input type | :struct:`CountOptions` | \(2) | ++---------------------+-------+------------------------------------+-----------------+----------------------------------+-------+ +| hash_mean | Unary | Numeric | Decimal/Float64 | :struct:`ScalarAggregateOptions` | | ++---------------------+-------+------------------------------------+-----------------+----------------------------------+-------+ +| hash_min_max | Unary | Non-nested, non-binary/string-like | Struct | :struct:`ScalarAggregateOptions` | \(3) | ++---------------------+-------+------------------------------------+-----------------+----------------------------------+-------+ +| hash_product | Unary | Numeric | Numeric | :struct:`ScalarAggregateOptions` | \(4) | ++---------------------+-------+------------------------------------+-----------------+----------------------------------+-------+ +| hash_stddev | Unary | Numeric | Float64 | :struct:`VarianceOptions` | | ++---------------------+-------+------------------------------------+-----------------+----------------------------------+-------+ +| hash_sum | Unary | Numeric | Numeric | :struct:`ScalarAggregateOptions` | \(4) | ++---------------------+-------+------------------------------------+-----------------+----------------------------------+-------+ +| hash_tdigest | Unary | Numeric | Float64 | :struct:`TDigestOptions` | \(5) | ++---------------------+-------+------------------------------------+-----------------+----------------------------------+-------+ +| hash_variance | Unary | Numeric | Float64 | :struct:`VarianceOptions` | | ++---------------------+-------+------------------------------------+-----------------+----------------------------------+-------+ * \(1) If null values are taken into account, by setting the :member:`ScalarAggregateOptions::skip_nulls` to false, then `Kleene logic`_ @@ -326,7 +332,10 @@ equivalents above and reflects how they are implemented internally. are emitted. This never affects the grouping keys, only group values (i.e. you may get a group where the key is null). -* \(3) Output is a ``{"min": input type, "max": input type}`` Struct scalar. +* \(3) Output is a ``{"min": input type, "max": input type}`` Struct array. + + Of the interval types, only the month interval is supported, as the day-time + and month-day-nano types are not sortable. * \(4) Output is Int64, UInt64, Float64, or Decimal128/256, depending on the input type.