From d9d6bb4dfa33f63eb880756e513cba77ef6977a1 Mon Sep 17 00:00:00 2001
From: David Li
Date: Tue, 22 Jun 2021 17:53:39 -0400
Subject: [PATCH 01/14] ARROW-11932: [C++] Implement ArrayBuilder::AppendScalar
---
cpp/src/arrow/array/array_test.cc | 33 ++++++++
cpp/src/arrow/array/builder_base.cc | 116 ++++++++++++++++++++++++++++
cpp/src/arrow/array/builder_base.h | 4 +
cpp/src/arrow/array/builder_dict.h | 1 +
cpp/src/arrow/testing/generator.cc | 90 +++------------------
5 files changed, 163 insertions(+), 81 deletions(-)
diff --git a/cpp/src/arrow/array/array_test.cc b/cpp/src/arrow/array/array_test.cc
index a97bf134604..97431a736e6 100644
--- a/cpp/src/arrow/array/array_test.cc
+++ b/cpp/src/arrow/array/array_test.cc
@@ -397,6 +397,33 @@ TEST_F(TestArray, TestMakeArrayOfNullUnion) {
}
}
+void AssertAppendScalar(MemoryPool* pool, const std::shared_ptr& scalar) {
+ std::unique_ptr builder;
+ auto null_scalar = MakeNullScalar(scalar->type);
+ ASSERT_OK(MakeBuilder(pool, scalar->type, &builder));
+ ASSERT_OK(builder->AppendScalar(*scalar));
+ ASSERT_OK(builder->AppendScalar(*scalar));
+ ASSERT_OK(builder->AppendScalar(*null_scalar));
+ ASSERT_OK(builder->AppendScalars({scalar, null_scalar}));
+
+ std::shared_ptr out;
+ FinishAndCheckPadding(builder.get(), &out);
+ ASSERT_OK(out->ValidateFull());
+ ASSERT_EQ(out->length(), 5);
+ ASSERT_EQ(out->null_count(), 2);
+ ASSERT_FALSE(out->IsNull(0));
+ ASSERT_FALSE(out->IsNull(1));
+ ASSERT_TRUE(out->IsNull(2));
+ ASSERT_FALSE(out->IsNull(3));
+ ASSERT_TRUE(out->IsNull(4));
+ ASSERT_OK_AND_ASSIGN(auto scalar0, out->GetScalar(0));
+ ASSERT_OK_AND_ASSIGN(auto scalar1, out->GetScalar(1));
+ ASSERT_OK_AND_ASSIGN(auto scalar3, out->GetScalar(3));
+ AssertScalarsEqual(*scalar, *scalar0, /*verbose=*/true);
+ AssertScalarsEqual(*scalar, *scalar1, /*verbose=*/true);
+ AssertScalarsEqual(*scalar, *scalar3, /*verbose=*/true);
+}
+
TEST_F(TestArray, TestMakeArrayFromScalar) {
ASSERT_OK_AND_ASSIGN(auto null_array, MakeArrayFromScalar(NullScalar(), 5));
ASSERT_OK(null_array->ValidateFull());
@@ -447,6 +474,10 @@ TEST_F(TestArray, TestMakeArrayFromScalar) {
ASSERT_EQ(array->null_count(), 0);
}
}
+
+ for (auto scalar : scalars) {
+ AssertAppendScalar(pool_, scalar);
+ }
}
TEST_F(TestArray, TestMakeArrayFromDictionaryScalar) {
@@ -481,6 +512,8 @@ TEST_F(TestArray, TestMakeArrayFromMapScalar) {
ASSERT_OK_AND_ASSIGN(auto item, array->GetScalar(i));
ASSERT_TRUE(item->Equals(scalar));
}
+
+ AssertAppendScalar(pool_, std::make_shared(scalar));
}
TEST_F(TestArray, ValidateBuffersPrimitive) {
diff --git a/cpp/src/arrow/array/builder_base.cc b/cpp/src/arrow/array/builder_base.cc
index b92cc285894..59202241111 100644
--- a/cpp/src/arrow/array/builder_base.cc
+++ b/cpp/src/arrow/array/builder_base.cc
@@ -24,8 +24,11 @@
#include "arrow/array/data.h"
#include "arrow/array/util.h"
#include "arrow/buffer.h"
+#include "arrow/builder.h"
+#include "arrow/scalar.h"
#include "arrow/status.h"
#include "arrow/util/logging.h"
+#include "arrow/visitor_inline.h"
namespace arrow {
@@ -92,6 +95,119 @@ Status ArrayBuilder::Advance(int64_t elements) {
return null_bitmap_builder_.Advance(elements);
}
+struct AppendScalarImpl {
+ template ::BuilderType,
+ typename ScalarType = typename TypeTraits::ScalarType>
+ Status UseBuilder(const AppendScalar& append) {
+ for (const auto scalar : scalars_) {
+ if (scalar->is_valid) {
+ RETURN_NOT_OK(append(internal::checked_cast(*scalar),
+ static_cast(builder_)));
+ } else {
+ RETURN_NOT_OK(builder_->AppendNull());
+ }
+ }
+ return Status::OK();
+ }
+
+ struct AppendValue {
+ template
+ Status operator()(const ScalarType& s, BuilderType* builder) const {
+ return builder->Append(s.value);
+ }
+ };
+
+ struct AppendBuffer {
+ template
+ Status operator()(const ScalarType& s, BuilderType* builder) const {
+ const Buffer& buffer = *s.value;
+ return builder->Append(util::string_view{buffer});
+ }
+ };
+
+ struct AppendList {
+ template
+ Status operator()(const ScalarType& s, BuilderType* builder) const {
+ RETURN_NOT_OK(builder->Append());
+ const Array& list = *s.value;
+ for (int64_t i = 0; i < list.length(); i++) {
+ ARROW_ASSIGN_OR_RAISE(auto scalar, list.GetScalar(i));
+ RETURN_NOT_OK(builder->value_builder()->AppendScalar(*scalar));
+ }
+ return Status::OK();
+ }
+ };
+
+ template
+ enable_if_has_c_type Visit(const T&) {
+ return UseBuilder(AppendValue{});
+ }
+
+ template
+ enable_if_has_string_view Visit(const T&) {
+ return UseBuilder(AppendBuffer{});
+ }
+
+ template
+ enable_if_decimal Visit(const T&) {
+ return UseBuilder(AppendValue{});
+ }
+
+ template
+ enable_if_list_like Visit(const T&) {
+ return UseBuilder(AppendList{});
+ }
+
+ Status Visit(const StructType& type) {
+ auto* builder = static_cast(builder_);
+ for (const auto s : scalars_) {
+ const auto& scalar = internal::checked_cast(*s);
+ for (int field_index = 0; field_index < type.num_fields(); ++field_index) {
+ if (!scalar.is_valid || !scalar.value[field_index]) {
+ RETURN_NOT_OK(builder->field_builder(field_index)->AppendNull());
+ } else {
+ RETURN_NOT_OK(builder->field_builder(field_index)
+ ->AppendScalar(*scalar.value[field_index]));
+ }
+ }
+ RETURN_NOT_OK(builder->Append(scalar.is_valid));
+ }
+ return Status::OK();
+ }
+
+ Status Visit(const DataType& type) {
+ return Status::NotImplemented("AppendScalar for type ", type);
+ }
+
+ Status Convert() { return VisitTypeInline(*scalars_[0]->type, this); }
+
+ std::vector scalars_;
+ ArrayBuilder* builder_;
+};
+
+Status ArrayBuilder::AppendScalar(const Scalar& scalar) {
+ if (!scalar.type->Equals(type())) {
+ return Status::Invalid("Cannot append scalar of type ", scalar.type->ToString(),
+ " to builder for type ", type()->ToString());
+ }
+ return AppendScalarImpl{{&scalar}, this}.Convert();
+}
+
+Status ArrayBuilder::AppendScalars(const ScalarVector& scalars) {
+ if (scalars.empty()) return Status::OK();
+ std::vector refs;
+ refs.reserve(scalars.size());
+ for (const auto& scalar : scalars) {
+ if (!scalar->type->Equals(type())) {
+ return Status::Invalid("Cannot append scalar of type ", scalar->type->ToString(),
+ " to builder for type ", type()->ToString());
+ }
+ refs.push_back(scalar.get());
+ }
+ return AppendScalarImpl{refs, this}.Convert();
+}
+
Status ArrayBuilder::Finish(std::shared_ptr* out) {
std::shared_ptr internal_data;
RETURN_NOT_OK(FinishInternal(&internal_data));
diff --git a/cpp/src/arrow/array/builder_base.h b/cpp/src/arrow/array/builder_base.h
index 15c726241b5..e0ac4d68e1a 100644
--- a/cpp/src/arrow/array/builder_base.h
+++ b/cpp/src/arrow/array/builder_base.h
@@ -116,6 +116,10 @@ class ARROW_EXPORT ArrayBuilder {
/// This method is useful when appending null values to a parent nested type.
virtual Status AppendEmptyValues(int64_t length) = 0;
+ /// \brief Append a value from a scalar
+ virtual Status AppendScalar(const Scalar& scalar);
+ virtual Status AppendScalars(const ScalarVector& scalars);
+
/// For cases where raw data was memcpy'd into the internal buffers, allows us
/// to advance the length of the builder. It is your responsibility to use
/// this function responsibly.
diff --git a/cpp/src/arrow/array/builder_dict.h b/cpp/src/arrow/array/builder_dict.h
index 40d6ce1ba9a..455cb3df7b1 100644
--- a/cpp/src/arrow/array/builder_dict.h
+++ b/cpp/src/arrow/array/builder_dict.h
@@ -29,6 +29,7 @@
#include "arrow/array/builder_primitive.h" // IWYU pragma: export
#include "arrow/array/data.h"
#include "arrow/array/util.h"
+#include "arrow/scalar.h"
#include "arrow/status.h"
#include "arrow/type.h"
#include "arrow/type_traits.h"
diff --git a/cpp/src/arrow/testing/generator.cc b/cpp/src/arrow/testing/generator.cc
index 71fad394d00..33371d55c6d 100644
--- a/cpp/src/arrow/testing/generator.cc
+++ b/cpp/src/arrow/testing/generator.cc
@@ -95,88 +95,16 @@ std::shared_ptr ConstantArrayGenerator::String(int64_t size,
return ConstantArray(size, value);
}
-struct ScalarVectorToArrayImpl {
- template ::BuilderType,
- typename ScalarType = typename TypeTraits::ScalarType>
- Status UseBuilder(const AppendScalar& append) {
- BuilderType builder(type_, default_memory_pool());
- for (const auto& s : scalars_) {
- if (s->is_valid) {
- RETURN_NOT_OK(append(internal::checked_cast(*s), &builder));
- } else {
- RETURN_NOT_OK(builder.AppendNull());
- }
- }
- return builder.FinishInternal(&data_);
- }
-
- struct AppendValue {
- template
- Status operator()(const ScalarType& s, BuilderType* builder) const {
- return builder->Append(s.value);
- }
- };
-
- struct AppendBuffer {
- template
- Status operator()(const ScalarType& s, BuilderType* builder) const {
- const Buffer& buffer = *s.value;
- return builder->Append(util::string_view{buffer});
- }
- };
-
- template
- enable_if_primitive_ctype Visit(const T&) {
- return UseBuilder(AppendValue{});
- }
-
- template
- enable_if_has_string_view Visit(const T&) {
- return UseBuilder(AppendBuffer{});
- }
-
- Status Visit(const StructType& type) {
- data_ = ArrayData::Make(type_, static_cast(scalars_.size()),
- {/*null_bitmap=*/nullptr});
- data_->child_data.resize(type_->num_fields());
-
- ScalarVector field_scalars(scalars_.size());
-
- for (int field_index = 0; field_index < type.num_fields(); ++field_index) {
- for (size_t i = 0; i < scalars_.size(); ++i) {
- field_scalars[i] =
- internal::checked_cast(scalars_[i].get())->value[field_index];
- }
-
- ARROW_ASSIGN_OR_RAISE(data_->child_data[field_index],
- ScalarVectorToArrayImpl{}.Convert(field_scalars));
- }
- return Status::OK();
- }
-
- Status Visit(const DataType& type) {
- return Status::NotImplemented("ScalarVectorToArray for type ", type);
- }
-
- Result> Convert(const ScalarVector& scalars) && {
- if (scalars.size() == 0) {
- return Status::NotImplemented("ScalarVectorToArray with no scalars");
- }
- scalars_ = std::move(scalars);
- type_ = scalars_[0]->type;
- RETURN_NOT_OK(VisitTypeInline(*type_, this));
- return std::move(data_);
- }
-
- std::shared_ptr type_;
- ScalarVector scalars_;
- std::shared_ptr data_;
-};
-
Result> ScalarVectorToArray(const ScalarVector& scalars) {
- ARROW_ASSIGN_OR_RAISE(auto data, ScalarVectorToArrayImpl{}.Convert(scalars));
- return MakeArray(std::move(data));
+ if (scalars.empty()) {
+ return Status::NotImplemented("ScalarVectorToArray with no scalars");
+ }
+ std::unique_ptr builder;
+ RETURN_NOT_OK(MakeBuilder(default_memory_pool(), scalars[0]->type, &builder));
+ RETURN_NOT_OK(builder->AppendScalars(scalars));
+ std::shared_ptr out;
+ RETURN_NOT_OK(builder->Finish(&out));
+ return out;
}
} // namespace arrow
From bd6ac13d54a177ed2ae9296001cc6964ac9d418e Mon Sep 17 00:00:00 2001
From: David Li
Date: Thu, 24 Jun 2021 11:45:26 -0400
Subject: [PATCH 02/14] ARROW-11932: [C++] Add ArrayBuilder::AppendScalar(const
Scalar&, int64_t)
---
cpp/src/arrow/array/array_test.cc | 25 +++++++--------
cpp/src/arrow/array/builder_base.cc | 49 ++++++++++++++++++-----------
cpp/src/arrow/array/builder_base.h | 5 +--
3 files changed, 46 insertions(+), 33 deletions(-)
diff --git a/cpp/src/arrow/array/array_test.cc b/cpp/src/arrow/array/array_test.cc
index 97431a736e6..682baab208d 100644
--- a/cpp/src/arrow/array/array_test.cc
+++ b/cpp/src/arrow/array/array_test.cc
@@ -405,23 +405,22 @@ void AssertAppendScalar(MemoryPool* pool, const std::shared_ptr& scalar)
ASSERT_OK(builder->AppendScalar(*scalar));
ASSERT_OK(builder->AppendScalar(*null_scalar));
ASSERT_OK(builder->AppendScalars({scalar, null_scalar}));
+ ASSERT_OK(builder->AppendScalar(*scalar, /*n_repeats=*/2));
+ ASSERT_OK(builder->AppendScalar(*null_scalar, /*n_repeats=*/2));
std::shared_ptr out;
FinishAndCheckPadding(builder.get(), &out);
ASSERT_OK(out->ValidateFull());
- ASSERT_EQ(out->length(), 5);
- ASSERT_EQ(out->null_count(), 2);
- ASSERT_FALSE(out->IsNull(0));
- ASSERT_FALSE(out->IsNull(1));
- ASSERT_TRUE(out->IsNull(2));
- ASSERT_FALSE(out->IsNull(3));
- ASSERT_TRUE(out->IsNull(4));
- ASSERT_OK_AND_ASSIGN(auto scalar0, out->GetScalar(0));
- ASSERT_OK_AND_ASSIGN(auto scalar1, out->GetScalar(1));
- ASSERT_OK_AND_ASSIGN(auto scalar3, out->GetScalar(3));
- AssertScalarsEqual(*scalar, *scalar0, /*verbose=*/true);
- AssertScalarsEqual(*scalar, *scalar1, /*verbose=*/true);
- AssertScalarsEqual(*scalar, *scalar3, /*verbose=*/true);
+ ASSERT_EQ(out->length(), 9);
+ ASSERT_EQ(out->null_count(), 4);
+ for (const auto index : {0, 1, 3, 5, 6}) {
+ ASSERT_FALSE(out->IsNull(index));
+ ASSERT_OK_AND_ASSIGN(auto scalar_i, out->GetScalar(index));
+ AssertScalarsEqual(*scalar, *scalar_i, /*verbose=*/true);
+ }
+ for (const auto index : {2, 4, 7, 8}) {
+ ASSERT_TRUE(out->IsNull(index));
+ }
}
TEST_F(TestArray, TestMakeArrayFromScalar) {
diff --git a/cpp/src/arrow/array/builder_base.cc b/cpp/src/arrow/array/builder_base.cc
index 59202241111..8dbad29008a 100644
--- a/cpp/src/arrow/array/builder_base.cc
+++ b/cpp/src/arrow/array/builder_base.cc
@@ -100,12 +100,14 @@ struct AppendScalarImpl {
typename BuilderType = typename TypeTraits::BuilderType,
typename ScalarType = typename TypeTraits::ScalarType>
Status UseBuilder(const AppendScalar& append) {
- for (const auto scalar : scalars_) {
- if (scalar->is_valid) {
- RETURN_NOT_OK(append(internal::checked_cast(*scalar),
- static_cast(builder_)));
- } else {
- RETURN_NOT_OK(builder_->AppendNull());
+ for (int64_t i = 0; i < n_repeats_; i++) {
+ for (const auto scalar : scalars_) {
+ if (scalar->is_valid) {
+ RETURN_NOT_OK(append(internal::checked_cast(*scalar),
+ static_cast(builder_)));
+ } else {
+ RETURN_NOT_OK(builder_->AppendNull());
+ }
}
}
return Status::OK();
@@ -160,18 +162,20 @@ struct AppendScalarImpl {
}
Status Visit(const StructType& type) {
- auto* builder = static_cast(builder_);
- for (const auto s : scalars_) {
- const auto& scalar = internal::checked_cast(*s);
- for (int field_index = 0; field_index < type.num_fields(); ++field_index) {
- if (!scalar.is_valid || !scalar.value[field_index]) {
- RETURN_NOT_OK(builder->field_builder(field_index)->AppendNull());
- } else {
- RETURN_NOT_OK(builder->field_builder(field_index)
- ->AppendScalar(*scalar.value[field_index]));
+ for (int64_t i = 0; i < n_repeats_; i++) {
+ auto* builder = static_cast(builder_);
+ for (const auto s : scalars_) {
+ const auto& scalar = internal::checked_cast(*s);
+ for (int field_index = 0; field_index < type.num_fields(); ++field_index) {
+ if (!scalar.is_valid || !scalar.value[field_index]) {
+ RETURN_NOT_OK(builder->field_builder(field_index)->AppendNull());
+ } else {
+ RETURN_NOT_OK(builder->field_builder(field_index)
+ ->AppendScalar(*scalar.value[field_index]));
+ }
}
+ RETURN_NOT_OK(builder->Append(scalar.is_valid));
}
- RETURN_NOT_OK(builder->Append(scalar.is_valid));
}
return Status::OK();
}
@@ -183,6 +187,7 @@ struct AppendScalarImpl {
Status Convert() { return VisitTypeInline(*scalars_[0]->type, this); }
std::vector scalars_;
+ int64_t n_repeats_;
ArrayBuilder* builder_;
};
@@ -191,7 +196,15 @@ Status ArrayBuilder::AppendScalar(const Scalar& scalar) {
return Status::Invalid("Cannot append scalar of type ", scalar.type->ToString(),
" to builder for type ", type()->ToString());
}
- return AppendScalarImpl{{&scalar}, this}.Convert();
+ return AppendScalarImpl{{&scalar}, /*n_repeats=*/1, this}.Convert();
+}
+
+Status ArrayBuilder::AppendScalar(const Scalar& scalar, int64_t n_repeats) {
+ if (!scalar.type->Equals(type())) {
+ return Status::Invalid("Cannot append scalar of type ", scalar.type->ToString(),
+ " to builder for type ", type()->ToString());
+ }
+ return AppendScalarImpl{{&scalar}, n_repeats, this}.Convert();
}
Status ArrayBuilder::AppendScalars(const ScalarVector& scalars) {
@@ -205,7 +218,7 @@ Status ArrayBuilder::AppendScalars(const ScalarVector& scalars) {
}
refs.push_back(scalar.get());
}
- return AppendScalarImpl{refs, this}.Convert();
+ return AppendScalarImpl{refs, /*n_repeats=*/1, this}.Convert();
}
Status ArrayBuilder::Finish(std::shared_ptr* out) {
diff --git a/cpp/src/arrow/array/builder_base.h b/cpp/src/arrow/array/builder_base.h
index e0ac4d68e1a..8e60c306796 100644
--- a/cpp/src/arrow/array/builder_base.h
+++ b/cpp/src/arrow/array/builder_base.h
@@ -117,8 +117,9 @@ class ARROW_EXPORT ArrayBuilder {
virtual Status AppendEmptyValues(int64_t length) = 0;
/// \brief Append a value from a scalar
- virtual Status AppendScalar(const Scalar& scalar);
- virtual Status AppendScalars(const ScalarVector& scalars);
+ Status AppendScalar(const Scalar& scalar);
+ Status AppendScalar(const Scalar& scalar, int64_t n_repeats);
+ Status AppendScalars(const ScalarVector& scalars);
/// For cases where raw data was memcpy'd into the internal buffers, allows us
/// to advance the length of the builder. It is your responsibility to use
From 87d66dea345dfedbba2fd6881c5054b652c70b51 Mon Sep 17 00:00:00 2001
From: David Li
Date: Mon, 28 Jun 2021 10:29:41 -0400
Subject: [PATCH 03/14] ARROW-11932: [C++] Reserve() in AppendScalar
---
cpp/src/arrow/array/builder_base.cc | 4 ++++
1 file changed, 4 insertions(+)
diff --git a/cpp/src/arrow/array/builder_base.cc b/cpp/src/arrow/array/builder_base.cc
index 8dbad29008a..584c244d989 100644
--- a/cpp/src/arrow/array/builder_base.cc
+++ b/cpp/src/arrow/array/builder_base.cc
@@ -95,6 +95,7 @@ Status ArrayBuilder::Advance(int64_t elements) {
return null_bitmap_builder_.Advance(elements);
}
+namespace {
struct AppendScalarImpl {
template ::BuilderType,
@@ -190,6 +191,7 @@ struct AppendScalarImpl {
int64_t n_repeats_;
ArrayBuilder* builder_;
};
+} // namespace
Status ArrayBuilder::AppendScalar(const Scalar& scalar) {
if (!scalar.type->Equals(type())) {
@@ -204,11 +206,13 @@ Status ArrayBuilder::AppendScalar(const Scalar& scalar, int64_t n_repeats) {
return Status::Invalid("Cannot append scalar of type ", scalar.type->ToString(),
" to builder for type ", type()->ToString());
}
+ RETURN_NOT_OK(Reserve(n_repeats));
return AppendScalarImpl{{&scalar}, n_repeats, this}.Convert();
}
Status ArrayBuilder::AppendScalars(const ScalarVector& scalars) {
if (scalars.empty()) return Status::OK();
+ RETURN_NOT_OK(Reserve(scalars.size()));
std::vector refs;
refs.reserve(scalars.size());
for (const auto& scalar : scalars) {
From 0da2e621a0932ab236336df9fd6a8bee059cca7f Mon Sep 17 00:00:00 2001
From: David Li
Date: Mon, 28 Jun 2021 15:44:55 -0400
Subject: [PATCH 04/14] ARROW-11932: [C++] Don't require an allocation in
AppendScalar
---
cpp/src/arrow/array/builder_base.cc | 32 +++++++++++++++--------------
1 file changed, 17 insertions(+), 15 deletions(-)
diff --git a/cpp/src/arrow/array/builder_base.cc b/cpp/src/arrow/array/builder_base.cc
index 584c244d989..b127dd15415 100644
--- a/cpp/src/arrow/array/builder_base.cc
+++ b/cpp/src/arrow/array/builder_base.cc
@@ -102,9 +102,10 @@ struct AppendScalarImpl {
typename ScalarType = typename TypeTraits::ScalarType>
Status UseBuilder(const AppendScalar& append) {
for (int64_t i = 0; i < n_repeats_; i++) {
- for (const auto scalar : scalars_) {
- if (scalar->is_valid) {
- RETURN_NOT_OK(append(internal::checked_cast(*scalar),
+ for (const std::shared_ptr* scalar = scalars_begin_; scalar != scalars_end_;
+ scalar++) {
+ if ((*scalar)->is_valid) {
+ RETURN_NOT_OK(append(internal::checked_cast(**scalar),
static_cast(builder_)));
} else {
RETURN_NOT_OK(builder_->AppendNull());
@@ -165,8 +166,8 @@ struct AppendScalarImpl {
Status Visit(const StructType& type) {
for (int64_t i = 0; i < n_repeats_; i++) {
auto* builder = static_cast(builder_);
- for (const auto s : scalars_) {
- const auto& scalar = internal::checked_cast(*s);
+ for (const std::shared_ptr* s = scalars_begin_; s != scalars_end_; s++) {
+ const auto& scalar = internal::checked_cast(**s);
for (int field_index = 0; field_index < type.num_fields(); ++field_index) {
if (!scalar.is_valid || !scalar.value[field_index]) {
RETURN_NOT_OK(builder->field_builder(field_index)->AppendNull());
@@ -185,9 +186,10 @@ struct AppendScalarImpl {
return Status::NotImplemented("AppendScalar for type ", type);
}
- Status Convert() { return VisitTypeInline(*scalars_[0]->type, this); }
+ Status Convert() { return VisitTypeInline(*(*scalars_begin_)->type, this); }
- std::vector scalars_;
+ const std::shared_ptr* scalars_begin_;
+ const std::shared_ptr* scalars_end_;
int64_t n_repeats_;
ArrayBuilder* builder_;
};
@@ -198,7 +200,8 @@ Status ArrayBuilder::AppendScalar(const Scalar& scalar) {
return Status::Invalid("Cannot append scalar of type ", scalar.type->ToString(),
" to builder for type ", type()->ToString());
}
- return AppendScalarImpl{{&scalar}, /*n_repeats=*/1, this}.Convert();
+ std::shared_ptr shared{const_cast(&scalar), [](Scalar*) {}};
+ return AppendScalarImpl{&shared, &shared + 1, /*n_repeats=*/1, this}.Convert();
}
Status ArrayBuilder::AppendScalar(const Scalar& scalar, int64_t n_repeats) {
@@ -207,22 +210,21 @@ Status ArrayBuilder::AppendScalar(const Scalar& scalar, int64_t n_repeats) {
" to builder for type ", type()->ToString());
}
RETURN_NOT_OK(Reserve(n_repeats));
- return AppendScalarImpl{{&scalar}, n_repeats, this}.Convert();
+ std::shared_ptr shared{const_cast(&scalar), [](Scalar*) {}};
+ return AppendScalarImpl{&shared, &shared + 1, n_repeats, this}.Convert();
}
Status ArrayBuilder::AppendScalars(const ScalarVector& scalars) {
if (scalars.empty()) return Status::OK();
- RETURN_NOT_OK(Reserve(scalars.size()));
- std::vector refs;
- refs.reserve(scalars.size());
+ const auto ty = type();
for (const auto& scalar : scalars) {
- if (!scalar->type->Equals(type())) {
+ if (!scalar->type->Equals(ty)) {
return Status::Invalid("Cannot append scalar of type ", scalar->type->ToString(),
" to builder for type ", type()->ToString());
}
- refs.push_back(scalar.get());
}
- return AppendScalarImpl{refs, /*n_repeats=*/1, this}.Convert();
+ return AppendScalarImpl{&*scalars.cbegin(), &*scalars.cend(), /*n_repeats=*/1, this}
+ .Convert();
}
Status ArrayBuilder::Finish(std::shared_ptr* out) {
From 0a76cb7b7f99a809dc2b0b074097e932adf34e74 Mon Sep 17 00:00:00 2001
From: David Li
Date: Mon, 28 Jun 2021 17:04:50 -0400
Subject: [PATCH 05/14] ARROW-11932: [C++] Fix test on MSVC
---
cpp/src/arrow/array/builder_base.cc | 3 ++-
1 file changed, 2 insertions(+), 1 deletion(-)
diff --git a/cpp/src/arrow/array/builder_base.cc b/cpp/src/arrow/array/builder_base.cc
index b127dd15415..ab04e099904 100644
--- a/cpp/src/arrow/array/builder_base.cc
+++ b/cpp/src/arrow/array/builder_base.cc
@@ -223,7 +223,8 @@ Status ArrayBuilder::AppendScalars(const ScalarVector& scalars) {
" to builder for type ", type()->ToString());
}
}
- return AppendScalarImpl{&*scalars.cbegin(), &*scalars.cend(), /*n_repeats=*/1, this}
+ return AppendScalarImpl{scalars.data(), scalars.data() + scalars.size(),
+ /*n_repeats=*/1, this}
.Convert();
}
From f273db17f54a4cff185449fc69e02b0e710b6310 Mon Sep 17 00:00:00 2001
From: David Li
Date: Tue, 29 Jun 2021 11:04:21 -0400
Subject: [PATCH 06/14] Update cpp/src/arrow/array/builder_base.cc
Co-authored-by: Benjamin Kietzman
---
cpp/src/arrow/array/builder_base.cc | 50 +++++++++++++++++++++++------
1 file changed, 41 insertions(+), 9 deletions(-)
diff --git a/cpp/src/arrow/array/builder_base.cc b/cpp/src/arrow/array/builder_base.cc
index ab04e099904..a61f92f0fc2 100644
--- a/cpp/src/arrow/array/builder_base.cc
+++ b/cpp/src/arrow/array/builder_base.cc
@@ -144,20 +144,52 @@ struct AppendScalarImpl {
};
template
- enable_if_has_c_type Visit(const T&) {
- return UseBuilder(AppendValue{});
- }
+ enable_if_fixed_width Visit(const T&) {
+ auto builder = checked_cast::BuilderType*>(builder_);
+ RETURN_NOT_OK(builder->Reserve(n_repeats_ * (scalar_end_ - scalar_begin_)));
- template
- enable_if_has_string_view Visit(const T&) {
- return UseBuilder(AppendBuffer{});
+ for (int64_t i = 0; i < n_repeats_; i++) {
+ for (const std::shared_ptr* raw = scalars_begin_; raw != scalars_end_;
+ raw++) {
+ auto scalar = checked_cast::ScalarType*>(raw->get());
+ if (scalar->is_valid) {
+ builder->UnsafeAppend(scalar->value);
+ } else {
+ builder->UnsafeAppendNull();
+ }
+ }
+ }
+ return Status::OK();
}
template
- enable_if_decimal Visit(const T&) {
- return UseBuilder(AppendValue{});
- }
+ enable_if_base_binary Visit(const T&) {
+ int64_t data_size = 0;
+ for (const std::shared_ptr* raw = scalars_begin_; raw != scalars_end_;
+ raw++) {
+ auto scalar = checked_cast::ScalarType*>(raw->get());
+ if (scalar->is_valid) {
+ data_size += scalar->value->size();
+ }
+ }
+
+ auto builder = checked_cast::BuilderType*>(builder_);
+ RETURN_NOT_OK(builder->Reserve(n_repeats_ * (scalar_end_ - scalar_begin_)));
+ RETURN_NOT_OK(builder->ReserveData(n_repeats_ * data_size));
+ for (int64_t i = 0; i < n_repeats_; i++) {
+ for (const std::shared_ptr* raw = scalars_begin_; raw != scalars_end_;
+ raw++) {
+ auto scalar = checked_cast::ScalarType*>(raw->get());
+ if (scalar->is_valid) {
+ builder->UnsafeAppend(util::string_view{*scalar->value});
+ } else {
+ builder->UnsafeAppendNull();
+ }
+ }
+ }
+ return Status::OK();
+ }
template
enable_if_list_like Visit(const T&) {
return UseBuilder(AppendList{});
From 6d966eae7b33b1e3ba7b12141ec8ffbb72446583 Mon Sep 17 00:00:00 2001
From: David Li
Date: Tue, 29 Jun 2021 11:31:21 -0400
Subject: [PATCH 07/14] ARROW-11932: [C++] Reserve in more cases
---
cpp/src/arrow/array/builder_base.cc | 110 +++++++++++++---------------
1 file changed, 51 insertions(+), 59 deletions(-)
diff --git a/cpp/src/arrow/array/builder_base.cc b/cpp/src/arrow/array/builder_base.cc
index a61f92f0fc2..d5d3b4e1288 100644
--- a/cpp/src/arrow/array/builder_base.cc
+++ b/cpp/src/arrow/array/builder_base.cc
@@ -97,61 +97,16 @@ Status ArrayBuilder::Advance(int64_t elements) {
namespace {
struct AppendScalarImpl {
- template ::BuilderType,
- typename ScalarType = typename TypeTraits::ScalarType>
- Status UseBuilder(const AppendScalar& append) {
- for (int64_t i = 0; i < n_repeats_; i++) {
- for (const std::shared_ptr* scalar = scalars_begin_; scalar != scalars_end_;
- scalar++) {
- if ((*scalar)->is_valid) {
- RETURN_NOT_OK(append(internal::checked_cast(**scalar),
- static_cast(builder_)));
- } else {
- RETURN_NOT_OK(builder_->AppendNull());
- }
- }
- }
- return Status::OK();
- }
-
- struct AppendValue {
- template
- Status operator()(const ScalarType& s, BuilderType* builder) const {
- return builder->Append(s.value);
- }
- };
-
- struct AppendBuffer {
- template
- Status operator()(const ScalarType& s, BuilderType* builder) const {
- const Buffer& buffer = *s.value;
- return builder->Append(util::string_view{buffer});
- }
- };
-
- struct AppendList {
- template
- Status operator()(const ScalarType& s, BuilderType* builder) const {
- RETURN_NOT_OK(builder->Append());
- const Array& list = *s.value;
- for (int64_t i = 0; i < list.length(); i++) {
- ARROW_ASSIGN_OR_RAISE(auto scalar, list.GetScalar(i));
- RETURN_NOT_OK(builder->value_builder()->AppendScalar(*scalar));
- }
- return Status::OK();
- }
- };
-
template
- enable_if_fixed_width Visit(const T&) {
- auto builder = checked_cast::BuilderType*>(builder_);
- RETURN_NOT_OK(builder->Reserve(n_repeats_ * (scalar_end_ - scalar_begin_)));
+ enable_if_t::value || is_decimal_type::value, Status> Visit(const T&) {
+ auto builder = internal::checked_cast::BuilderType*>(builder_);
+ RETURN_NOT_OK(builder->Reserve(n_repeats_ * (scalars_end_ - scalars_begin_)));
for (int64_t i = 0; i < n_repeats_; i++) {
for (const std::shared_ptr* raw = scalars_begin_; raw != scalars_end_;
raw++) {
- auto scalar = checked_cast::ScalarType*>(raw->get());
+ auto scalar =
+ internal::checked_cast::ScalarType*>(raw->get());
if (scalar->is_valid) {
builder->UnsafeAppend(scalar->value);
} else {
@@ -163,24 +118,26 @@ struct AppendScalarImpl {
}
template
- enable_if_base_binary Visit(const T&) {
+ enable_if_has_string_view Visit(const T&) {
int64_t data_size = 0;
for (const std::shared_ptr* raw = scalars_begin_; raw != scalars_end_;
- raw++) {
- auto scalar = checked_cast::ScalarType*>(raw->get());
+ raw++) {
+ auto scalar =
+ internal::checked_cast::ScalarType*>(raw->get());
if (scalar->is_valid) {
data_size += scalar->value->size();
}
}
- auto builder = checked_cast::BuilderType*>(builder_);
- RETURN_NOT_OK(builder->Reserve(n_repeats_ * (scalar_end_ - scalar_begin_)));
+ auto builder = internal::checked_cast::BuilderType*>(builder_);
+ RETURN_NOT_OK(builder->Reserve(n_repeats_ * (scalars_end_ - scalars_begin_)));
RETURN_NOT_OK(builder->ReserveData(n_repeats_ * data_size));
for (int64_t i = 0; i < n_repeats_; i++) {
for (const std::shared_ptr* raw = scalars_begin_; raw != scalars_end_;
raw++) {
- auto scalar = checked_cast::ScalarType*>(raw->get());
+ auto scalar =
+ internal::checked_cast::ScalarType*>(raw->get());
if (scalar->is_valid) {
builder->UnsafeAppend(util::string_view{*scalar->value});
} else {
@@ -190,14 +147,50 @@ struct AppendScalarImpl {
}
return Status::OK();
}
+
+ struct AppendList {
+ template
+ Status operator()(const ScalarType& s, BuilderType* builder) const {
+ RETURN_NOT_OK(builder->Append());
+ const Array& list = *s.value;
+ for (int64_t i = 0; i < list.length(); i++) {
+ ARROW_ASSIGN_OR_RAISE(auto scalar, list.GetScalar(i));
+ RETURN_NOT_OK(builder->value_builder()->AppendScalar(*scalar));
+ }
+ return Status::OK();
+ }
+ };
+
template
enable_if_list_like Visit(const T&) {
- return UseBuilder(AppendList{});
+ auto builder = internal::checked_cast::BuilderType*>(builder_);
+ for (int64_t i = 0; i < n_repeats_; i++) {
+ for (const std::shared_ptr* scalar = scalars_begin_; scalar != scalars_end_;
+ scalar++) {
+ if ((*scalar)->is_valid) {
+ RETURN_NOT_OK(builder->Append());
+ const Array& list =
+ *internal::checked_cast(**scalar).value;
+ for (int64_t i = 0; i < list.length(); i++) {
+ ARROW_ASSIGN_OR_RAISE(auto scalar, list.GetScalar(i));
+ RETURN_NOT_OK(builder->value_builder()->AppendScalar(*scalar));
+ }
+ } else {
+ RETURN_NOT_OK(builder_->AppendNull());
+ }
+ }
+ }
+ return Status::OK();
}
Status Visit(const StructType& type) {
+ auto* builder = internal::checked_cast(builder_);
+ auto count = n_repeats_ * (scalars_end_ - scalars_begin_);
+ RETURN_NOT_OK(builder->Reserve(count));
+ for (int field_index = 0; field_index < type.num_fields(); ++field_index) {
+ RETURN_NOT_OK(builder->field_builder(field_index)->Reserve(count));
+ }
for (int64_t i = 0; i < n_repeats_; i++) {
- auto* builder = static_cast(builder_);
for (const std::shared_ptr* s = scalars_begin_; s != scalars_end_; s++) {
const auto& scalar = internal::checked_cast(**s);
for (int field_index = 0; field_index < type.num_fields(); ++field_index) {
@@ -241,7 +234,6 @@ Status ArrayBuilder::AppendScalar(const Scalar& scalar, int64_t n_repeats) {
return Status::Invalid("Cannot append scalar of type ", scalar.type->ToString(),
" to builder for type ", type()->ToString());
}
- RETURN_NOT_OK(Reserve(n_repeats));
std::shared_ptr shared{const_cast(&scalar), [](Scalar*) {}};
return AppendScalarImpl{&shared, &shared + 1, n_repeats, this}.Convert();
}
From c64f6a7feb1cfb257b21e5c98af6be69c0967836 Mon Sep 17 00:00:00 2001
From: David Li
Date: Tue, 29 Jun 2021 14:29:44 -0400
Subject: [PATCH 08/14] ARROW-11932: [C++] Handle fixed size binary with other
fixed size types
---
cpp/src/arrow/array/builder_base.cc | 7 +++++--
cpp/src/arrow/array/builder_binary.h | 12 ++++++++++++
2 files changed, 17 insertions(+), 2 deletions(-)
diff --git a/cpp/src/arrow/array/builder_base.cc b/cpp/src/arrow/array/builder_base.cc
index d5d3b4e1288..fc1c3a5057a 100644
--- a/cpp/src/arrow/array/builder_base.cc
+++ b/cpp/src/arrow/array/builder_base.cc
@@ -98,7 +98,10 @@ Status ArrayBuilder::Advance(int64_t elements) {
namespace {
struct AppendScalarImpl {
template
- enable_if_t::value || is_decimal_type::value, Status> Visit(const T&) {
+ enable_if_t::value || is_decimal_type::value ||
+ is_fixed_size_binary_type::value,
+ Status>
+ Visit(const T&) {
auto builder = internal::checked_cast::BuilderType*>(builder_);
RETURN_NOT_OK(builder->Reserve(n_repeats_ * (scalars_end_ - scalars_begin_)));
@@ -118,7 +121,7 @@ struct AppendScalarImpl {
}
template
- enable_if_has_string_view Visit(const T&) {
+ enable_if_base_binary Visit(const T&) {
int64_t data_size = 0;
for (const std::shared_ptr* raw = scalars_begin_; raw != scalars_end_;
raw++) {
diff --git a/cpp/src/arrow/array/builder_binary.h b/cpp/src/arrow/array/builder_binary.h
index c1c664a1249..7653eeca5c4 100644
--- a/cpp/src/arrow/array/builder_binary.h
+++ b/cpp/src/arrow/array/builder_binary.h
@@ -467,6 +467,14 @@ class ARROW_EXPORT FixedSizeBinaryBuilder : public ArrayBuilder {
return Status::OK();
}
+ Status Append(const Buffer& s) {
+ ARROW_RETURN_NOT_OK(Reserve(1));
+ UnsafeAppend(util::string_view(s));
+ return Status::OK();
+ }
+
+ Status Append(const std::shared_ptr& s) { return Append(*s); }
+
template
Status Append(const std::array& value) {
ARROW_RETURN_NOT_OK(Reserve(1));
@@ -502,6 +510,10 @@ class ARROW_EXPORT FixedSizeBinaryBuilder : public ArrayBuilder {
UnsafeAppend(reinterpret_cast(value.data()));
}
+ void UnsafeAppend(const Buffer& s) { UnsafeAppend(util::string_view(s)); }
+
+ void UnsafeAppend(const std::shared_ptr& s) { UnsafeAppend(*s); }
+
void UnsafeAppendNull() {
UnsafeAppendToBitmap(false);
byte_builder_.UnsafeAppend(/*num_copies=*/byte_width_, 0);
From 9a48b4a747e6eb64a0891c55e85faa0bf3a18401 Mon Sep 17 00:00:00 2001
From: David Li
Date: Tue, 29 Jun 2021 14:37:13 -0400
Subject: [PATCH 09/14] ARROW-11932: [C++] Clean up implementation for lists
---
cpp/src/arrow/array/builder_base.cc | 22 +++++++++-------------
1 file changed, 9 insertions(+), 13 deletions(-)
diff --git a/cpp/src/arrow/array/builder_base.cc b/cpp/src/arrow/array/builder_base.cc
index fc1c3a5057a..c892e3d664b 100644
--- a/cpp/src/arrow/array/builder_base.cc
+++ b/cpp/src/arrow/array/builder_base.cc
@@ -151,22 +151,18 @@ struct AppendScalarImpl {
return Status::OK();
}
- struct AppendList {
- template
- Status operator()(const ScalarType& s, BuilderType* builder) const {
- RETURN_NOT_OK(builder->Append());
- const Array& list = *s.value;
- for (int64_t i = 0; i < list.length(); i++) {
- ARROW_ASSIGN_OR_RAISE(auto scalar, list.GetScalar(i));
- RETURN_NOT_OK(builder->value_builder()->AppendScalar(*scalar));
- }
- return Status::OK();
- }
- };
-
template
enable_if_list_like Visit(const T&) {
auto builder = internal::checked_cast::BuilderType*>(builder_);
+ int64_t num_children = 0;
+ for (const std::shared_ptr* scalar = scalars_begin_; scalar != scalars_end_;
+ scalar++) {
+ if (!(*scalar)->is_valid) continue;
+ num_children +=
+ internal::checked_cast(**scalar).value->length();
+ }
+ RETURN_NOT_OK(builder->value_builder()->Reserve(num_children * n_repeats_));
+
for (int64_t i = 0; i < n_repeats_; i++) {
for (const std::shared_ptr* scalar = scalars_begin_; scalar != scalars_end_;
scalar++) {
From ab60ef12a333719f1e243548dff1538cb656ec12 Mon Sep 17 00:00:00 2001
From: David Li
Date: Wed, 9 Jun 2021 17:30:35 -0400
Subject: [PATCH 10/14] ARROW-13025: [C++] Add FunctionOptions::Equals/ToString
---
cpp/src/arrow/CMakeLists.txt | 1 +
cpp/src/arrow/compute/api_aggregate.cc | 94 +++
cpp/src/arrow/compute/api_aggregate.h | 52 +-
cpp/src/arrow/compute/api_scalar.cc | 214 ++++++-
cpp/src/arrow/compute/api_scalar.h | 136 +++--
cpp/src/arrow/compute/api_vector.cc | 99 +++
cpp/src/arrow/compute/api_vector.h | 64 +-
cpp/src/arrow/compute/cast.cc | 29 +-
cpp/src/arrow/compute/cast.h | 12 +-
cpp/src/arrow/compute/exec.h | 2 +-
cpp/src/arrow/compute/exec/expression.cc | 176 +-----
.../arrow/compute/exec/expression_internal.h | 11 -
cpp/src/arrow/compute/exec/expression_test.cc | 41 +-
cpp/src/arrow/compute/exec/test_util.cc | 2 +-
cpp/src/arrow/compute/exec_test.cc | 53 +-
cpp/src/arrow/compute/function.cc | 35 ++
cpp/src/arrow/compute/function.h | 38 +-
cpp/src/arrow/compute/function_internal.cc | 104 ++++
cpp/src/arrow/compute/function_internal.h | 577 ++++++++++++++++++
cpp/src/arrow/compute/function_test.cc | 94 +++
cpp/src/arrow/compute/kernel.h | 2 +-
cpp/src/arrow/compute/registry.cc | 42 ++
cpp/src/arrow/compute/registry.h | 10 +
cpp/src/arrow/compute/registry_internal.h | 6 +
cpp/src/arrow/compute/type_fwd.h | 4 +-
cpp/src/arrow/util/reflection_internal.h | 5 +
python/pyarrow/_compute.pxd | 3 +
python/pyarrow/_compute.pyx | 302 ++++-----
python/pyarrow/includes/libarrow.pxd | 15 +-
python/pyarrow/tests/test_compute.py | 34 ++
30 files changed, 1746 insertions(+), 511 deletions(-)
create mode 100644 cpp/src/arrow/compute/function_internal.cc
create mode 100644 cpp/src/arrow/compute/function_internal.h
diff --git a/cpp/src/arrow/CMakeLists.txt b/cpp/src/arrow/CMakeLists.txt
index 79b48461f9b..484c3e9e769 100644
--- a/cpp/src/arrow/CMakeLists.txt
+++ b/cpp/src/arrow/CMakeLists.txt
@@ -371,6 +371,7 @@ if(ARROW_COMPUTE)
compute/exec/exec_plan.cc
compute/exec/expression.cc
compute/function.cc
+ compute/function_internal.cc
compute/kernel.cc
compute/registry.cc
compute/kernels/aggregate_basic.cc
diff --git a/cpp/src/arrow/compute/api_aggregate.cc b/cpp/src/arrow/compute/api_aggregate.cc
index efff4ac67df..17f9cd50552 100644
--- a/cpp/src/arrow/compute/api_aggregate.cc
+++ b/cpp/src/arrow/compute/api_aggregate.cc
@@ -18,10 +18,104 @@
#include "arrow/compute/api_aggregate.h"
#include "arrow/compute/exec.h"
+#include "arrow/compute/function_internal.h"
+#include "arrow/compute/registry.h"
+#include "arrow/compute/util_internal.h"
+#include "arrow/util/checked_cast.h"
+#include "arrow/util/logging.h"
namespace arrow {
namespace compute {
+// ----------------------------------------------------------------------
+// Function options
+
+using ::arrow::internal::checked_cast;
+
+namespace internal {
+template <>
+struct EnumTraits
+ : BasicEnumTraits {
+ static std::string name() { return "QuantileOptions::Interpolation"; }
+ static std::array values() {
+ return {
+ QuantileOptions::LINEAR, QuantileOptions::LOWER, QuantileOptions::HIGHER,
+ QuantileOptions::NEAREST, QuantileOptions::MIDPOINT,
+ };
+ }
+};
+namespace {
+using ::arrow::internal::DataMember;
+static auto kScalarAggregateOptionsType = GetFunctionOptionsType(
+ DataMember("skip_nulls", &ScalarAggregateOptions::skip_nulls),
+ DataMember("min_count", &ScalarAggregateOptions::min_count));
+static auto kModeOptionsType =
+ GetFunctionOptionsType(DataMember("n", &ModeOptions::n));
+static auto kVarianceOptionsType =
+ GetFunctionOptionsType(DataMember("ddof", &VarianceOptions::ddof));
+static auto kQuantileOptionsType = GetFunctionOptionsType(
+ DataMember("q", &QuantileOptions::q),
+ DataMember("interpolation", &QuantileOptions::interpolation));
+static auto kTDigestOptionsType = GetFunctionOptionsType(
+ DataMember("q", &TDigestOptions::q), DataMember("delta", &TDigestOptions::delta),
+ DataMember("buffer_size", &TDigestOptions::buffer_size));
+static auto kIndexOptionsType =
+ GetFunctionOptionsType(DataMember("value", &IndexOptions::value));
+} // namespace
+} // namespace internal
+
+ScalarAggregateOptions::ScalarAggregateOptions(bool skip_nulls, uint32_t min_count)
+ : FunctionOptions(internal::kScalarAggregateOptionsType),
+ skip_nulls(skip_nulls),
+ min_count(min_count) {}
+constexpr char ScalarAggregateOptions::kTypeName[];
+
+ModeOptions::ModeOptions(int64_t n) : FunctionOptions(internal::kModeOptionsType), n(n) {}
+constexpr char ModeOptions::kTypeName[];
+
+VarianceOptions::VarianceOptions(int ddof)
+ : FunctionOptions(internal::kVarianceOptionsType), ddof(ddof) {}
+constexpr char VarianceOptions::kTypeName[];
+
+QuantileOptions::QuantileOptions(double q, enum Interpolation interpolation)
+ : FunctionOptions(internal::kQuantileOptionsType),
+ q{q},
+ interpolation{interpolation} {}
+QuantileOptions::QuantileOptions(std::vector q, enum Interpolation interpolation)
+ : FunctionOptions(internal::kQuantileOptionsType),
+ q{std::move(q)},
+ interpolation{interpolation} {}
+constexpr char QuantileOptions::kTypeName[];
+
+TDigestOptions::TDigestOptions(double q, uint32_t delta, uint32_t buffer_size)
+ : FunctionOptions(internal::kTDigestOptionsType),
+ q{q},
+ delta{delta},
+ buffer_size{buffer_size} {}
+TDigestOptions::TDigestOptions(std::vector q, uint32_t delta,
+ uint32_t buffer_size)
+ : FunctionOptions(internal::kTDigestOptionsType),
+ q{std::move(q)},
+ delta{delta},
+ buffer_size{buffer_size} {}
+constexpr char TDigestOptions::kTypeName[];
+
+IndexOptions::IndexOptions(std::shared_ptr value)
+ : FunctionOptions(internal::kIndexOptionsType), value{std::move(value)} {}
+IndexOptions::IndexOptions() : IndexOptions(std::make_shared()) {}
+constexpr char IndexOptions::kTypeName[];
+
+namespace internal {
+void RegisterAggregateOptions(FunctionRegistry* registry) {
+ DCHECK_OK(registry->AddFunctionOptionsType(kScalarAggregateOptionsType));
+ DCHECK_OK(registry->AddFunctionOptionsType(kModeOptionsType));
+ DCHECK_OK(registry->AddFunctionOptionsType(kVarianceOptionsType));
+ DCHECK_OK(registry->AddFunctionOptionsType(kQuantileOptionsType));
+ DCHECK_OK(registry->AddFunctionOptionsType(kTDigestOptionsType));
+ DCHECK_OK(registry->AddFunctionOptionsType(kIndexOptionsType));
+}
+} // namespace internal
+
// ----------------------------------------------------------------------
// Scalar aggregates
diff --git a/cpp/src/arrow/compute/api_aggregate.h b/cpp/src/arrow/compute/api_aggregate.h
index 121896f1c97..9be0b406aa4 100644
--- a/cpp/src/arrow/compute/api_aggregate.h
+++ b/cpp/src/arrow/compute/api_aggregate.h
@@ -43,10 +43,10 @@ class ExecContext;
/// \brief Control general scalar aggregate kernel behavior
///
/// By default, null values are ignored
-struct ARROW_EXPORT ScalarAggregateOptions : public FunctionOptions {
- explicit ScalarAggregateOptions(bool skip_nulls = true, uint32_t min_count = 1)
- : skip_nulls(skip_nulls), min_count(min_count) {}
-
+class ARROW_EXPORT ScalarAggregateOptions : public FunctionOptions {
+ public:
+ explicit ScalarAggregateOptions(bool skip_nulls = true, uint32_t min_count = 1);
+ constexpr static char const kTypeName[] = "scalar_aggregate";
static ScalarAggregateOptions Defaults() { return ScalarAggregateOptions{}; }
bool skip_nulls;
@@ -57,9 +57,10 @@ struct ARROW_EXPORT ScalarAggregateOptions : public FunctionOptions {
///
/// Returns top-n common values and counts.
/// By default, returns the most common value and count.
-struct ARROW_EXPORT ModeOptions : public FunctionOptions {
- explicit ModeOptions(int64_t n = 1) : n(n) {}
-
+class ARROW_EXPORT ModeOptions : public FunctionOptions {
+ public:
+ explicit ModeOptions(int64_t n = 1);
+ constexpr static char const kTypeName[] = "mode";
static ModeOptions Defaults() { return ModeOptions{}; }
int64_t n = 1;
@@ -69,9 +70,10 @@ struct ARROW_EXPORT ModeOptions : public FunctionOptions {
///
/// The divisor used in calculations is N - ddof, where N is the number of elements.
/// By default, ddof is zero, and population variance or stddev is returned.
-struct ARROW_EXPORT VarianceOptions : public FunctionOptions {
- explicit VarianceOptions(int ddof = 0) : ddof(ddof) {}
-
+class ARROW_EXPORT VarianceOptions : public FunctionOptions {
+ public:
+ explicit VarianceOptions(int ddof = 0);
+ constexpr static char const kTypeName[] = "variance";
static VarianceOptions Defaults() { return VarianceOptions{}; }
int ddof = 0;
@@ -80,7 +82,8 @@ struct ARROW_EXPORT VarianceOptions : public FunctionOptions {
/// \brief Control Quantile kernel behavior
///
/// By default, returns the median value.
-struct ARROW_EXPORT QuantileOptions : public FunctionOptions {
+class ARROW_EXPORT QuantileOptions : public FunctionOptions {
+ public:
/// Interpolation method to use when quantile lies between two data points
enum Interpolation {
LINEAR = 0,
@@ -90,13 +93,12 @@ struct ARROW_EXPORT QuantileOptions : public FunctionOptions {
MIDPOINT,
};
- explicit QuantileOptions(double q = 0.5, enum Interpolation interpolation = LINEAR)
- : q{q}, interpolation{interpolation} {}
+ explicit QuantileOptions(double q = 0.5, enum Interpolation interpolation = LINEAR);
explicit QuantileOptions(std::vector q,
- enum Interpolation interpolation = LINEAR)
- : q{std::move(q)}, interpolation{interpolation} {}
+ enum Interpolation interpolation = LINEAR);
+ constexpr static char const kTypeName[] = "quantile";
static QuantileOptions Defaults() { return QuantileOptions{}; }
/// quantile must be between 0 and 1 inclusive
@@ -107,15 +109,13 @@ struct ARROW_EXPORT QuantileOptions : public FunctionOptions {
/// \brief Control TDigest approximate quantile kernel behavior
///
/// By default, returns the median value.
-struct ARROW_EXPORT TDigestOptions : public FunctionOptions {
+class ARROW_EXPORT TDigestOptions : public FunctionOptions {
+ public:
explicit TDigestOptions(double q = 0.5, uint32_t delta = 100,
- uint32_t buffer_size = 500)
- : q{q}, delta{delta}, buffer_size{buffer_size} {}
-
+ uint32_t buffer_size = 500);
explicit TDigestOptions(std::vector q, uint32_t delta = 100,
- uint32_t buffer_size = 500)
- : q{std::move(q)}, delta{delta}, buffer_size{buffer_size} {}
-
+ uint32_t buffer_size = 500);
+ constexpr static char const kTypeName[] = "t_digest";
static TDigestOptions Defaults() { return TDigestOptions{}; }
/// quantile must be between 0 and 1 inclusive
@@ -127,8 +127,12 @@ struct ARROW_EXPORT TDigestOptions : public FunctionOptions {
};
/// \brief Control Index kernel behavior
-struct ARROW_EXPORT IndexOptions : public FunctionOptions {
- explicit IndexOptions(std::shared_ptr value) : value{std::move(value)} {}
+class ARROW_EXPORT IndexOptions : public FunctionOptions {
+ public:
+ explicit IndexOptions(std::shared_ptr value);
+ // Default constructor for serialization
+ IndexOptions();
+ constexpr static char const kTypeName[] = "index";
std::shared_ptr value;
};
diff --git a/cpp/src/arrow/compute/api_scalar.cc b/cpp/src/arrow/compute/api_scalar.cc
index db1cac290cf..9018e0b192c 100644
--- a/cpp/src/arrow/compute/api_scalar.cc
+++ b/cpp/src/arrow/compute/api_scalar.cc
@@ -21,13 +21,225 @@
#include
#include
+#include "arrow/array/array_base.h"
#include "arrow/compute/exec.h"
+#include "arrow/compute/function_internal.h"
+#include "arrow/compute/registry.h"
+#include "arrow/compute/util_internal.h"
#include "arrow/status.h"
#include "arrow/type.h"
+#include "arrow/util/checked_cast.h"
+#include "arrow/util/logging.h"
namespace arrow {
namespace compute {
+// ----------------------------------------------------------------------
+// Function options
+
+using ::arrow::internal::checked_cast;
+
+namespace internal {
+template <>
+struct EnumTraits
+ : BasicEnumTraits {
+ static std::string name() { return "JoinOptions::NullHandlingBehavior"; }
+ static std::array values() {
+ return {
+ JoinOptions::NullHandlingBehavior::EMIT_NULL,
+ JoinOptions::NullHandlingBehavior::SKIP,
+ JoinOptions::NullHandlingBehavior::REPLACE,
+ };
+ }
+};
+template <>
+struct EnumTraits : BasicEnumTraits {
+ static std::string name() { return "TimeUnit::type"; }
+ static std::array values() {
+ return {
+ TimeUnit::type::SECOND,
+ TimeUnit::type::MILLI,
+ TimeUnit::type::MICRO,
+ TimeUnit::type::NANO,
+ };
+ }
+};
+namespace {
+using ::arrow::internal::DataMember;
+static auto kElementWiseAggregateOptionsType =
+ GetFunctionOptionsType(
+ DataMember("skip_nulls", &ElementWiseAggregateOptions::skip_nulls));
+static auto kJoinOptionsType = GetFunctionOptionsType(
+ DataMember("null_handling", &JoinOptions::null_handling),
+ DataMember("null_replacement", &JoinOptions::null_replacement));
+static auto kMatchSubstringOptionsType = GetFunctionOptionsType(
+ DataMember("pattern", &MatchSubstringOptions::pattern),
+ DataMember("ignore_case", &MatchSubstringOptions::ignore_case));
+static auto kSplitOptionsType = GetFunctionOptionsType(
+ DataMember("max_splits", &SplitOptions::max_splits),
+ DataMember("reverse", &SplitOptions::reverse));
+static auto kSplitPatternOptionsType = GetFunctionOptionsType(
+ DataMember("pattern", &SplitPatternOptions::pattern),
+ DataMember("max_splits", &SplitPatternOptions::max_splits),
+ DataMember("reverse", &SplitPatternOptions::reverse));
+static auto kReplaceSliceOptionsType = GetFunctionOptionsType(
+ DataMember("start", &ReplaceSliceOptions::start),
+ DataMember("stop", &ReplaceSliceOptions::stop),
+ DataMember("replacement", &ReplaceSliceOptions::replacement));
+static auto kReplaceSubstringOptionsType =
+ GetFunctionOptionsType(
+ DataMember("pattern", &ReplaceSubstringOptions::pattern),
+ DataMember("replacement", &ReplaceSubstringOptions::replacement),
+ DataMember("max_replacements", &ReplaceSubstringOptions::max_replacements));
+static auto kExtractRegexOptionsType = GetFunctionOptionsType(
+ DataMember("pattern", &ExtractRegexOptions::pattern));
+static auto kSetLookupOptionsType = GetFunctionOptionsType(
+ DataMember("value_set", &SetLookupOptions::value_set),
+ DataMember("skip_nulls", &SetLookupOptions::skip_nulls));
+static auto kStrptimeOptionsType = GetFunctionOptionsType(
+ DataMember("format", &StrptimeOptions::format),
+ DataMember("unit", &StrptimeOptions::unit));
+static auto kPadOptionsType = GetFunctionOptionsType(
+ DataMember("width", &PadOptions::width), DataMember("padding", &PadOptions::padding));
+static auto kTrimOptionsType = GetFunctionOptionsType(
+ DataMember("characters", &TrimOptions::characters));
+static auto kSliceOptionsType = GetFunctionOptionsType(
+ DataMember("start", &SliceOptions::start), DataMember("stop", &SliceOptions::stop),
+ DataMember("step", &SliceOptions::step));
+static auto kProjectOptionsType = GetFunctionOptionsType(
+ DataMember("field_names", &ProjectOptions::field_names),
+ DataMember("field_nullability", &ProjectOptions::field_nullability),
+ DataMember("field_metadata", &ProjectOptions::field_metadata));
+} // namespace
+} // namespace internal
+
+ElementWiseAggregateOptions::ElementWiseAggregateOptions(bool skip_nulls)
+ : FunctionOptions(internal::kElementWiseAggregateOptionsType),
+ skip_nulls(skip_nulls) {}
+constexpr char ElementWiseAggregateOptions::kTypeName[];
+
+JoinOptions::JoinOptions(NullHandlingBehavior null_handling, std::string null_replacement)
+ : FunctionOptions(internal::kJoinOptionsType),
+ null_handling(null_handling),
+ null_replacement(std::move(null_replacement)) {}
+constexpr char JoinOptions::kTypeName[];
+
+MatchSubstringOptions::MatchSubstringOptions(std::string pattern, bool ignore_case)
+ : FunctionOptions(internal::kMatchSubstringOptionsType),
+ pattern(std::move(pattern)),
+ ignore_case(ignore_case) {}
+MatchSubstringOptions::MatchSubstringOptions() : MatchSubstringOptions("", false) {}
+constexpr char MatchSubstringOptions::kTypeName[];
+
+SplitOptions::SplitOptions(int64_t max_splits, bool reverse)
+ : FunctionOptions(internal::kSplitOptionsType),
+ max_splits(max_splits),
+ reverse(reverse) {}
+constexpr char SplitOptions::kTypeName[];
+
+SplitPatternOptions::SplitPatternOptions(std::string pattern, int64_t max_splits,
+ bool reverse)
+ : FunctionOptions(internal::kSplitPatternOptionsType),
+ pattern(std::move(pattern)),
+ max_splits(max_splits),
+ reverse(reverse) {}
+SplitPatternOptions::SplitPatternOptions() : SplitPatternOptions("", -1, false) {}
+constexpr char SplitPatternOptions::kTypeName[];
+
+ReplaceSliceOptions::ReplaceSliceOptions(int64_t start, int64_t stop,
+ std::string replacement)
+ : FunctionOptions(internal::kReplaceSliceOptionsType),
+ start(start),
+ stop(stop),
+ replacement(std::move(replacement)) {}
+ReplaceSliceOptions::ReplaceSliceOptions() : ReplaceSliceOptions(0, 0, "") {}
+constexpr char ReplaceSliceOptions::kTypeName[];
+
+ReplaceSubstringOptions::ReplaceSubstringOptions(std::string pattern,
+ std::string replacement,
+ int64_t max_replacements)
+ : FunctionOptions(internal::kReplaceSubstringOptionsType),
+ pattern(std::move(pattern)),
+ replacement(std::move(replacement)),
+ max_replacements(max_replacements) {}
+ReplaceSubstringOptions::ReplaceSubstringOptions()
+ : ReplaceSubstringOptions("", "", -1) {}
+constexpr char ReplaceSubstringOptions::kTypeName[];
+
+ExtractRegexOptions::ExtractRegexOptions(std::string pattern)
+ : FunctionOptions(internal::kExtractRegexOptionsType), pattern(std::move(pattern)) {}
+ExtractRegexOptions::ExtractRegexOptions() : ExtractRegexOptions("") {}
+constexpr char ExtractRegexOptions::kTypeName[];
+
+SetLookupOptions::SetLookupOptions(Datum value_set, bool skip_nulls)
+ : FunctionOptions(internal::kSetLookupOptionsType),
+ value_set(std::move(value_set)),
+ skip_nulls(skip_nulls) {}
+SetLookupOptions::SetLookupOptions() : SetLookupOptions({}, false) {}
+constexpr char SetLookupOptions::kTypeName[];
+
+StrptimeOptions::StrptimeOptions(std::string format, TimeUnit::type unit)
+ : FunctionOptions(internal::kStrptimeOptionsType),
+ format(std::move(format)),
+ unit(unit) {}
+StrptimeOptions::StrptimeOptions() : StrptimeOptions("", TimeUnit::SECOND) {}
+constexpr char StrptimeOptions::kTypeName[];
+
+PadOptions::PadOptions(int64_t width, std::string padding)
+ : FunctionOptions(internal::kPadOptionsType),
+ width(width),
+ padding(std::move(padding)) {}
+PadOptions::PadOptions() : PadOptions(0, " ") {}
+constexpr char PadOptions::kTypeName[];
+
+TrimOptions::TrimOptions(std::string characters)
+ : FunctionOptions(internal::kTrimOptionsType), characters(std::move(characters)) {}
+TrimOptions::TrimOptions() : TrimOptions("") {}
+constexpr char TrimOptions::kTypeName[];
+
+SliceOptions::SliceOptions(int64_t start, int64_t stop, int64_t step)
+ : FunctionOptions(internal::kSliceOptionsType),
+ start(start),
+ stop(stop),
+ step(step) {}
+SliceOptions::SliceOptions() : SliceOptions(0, 0, 1) {}
+constexpr char SliceOptions::kTypeName[];
+
+ProjectOptions::ProjectOptions(std::vector n, std::vector r,
+ std::vector> m)
+ : FunctionOptions(internal::kProjectOptionsType),
+ field_names(std::move(n)),
+ field_nullability(std::move(r)),
+ field_metadata(std::move(m)) {}
+
+ProjectOptions::ProjectOptions(std::vector n)
+ : FunctionOptions(internal::kProjectOptionsType),
+ field_names(std::move(n)),
+ field_nullability(field_names.size(), true),
+ field_metadata(field_names.size(), NULLPTR) {}
+
+ProjectOptions::ProjectOptions() : ProjectOptions(std::vector()) {}
+constexpr char ProjectOptions::kTypeName[];
+
+namespace internal {
+void RegisterScalarOptions(FunctionRegistry* registry) {
+ DCHECK_OK(registry->AddFunctionOptionsType(kElementWiseAggregateOptionsType));
+ DCHECK_OK(registry->AddFunctionOptionsType(kJoinOptionsType));
+ DCHECK_OK(registry->AddFunctionOptionsType(kMatchSubstringOptionsType));
+ DCHECK_OK(registry->AddFunctionOptionsType(kSplitOptionsType));
+ DCHECK_OK(registry->AddFunctionOptionsType(kSplitPatternOptionsType));
+ DCHECK_OK(registry->AddFunctionOptionsType(kReplaceSliceOptionsType));
+ DCHECK_OK(registry->AddFunctionOptionsType(kReplaceSubstringOptionsType));
+ DCHECK_OK(registry->AddFunctionOptionsType(kExtractRegexOptionsType));
+ DCHECK_OK(registry->AddFunctionOptionsType(kSetLookupOptionsType));
+ DCHECK_OK(registry->AddFunctionOptionsType(kStrptimeOptionsType));
+ DCHECK_OK(registry->AddFunctionOptionsType(kPadOptionsType));
+ DCHECK_OK(registry->AddFunctionOptionsType(kTrimOptionsType));
+ DCHECK_OK(registry->AddFunctionOptionsType(kSliceOptionsType));
+ DCHECK_OK(registry->AddFunctionOptionsType(kProjectOptionsType));
+}
+} // namespace internal
+
#define SCALAR_EAGER_UNARY(NAME, REGISTRY_NAME) \
Result NAME(const Datum& value, ExecContext* ctx) { \
return CallFunction(REGISTRY_NAME, {value}, ctx); \
@@ -153,7 +365,7 @@ Result Compare(const Datum& left, const Datum& right, CompareOptions opti
func_name = "less_equal";
break;
}
- return CallFunction(func_name, {left, right}, &options, ctx);
+ return CallFunction(func_name, {left, right}, nullptr, ctx);
}
// ----------------------------------------------------------------------
diff --git a/cpp/src/arrow/compute/api_scalar.h b/cpp/src/arrow/compute/api_scalar.h
index 5c83dcb5c85..282d5cdab0b 100644
--- a/cpp/src/arrow/compute/api_scalar.h
+++ b/cpp/src/arrow/compute/api_scalar.h
@@ -37,19 +37,25 @@ namespace compute {
///
/// @{
-struct ArithmeticOptions : public FunctionOptions {
- ArithmeticOptions() : check_overflow(false) {}
+struct ARROW_EXPORT ArithmeticOptions {
+ public:
+ explicit ArithmeticOptions(bool check_overflow = false)
+ : check_overflow(check_overflow) {}
bool check_overflow;
};
-struct ARROW_EXPORT ElementWiseAggregateOptions : public FunctionOptions {
- explicit ElementWiseAggregateOptions(bool skip_nulls = true) : skip_nulls(skip_nulls) {}
+class ARROW_EXPORT ElementWiseAggregateOptions : public FunctionOptions {
+ public:
+ explicit ElementWiseAggregateOptions(bool skip_nulls = true);
+ constexpr static char const kTypeName[] = "element_wise_aggregate";
static ElementWiseAggregateOptions Defaults() { return ElementWiseAggregateOptions{}; }
+
bool skip_nulls;
};
/// Options for var_args_join.
-struct ARROW_EXPORT JoinOptions : public FunctionOptions {
+class ARROW_EXPORT JoinOptions : public FunctionOptions {
+ public:
/// How to handle null values. (A null separator always results in a null output.)
enum NullHandlingBehavior {
/// A null in any input results in a null in the output.
@@ -60,16 +66,18 @@ struct ARROW_EXPORT JoinOptions : public FunctionOptions {
REPLACE,
};
explicit JoinOptions(NullHandlingBehavior null_handling = EMIT_NULL,
- std::string null_replacement = "")
- : null_handling(null_handling), null_replacement(std::move(null_replacement)) {}
+ std::string null_replacement = "");
+ constexpr static char const kTypeName[] = "join";
static JoinOptions Defaults() { return JoinOptions(); }
NullHandlingBehavior null_handling;
std::string null_replacement;
};
-struct ARROW_EXPORT MatchSubstringOptions : public FunctionOptions {
- explicit MatchSubstringOptions(std::string pattern, bool ignore_case = false)
- : pattern(std::move(pattern)), ignore_case(ignore_case) {}
+class ARROW_EXPORT MatchSubstringOptions : public FunctionOptions {
+ public:
+ explicit MatchSubstringOptions(std::string pattern, bool ignore_case = false);
+ MatchSubstringOptions();
+ constexpr static char const kTypeName[] = "match_substring";
/// The exact substring (or regex, depending on kernel) to look for inside input values.
std::string pattern;
@@ -77,9 +85,10 @@ struct ARROW_EXPORT MatchSubstringOptions : public FunctionOptions {
bool ignore_case = false;
};
-struct ARROW_EXPORT SplitOptions : public FunctionOptions {
- explicit SplitOptions(int64_t max_splits = -1, bool reverse = false)
- : max_splits(max_splits), reverse(reverse) {}
+class ARROW_EXPORT SplitOptions : public FunctionOptions {
+ public:
+ explicit SplitOptions(int64_t max_splits = -1, bool reverse = false);
+ constexpr static char const kTypeName[] = "split";
/// Maximum number of splits allowed, or unlimited when -1
int64_t max_splits;
@@ -87,18 +96,26 @@ struct ARROW_EXPORT SplitOptions : public FunctionOptions {
bool reverse;
};
-struct ARROW_EXPORT SplitPatternOptions : public SplitOptions {
+class ARROW_EXPORT SplitPatternOptions : public FunctionOptions {
+ public:
explicit SplitPatternOptions(std::string pattern, int64_t max_splits = -1,
- bool reverse = false)
- : SplitOptions(max_splits, reverse), pattern(std::move(pattern)) {}
+ bool reverse = false);
+ SplitPatternOptions();
+ constexpr static char const kTypeName[] = "split_pattern";
- /// The exact substring to look for inside input values.
+ /// The exact substring to split on.
std::string pattern;
+ /// Maximum number of splits allowed, or unlimited when -1
+ int64_t max_splits;
+ /// Start splitting from the end of the string (only relevant when max_splits != -1)
+ bool reverse;
};
-struct ARROW_EXPORT ReplaceSliceOptions : public FunctionOptions {
- explicit ReplaceSliceOptions(int64_t start, int64_t stop, std::string replacement)
- : start(start), stop(stop), replacement(std::move(replacement)) {}
+class ARROW_EXPORT ReplaceSliceOptions : public FunctionOptions {
+ public:
+ explicit ReplaceSliceOptions(int64_t start, int64_t stop, std::string replacement);
+ ReplaceSliceOptions();
+ constexpr static char const kTypeName[] = "replace_slice";
/// Index to start slicing at
int64_t start;
@@ -108,12 +125,12 @@ struct ARROW_EXPORT ReplaceSliceOptions : public FunctionOptions {
std::string replacement;
};
-struct ARROW_EXPORT ReplaceSubstringOptions : public FunctionOptions {
+class ARROW_EXPORT ReplaceSubstringOptions : public FunctionOptions {
+ public:
explicit ReplaceSubstringOptions(std::string pattern, std::string replacement,
- int64_t max_replacements = -1)
- : pattern(std::move(pattern)),
- replacement(std::move(replacement)),
- max_replacements(max_replacements) {}
+ int64_t max_replacements = -1);
+ ReplaceSubstringOptions();
+ constexpr static char const kTypeName[] = "replace_substring";
/// Pattern to match, literal, or regular expression depending on which kernel is used
std::string pattern;
@@ -123,17 +140,22 @@ struct ARROW_EXPORT ReplaceSubstringOptions : public FunctionOptions {
int64_t max_replacements;
};
-struct ARROW_EXPORT ExtractRegexOptions : public FunctionOptions {
- explicit ExtractRegexOptions(std::string pattern) : pattern(std::move(pattern)) {}
+class ARROW_EXPORT ExtractRegexOptions : public FunctionOptions {
+ public:
+ explicit ExtractRegexOptions(std::string pattern);
+ ExtractRegexOptions();
+ constexpr static char const kTypeName[] = "extract_regex";
/// Regular expression with named capture fields
std::string pattern;
};
/// Options for IsIn and IndexIn functions
-struct ARROW_EXPORT SetLookupOptions : public FunctionOptions {
- explicit SetLookupOptions(Datum value_set, bool skip_nulls = false)
- : value_set(std::move(value_set)), skip_nulls(skip_nulls) {}
+class ARROW_EXPORT SetLookupOptions : public FunctionOptions {
+ public:
+ explicit SetLookupOptions(Datum value_set, bool skip_nulls = false);
+ SetLookupOptions();
+ constexpr static char const kTypeName[] = "set_lookup";
/// The set of values to look up input values into.
Datum value_set;
@@ -146,17 +168,21 @@ struct ARROW_EXPORT SetLookupOptions : public FunctionOptions {
bool skip_nulls;
};
-struct ARROW_EXPORT StrptimeOptions : public FunctionOptions {
- explicit StrptimeOptions(std::string format, TimeUnit::type unit)
- : format(std::move(format)), unit(unit) {}
+class ARROW_EXPORT StrptimeOptions : public FunctionOptions {
+ public:
+ explicit StrptimeOptions(std::string format, TimeUnit::type unit);
+ StrptimeOptions();
+ constexpr static char const kTypeName[] = "strptime";
std::string format;
TimeUnit::type unit;
};
-struct ARROW_EXPORT PadOptions : public FunctionOptions {
- explicit PadOptions(int64_t width, std::string padding = " ")
- : width(width), padding(std::move(padding)) {}
+class ARROW_EXPORT PadOptions : public FunctionOptions {
+ public:
+ explicit PadOptions(int64_t width, std::string padding = " ");
+ PadOptions();
+ constexpr static char const kTypeName[] = "pad";
/// The desired string length.
int64_t width;
@@ -164,18 +190,22 @@ struct ARROW_EXPORT PadOptions : public FunctionOptions {
std::string padding;
};
-struct ARROW_EXPORT TrimOptions : public FunctionOptions {
- explicit TrimOptions(std::string characters) : characters(std::move(characters)) {}
+class ARROW_EXPORT TrimOptions : public FunctionOptions {
+ public:
+ explicit TrimOptions(std::string characters);
+ TrimOptions();
+ constexpr static char const kTypeName[] = "trim";
/// The individual characters that can be trimmed from the string.
std::string characters;
};
-struct ARROW_EXPORT SliceOptions : public FunctionOptions {
+class ARROW_EXPORT SliceOptions : public FunctionOptions {
+ public:
explicit SliceOptions(int64_t start, int64_t stop = std::numeric_limits::max(),
- int64_t step = 1)
- : start(start), stop(stop), step(step) {}
-
+ int64_t step = 1);
+ SliceOptions();
+ constexpr static char const kTypeName[] = "slice";
int64_t start, stop, step;
};
@@ -188,23 +218,19 @@ enum CompareOperator : int8_t {
LESS_EQUAL,
};
-struct CompareOptions : public FunctionOptions {
+struct ARROW_EXPORT CompareOptions {
explicit CompareOptions(CompareOperator op) : op(op) {}
enum CompareOperator op;
};
-struct ARROW_EXPORT ProjectOptions : public FunctionOptions {
+class ARROW_EXPORT ProjectOptions : public FunctionOptions {
+ public:
ProjectOptions(std::vector n, std::vector r,
- std::vector> m)
- : field_names(std::move(n)),
- field_nullability(std::move(r)),
- field_metadata(std::move(m)) {}
-
- explicit ProjectOptions(std::vector n)
- : field_names(std::move(n)),
- field_nullability(field_names.size(), true),
- field_metadata(field_names.size(), NULLPTR) {}
+ std::vector> m);
+ explicit ProjectOptions(std::vector n);
+ ProjectOptions();
+ constexpr static char const kTypeName[] = "project";
/// Names for wrapped columns
std::vector field_names;
@@ -348,8 +374,8 @@ Result MinElementWise(
/// \since 1.0.0
/// \note API not yet finalized
ARROW_EXPORT
-Result Compare(const Datum& left, const Datum& right,
- struct CompareOptions options, ExecContext* ctx = NULLPTR);
+Result Compare(const Datum& left, const Datum& right, CompareOptions options,
+ ExecContext* ctx = NULLPTR);
/// \brief Invert the values of a boolean datum
/// \param[in] value datum to invert
diff --git a/cpp/src/arrow/compute/api_vector.cc b/cpp/src/arrow/compute/api_vector.cc
index 0082d48112d..7a084a44678 100644
--- a/cpp/src/arrow/compute/api_vector.cc
+++ b/cpp/src/arrow/compute/api_vector.cc
@@ -18,23 +18,122 @@
#include "arrow/compute/api_vector.h"
#include
+#include
#include
#include
#include "arrow/array/array_nested.h"
#include "arrow/array/builder_primitive.h"
#include "arrow/compute/exec.h"
+#include "arrow/compute/function_internal.h"
+#include "arrow/compute/registry.h"
#include "arrow/datum.h"
#include "arrow/record_batch.h"
#include "arrow/result.h"
#include "arrow/util/checked_cast.h"
+#include "arrow/util/logging.h"
namespace arrow {
+using internal::checked_cast;
using internal::checked_pointer_cast;
namespace compute {
+// ----------------------------------------------------------------------
+// Function options
+
+bool SortKey::Equals(const SortKey& other) const {
+ return name == other.name && order == other.order;
+}
+std::string SortKey::ToString() const {
+ std::stringstream ss;
+ ss << name << ' ';
+ switch (order) {
+ case SortOrder::Ascending:
+ ss << "ASC";
+ break;
+ case SortOrder::Descending:
+ ss << "DESC";
+ break;
+ }
+ return ss.str();
+}
+
+namespace internal {
+template <>
+struct EnumTraits
+ : BasicEnumTraits {
+ static std::string name() { return "FilterOptions::NullSelectionBehavior"; }
+ static std::array values() {
+ return {FilterOptions::NullSelectionBehavior::DROP,
+ FilterOptions::NullSelectionBehavior::EMIT_NULL};
+ }
+};
+template <>
+struct EnumTraits
+ : BasicEnumTraits {
+ static std::string name() { return "DictionaryEncodeOptions::NullEncodingBehavior"; }
+ static std::array values() {
+ return {DictionaryEncodeOptions::NullEncodingBehavior::ENCODE,
+ DictionaryEncodeOptions::NullEncodingBehavior::MASK};
+ }
+};
+namespace {
+using ::arrow::internal::DataMember;
+static auto kFilterOptionsType = GetFunctionOptionsType(
+ DataMember("null_selection_behavior", &FilterOptions::null_selection_behavior));
+static auto kTakeOptionsType = GetFunctionOptionsType(
+ DataMember("boundscheck", &TakeOptions::boundscheck));
+static auto kDictionaryEncodeOptionsType =
+ GetFunctionOptionsType(DataMember(
+ "null_encoding_behavior", &DictionaryEncodeOptions::null_encoding_behavior));
+static auto kArraySortOptionsType = GetFunctionOptionsType(
+ DataMember("order", &ArraySortOptions::order));
+static auto kSortOptionsType =
+ GetFunctionOptionsType