-
Notifications
You must be signed in to change notification settings - Fork 4k
GH-41094: [C++] Support scalar expression special form #42106
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
49950d6
fb7b936
8047d0a
d10d34a
49fb971
cf33b98
6c6ed3b
0112889
42d8c87
8bb5af3
b382a67
2bca08c
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -24,9 +24,9 @@ | |
|
|
||
| #include "arrow/chunked_array.h" | ||
| #include "arrow/compute/api_vector.h" | ||
| #include "arrow/compute/exec_internal.h" | ||
| #include "arrow/compute/expression_internal.h" | ||
| #include "arrow/compute/function_internal.h" | ||
| #include "arrow/compute/special_form.h" | ||
| #include "arrow/compute/util.h" | ||
| #include "arrow/io/memory.h" | ||
| #include "arrow/ipc/reader.h" | ||
|
|
@@ -72,11 +72,12 @@ Expression field_ref(FieldRef ref) { | |
| } | ||
|
|
||
| Expression call(std::string function, std::vector<Expression> arguments, | ||
| std::shared_ptr<compute::FunctionOptions> options) { | ||
| std::shared_ptr<compute::FunctionOptions> options, bool is_special_form) { | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This is exactly the kind of added complexity around calls that I'm trying to convince you is not a good idea. This boolean is effectively saying the arguments are passed in call-by-name fashion and not call-by-value [1]. We should keep the Arrow rewrite system a strict-evaluated system with just 1 or 2 special forms. That means calls are always called by value. We keep the C++ is a strictly evaluated rewrite system. You can't declare a function and say "all the parameters are call-by-name" like you're trying to do with this boolean parameter. So you can implement template<typename T>
T if_else(bool cond, T then_case, T else_case) {
if (cond) { return then_case; } else { return else_case; }
}
// Non-strict evaluation with special form syntax. The branches of the
// if are non-strictly evaluated in a way that can't be achieved with
// the `if_else` Call.
if (den > 0) {
return num / den;
} else {
return 0;
}A general conditional special form can serve for if-else, https://en.wikipedia.org/wiki/Evaluation_strategy#Non-strict_binding_strategies
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Your explanation is convincing and it's hard to not be convinced. I'll need some more time to digest your comments and get back to you. I really appreciate your kind help and insightful comments, and the pointer to the interesting reading too :) Thank you @felipecrv ! |
||
| Expression::Call call; | ||
| call.function_name = std::move(function); | ||
| call.arguments = std::move(arguments); | ||
| call.options = std::move(options); | ||
| call.is_special_form = is_special_form; | ||
| return Expression(std::move(call)); | ||
| } | ||
|
|
||
|
|
@@ -119,6 +120,16 @@ const DataType* Expression::type() const { | |
| return CallNotNull(*this)->type.type; | ||
| } | ||
|
|
||
| bool Expression::selection_vector_aware() const { | ||
| DCHECK(IsBound()); | ||
|
|
||
| if (literal() || field_ref()) { | ||
| return true; | ||
| } | ||
|
|
||
| return CallNotNull(*this)->selection_vector_aware; | ||
| } | ||
|
|
||
| namespace { | ||
|
|
||
| std::string PrintDatum(const Datum& datum) { | ||
|
|
@@ -550,6 +561,11 @@ Result<Expression> BindNonRecursive(Expression::Call call, bool insert_implicit_ | |
|
|
||
| ARROW_ASSIGN_OR_RAISE( | ||
| call.type, call.kernel->signature->out_type().Resolve(&kernel_context, types)); | ||
|
|
||
| if (call.is_special_form) { | ||
| ARROW_ASSIGN_OR_RAISE(call.special_form, SpecialForm::Make(call.function_name)); | ||
| } | ||
|
|
||
| return Status::OK(); | ||
| }; | ||
|
|
||
|
|
@@ -572,7 +588,11 @@ Result<Expression> BindNonRecursive(Expression::Call call, bool insert_implicit_ | |
| types = GetTypesWithSmallestLiteralRepresentation(call.arguments); | ||
| ARROW_ASSIGN_OR_RAISE(call.kernel, call.function->DispatchBest(&types)); | ||
|
|
||
| call.selection_vector_aware = call.kernel->selection_vector_aware; | ||
|
|
||
| for (size_t i = 0; i < types.size(); ++i) { | ||
| call.selection_vector_aware &= call.arguments[i].selection_vector_aware(); | ||
|
|
||
| if (types[i] == call.arguments[i].type()) continue; | ||
|
|
||
| if (const Datum* lit = call.arguments[i].literal()) { | ||
|
|
@@ -716,6 +736,30 @@ Result<Datum> ExecuteScalarExpression(const Expression& expr, const Schema& full | |
| return ExecuteScalarExpression(expr, input, exec_context); | ||
| } | ||
|
|
||
| namespace { | ||
|
|
||
| // Execute a scalar expression that is not fully selection-vector-aware on a batch | ||
| // carrying a valid selection vector using the slow path: gathering the input and | ||
| // scattering the output. | ||
| Result<Datum> ExecuteScalarExpressionWithSelSlowPath(const Expression& expr, | ||
| const ExecBatch& input, | ||
| compute::ExecContext* exec_context) { | ||
| DCHECK(!expr.selection_vector_aware() && input.selection_vector); | ||
| auto values = input.values; | ||
| for (auto& value : values) { | ||
| if (value.is_scalar()) continue; | ||
| ARROW_ASSIGN_OR_RAISE( | ||
| value, Filter(value, input.selection_vector->data(), FilterOptions::Defaults())); | ||
| } | ||
|
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Aha, this can be implemented like this, funny |
||
| ARROW_ASSIGN_OR_RAISE(ExecBatch selected_batch, ExecBatch::Make(std::move(values))); | ||
| ARROW_ASSIGN_OR_RAISE(auto partial_result, | ||
| ExecuteScalarExpression(expr, selected_batch, exec_context)); | ||
| // TODO: Scatter. | ||
| return partial_result; | ||
| } | ||
|
|
||
| } // namespace | ||
|
|
||
| Result<Datum> ExecuteScalarExpression(const Expression& expr, const ExecBatch& input, | ||
| compute::ExecContext* exec_context) { | ||
| if (exec_context == nullptr) { | ||
|
|
@@ -732,6 +776,10 @@ Result<Datum> ExecuteScalarExpression(const Expression& expr, const ExecBatch& i | |
| "ExecuteScalarExpression cannot Execute non-scalar expression ", expr.ToString()); | ||
| } | ||
|
|
||
| if (!expr.selection_vector_aware() && input.selection_vector) { | ||
|
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This actually implies a strategy (1) that is: if any subexpression (including the top-level expression itself) is not selection vector aware, we do the gather/scatter work only once for the top-level expression. This way all the subexpressions can work on a (gathered) continuous input. An alternative strategy (2) could be: every expression that is not selection vector aware (not counting its subexpressions), does the gather/scatter itself. This way every expression itself maintains the input/output shape the same as the original input. Given that for a long time (possibly forever), there will be much less selection vector aware kernels than the other, it's probably worth it to use 1 instead of 2, as the gather/scatter only happen once for the whole expression. |
||
| return ExecuteScalarExpressionWithSelSlowPath(expr, input, exec_context); | ||
| } | ||
|
|
||
| if (auto lit = expr.literal()) return *lit; | ||
|
|
||
| if (auto param = expr.parameter()) { | ||
|
|
@@ -757,44 +805,20 @@ Result<Datum> ExecuteScalarExpression(const Expression& expr, const ExecBatch& i | |
|
|
||
| auto call = CallNotNull(expr); | ||
|
|
||
| std::vector<Datum> arguments(call->arguments.size()); | ||
|
|
||
| bool all_scalar = true; | ||
| for (size_t i = 0; i < arguments.size(); ++i) { | ||
| ARROW_ASSIGN_OR_RAISE( | ||
| arguments[i], ExecuteScalarExpression(call->arguments[i], input, exec_context)); | ||
| if (arguments[i].is_array()) { | ||
| all_scalar = false; | ||
| } | ||
| } | ||
|
|
||
| int64_t input_length; | ||
| if (!arguments.empty() && all_scalar) { | ||
| // all inputs are scalar, so use a 1-long batch to avoid | ||
| // computing input.length equivalent outputs | ||
| input_length = 1; | ||
| if (call->is_special_form) { | ||
| // Let the special form take over the execution using its own logic of argument | ||
| // evaluation. | ||
| DCHECK(call->special_form); | ||
| return call->special_form->Execute(*call, input, exec_context); | ||
| } else { | ||
| input_length = input.length; | ||
| // Eagerly evaluate all the arguments. | ||
| std::vector<Datum> arguments(call->arguments.size()); | ||
| for (size_t i = 0; i < arguments.size(); ++i) { | ||
| ARROW_ASSIGN_OR_RAISE( | ||
| arguments[i], ExecuteScalarExpression(call->arguments[i], input, exec_context)); | ||
| } | ||
| return ExecuteCallNonRecursive(*call, input, arguments, exec_context); | ||
| } | ||
|
|
||
| auto executor = compute::detail::KernelExecutor::MakeScalar(); | ||
|
|
||
| compute::KernelContext kernel_context(exec_context, call->kernel); | ||
| kernel_context.SetState(call->kernel_state.get()); | ||
|
|
||
| const Kernel* kernel = call->kernel; | ||
| std::vector<TypeHolder> types = GetTypes(arguments); | ||
| auto options = call->options.get(); | ||
| RETURN_NOT_OK(executor->Init(&kernel_context, {kernel, types, options})); | ||
|
|
||
| compute::detail::DatumAccumulator listener; | ||
| RETURN_NOT_OK( | ||
| executor->Execute(ExecBatch(std::move(arguments), input_length), &listener)); | ||
| const auto out = executor->WrapResults(arguments, listener.values()); | ||
| #ifndef NDEBUG | ||
| DCHECK_OK(executor->CheckResultType(out, call->function_name.c_str())); | ||
| #endif | ||
| return out; | ||
| } | ||
|
|
||
| namespace { | ||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -48,6 +48,9 @@ class ARROW_EXPORT Expression { | |
| std::string function_name; | ||
| std::vector<Expression> arguments; | ||
| std::shared_ptr<FunctionOptions> options; | ||
| // Whether this call is a special form (e.g. if-else). If true, the `special_form` | ||
| // field will be resolved in binding. | ||
| bool is_special_form = false; | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. A special-form is not a call. A special-form is a special form of expression. - using Impl = std::variant<Datum, Parameter, Call>;
+ using Impl = std::variant<Datum, Parameter, Call, Special>;This is fundamental to the approach. Special forms don't have names, they are virtual sub-classes of We can start with
|
||
| // Cached hash value | ||
| size_t hash; | ||
|
|
||
|
|
@@ -56,6 +59,9 @@ class ARROW_EXPORT Expression { | |
| const Kernel* kernel = NULLPTR; | ||
| std::shared_ptr<KernelState> kernel_state; | ||
| TypeHolder type; | ||
| // Whether the entire call (including all its arguments) is selection-vector-aware | ||
| bool selection_vector_aware = false; | ||
| std::shared_ptr<SpecialForm> special_form = NULLPTR; | ||
|
Comment on lines
+63
to
+64
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Would
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Yeah, a kernel will claim itself as |
||
|
|
||
| void ComputeHash(); | ||
| }; | ||
|
|
@@ -118,6 +124,12 @@ class ARROW_EXPORT Expression { | |
| // XXX someday | ||
| // NullGeneralization::type nullable() const; | ||
|
|
||
| /// Whether the entire expression (including all its subexpressions) is | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I think this is only something you will discover during evaluation. The same function can have kernels that are aware of selection and some that are not. Another important aspect is the type-checking of the special forms. You need to unify [1] the output type of all the branches, so you can pre-allocate the output and introduce the appropriate casts.
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
I don't think it's necessarily during evaluation. This can be done in binding.
Binding will resolve the function to concrete kernel, so the selection awareness is known once the kernel is resolved. Also note that I made selection aware flag a property of kernel rather than of function.
(I don't think this is related to selection-awareness. In this proposal, special forms and expression's selection-awareness are relatively separated - they only correlate on the fact that a special form may use selection vector to mask the evaluation of its subexpressions. So I'm assuming this is a general comment about special form.) Before responding to your comment about special forms, I want to raise some other discussion which is part of my reasons making this proposal the current way. In arrow compute, there are "function"s, which users can directly invoke via So my question is, can we assume that for every coming special form in arrow, there exists a function "companion" of it? For some common special forms like "and", "or", "if_else", "case_when", there already are. For other special forms in Lisp, like "quote", "let", "defunc", they don't have corresponding "function"s, but those special forms themselves are more like fundamental parts, like defining variable and assignments, for a Turing-complete language, which I believe arrow won't need to have. So my so-far conclusion is, yes, we can assume it. Then the next question is, given that we'll have the function companion for every special form, can we leverage the function companion to do type-checking/resolution (the function need to do those anyway)? My feeling is yes, at least I can't think of a case that a special form has different type-checking/resolution rule than its function companion. So to summarize, if a special form:
Then we can think of special forms to be a special call with special argument evaluation rules. Of course, if there are counterexamples for any of above assumptions, then my conclusion would be wrong. And I should take the way around. Thanks.
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
Special forms in Arrow won't be many and we don't have to store them in a table like we have to do for functions. My use of LISP for inspiration might have led you into a path of over-engineering. We don't need the full generality of LISP or languages where users can define functions with call-by-name parameters. Think more C-like (strict evaluation) and less LISP-like (quote, let, defunc...) because that is overkill. Hence my insistence on making special forms a well-defined set of C++ classes. |
||
| /// selection-vector-aware. If true, then the expression can be executed using the "fast | ||
| /// path" - all kernels directly working on the selection vector. Otherwise the | ||
| /// execution takes the "slow path" - gathering the input and scattering the output. | ||
| bool selection_vector_aware() const; | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I think that this should return the So you can extend |
||
|
|
||
| struct Parameter { | ||
| FieldRef ref; | ||
|
|
||
|
|
@@ -159,14 +171,15 @@ Expression field_ref(FieldRef ref); | |
|
|
||
| ARROW_EXPORT | ||
| Expression call(std::string function, std::vector<Expression> arguments, | ||
| std::shared_ptr<FunctionOptions> options = NULLPTR); | ||
| std::shared_ptr<FunctionOptions> options = NULLPTR, | ||
| bool is_special_form = false); | ||
|
|
||
| template <typename Options, typename = typename std::enable_if< | ||
| std::is_base_of<FunctionOptions, Options>::value>::type> | ||
| Expression call(std::string function, std::vector<Expression> arguments, | ||
| Options options) { | ||
| Expression call(std::string function, std::vector<Expression> arguments, Options options, | ||
| bool is_special_form = false) { | ||
|
Comment on lines
+179
to
+180
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Special forms are not "called", they are evaluated. They are more general expressions than calls. |
||
| return call(std::move(function), std::move(arguments), | ||
| std::make_shared<Options>(std::move(options))); | ||
| std::make_shared<Options>(std::move(options)), is_special_form); | ||
| } | ||
|
|
||
| /// Assemble a list of all fields referenced by an Expression at any depth. | ||
|
|
||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Bitmaps are alternatives to selection-vectors, yes, but they have a disadvantage: they don't reduce in size as you process them — you start with
lengthbits and end up withlengthbits and have to scan them over and over. Ideally we should have both forms and use bitmaps when the selection is small, but if we are going to simplify things and have a single form it should be selection vectors.