From db634fb867b77ae6d5ca8a02c0b48272b30f3711 Mon Sep 17 00:00:00 2001 From: Vibhatha Abeykoon Date: Fri, 4 Mar 2022 18:57:55 +0530 Subject: [PATCH 001/131] updating submodule --- testing | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/testing b/testing index 53b49804710..634739c6644 160000 --- a/testing +++ b/testing @@ -1 +1 @@ -Subproject commit 53b498047109d9940fcfab388bd9d6aeb8c57425 +Subproject commit 634739c664433cec366b4b9a81d1e1044a8c5eda From 9ad5c8f9a06800cab45a2d6129ddc6eca8d40161 Mon Sep 17 00:00:00 2001 From: Vibhatha Abeykoon Date: Sat, 19 Mar 2022 19:14:39 +0530 Subject: [PATCH 002/131] temp commit to remove files in submodule --- .gitmodules | 3 --- testing | 1 - 2 files changed, 4 deletions(-) delete mode 160000 testing diff --git a/.gitmodules b/.gitmodules index 6efc4871542..71722b21777 100644 --- a/.gitmodules +++ b/.gitmodules @@ -1,6 +1,3 @@ [submodule "cpp/submodules/parquet-testing"] path = cpp/submodules/parquet-testing url = https://github.com/apache/parquet-testing.git -[submodule "testing"] - path = testing - url = https://github.com/apache/arrow-testing diff --git a/testing b/testing deleted file mode 160000 index 634739c6644..00000000000 --- a/testing +++ /dev/null @@ -1 +0,0 @@ -Subproject commit 634739c664433cec366b4b9a81d1e1044a8c5eda From 12010e82feef86e812fddbab4703e8d79f3ac4b0 Mon Sep 17 00:00:00 2001 From: Vibhatha Abeykoon Date: Sat, 19 Mar 2022 19:15:36 +0530 Subject: [PATCH 003/131] adding submodule --- .gitmodules | 3 +++ 1 file changed, 3 insertions(+) diff --git a/.gitmodules b/.gitmodules index 71722b21777..6efc4871542 100644 --- a/.gitmodules +++ b/.gitmodules @@ -1,3 +1,6 @@ [submodule "cpp/submodules/parquet-testing"] path = cpp/submodules/parquet-testing url = https://github.com/apache/parquet-testing.git +[submodule "testing"] + path = testing + url = https://github.com/apache/arrow-testing From d7c4593f904c09e31b65b9e5db474261d3799dfb Mon Sep 17 00:00:00 2001 From: Vibhatha Abeykoon Date: Sun, 20 Mar 2022 23:09:21 +0530 Subject: [PATCH 004/131] updating testing submodule --- testing | 1 + 1 file changed, 1 insertion(+) create mode 160000 testing diff --git a/testing b/testing new file mode 160000 index 00000000000..d315f798520 --- /dev/null +++ b/testing @@ -0,0 +1 @@ +Subproject commit d315f7985207d2d67fc2c8e41053e9d97d573f4b From 2a2ec2146360e014793e8a95dc4deba94682c0ac Mon Sep 17 00:00:00 2001 From: Vibhatha Abeykoon Date: Sun, 20 Mar 2022 23:11:24 +0530 Subject: [PATCH 005/131] revert to uupstream version --- testing | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/testing b/testing index d315f798520..53b49804710 160000 --- a/testing +++ b/testing @@ -1 +1 @@ -Subproject commit d315f7985207d2d67fc2c8e41053e9d97d573f4b +Subproject commit 53b498047109d9940fcfab388bd9d6aeb8c57425 From 7aa5fc473faaae66b43e5e5582f74b3c34458fab Mon Sep 17 00:00:00 2001 From: Vibhatha Abeykoon Date: Mon, 31 Jan 2022 20:49:34 +0530 Subject: [PATCH 006/131] nltk example step 1 --- python/examples/statistics/nltk_example.py | 58 ++++++++++++++++++++++ 1 file changed, 58 insertions(+) create mode 100644 python/examples/statistics/nltk_example.py diff --git a/python/examples/statistics/nltk_example.py b/python/examples/statistics/nltk_example.py new file mode 100644 index 00000000000..779de9c3046 --- /dev/null +++ b/python/examples/statistics/nltk_example.py @@ -0,0 +1,58 @@ +from turtle import down +import nltk +import pandas as pd +import pyarrow as pa + +def download(): + nltk.download([ + "names", + "stopwords", + "state_union", + "twitter_samples", + "movie_reviews", + "averaged_perceptron_tagger", + "vader_lexicon", + "punkt", + ]) + +def test_nltk(): + from nltk.sentiment import SentimentIntensityAnalyzer + sia = SentimentIntensityAnalyzer() + score = sia.polarity_scores("Wow, NLTK is really powerful!") + for item in score: + print(item, score[item]) + + +def get_nltk_tweets(): + tweets = [t.replace("://", "//") for t in nltk.corpus.twitter_samples.strings()] + return tweets + + +def make_product_info(tweets): + import random + num_records = len(tweets) + product_names = [] + product_quantities = [] + regions = [] + region_types = {0: "US", 1: "UK", 2: "JPN", 3: "IND", 4: "AUS"} + for id in range(num_records): + product_name = "prod-" + str(id) + product_quantity = random.randint(0, 1000) + region = region_types[random.randint(0, 4)] + product_names.append(product_name) + product_quantities.append(product_quantity) + regions.append(region) + dict_data = {"product_name": product_names, + "product_quantity": product_quantities, + "region": regions, + "review" : tweets} + + data_table = pa.Table.from_pydict(dict_data) + return data_table + + +data_table = make_product_info(get_nltk_tweets()) + +print(data_table[0:5].to_pandas()) + + From 994b629747a8cfee5720ca91b686e4a11d4c5cec Mon Sep 17 00:00:00 2001 From: Vibhatha Abeykoon Date: Tue, 8 Feb 2022 14:50:30 +0530 Subject: [PATCH 007/131] rebase --- cpp/examples/arrow/CMakeLists.txt | 5 + cpp/examples/arrow/udf_example.cc | 287 +++++++++++++++++++++++++++ python/pyarrow/_compute.pxd | 12 ++ python/pyarrow/_compute.pyx | 52 +++++ python/pyarrow/compute.py | 2 + python/pyarrow/includes/libarrow.pxd | 10 + python/pyarrow/public-api.pxi | 3 - 7 files changed, 368 insertions(+), 3 deletions(-) create mode 100644 cpp/examples/arrow/udf_example.cc diff --git a/cpp/examples/arrow/CMakeLists.txt b/cpp/examples/arrow/CMakeLists.txt index e11b3bd0ab2..c262ab90805 100644 --- a/cpp/examples/arrow/CMakeLists.txt +++ b/cpp/examples/arrow/CMakeLists.txt @@ -133,4 +133,9 @@ if(ARROW_PARQUET AND ARROW_DATASET) add_arrow_example(join_example EXTRA_LINK_LIBS ${DATASET_EXAMPLES_LINK_LIBS}) add_dependencies(join-example parquet) + add_dependencies(dataset_documentation_example parquet) + + add_arrow_example(udf_example EXTRA_LINK_LIBS + ${DATASET_EXAMPLES_LINK_LIBS}) + add_dependencies(udf_example parquet) endif() diff --git a/cpp/examples/arrow/udf_example.cc b/cpp/examples/arrow/udf_example.cc new file mode 100644 index 00000000000..ac24f1c9403 --- /dev/null +++ b/cpp/examples/arrow/udf_example.cc @@ -0,0 +1,287 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include + +// Demonstrate registering an Arrow compute function outside of the Arrow source tree + +namespace cp = ::arrow::compute; + +#define ABORT_ON_FAILURE(expr) \ + do { \ + arrow::Status status_ = (expr); \ + if (!status_.ok()) { \ + std::cerr << status_.message() << std::endl; \ + abort(); \ + } \ + } while (0); + +class ExampleFunctionOptionsType : public cp::FunctionOptionsType { + const char* type_name() const override { return "ExampleFunctionOptionsType"; } + std::string Stringify(const cp::FunctionOptions&) const override { + return "ExampleFunctionOptionsType"; + } + bool Compare(const cp::FunctionOptions&, const cp::FunctionOptions&) const override { + return true; + } + std::unique_ptr Copy(const cp::FunctionOptions&) const override; + // optional: support for serialization + // Result> Serialize(const FunctionOptions&) const override; + // Result> Deserialize(const Buffer&) const override; +}; + +cp::FunctionOptionsType* GetExampleFunctionOptionsType() { + static ExampleFunctionOptionsType options_type; + return &options_type; +} + +class ExampleFunctionOptions : public cp::FunctionOptions { + public: + ExampleFunctionOptions() : cp::FunctionOptions(GetExampleFunctionOptionsType()) {} +}; + +std::unique_ptr ExampleFunctionOptionsType::Copy( + const cp::FunctionOptions&) const { + return std::unique_ptr(new ExampleFunctionOptions()); +} + +arrow::Status ExampleFunctionImpl(cp::KernelContext* ctx, const cp::ExecBatch& batch, + arrow::Datum* out) { + auto result = cp::CallFunction("add", {batch[0].array(), batch[0].array()}); + *out->mutable_array() = *result.ValueOrDie().array(); + return arrow::Status::OK(); +} + +struct BatchesWithSchema { + std::vector batches; + std::shared_ptr schema; + // // This method uses internal arrow utilities to + // // convert a vector of record batches to an AsyncGenerator of optional batches + arrow::AsyncGenerator> gen() const { + auto opt_batches = ::arrow::internal::MapVector( + [](cp::ExecBatch batch) { return arrow::util::make_optional(std::move(batch)); }, + batches); + arrow::AsyncGenerator> gen; + gen = arrow::MakeVectorGenerator(std::move(opt_batches)); + return gen; + } +}; + +template ::value | + arrow::is_boolean_type::value | + arrow::is_temporal_type::value>::type> +arrow::Result> GetArrayDataSample( + const std::vector& values) { + using ARROW_ARRAY_TYPE = typename arrow::TypeTraits::ArrayType; + using ARROW_BUILDER_TYPE = typename arrow::TypeTraits::BuilderType; + ARROW_BUILDER_TYPE builder; + ARROW_RETURN_NOT_OK(builder.Reserve(values.size())); + std::shared_ptr array; + ARROW_RETURN_NOT_OK(builder.AppendValues(values)); + ARROW_RETURN_NOT_OK(builder.Finish(&array)); + return array; +} + +arrow::Result> GetSampleRecordBatch( + const arrow::ArrayVector array_vector, const arrow::FieldVector& field_vector) { + std::shared_ptr record_batch; + ARROW_ASSIGN_OR_RAISE(auto struct_result, + arrow::StructArray::Make(array_vector, field_vector)); + return record_batch->FromStructArray(struct_result); +} + +arrow::Result GetExecBatchFromVectors( + const arrow::FieldVector& field_vector, const arrow::ArrayVector& array_vector) { + std::shared_ptr record_batch; + ARROW_ASSIGN_OR_RAISE(auto res_batch, GetSampleRecordBatch(array_vector, field_vector)); + cp::ExecBatch batch{*res_batch}; + return batch; +} + +arrow::Result MakeBasicBatches() { + BatchesWithSchema out; + auto field_vector = {arrow::field("a", arrow::int32()), + arrow::field("b", arrow::boolean())}; + ARROW_ASSIGN_OR_RAISE(auto b1_int, GetArrayDataSample({0, 4})); + ARROW_ASSIGN_OR_RAISE(auto b2_int, GetArrayDataSample({5, 6, 7})); + ARROW_ASSIGN_OR_RAISE(auto b3_int, GetArrayDataSample({8, 9, 10})); + + ARROW_ASSIGN_OR_RAISE(auto b1_bool, + GetArrayDataSample({false, true})); + ARROW_ASSIGN_OR_RAISE(auto b2_bool, + GetArrayDataSample({true, false, true})); + ARROW_ASSIGN_OR_RAISE(auto b3_bool, + GetArrayDataSample({false, true, false})); + + ARROW_ASSIGN_OR_RAISE(auto b1, + GetExecBatchFromVectors(field_vector, {b1_int, b1_bool})); + ARROW_ASSIGN_OR_RAISE(auto b2, + GetExecBatchFromVectors(field_vector, {b2_int, b2_bool})); + ARROW_ASSIGN_OR_RAISE(auto b3, + GetExecBatchFromVectors(field_vector, {b3_int, b3_bool})); + + out.batches = {b1, b2, b3}; + out.schema = arrow::schema(field_vector); + return out; +} + +arrow::Result> GetTable() { + std::shared_ptr out; + + return out; +} + +class ExampleNodeOptions : public cp::ExecNodeOptions {}; + +// a basic ExecNode which ignores all input batches +class ExampleNode : public cp::ExecNode { + public: + ExampleNode(ExecNode* input, const ExampleNodeOptions&) + : ExecNode(/*plan=*/input->plan(), /*inputs=*/{input}, + /*input_labels=*/{"ignored"}, + /*output_schema=*/input->output_schema(), /*num_outputs=*/1) {} + + const char* kind_name() const override { return "ExampleNode"; } + + arrow::Status StartProducing() override { + outputs_[0]->InputFinished(this, 0); + return arrow::Status::OK(); + } + + void ResumeProducing(ExecNode* output) override {} + void PauseProducing(ExecNode* output) override {} + + void StopProducing(ExecNode* output) override { inputs_[0]->StopProducing(this); } + void StopProducing() override { inputs_[0]->StopProducing(); } + + void InputReceived(ExecNode* input, cp::ExecBatch batch) override {} + void ErrorReceived(ExecNode* input, arrow::Status error) override {} + void InputFinished(ExecNode* input, int total_batches) override {} + + arrow::Future<> finished() override { return inputs_[0]->finished(); } +}; + +arrow::Result ExampleExecNodeFactory(cp::ExecPlan* plan, + std::vector inputs, + const cp::ExecNodeOptions& options) { + const auto& example_options = + arrow::internal::checked_cast(options); + + return plan->EmplaceNode(inputs[0], example_options); +} + +const cp::FunctionDoc func_doc{ + "Example function to demonstrate registering an out-of-tree function", + "", + {"x"}, + "ExampleFunctionOptions"}; + +arrow::Status Execute() { + const std::string name = "x+x"; + auto func = std::make_shared(name, cp::Arity::Unary(), &func_doc); + cp::ScalarKernel kernel({cp::InputType::Array(arrow::int32())}, arrow::int32(), + ExampleFunctionImpl); + kernel.mem_allocation = cp::MemAllocation::NO_PREALLOCATE; + ABORT_ON_FAILURE(func->AddKernel(std::move(kernel))); + + auto registry = cp::GetFunctionRegistry(); + ABORT_ON_FAILURE(registry->AddFunction(std::move(func))); + + arrow::Int32Builder builder(arrow::default_memory_pool()); + std::shared_ptr arr; + ABORT_ON_FAILURE(builder.Append(42)); + ABORT_ON_FAILURE(builder.Finish(&arr)); + auto options = std::make_shared(); + auto maybe_result = cp::CallFunction(name, {arr}, options.get()); + ABORT_ON_FAILURE(maybe_result.status()); + + std::cout << "Result 1: " << maybe_result->make_array()->ToString() << std::endl; + + // Expression serialization will raise NotImplemented if an expression includes + // FunctionOptions for which serialization is not supported. + // auto expr = cp::call(name, {}, options); + // auto maybe_serialized = cp::Serialize(expr); + // std::cerr << maybe_serialized.status().ToString() << std::endl; + + auto exec_registry = cp::default_exec_factory_registry(); + ABORT_ON_FAILURE( + exec_registry->AddFactory("compute_register_example", ExampleExecNodeFactory)); + + auto maybe_plan = cp::ExecPlan::Make(); + ABORT_ON_FAILURE(maybe_plan.status()); + auto plan = maybe_plan.ValueOrDie(); + cp::ExecContext exec_context(arrow::default_memory_pool(), + ::arrow::internal::GetCpuThreadPool()); + arrow::AsyncGenerator> source_gen, sink_gen; + ARROW_ASSIGN_OR_RAISE(auto basic_data, MakeBasicBatches()); + + cp::Expression a_times_10 = cp::call("multiply", {cp::field_ref("a"), cp::literal(10)}); + cp::Expression custom_exp = cp::call(name, {cp::field_ref("a")}, options); + + auto source_node_options = cp::SourceNodeOptions{basic_data.schema, basic_data.gen()}; + auto project_node_options = cp::ProjectNodeOptions{{ + cp::field_ref("a"), + custom_exp, + cp::field_ref("b"), + }}; + auto output_schema = arrow::schema({arrow::field("a", arrow::int32()), + arrow::field("a + a", arrow::int32()), arrow::field("b", arrow::boolean())}); + std::shared_ptr out; + ABORT_ON_FAILURE( + cp::Declaration::Sequence( + { + {"source", source_node_options}, + {"project", project_node_options}, + {"table_sink", cp::TableSinkNodeOptions{&out, output_schema}}, + }) + .AddToPlan(plan.get()) + .status()); + + ARROW_RETURN_NOT_OK(plan->StartProducing()); + + std::cout << "Output Table Data : " << std::endl; + std::cout << out->ToString() << std::endl; + + auto future = plan->finished(); + + return future.status(); +} + +int main(int argc, char** argv) { + auto status = Execute(); + if (!status.ok()) { + std::cerr << "Error occurred : " << status.message() << std::endl; + return EXIT_FAILURE; + } + + return EXIT_SUCCESS; +} diff --git a/python/pyarrow/_compute.pxd b/python/pyarrow/_compute.pxd index ebf2066d046..52fdf1a752a 100644 --- a/python/pyarrow/_compute.pxd +++ b/python/pyarrow/_compute.pxd @@ -21,6 +21,18 @@ from pyarrow.lib cimport * from pyarrow.includes.common cimport * from pyarrow.includes.libarrow cimport * +cdef class Arity(_Weakrefable): + cdef: + CArity arity + + cdef void init(self, const CArity &arity) + + +cdef class InputType(_Weakrefable): + cdef: + CInputType input_type + + cdef void init(self, const CInputType &input_type) cdef class FunctionOptions(_Weakrefable): cdef: diff --git a/python/pyarrow/_compute.pyx b/python/pyarrow/_compute.pyx index 2ba6ae2462c..1a3b2d95b90 100644 --- a/python/pyarrow/_compute.pyx +++ b/python/pyarrow/_compute.pyx @@ -198,6 +198,55 @@ FunctionDoc = namedtuple( ("summary", "description", "arg_names", "options_class", "options_required")) +cdef wrap_arity(const CArity c_arity): + cdef Arity arity = Arity.__new__(Arity) + arity.init(c_arity) + return arity + +cdef wrap_input_type(const CInputType c_input_type): + cdef InputType input_type = InputType.__new__(InputType) + input_type.init(c_input_type) + return input_type + +cdef class InputType(_Weakrefable): + + def __init__(self): + raise TypeError("Cannot use constructor to initialize InputType") + + cdef void init(self, const CInputType &input_type): + self.input_type = input_type + + @staticmethod + def scalar(data_type): + cdef: + shared_ptr[CDataType] c_data_type + CInputType c_input_type + c_data_type = pyarrow_unwrap_data_type(data_type) + c_input_type = CInputType.Scalar(c_data_type) + return wrap_input_type(c_input_type) + + @staticmethod + def array(data_type): + cdef: + shared_ptr[CDataType] c_data_type + CInputType c_input_type + c_data_type = pyarrow_unwrap_data_type(data_type) + c_input_type = CInputType.Array(c_data_type) + return wrap_input_type(c_input_type) + + +cdef class Arity(_Weakrefable): + + def __init__(self): + raise TypeError("Cannot use constructor to initialize Arity") + + cdef void init(self, const CArity &arity): + self.arity = arity + + @staticmethod + def unary(): + cdef CArity c_arity = CArity.Unary() + return wrap_arity(c_arity) cdef class Function(_Weakrefable): """ @@ -490,6 +539,9 @@ cdef class FunctionRegistry(_Weakrefable): func = GetResultValue(self.registry.GetFunction(c_name)) return wrap_function(func) + def register_function(self, name, arity, input_types, output_type, function_kind): + pass + cdef FunctionRegistry _global_func_registry = FunctionRegistry() diff --git a/python/pyarrow/compute.py b/python/pyarrow/compute.py index 40751eab26a..feabebce78b 100644 --- a/python/pyarrow/compute.py +++ b/python/pyarrow/compute.py @@ -16,11 +16,13 @@ # under the License. from pyarrow._compute import ( # noqa + Arity, Function, FunctionOptions, FunctionRegistry, HashAggregateFunction, HashAggregateKernel, + InputType, Kernel, ScalarAggregateFunction, ScalarAggregateKernel, diff --git a/python/pyarrow/includes/libarrow.pxd b/python/pyarrow/includes/libarrow.pxd index 4485f744cd7..6bd0c29c544 100644 --- a/python/pyarrow/includes/libarrow.pxd +++ b/python/pyarrow/includes/libarrow.pxd @@ -1832,6 +1832,16 @@ cdef extern from "arrow/compute/api.h" namespace "arrow::compute" nogil: cdef cppclass CArity" arrow::compute::Arity": int num_args c_bool is_varargs + + @staticmethod + CArity Unary() + + cdef cppclass CInputType" arrow::compute::InputType": + @staticmethod + CInputType Array(shared_ptr[CDataType] type) + + @staticmethod + CInputType Scalar(shared_ptr[CDataType] type) cdef enum FunctionKind" arrow::compute::Function::Kind": FunctionKind_SCALAR" arrow::compute::Function::SCALAR" diff --git a/python/pyarrow/public-api.pxi b/python/pyarrow/public-api.pxi index c427fb9f5db..1b095c52e55 100644 --- a/python/pyarrow/public-api.pxi +++ b/python/pyarrow/public-api.pxi @@ -25,11 +25,9 @@ from pyarrow.includes.libarrow cimport (CArray, CDataType, CField, # You cannot assign something to a dereferenced pointer in Cython thus these # methods don't use Status to indicate a successful operation. - cdef api bint pyarrow_is_buffer(object buffer): return isinstance(buffer, Buffer) - cdef api shared_ptr[CBuffer] pyarrow_unwrap_buffer(object buffer): cdef Buffer buf if pyarrow_is_buffer(buffer): @@ -38,7 +36,6 @@ cdef api shared_ptr[CBuffer] pyarrow_unwrap_buffer(object buffer): return shared_ptr[CBuffer]() - cdef api object pyarrow_wrap_buffer(const shared_ptr[CBuffer]& buf): cdef Buffer result = Buffer.__new__(Buffer) result.init(buf) From 86632bedda68ecede13f07c22ece4083b127e8cc Mon Sep 17 00:00:00 2001 From: Vibhatha Abeykoon Date: Wed, 9 Feb 2022 17:21:20 +0530 Subject: [PATCH 008/131] function registry example enhanced for udf implementation --- cpp/examples/arrow/udf_example.cc | 30 +++++++++++++++--------------- 1 file changed, 15 insertions(+), 15 deletions(-) diff --git a/cpp/examples/arrow/udf_example.cc b/cpp/examples/arrow/udf_example.cc index ac24f1c9403..24683c3b05c 100644 --- a/cpp/examples/arrow/udf_example.cc +++ b/cpp/examples/arrow/udf_example.cc @@ -249,28 +249,28 @@ arrow::Status Execute() { auto source_node_options = cp::SourceNodeOptions{basic_data.schema, basic_data.gen()}; auto project_node_options = cp::ProjectNodeOptions{{ - cp::field_ref("a"), - custom_exp, - cp::field_ref("b"), - }}; + cp::field_ref("a"), + custom_exp, + cp::field_ref("b"), + }}; auto output_schema = arrow::schema({arrow::field("a", arrow::int32()), - arrow::field("a + a", arrow::int32()), arrow::field("b", arrow::boolean())}); + arrow::field("a + a", arrow::int32()), + arrow::field("b", arrow::boolean())}); std::shared_ptr out; - ABORT_ON_FAILURE( - cp::Declaration::Sequence( - { - {"source", source_node_options}, - {"project", project_node_options}, - {"table_sink", cp::TableSinkNodeOptions{&out, output_schema}}, - }) - .AddToPlan(plan.get()) - .status()); + ABORT_ON_FAILURE(cp::Declaration::Sequence( + { + {"source", source_node_options}, + {"project", project_node_options}, + {"table_sink", cp::TableSinkNodeOptions{&out, output_schema}}, + }) + .AddToPlan(plan.get()) + .status()); ARROW_RETURN_NOT_OK(plan->StartProducing()); std::cout << "Output Table Data : " << std::endl; std::cout << out->ToString() << std::endl; - + auto future = plan->finished(); return future.status(); From c49ece3387e0dc053924af74e8bcb551bd630806 Mon Sep 17 00:00:00 2001 From: Vibhatha Abeykoon Date: Thu, 10 Feb 2022 22:46:59 +0530 Subject: [PATCH 009/131] adding udf poc interfaces --- cpp/examples/arrow/CMakeLists.txt | 14 ++++- cpp/examples/arrow/udf_example.cc | 62 ++++++++++++++++++-- cpp/src/arrow/python/CMakeLists.txt | 3 +- cpp/src/arrow/python/api.h | 2 + cpp/src/arrow/python/udf.cc | 45 +++++++++++++++ cpp/src/arrow/python/udf.h | 87 +++++++++++++++++++++++++++++ python/pyarrow/_compute.pxd | 6 ++ python/pyarrow/_compute.pyx | 46 +++++++++++++++ python/pyarrow/compute.py | 1 + 9 files changed, 259 insertions(+), 7 deletions(-) create mode 100644 cpp/src/arrow/python/udf.cc create mode 100644 cpp/src/arrow/python/udf.h diff --git a/cpp/examples/arrow/CMakeLists.txt b/cpp/examples/arrow/CMakeLists.txt index c262ab90805..c0ee59b8369 100644 --- a/cpp/examples/arrow/CMakeLists.txt +++ b/cpp/examples/arrow/CMakeLists.txt @@ -15,6 +15,8 @@ # specific language governing permissions and limitations # under the License. +find_package(Python3Alt REQUIRED) + add_arrow_example(row_wise_conversion_example) if(ARROW_COMPUTE) @@ -112,6 +114,8 @@ if(ARROW_FLIGHT) endif() endif() +include_directories(${NUMPY_INCLUDE_DIRS} ${PYTHON_INCLUDE_DIRS}) + if(ARROW_PARQUET AND ARROW_DATASET) if(ARROW_BUILD_SHARED) set(DATASET_EXAMPLES_LINK_LIBS arrow_dataset_shared) @@ -136,6 +140,14 @@ if(ARROW_PARQUET AND ARROW_DATASET) add_dependencies(dataset_documentation_example parquet) add_arrow_example(udf_example EXTRA_LINK_LIBS - ${DATASET_EXAMPLES_LINK_LIBS}) + ${DATASET_EXAMPLES_LINK_LIBS} ${PYTHON_LIBRARIES} ${PYTHON_OTHER_LIBS}) add_dependencies(udf_example parquet) endif() + +message("PYTHON_INCLUDE_DIRS : ${PYTHON_INCLUDE_DIRS}") + +message("PYTHON_OTHER_LIBS : ${PYTHON_OTHER_LIBS}") + +message("PYTHON_LIBRARIES : ${PYTHON_LIBRARIES}") + + diff --git a/cpp/examples/arrow/udf_example.cc b/cpp/examples/arrow/udf_example.cc index 24683c3b05c..b615cf97c7d 100644 --- a/cpp/examples/arrow/udf_example.cc +++ b/cpp/examples/arrow/udf_example.cc @@ -31,6 +31,9 @@ #include #include +#include + + // Demonstrate registering an Arrow compute function outside of the Arrow source tree namespace cp = ::arrow::compute; @@ -73,6 +76,14 @@ std::unique_ptr ExampleFunctionOptionsType::Copy( return std::unique_ptr(new ExampleFunctionOptions()); } +PyObject* SimpleFunction() { + PyObject* obj = Py_BuildValue("s", "hello"); + return obj; +} + +// PyObject* objectsRepresentation = PyObject_Repr(yourObject); +// const char* s = PyString_AsString(objectsRepresentation); + arrow::Status ExampleFunctionImpl(cp::KernelContext* ctx, const cp::ExecBatch& batch, arrow::Datum* out) { auto result = cp::CallFunction("add", {batch[0].array(), batch[0].array()}); @@ -80,6 +91,17 @@ arrow::Status ExampleFunctionImpl(cp::KernelContext* ctx, const cp::ExecBatch& b return arrow::Status::OK(); } +arrow::Status ExamplePyFunctionImpl(cp::KernelContext* ctx, const cp::ExecBatch& batch, + arrow::Datum* out, PyObject* func) { + auto result = cp::CallFunction("add", {batch[0].array(), batch[0].array()}); + *out->mutable_array() = *result.ValueOrDie().array(); + PyObject* res = SimpleFunction(); + PyObject* objectsRepresentation = PyObject_Repr(res); + const char* s = PyUnicode_AsUTF8(objectsRepresentation); + std::cout << "Message :: " << s << std::endl; + return arrow::Status::OK(); +} + struct BatchesWithSchema { std::vector batches; std::shared_ptr schema; @@ -209,8 +231,10 @@ arrow::Status Execute() { const std::string name = "x+x"; auto func = std::make_shared(name, cp::Arity::Unary(), &func_doc); cp::ScalarKernel kernel({cp::InputType::Array(arrow::int32())}, arrow::int32(), - ExampleFunctionImpl); + ExampleFunctionImpl); + kernel.mem_allocation = cp::MemAllocation::NO_PREALLOCATE; + ABORT_ON_FAILURE(func->AddKernel(std::move(kernel))); auto registry = cp::GetFunctionRegistry(); @@ -228,9 +252,9 @@ arrow::Status Execute() { // Expression serialization will raise NotImplemented if an expression includes // FunctionOptions for which serialization is not supported. - // auto expr = cp::call(name, {}, options); - // auto maybe_serialized = cp::Serialize(expr); - // std::cerr << maybe_serialized.status().ToString() << std::endl; + // auto expr = cp::call(name, {}, options); + // auto maybe_serialized = cp::Serialize(expr); + // std::cerr << maybe_serialized.status().ToString() << std::endl; auto exec_registry = cp::default_exec_factory_registry(); ABORT_ON_FAILURE( @@ -276,8 +300,36 @@ arrow::Status Execute() { return future.status(); } +arrow::Status ExecutePy() { + const std::string name = "x+x"; + auto func2 = std::make_shared(name, cp::Arity::Unary(), &func_doc); + arrow::py::UDFScalarKernel kernel2({cp::InputType::Array(arrow::int32())}, arrow::int32(), + ExamplePyFunctionImpl); + + kernel2.mem_allocation = cp::MemAllocation::NO_PREALLOCATE; + ABORT_ON_FAILURE(func2->AddKernel(std::move(kernel2))); + + + + + // auto registry = cp::GetFunctionRegistry(); + // ABORT_ON_FAILURE(registry->AddFunction(std::move(func2))); + + // arrow::Int32Builder builder(arrow::default_memory_pool()); + // std::shared_ptr arr; + // ABORT_ON_FAILURE(builder.Append(42)); + // ABORT_ON_FAILURE(builder.Finish(&arr)); + // auto options = std::make_shared(); + // auto maybe_result = cp::CallFunction(name, {arr}, options.get()); + // ABORT_ON_FAILURE(maybe_result.status()); + + // std::cout << "Result 1: " << maybe_result->make_array()->ToString() << std::endl; + + return arrow::Status::OK(); +} + int main(int argc, char** argv) { - auto status = Execute(); + auto status = ExecutePy(); if (!status.ok()) { std::cerr << "Error occurred : " << status.message() << std::endl; return EXIT_FAILURE; diff --git a/cpp/src/arrow/python/CMakeLists.txt b/cpp/src/arrow/python/CMakeLists.txt index 23c97cc3209..7235e2d0fe3 100644 --- a/cpp/src/arrow/python/CMakeLists.txt +++ b/cpp/src/arrow/python/CMakeLists.txt @@ -44,7 +44,8 @@ set(ARROW_PYTHON_SRCS numpy_to_arrow.cc python_to_arrow.cc pyarrow.cc - serialize.cc) + serialize.cc + udf.cc) set_source_files_properties(init.cc PROPERTIES SKIP_PRECOMPILE_HEADERS ON SKIP_UNITY_BUILD_INCLUSION ON) diff --git a/cpp/src/arrow/python/api.h b/cpp/src/arrow/python/api.h index a0b13d6d130..638b05a69fc 100644 --- a/cpp/src/arrow/python/api.h +++ b/cpp/src/arrow/python/api.h @@ -28,3 +28,5 @@ #include "arrow/python/numpy_to_arrow.h" #include "arrow/python/python_to_arrow.h" #include "arrow/python/serialize.h" +#include "arrow/python/udf.h" + diff --git a/cpp/src/arrow/python/udf.cc b/cpp/src/arrow/python/udf.cc new file mode 100644 index 00000000000..33a1fd5e3bd --- /dev/null +++ b/cpp/src/arrow/python/udf.cc @@ -0,0 +1,45 @@ +#include "arrow/python/udf.h" + +#include +#include +#include + +namespace arrow { + +namespace py { + + +/// \brief Add a kernel with given input/output types, no required state +/// initialization, preallocation for fixed-width types, and default null +/// handling (intersect validity bitmaps of inputs). +Status UDFScalarFunction::AddKernel(std::vector in_types, cp::OutputType out_type, + UDFArrayKernelExec exec, cp::KernelInit init) { + + RETURN_NOT_OK(CheckArity(in_types)); + + if (arity_.is_varargs && in_types.size() != 1) { + return Status::Invalid("VarArgs signatures must have exactly one input type"); + } + auto sig = + cp::KernelSignature::Make(std::move(in_types), std::move(out_type), arity_.is_varargs); + kernels_.emplace_back(std::move(sig), exec, init); + return Status::OK(); + +} + +/// \brief Add a kernel (function implementation). Returns error if the +/// kernel's signature does not match the function's arity. +Status UDFScalarFunction::AddKernel(UDFScalarKernel kernel) { + RETURN_NOT_OK(CheckArity(kernel.signature->in_types())); + if (arity_.is_varargs && !kernel.signature->is_varargs()) { + return Status::Invalid("Function accepts varargs but kernel signature does not"); + } + kernels_.emplace_back(std::move(kernel)); + return Status::OK(); +} + + + +} // namespace py + +} // namespace arrow \ No newline at end of file diff --git a/cpp/src/arrow/python/udf.h b/cpp/src/arrow/python/udf.h new file mode 100644 index 00000000000..3aec7b801a1 --- /dev/null +++ b/cpp/src/arrow/python/udf.h @@ -0,0 +1,87 @@ +#pragma once + +#include "arrow/python/platform.h" + +#include +#include + +#include "arrow/compute/function.h" + +#include "arrow/python/common.h" + +namespace cp = arrow::compute; + +namespace arrow { + +namespace py { + +// PyObject* CallUnaryTableUDF(PyObject* func, PyObject* arg1, std::shared_ptr input); +// PyObject* CallUnaryArrayUDF(PyObject* func, PyObject* arg1, std::shared_ptr input); + +using UDFArrayKernelExec = std::function; + +struct UDFArrayKernel : public cp::Kernel { + UDFArrayKernel() = default; + + UDFArrayKernel(std::shared_ptr sig, UDFArrayKernelExec exec, + cp::KernelInit init = NULLPTR) + : Kernel(std::move(sig), init), exec(std::move(exec)) {} + + UDFArrayKernel(std::vector in_types, cp::OutputType out_type, UDFArrayKernelExec exec, + cp::KernelInit init = NULLPTR) + : Kernel(std::move(in_types), std::move(out_type), std::move(init)), + exec(std::move(exec)) {} + + /// \brief Perform a single invocation of this kernel. Depending on the + /// implementation, it may only write into preallocated memory, while in some + /// cases it will allocate its own memory. Any required state is managed + /// through the KernelContext. + UDFArrayKernelExec exec; + + /// \brief Writing execution results into larger contiguous allocations + /// requires that the kernel be able to write into sliced output ArrayData*, + /// including sliced output validity bitmaps. Some kernel implementations may + /// not be able to do this, so setting this to false disables this + /// functionality. + bool can_write_into_slices = true; +}; + +struct UDFScalarKernel : public UDFArrayKernel { + using UDFArrayKernel::UDFArrayKernel; + + // For scalar functions preallocated data and intersecting arg validity + // bitmaps is a reasonable default + cp::NullHandling::type null_handling = cp::NullHandling::INTERSECTION; + cp::MemAllocation::type mem_allocation = cp::MemAllocation::PREALLOCATE; +}; + +class ARROW_PYTHON_EXPORT UDFScalarFunction : public cp::detail::FunctionImpl { + public: + using KernelType = UDFScalarKernel; + + UDFScalarFunction(std::string name, const cp::Arity& arity, const cp::FunctionDoc* doc, + const cp::FunctionOptions* default_options = NULLPTR) + : cp::detail::FunctionImpl(std::move(name), cp::Function::SCALAR, arity, doc, + default_options) {} + + /// \brief Add a kernel with given input/output types, no required state + /// initialization, preallocation for fixed-width types, and default null + /// handling (intersect validity bitmaps of inputs). + Status AddKernel(std::vector in_types, cp::OutputType out_type, + UDFArrayKernelExec exec, cp::KernelInit init = NULLPTR); + + /// \brief Add a kernel (function implementation). Returns error if the + /// kernel's signature does not match the function's arity. + Status AddKernel(UDFScalarKernel kernel); +}; + + +struct ARROW_PYTHON_EXPORT UDFSynthesizer { + UDFSynthesizer(); + + +}; + +} // namespace py + +} // namespace arrow \ No newline at end of file diff --git a/python/pyarrow/_compute.pxd b/python/pyarrow/_compute.pxd index 52fdf1a752a..1274eb0aeeb 100644 --- a/python/pyarrow/_compute.pxd +++ b/python/pyarrow/_compute.pxd @@ -34,6 +34,12 @@ cdef class InputType(_Weakrefable): cdef void init(self, const CInputType &input_type) +# cdef class FunctionDoc(_Weakrefable): +# cdef: +# CFunctionDoc function_doc + +# cdef void init(self, const CFunctionDoc &function_doc) + cdef class FunctionOptions(_Weakrefable): cdef: shared_ptr[CFunctionOptions] wrapped diff --git a/python/pyarrow/_compute.pyx b/python/pyarrow/_compute.pyx index 1a3b2d95b90..b0b958e4bf8 100644 --- a/python/pyarrow/_compute.pyx +++ b/python/pyarrow/_compute.pyx @@ -208,6 +208,18 @@ cdef wrap_input_type(const CInputType c_input_type): input_type.init(c_input_type) return input_type +# cdef class FunctionDoc(_Weakrefable): + +# def __init__(self): +# raise TypeError("Cannot use constructor to initialize FunctionDoc") + +# cdef void init(self, const CFunctionDoc &function_doc): +# self.function_doc = function_doc + +# @staticmethod +# def create(self): +# pass + cdef class InputType(_Weakrefable): def __init__(self): @@ -2327,3 +2339,37 @@ cdef CExpression _bind(Expression filter, Schema schema) except *: return GetResultValue(filter.unwrap().Bind( deref(pyarrow_unwrap_schema(schema).get()))) + +cdef CFunctionDoc _make_function_doc(func_doc): + cdef: + CFunctionDoc f_doc + vector[c_string] c_arg_names + if isinstance(func_doc, dict): + f_doc.summary = func_doc["summary"].encode() + f_doc.description = func_doc["description"].encode() + for arg_name in func_doc["arg_names"]: + c_arg_names.push_back(arg_name.encode()) + f_doc.arg_names = c_arg_names + f_doc.options_class = func_doc["arg_names"].encode() + f_doc.options_required = func_doc["options_required"] + return f_doc + else: + raise TypeError(f"func_doc must be a dictionary") + +cdef class UDFInterpreter: + + def __init__(self): + raise TypeError("Cannot initialize UDFInterpreter from the constructor") + + @staticmethod + def create_scalar_kernel(function_name, arity, function_doc): + cdef: + c_string c_func_name + Arity c_arity + CFunctionDoc c_func_doc + + c_func_name = function_name.encode() + c_arity = (arity) + c_func_doc = _make_function_doc(function_doc) + + \ No newline at end of file diff --git a/python/pyarrow/compute.py b/python/pyarrow/compute.py index feabebce78b..16b287eff22 100644 --- a/python/pyarrow/compute.py +++ b/python/pyarrow/compute.py @@ -80,6 +80,7 @@ _group_by, # Expressions Expression, + _make_function_doc ) from collections import namedtuple From f2f35cb2608cf9d1ad97a7f0e384d070c5a229af Mon Sep 17 00:00:00 2001 From: Vibhatha Abeykoon Date: Fri, 11 Feb 2022 16:51:08 +0530 Subject: [PATCH 010/131] testing udf --- cpp/examples/arrow/CMakeLists.txt | 2 +- cpp/examples/arrow/udf_example.cc | 160 ++++++++++++++++++++-------- cpp/src/arrow/python/api.h | 1 - cpp/src/arrow/python/python_test.cc | 92 +++++++++++++++- cpp/src/arrow/python/udf.cc | 39 +------ cpp/src/arrow/python/udf.h | 58 +++++++--- 6 files changed, 256 insertions(+), 96 deletions(-) diff --git a/cpp/examples/arrow/CMakeLists.txt b/cpp/examples/arrow/CMakeLists.txt index c0ee59b8369..57c35112665 100644 --- a/cpp/examples/arrow/CMakeLists.txt +++ b/cpp/examples/arrow/CMakeLists.txt @@ -141,7 +141,7 @@ if(ARROW_PARQUET AND ARROW_DATASET) add_arrow_example(udf_example EXTRA_LINK_LIBS ${DATASET_EXAMPLES_LINK_LIBS} ${PYTHON_LIBRARIES} ${PYTHON_OTHER_LIBS}) - add_dependencies(udf_example parquet) + add_dependencies(udf_example parquet arrow_python) endif() message("PYTHON_INCLUDE_DIRS : ${PYTHON_INCLUDE_DIRS}") diff --git a/cpp/examples/arrow/udf_example.cc b/cpp/examples/arrow/udf_example.cc index b615cf97c7d..0baf34021b0 100644 --- a/cpp/examples/arrow/udf_example.cc +++ b/cpp/examples/arrow/udf_example.cc @@ -31,8 +31,7 @@ #include #include -#include - +#include // Demonstrate registering an Arrow compute function outside of the Arrow source tree @@ -77,8 +76,9 @@ std::unique_ptr ExampleFunctionOptionsType::Copy( } PyObject* SimpleFunction() { - PyObject* obj = Py_BuildValue("s", "hello"); - return obj; + PyObject* out = Py_BuildValue("s", "hello"); + std::cout << "HELLO" << std::endl; + return std::move(out); } // PyObject* objectsRepresentation = PyObject_Repr(yourObject); @@ -86,19 +86,29 @@ PyObject* SimpleFunction() { arrow::Status ExampleFunctionImpl(cp::KernelContext* ctx, const cp::ExecBatch& batch, arrow::Datum* out) { + std::cout << "calling udf :" << batch.length << std::endl; + Py_Initialize(); + PyObject* res = SimpleFunction(); + PyObject* objectsRepresentation = PyObject_Repr(res); + const char* s = PyUnicode_AsUTF8(objectsRepresentation); + std::cout << "Message :: " << s << std::endl; + Py_Finalize(); auto result = cp::CallFunction("add", {batch[0].array(), batch[0].array()}); *out->mutable_array() = *result.ValueOrDie().array(); return arrow::Status::OK(); } - +// cp::KernelContext*, const cp::ExecBatch&, Datum*, PyObject* func arrow::Status ExamplePyFunctionImpl(cp::KernelContext* ctx, const cp::ExecBatch& batch, - arrow::Datum* out, PyObject* func) { + arrow::Datum* out, PyObject* func) { + std::cout << "H" << std::endl; auto result = cp::CallFunction("add", {batch[0].array(), batch[0].array()}); *out->mutable_array() = *result.ValueOrDie().array(); - PyObject* res = SimpleFunction(); - PyObject* objectsRepresentation = PyObject_Repr(res); - const char* s = PyUnicode_AsUTF8(objectsRepresentation); - std::cout << "Message :: " << s << std::endl; + // PyObject* res = SimpleFunction(); + // PyObject* objectsRepresentation = PyObject_Repr(res); + // const char* s = PyUnicode_AsUTF8(objectsRepresentation); + std::cout << "Message :: " + << "s" << std::endl; + return arrow::Status::OK(); } @@ -151,11 +161,11 @@ arrow::Result GetExecBatchFromVectors( arrow::Result MakeBasicBatches() { BatchesWithSchema out; - auto field_vector = {arrow::field("a", arrow::int32()), + auto field_vector = {arrow::field("a", arrow::int64()), arrow::field("b", arrow::boolean())}; - ARROW_ASSIGN_OR_RAISE(auto b1_int, GetArrayDataSample({0, 4})); - ARROW_ASSIGN_OR_RAISE(auto b2_int, GetArrayDataSample({5, 6, 7})); - ARROW_ASSIGN_OR_RAISE(auto b3_int, GetArrayDataSample({8, 9, 10})); + ARROW_ASSIGN_OR_RAISE(auto b1_int, GetArrayDataSample({0, 4})); + ARROW_ASSIGN_OR_RAISE(auto b2_int, GetArrayDataSample({5, 6, 7})); + ARROW_ASSIGN_OR_RAISE(auto b3_int, GetArrayDataSample({8, 9, 10})); ARROW_ASSIGN_OR_RAISE(auto b1_bool, GetArrayDataSample({false, true})); @@ -224,28 +234,79 @@ arrow::Result ExampleExecNodeFactory(cp::ExecPlan* plan, const cp::FunctionDoc func_doc{ "Example function to demonstrate registering an out-of-tree function", "", - {"x"}, + {"x", "y"}, "ExampleFunctionOptions"}; +const cp::FunctionDoc func_doc2{ + "Example function to demonstrate registering an out-of-tree function", + "", + {"x"}, + "ExampleFunctionOptions2"}; + +PyObject* MultiplyFunction(PyObject* scalar) { + PyObject* constant = PyLong_FromLong(2); + PyObject* res = PyNumber_Multiply(constant, scalar); + return std::move(res); +} + +class ScalarUDF { + + public: + ScalarUDF(); + explicit ScalarUDF(cp::Arity arity, std::vector input_types, + cp::OutputType output_type, PyObject* (*function)(PyObject*)) : arity_(std::move(arity)), input_types_(std::move(input_types)), + output_type_(output_type), function_(function) {} + + arrow::Status Make(cp::KernelContext* ctx, const cp::ExecBatch& batch, + arrow::Datum* out) { + Py_Initialize(); + PyObject* args = PyTuple_Pack(1,PyLong_FromLong(2)); + PyObject* myResult = function_(args); + int64_t result = PyLong_AsLong(myResult); + Py_Finalize(); + std::cout << "Value : " << result << std::endl; + arrow::Result maybe_result; + arrow::Int64Builder builder(arrow::default_memory_pool()); + std::shared_ptr arr; + ABORT_ON_FAILURE(builder.Append(result)); + ABORT_ON_FAILURE(builder.Finish(&arr)); + maybe_result = cp::CallFunction("add", {batch[0].array(), arr}); + *out->mutable_array() = *maybe_result.ValueOrDie().array(); + return arrow::Status::OK(); + } + + private: + cp::Arity arity_; + std::vector input_types_; + cp::OutputType output_type_; + PyObject* (*function_)(PyObject*); + +}; + arrow::Status Execute() { const std::string name = "x+x"; - auto func = std::make_shared(name, cp::Arity::Unary(), &func_doc); - cp::ScalarKernel kernel({cp::InputType::Array(arrow::int32())}, arrow::int32(), - ExampleFunctionImpl); - kernel.mem_allocation = cp::MemAllocation::NO_PREALLOCATE; + ScalarUDF func_gen(cp::Arity::Unary(), {cp::InputType::Array(arrow::int64())}, arrow::int64(), &MultiplyFunction); + + + auto func = std::make_shared(name, cp::Arity::Unary(), &func_doc2); + cp::ScalarKernel kernel({cp::InputType::Array(arrow::int64())}, arrow::int64(), ExampleFunctionImpl); + kernel.mem_allocation = cp::MemAllocation::NO_PREALLOCATE; + ABORT_ON_FAILURE(func->AddKernel(std::move(kernel))); auto registry = cp::GetFunctionRegistry(); ABORT_ON_FAILURE(registry->AddFunction(std::move(func))); - arrow::Int32Builder builder(arrow::default_memory_pool()); - std::shared_ptr arr; + arrow::Int64Builder builder(arrow::default_memory_pool()); + std::shared_ptr arr1, arr2; ABORT_ON_FAILURE(builder.Append(42)); - ABORT_ON_FAILURE(builder.Finish(&arr)); + ABORT_ON_FAILURE(builder.Finish(&arr1)); + ABORT_ON_FAILURE(builder.Append(58)); + ABORT_ON_FAILURE(builder.Finish(&arr2)); auto options = std::make_shared(); - auto maybe_result = cp::CallFunction(name, {arr}, options.get()); + auto maybe_result = cp::CallFunction(name, {arr1}, options.get()); ABORT_ON_FAILURE(maybe_result.status()); std::cout << "Result 1: " << maybe_result->make_array()->ToString() << std::endl; @@ -277,8 +338,8 @@ arrow::Status Execute() { custom_exp, cp::field_ref("b"), }}; - auto output_schema = arrow::schema({arrow::field("a", arrow::int32()), - arrow::field("a + a", arrow::int32()), + auto output_schema = arrow::schema({arrow::field("a", arrow::int64()), + arrow::field("a + a", arrow::int64()), arrow::field("b", arrow::boolean())}); std::shared_ptr out; ABORT_ON_FAILURE(cp::Declaration::Sequence( @@ -301,39 +362,50 @@ arrow::Status Execute() { } arrow::Status ExecutePy() { - const std::string name = "x+x"; - auto func2 = std::make_shared(name, cp::Arity::Unary(), &func_doc); - arrow::py::UDFScalarKernel kernel2({cp::InputType::Array(arrow::int32())}, arrow::int32(), - ExamplePyFunctionImpl); - + cp::ExecContext exec_context(arrow::default_memory_pool(), + ::arrow::internal::GetCpuThreadPool()); + const std::string name = "simple_func"; + auto func2 = + std::make_shared(name, cp::Arity::Unary(), &func_doc2); + arrow::py::UDFScalarKernel kernel2({cp::InputType::Array(arrow::int64())}, + arrow::int64(), ExamplePyFunctionImpl); + kernel2.mem_allocation = cp::MemAllocation::NO_PREALLOCATE; ABORT_ON_FAILURE(func2->AddKernel(std::move(kernel2))); - - + auto registry = cp::GetFunctionRegistry(); + + auto size_before_registration = registry->GetFunctionNames().size(); - // auto registry = cp::GetFunctionRegistry(); - // ABORT_ON_FAILURE(registry->AddFunction(std::move(func2))); + std::cout << "[Before] Func Reg Size: " << size_before_registration << ", " << registry->num_functions() << std::endl; - // arrow::Int32Builder builder(arrow::default_memory_pool()); - // std::shared_ptr arr; - // ABORT_ON_FAILURE(builder.Append(42)); - // ABORT_ON_FAILURE(builder.Finish(&arr)); - // auto options = std::make_shared(); - // auto maybe_result = cp::CallFunction(name, {arr}, options.get()); - // ABORT_ON_FAILURE(maybe_result.status()); + ABORT_ON_FAILURE(registry->AddFunction(std::move(func2))); - // std::cout << "Result 1: " << maybe_result->make_array()->ToString() << std::endl; + auto size_after_registration = registry->GetFunctionNames().size(); + + std::cout << "[After] Func Reg Size: " << size_after_registration << ", " << registry->num_functions() << std::endl; + + arrow::Int64Builder builder(arrow::default_memory_pool()); + std::shared_ptr arr; + ABORT_ON_FAILURE(builder.Append(42)); + ABORT_ON_FAILURE(builder.Finish(&arr)); + auto options = std::make_shared(); + + std::cout << "Calling function :" << arr->ToString() << std::endl; + + auto maybe_result = cp::CallFunction(name, {arr}, options.get()); + ABORT_ON_FAILURE(maybe_result.status()); + + std::cout << "Result 1: " << maybe_result->make_array()->ToString() << std::endl; return arrow::Status::OK(); } int main(int argc, char** argv) { - auto status = ExecutePy(); + auto status = Execute(); if (!status.ok()) { std::cerr << "Error occurred : " << status.message() << std::endl; return EXIT_FAILURE; } - return EXIT_SUCCESS; } diff --git a/cpp/src/arrow/python/api.h b/cpp/src/arrow/python/api.h index 638b05a69fc..7737d791e31 100644 --- a/cpp/src/arrow/python/api.h +++ b/cpp/src/arrow/python/api.h @@ -29,4 +29,3 @@ #include "arrow/python/python_to_arrow.h" #include "arrow/python/serialize.h" #include "arrow/python/udf.h" - diff --git a/cpp/src/arrow/python/python_test.cc b/cpp/src/arrow/python/python_test.cc index c465fabc680..d3c755993c2 100644 --- a/cpp/src/arrow/python/python_test.cc +++ b/cpp/src/arrow/python/python_test.cc @@ -39,10 +39,18 @@ #include "arrow/util/checked_cast.h" #include "arrow/util/logging.h" +#include +#include +#include +#include +#include + +#include "arrow/python/udf.h" + namespace arrow { using internal::checked_cast; - +namespace cp = arrow::compute; namespace py { TEST(OwnedRef, TestMoves) { @@ -595,5 +603,87 @@ TEST_F(DecimalTest, UpdateWithNaN) { ASSERT_EQ(std::numeric_limits::min(), metadata.scale()); } +PyObject* SimpleFunction() { + PyObject* obj = Py_BuildValue("s", "hello"); + return obj; +} + +class ExampleFunctionOptionsType : public cp::FunctionOptionsType { + const char* type_name() const override { return "ExampleFunctionOptionsType"; } + std::string Stringify(const cp::FunctionOptions&) const override { + return "ExampleFunctionOptionsType"; + } + bool Compare(const cp::FunctionOptions&, const cp::FunctionOptions&) const override { + return true; + } + std::unique_ptr Copy(const cp::FunctionOptions&) const override; + // optional: support for serialization + // Result> Serialize(const FunctionOptions&) const override; + // Result> Deserialize(const Buffer&) const override; +}; + +cp::FunctionOptionsType* GetExampleFunctionOptionsType() { + static ExampleFunctionOptionsType options_type; + return &options_type; +} + +class ExampleFunctionOptions : public cp::FunctionOptions { + public: + ExampleFunctionOptions() : cp::FunctionOptions(GetExampleFunctionOptionsType()) {} +}; + +std::unique_ptr ExampleFunctionOptionsType::Copy( + const cp::FunctionOptions&) const { + return std::unique_ptr(new ExampleFunctionOptions()); +} + +arrow::Status ExamplePyFunctionImpl(cp::KernelContext* ctx, const cp::ExecBatch& batch, + arrow::Datum* out, PyObject* func) { + std::cout << "H" << std::endl; + auto result = cp::CallFunction("add", {batch[0].array(), batch[0].array()}); + *out->mutable_array() = *result.ValueOrDie().array(); + // PyObject* res = SimpleFunction(); + // PyObject* objectsRepresentation = PyObject_Repr(res); + // const char* s = PyUnicode_AsUTF8(objectsRepresentation); + std::cout << "Message :: " + << "s" << std::endl; + return arrow::Status::OK(); +} + +TEST(UDF, Initialization) { + const cp::FunctionDoc func_doc{ + "Example function to demonstrate registering an out-of-tree function", + "", + {"x"}, + "ExampleFunctionOptions"}; + arrow::Status st; + const std::string name = "x+x"; + auto func2 = + std::make_shared(name, cp::Arity::Unary(), &func_doc); + arrow::py::UDFScalarKernel kernel2({cp::InputType::Array(arrow::int32())}, + arrow::int32(), ExamplePyFunctionImpl); + + kernel2.mem_allocation = cp::MemAllocation::NO_PREALLOCATE; + + st = func2->AddKernel(std::move(kernel2)); + + auto registry = cp::GetFunctionRegistry(); + st = registry->AddFunction(std::move(func2)); + + arrow::Int32Builder builder(arrow::default_memory_pool()); + std::shared_ptr arr; + + st = builder.Append(42); + st = builder.Finish(&arr); + auto options = std::make_shared(); + + // auto func = registry->GetFunction("x+x").ValueOrDie(); + + auto maybe_result = cp::CallFunction(name, {arr}, options.get()); + st = maybe_result.status(); + + std::cout << "Result 1: " << maybe_result->make_array()->ToString() << std::endl; +} + } // namespace py } // namespace arrow diff --git a/cpp/src/arrow/python/udf.cc b/cpp/src/arrow/python/udf.cc index 33a1fd5e3bd..c38b69ee432 100644 --- a/cpp/src/arrow/python/udf.cc +++ b/cpp/src/arrow/python/udf.cc @@ -4,42 +4,13 @@ #include #include -namespace arrow { - -namespace py { - - -/// \brief Add a kernel with given input/output types, no required state -/// initialization, preallocation for fixed-width types, and default null -/// handling (intersect validity bitmaps of inputs). -Status UDFScalarFunction::AddKernel(std::vector in_types, cp::OutputType out_type, - UDFArrayKernelExec exec, cp::KernelInit init) { - - RETURN_NOT_OK(CheckArity(in_types)); - - if (arity_.is_varargs && in_types.size() != 1) { - return Status::Invalid("VarArgs signatures must have exactly one input type"); - } - auto sig = - cp::KernelSignature::Make(std::move(in_types), std::move(out_type), arity_.is_varargs); - kernels_.emplace_back(std::move(sig), exec, init); - return Status::OK(); - -} - -/// \brief Add a kernel (function implementation). Returns error if the -/// kernel's signature does not match the function's arity. -Status UDFScalarFunction::AddKernel(UDFScalarKernel kernel) { - RETURN_NOT_OK(CheckArity(kernel.signature->in_types())); - if (arity_.is_varargs && !kernel.signature->is_varargs()) { - return Status::Invalid("Function accepts varargs but kernel signature does not"); - } - kernels_.emplace_back(std::move(kernel)); - return Status::OK(); -} +#include "arrow/compute/function.h" +#include "arrow/python/common.h" +namespace cp = arrow::compute; +namespace arrow { -} // namespace py +namespace py {} // namespace py } // namespace arrow \ No newline at end of file diff --git a/cpp/src/arrow/python/udf.h b/cpp/src/arrow/python/udf.h index 3aec7b801a1..d6ce993eb4d 100644 --- a/cpp/src/arrow/python/udf.h +++ b/cpp/src/arrow/python/udf.h @@ -5,9 +5,21 @@ #include #include +#include "arrow/compute/api_scalar.h" +#include "arrow/compute/cast.h" +#include "arrow/compute/exec.h" +#include "arrow/compute/exec_internal.h" #include "arrow/compute/function.h" +#include "arrow/compute/function_internal.h" +#include "arrow/compute/kernels/common.h" +#include "arrow/compute/registry.h" +#include "arrow/datum.h" +#include "arrow/util/cpu_info.h" +#include "arrow/util/logging.h" +#include "arrow/util/tracing_internal.h" #include "arrow/python/common.h" +#include "arrow/python/visibility.h" namespace cp = arrow::compute; @@ -15,21 +27,23 @@ namespace arrow { namespace py { -// PyObject* CallUnaryTableUDF(PyObject* func, PyObject* arg1, std::shared_ptr
input); -// PyObject* CallUnaryArrayUDF(PyObject* func, PyObject* arg1, std::shared_ptr input); +// PyObject* CallUnaryTableUDF(PyObject* func, PyObject* arg1, std::shared_ptr
+// input); PyObject* CallUnaryArrayUDF(PyObject* func, PyObject* arg1, +// std::shared_ptr input); -using UDFArrayKernelExec = std::function; +using UDFArrayKernelExec = std::function; struct UDFArrayKernel : public cp::Kernel { UDFArrayKernel() = default; UDFArrayKernel(std::shared_ptr sig, UDFArrayKernelExec exec, - cp::KernelInit init = NULLPTR) - : Kernel(std::move(sig), init), exec(std::move(exec)) {} + cp::KernelInit init = NULLPTR) + : cp::Kernel(std::move(sig), init), exec(std::move(exec)) {} - UDFArrayKernel(std::vector in_types, cp::OutputType out_type, UDFArrayKernelExec exec, - cp::KernelInit init = NULLPTR) - : Kernel(std::move(in_types), std::move(out_type), std::move(init)), + UDFArrayKernel(std::vector in_types, cp::OutputType out_type, + UDFArrayKernelExec exec, cp::KernelInit init = NULLPTR) + : cp::Kernel(std::move(in_types), std::move(out_type), std::move(init)), exec(std::move(exec)) {} /// \brief Perform a single invocation of this kernel. Depending on the @@ -55,14 +69,15 @@ struct UDFScalarKernel : public UDFArrayKernel { cp::MemAllocation::type mem_allocation = cp::MemAllocation::PREALLOCATE; }; -class ARROW_PYTHON_EXPORT UDFScalarFunction : public cp::detail::FunctionImpl { +class ARROW_PYTHON_EXPORT UDFScalarFunction + : public cp::detail::FunctionImpl { public: using KernelType = UDFScalarKernel; UDFScalarFunction(std::string name, const cp::Arity& arity, const cp::FunctionDoc* doc, - const cp::FunctionOptions* default_options = NULLPTR) - : cp::detail::FunctionImpl(std::move(name), cp::Function::SCALAR, arity, doc, - default_options) {} + const cp::FunctionOptions* default_options = NULLPTR) + : cp::detail::FunctionImpl(std::move(name), cp::Function::SCALAR, + arity, doc, default_options) {} /// \brief Add a kernel with given input/output types, no required state /// initialization, preallocation for fixed-width types, and default null @@ -72,14 +87,27 @@ class ARROW_PYTHON_EXPORT UDFScalarFunction : public cp::detail::FunctionImplin_types())); + if (arity_.is_varargs && + !kernel.signature->is_varargs()) { + return Status::Invalid("Function accepts varargs but kernel signature does not"); + } + kernels_.emplace_back(std::move(kernel)); + return Status::OK(); + } + + Status Hello1() { return Status::OK(); } + + Status Hello2(); }; - struct ARROW_PYTHON_EXPORT UDFSynthesizer { UDFSynthesizer(); - + static void MakeFunction(PyObject* func) { + + } }; } // namespace py From 5598cdc4f5a571b3dcfe7c7be820f8b71c73be7d Mon Sep 17 00:00:00 2001 From: Vibhatha Abeykoon Date: Mon, 14 Feb 2022 15:36:00 +0530 Subject: [PATCH 011/131] adding cpp interfaces in cython and creating a basic UDF synthesizer --- cpp/examples/arrow/udf_example.cc | 255 +++++++++++++++------------ cpp/src/arrow/python/CMakeLists.txt | 3 +- cpp/src/arrow/python/udf.h | 42 ++++- python/pyarrow/_compute.pyx | 43 ++++- python/pyarrow/compute.py | 2 +- python/pyarrow/includes/libarrow.pxd | 15 ++ 6 files changed, 231 insertions(+), 129 deletions(-) diff --git a/cpp/examples/arrow/udf_example.cc b/cpp/examples/arrow/udf_example.cc index 0baf34021b0..7d9ca3e30dd 100644 --- a/cpp/examples/arrow/udf_example.cc +++ b/cpp/examples/arrow/udf_example.cc @@ -20,6 +20,7 @@ #include #include #include +#include #include #include #include @@ -31,6 +32,7 @@ #include #include +#include #include // Demonstrate registering an Arrow compute function outside of the Arrow source tree @@ -46,72 +48,6 @@ namespace cp = ::arrow::compute; } \ } while (0); -class ExampleFunctionOptionsType : public cp::FunctionOptionsType { - const char* type_name() const override { return "ExampleFunctionOptionsType"; } - std::string Stringify(const cp::FunctionOptions&) const override { - return "ExampleFunctionOptionsType"; - } - bool Compare(const cp::FunctionOptions&, const cp::FunctionOptions&) const override { - return true; - } - std::unique_ptr Copy(const cp::FunctionOptions&) const override; - // optional: support for serialization - // Result> Serialize(const FunctionOptions&) const override; - // Result> Deserialize(const Buffer&) const override; -}; - -cp::FunctionOptionsType* GetExampleFunctionOptionsType() { - static ExampleFunctionOptionsType options_type; - return &options_type; -} - -class ExampleFunctionOptions : public cp::FunctionOptions { - public: - ExampleFunctionOptions() : cp::FunctionOptions(GetExampleFunctionOptionsType()) {} -}; - -std::unique_ptr ExampleFunctionOptionsType::Copy( - const cp::FunctionOptions&) const { - return std::unique_ptr(new ExampleFunctionOptions()); -} - -PyObject* SimpleFunction() { - PyObject* out = Py_BuildValue("s", "hello"); - std::cout << "HELLO" << std::endl; - return std::move(out); -} - -// PyObject* objectsRepresentation = PyObject_Repr(yourObject); -// const char* s = PyString_AsString(objectsRepresentation); - -arrow::Status ExampleFunctionImpl(cp::KernelContext* ctx, const cp::ExecBatch& batch, - arrow::Datum* out) { - std::cout << "calling udf :" << batch.length << std::endl; - Py_Initialize(); - PyObject* res = SimpleFunction(); - PyObject* objectsRepresentation = PyObject_Repr(res); - const char* s = PyUnicode_AsUTF8(objectsRepresentation); - std::cout << "Message :: " << s << std::endl; - Py_Finalize(); - auto result = cp::CallFunction("add", {batch[0].array(), batch[0].array()}); - *out->mutable_array() = *result.ValueOrDie().array(); - return arrow::Status::OK(); -} -// cp::KernelContext*, const cp::ExecBatch&, Datum*, PyObject* func -arrow::Status ExamplePyFunctionImpl(cp::KernelContext* ctx, const cp::ExecBatch& batch, - arrow::Datum* out, PyObject* func) { - std::cout << "H" << std::endl; - auto result = cp::CallFunction("add", {batch[0].array(), batch[0].array()}); - *out->mutable_array() = *result.ValueOrDie().array(); - // PyObject* res = SimpleFunction(); - // PyObject* objectsRepresentation = PyObject_Repr(res); - // const char* s = PyUnicode_AsUTF8(objectsRepresentation); - std::cout << "Message :: " - << "s" << std::endl; - - return arrow::Status::OK(); -} - struct BatchesWithSchema { std::vector batches; std::shared_ptr schema; @@ -186,10 +122,80 @@ arrow::Result MakeBasicBatches() { return out; } -arrow::Result> GetTable() { - std::shared_ptr out; +class ExampleFunctionOptionsType : public cp::FunctionOptionsType { + const char* type_name() const override { return "ExampleFunctionOptionsType"; } + std::string Stringify(const cp::FunctionOptions&) const override { + return "ExampleFunctionOptionsType"; + } + bool Compare(const cp::FunctionOptions&, const cp::FunctionOptions&) const override { + return true; + } + std::unique_ptr Copy(const cp::FunctionOptions&) const override; + // optional: support for serialization + // Result> Serialize(const FunctionOptions&) const override; + // Result> Deserialize(const Buffer&) const override; +}; - return out; +cp::FunctionOptionsType* GetExampleFunctionOptionsType() { + static ExampleFunctionOptionsType options_type; + return &options_type; +} + +class ExampleFunctionOptions : public cp::FunctionOptions { + public: + ExampleFunctionOptions() : cp::FunctionOptions(GetExampleFunctionOptionsType()) {} +}; + +std::unique_ptr ExampleFunctionOptionsType::Copy( + const cp::FunctionOptions&) const { + return std::unique_ptr(new ExampleFunctionOptions()); +} + +PyObject* SimpleFunction() { + PyObject* out = Py_BuildValue("s", "hello"); + std::cout << "HELLO" << std::endl; + return std::move(out); +} + +arrow::Status rb_test() { + auto datasource = MakeBasicBatches(); + auto batches = datasource->batches; + + ARROW_ASSIGN_OR_RAISE(auto rb, batches[0].ToRecordBatch(datasource->schema, + arrow::default_memory_pool())); + ARROW_ASSIGN_OR_RAISE(auto result, cp::CallFunction("add", {rb, rb})); + return arrow::Status::OK(); +} + +// PyObject* objectsRepresentation = PyObject_Repr(yourObject); +// const char* s = PyString_AsString(objectsRepresentation); + +arrow::Status ExampleFunctionImpl(cp::KernelContext* ctx, const cp::ExecBatch& batch, + arrow::Datum* out) { + std::cout << "calling udf :" << batch.length << std::endl; + Py_Initialize(); + PyObject* res = SimpleFunction(); + PyObject* objectsRepresentation = PyObject_Repr(res); + const char* s = PyUnicode_AsUTF8(objectsRepresentation); + std::cout << "Message :: " << s << std::endl; + Py_Finalize(); + auto result = cp::CallFunction("add", {batch[0].array(), batch[0].array()}); + *out->mutable_array() = *result.ValueOrDie().array(); + return arrow::Status::OK(); +} +// cp::KernelContext*, const cp::ExecBatch&, Datum*, PyObject* func +arrow::Status ExamplePyFunctionImpl(cp::KernelContext* ctx, const cp::ExecBatch& batch, + arrow::Datum* out, PyObject* func) { + std::cout << "H" << std::endl; + auto result = cp::CallFunction("add", {batch[0].array(), batch[0].array()}); + *out->mutable_array() = *result.ValueOrDie().array(); + // PyObject* res = SimpleFunction(); + // PyObject* objectsRepresentation = PyObject_Repr(res); + // const char* s = PyUnicode_AsUTF8(objectsRepresentation); + std::cout << "Message :: " + << "s" << std::endl; + + return arrow::Status::OK(); } class ExampleNodeOptions : public cp::ExecNodeOptions {}; @@ -250,48 +256,46 @@ PyObject* MultiplyFunction(PyObject* scalar) { } class ScalarUDF { + public: + ScalarUDF(); + explicit ScalarUDF(cp::Arity arity, std::vector input_types, + cp::OutputType output_type, PyObject* (*function)(PyObject*)) + : arity_(std::move(arity)), + input_types_(std::move(input_types)), + output_type_(output_type), + function_(function) {} + + arrow::Status Make(cp::KernelContext* ctx, const cp::ExecBatch& batch, + arrow::Datum* out) { + Py_Initialize(); + PyObject* args = PyTuple_Pack(1, PyLong_FromLong(2)); + PyObject* myResult = function_(args); + int64_t result = PyLong_AsLong(myResult); + Py_Finalize(); + std::cout << "Value : " << result << std::endl; + arrow::Result maybe_result; + arrow::Int64Builder builder(arrow::default_memory_pool()); + std::shared_ptr arr; + ABORT_ON_FAILURE(builder.Append(result)); + ABORT_ON_FAILURE(builder.Finish(&arr)); + maybe_result = cp::CallFunction("add", {batch[0].array(), arr}); + *out->mutable_array() = *maybe_result.ValueOrDie().array(); + return arrow::Status::OK(); + } - public: - ScalarUDF(); - explicit ScalarUDF(cp::Arity arity, std::vector input_types, - cp::OutputType output_type, PyObject* (*function)(PyObject*)) : arity_(std::move(arity)), input_types_(std::move(input_types)), - output_type_(output_type), function_(function) {} - - arrow::Status Make(cp::KernelContext* ctx, const cp::ExecBatch& batch, - arrow::Datum* out) { - Py_Initialize(); - PyObject* args = PyTuple_Pack(1,PyLong_FromLong(2)); - PyObject* myResult = function_(args); - int64_t result = PyLong_AsLong(myResult); - Py_Finalize(); - std::cout << "Value : " << result << std::endl; - arrow::Result maybe_result; - arrow::Int64Builder builder(arrow::default_memory_pool()); - std::shared_ptr arr; - ABORT_ON_FAILURE(builder.Append(result)); - ABORT_ON_FAILURE(builder.Finish(&arr)); - maybe_result = cp::CallFunction("add", {batch[0].array(), arr}); - *out->mutable_array() = *maybe_result.ValueOrDie().array(); - return arrow::Status::OK(); - } - - private: - cp::Arity arity_; - std::vector input_types_; - cp::OutputType output_type_; - PyObject* (*function_)(PyObject*); - + private: + cp::Arity arity_; + std::vector input_types_; + cp::OutputType output_type_; + PyObject* (*function_)(PyObject*); }; arrow::Status Execute() { const std::string name = "x+x"; - - ScalarUDF func_gen(cp::Arity::Unary(), {cp::InputType::Array(arrow::int64())}, arrow::int64(), &MultiplyFunction); - - auto func = std::make_shared(name, cp::Arity::Unary(), &func_doc2); - cp::ScalarKernel kernel({cp::InputType::Array(arrow::int64())}, arrow::int64(), ExampleFunctionImpl); - + cp::ScalarKernel kernel({cp::InputType::Array(arrow::int64())}, arrow::int64(), + ExampleFunctionImpl); + kernel.mem_allocation = cp::MemAllocation::NO_PREALLOCATE; ABORT_ON_FAILURE(func->AddKernel(std::move(kernel))); @@ -361,12 +365,41 @@ arrow::Status Execute() { return future.status(); } +arrow::Status ExecuteSynth() { + + std::string func_name = "simple_func"; + cp::Arity arity = cp::Arity::Unary(); + const cp::FunctionDoc func_doc3{ + "Example function to demonstrate registering an out-of-tree function", + "", + {"x"}, + "ExampleFunctionOptions3"}; + std::vector in_types = {cp::InputType::Array(arrow::int64())}; + cp::OutputType out_type = arrow::int64(); + + arrow::py::UDFSynthesizer udf_sync(func_name, arity, func_doc3, in_types, out_type, ExampleFunctionImpl); + ABORT_ON_FAILURE(udf_sync.MakeFunction()); + + arrow::Int64Builder builder(arrow::default_memory_pool()); + std::shared_ptr arr1, arr2; + ABORT_ON_FAILURE(builder.Append(42)); + ABORT_ON_FAILURE(builder.Finish(&arr1)); + auto options = std::make_shared(); + auto maybe_result = cp::CallFunction(func_name, {arr1}, options.get()); + ABORT_ON_FAILURE(maybe_result.status()); + + std::cout << "Result 1: " << maybe_result->make_array()->ToString() << std::endl; + + + return arrow::Status::OK(); +} + arrow::Status ExecutePy() { cp::ExecContext exec_context(arrow::default_memory_pool(), ::arrow::internal::GetCpuThreadPool()); const std::string name = "simple_func"; - auto func2 = - std::make_shared(name, cp::Arity::Unary(), &func_doc2); + auto func2 = std::make_shared(name, cp::Arity::Unary(), + &func_doc2); arrow::py::UDFScalarKernel kernel2({cp::InputType::Array(arrow::int64())}, arrow::int64(), ExamplePyFunctionImpl); @@ -377,13 +410,15 @@ arrow::Status ExecutePy() { auto size_before_registration = registry->GetFunctionNames().size(); - std::cout << "[Before] Func Reg Size: " << size_before_registration << ", " << registry->num_functions() << std::endl; + std::cout << "[Before] Func Reg Size: " << size_before_registration << ", " + << registry->num_functions() << std::endl; ABORT_ON_FAILURE(registry->AddFunction(std::move(func2))); auto size_after_registration = registry->GetFunctionNames().size(); - std::cout << "[After] Func Reg Size: " << size_after_registration << ", " << registry->num_functions() << std::endl; + std::cout << "[After] Func Reg Size: " << size_after_registration << ", " + << registry->num_functions() << std::endl; arrow::Int64Builder builder(arrow::default_memory_pool()); std::shared_ptr arr; @@ -402,7 +437,7 @@ arrow::Status ExecutePy() { } int main(int argc, char** argv) { - auto status = Execute(); + auto status = ExecuteSynth(); if (!status.ok()) { std::cerr << "Error occurred : " << status.message() << std::endl; return EXIT_FAILURE; diff --git a/cpp/src/arrow/python/CMakeLists.txt b/cpp/src/arrow/python/CMakeLists.txt index 7235e2d0fe3..b75fee4a25e 100644 --- a/cpp/src/arrow/python/CMakeLists.txt +++ b/cpp/src/arrow/python/CMakeLists.txt @@ -45,7 +45,8 @@ set(ARROW_PYTHON_SRCS python_to_arrow.cc pyarrow.cc serialize.cc - udf.cc) + udf.cc + udf.h) set_source_files_properties(init.cc PROPERTIES SKIP_PRECOMPILE_HEADERS ON SKIP_UNITY_BUILD_INCLUSION ON) diff --git a/cpp/src/arrow/python/udf.h b/cpp/src/arrow/python/udf.h index d6ce993eb4d..b666fda33dd 100644 --- a/cpp/src/arrow/python/udf.h +++ b/cpp/src/arrow/python/udf.h @@ -8,15 +8,11 @@ #include "arrow/compute/api_scalar.h" #include "arrow/compute/cast.h" #include "arrow/compute/exec.h" -#include "arrow/compute/exec_internal.h" #include "arrow/compute/function.h" -#include "arrow/compute/function_internal.h" -#include "arrow/compute/kernels/common.h" #include "arrow/compute/registry.h" #include "arrow/datum.h" #include "arrow/util/cpu_info.h" #include "arrow/util/logging.h" -#include "arrow/util/tracing_internal.h" #include "arrow/python/common.h" #include "arrow/python/visibility.h" @@ -102,12 +98,40 @@ class ARROW_PYTHON_EXPORT UDFScalarFunction Status Hello2(); }; -struct ARROW_PYTHON_EXPORT UDFSynthesizer { - UDFSynthesizer(); - - static void MakeFunction(PyObject* func) { +using KernelExec = std::function; + +class ARROW_PYTHON_EXPORT UDFSynthesizer { + public: + + UDFSynthesizer(std::string func_name, cp::Arity arity, cp::FunctionDoc func_doc, + std::vector in_types, cp::OutputType out_type, KernelExec kernel_exec) + : func_name_(func_name), arity_(arity), func_doc_(func_doc), + in_types_(in_types), out_type_(out_type), kernel_exec_(kernel_exec) {} + + Status MakeFunction() { + Status st; + auto func = std::make_shared(func_name_, arity_, &func_doc_); + cp::ScalarKernel kernel(in_types_, out_type_, kernel_exec_); + kernel.mem_allocation = cp::MemAllocation::NO_PREALLOCATE; + st = func->AddKernel(std::move(kernel)); + if (!st.ok()) { + return Status::ExecutionError("Kernel couldn't be added to the udf"); + } + auto registry = cp::GetFunctionRegistry(); + st = registry->AddFunction(std::move(func)); + if (!st.ok()) { + return Status::ExecutionError("udf registration failed"); + } + return Status::OK(); + } + private: + std::string func_name_; + cp::Arity arity_; + cp::FunctionDoc func_doc_; + std::vector in_types_; + cp::OutputType out_type_; + KernelExec kernel_exec_; - } }; } // namespace py diff --git a/python/pyarrow/_compute.pyx b/python/pyarrow/_compute.pyx index b0b958e4bf8..1cb8f5c17d8 100644 --- a/python/pyarrow/_compute.pyx +++ b/python/pyarrow/_compute.pyx @@ -2345,13 +2345,36 @@ cdef CFunctionDoc _make_function_doc(func_doc): CFunctionDoc f_doc vector[c_string] c_arg_names if isinstance(func_doc, dict): - f_doc.summary = func_doc["summary"].encode() - f_doc.description = func_doc["description"].encode() - for arg_name in func_doc["arg_names"]: - c_arg_names.push_back(arg_name.encode()) - f_doc.arg_names = c_arg_names - f_doc.options_class = func_doc["arg_names"].encode() - f_doc.options_required = func_doc["options_required"] + if func_doc["summary"] and isinstance(func_doc["summary"], str): + f_doc.summary = func_doc["summary"].encode() + else: + raise ValueError("key `summary` cannot be None") + + if func_doc["description"] and isinstance(func_doc["description"], str): + f_doc.description = func_doc["description"].encode() + else: + raise ValueError("key `description` cannot be None") + + if func_doc["arg_names"] and isinstance(func_doc["arg_names"], list): + for arg_name in func_doc["arg_names"]: + if isinstance(arg_name, str): + c_arg_names.push_back(arg_name.encode()) + else: + raise ValueError("key `arg_names` must be a list of strings") + f_doc.arg_names = c_arg_names + else: + raise ValueError("key `arg_names` cannot be None") + + if func_doc["options_class"] and isinstance(func_doc["options_class"], str): + f_doc.options_class = func_doc["options_class"].encode() + else: + raise ValueError("key `options_class` cannot be None") + + if isinstance(func_doc["options_required"], bool): + f_doc.options_required = func_doc["options_required"] + else: + raise ValueError("key `options_required` cannot must be bool") + return f_doc else: raise TypeError(f"func_doc must be a dictionary") @@ -2362,7 +2385,7 @@ cdef class UDFInterpreter: raise TypeError("Cannot initialize UDFInterpreter from the constructor") @staticmethod - def create_scalar_kernel(function_name, arity, function_doc): + def create_scalar_function(function_name, arity, function_doc): cdef: c_string c_func_name Arity c_arity @@ -2372,4 +2395,8 @@ cdef class UDFInterpreter: c_arity = (arity) c_func_doc = _make_function_doc(function_doc) + cdef _create_kernel(self): + + + \ No newline at end of file diff --git a/python/pyarrow/compute.py b/python/pyarrow/compute.py index 16b287eff22..70b48641453 100644 --- a/python/pyarrow/compute.py +++ b/python/pyarrow/compute.py @@ -80,7 +80,7 @@ _group_by, # Expressions Expression, - _make_function_doc + UDFInterpreter ) from collections import namedtuple diff --git a/python/pyarrow/includes/libarrow.pxd b/python/pyarrow/includes/libarrow.pxd index 6bd0c29c544..95a69ea9af6 100644 --- a/python/pyarrow/includes/libarrow.pxd +++ b/python/pyarrow/includes/libarrow.pxd @@ -1806,6 +1806,18 @@ cdef extern from "arrow/compute/api.h" namespace "arrow::compute" nogil: CMemoryPool* memory_pool() const CExecutor* executor() + cdef cppclass CExecBatch" arrow::compute::ExecBatch": + CExecBatch(const CRecordBatch& batch); + + @staticmethod + CResult[CExecBatch] Make(vector[CDatum] values) + CResult[shared_ptr[CRecordBatch]] ToRecordBatch( + shared_ptr[CSchema] schema, CMemoryPool* pool) const + + + cdef cppclass CKernelContext" arrow::compute::KernelContext": + CKernelContext(CExecContext* exec_ctx) + cdef cppclass CKernelSignature" arrow::compute::KernelSignature": c_string ToString() const @@ -1843,6 +1855,9 @@ cdef extern from "arrow/compute/api.h" namespace "arrow::compute" nogil: @staticmethod CInputType Scalar(shared_ptr[CDataType] type) + cdef cppclass COutputType" arrow::compute::OutputType": + COutputType(shared_ptr[CDataType] type) + cdef enum FunctionKind" arrow::compute::Function::Kind": FunctionKind_SCALAR" arrow::compute::Function::SCALAR" FunctionKind_VECTOR" arrow::compute::Function::VECTOR" From dc9c8df4b2bca2e3f963c68f16c9a3888704cb1f Mon Sep 17 00:00:00 2001 From: Vibhatha Abeykoon Date: Tue, 15 Feb 2022 09:07:32 +0530 Subject: [PATCH 012/131] adding cython binding for funcptr init --- cpp/src/arrow/python/udf.h | 12 ++++++++---- python/pyarrow/_compute.pyx | 12 ++++++++++-- 2 files changed, 18 insertions(+), 6 deletions(-) diff --git a/cpp/src/arrow/python/udf.h b/cpp/src/arrow/python/udf.h index b666fda33dd..cb17ea01b0d 100644 --- a/cpp/src/arrow/python/udf.h +++ b/cpp/src/arrow/python/udf.h @@ -104,14 +104,15 @@ class ARROW_PYTHON_EXPORT UDFSynthesizer { public: UDFSynthesizer(std::string func_name, cp::Arity arity, cp::FunctionDoc func_doc, - std::vector in_types, cp::OutputType out_type, KernelExec kernel_exec) + std::vector in_types, cp::OutputType out_type, + Status(*callback)(cp::KernelContext*, const cp::ExecBatch&, Datum*)) : func_name_(func_name), arity_(arity), func_doc_(func_doc), - in_types_(in_types), out_type_(out_type), kernel_exec_(kernel_exec) {} + in_types_(in_types), out_type_(out_type), callback_(callback) {} Status MakeFunction() { Status st; auto func = std::make_shared(func_name_, arity_, &func_doc_); - cp::ScalarKernel kernel(in_types_, out_type_, kernel_exec_); + cp::ScalarKernel kernel(in_types_, out_type_, callback_); kernel.mem_allocation = cp::MemAllocation::NO_PREALLOCATE; st = func->AddKernel(std::move(kernel)); if (!st.ok()) { @@ -124,13 +125,16 @@ class ARROW_PYTHON_EXPORT UDFSynthesizer { } return Status::OK(); } + private: + std::string func_name_; cp::Arity arity_; cp::FunctionDoc func_doc_; std::vector in_types_; cp::OutputType out_type_; - KernelExec kernel_exec_; + //KernelExec kernel_exec_; + Status(*callback_)(cp::KernelContext*, const cp::ExecBatch&, Datum*); }; diff --git a/python/pyarrow/_compute.pyx b/python/pyarrow/_compute.pyx index 1cb8f5c17d8..e222ae084e6 100644 --- a/python/pyarrow/_compute.pyx +++ b/python/pyarrow/_compute.pyx @@ -2395,8 +2395,16 @@ cdef class UDFInterpreter: c_arity = (arity) c_func_doc = _make_function_doc(function_doc) - cdef _create_kernel(self): - + + + @staticmethod + cdef CStatus ExecFunc(CKernelContext* ctx, const CExecBatch& batch, CDatum* out): + cdef: + CDatum res + val = lib.asarray([10]) + res = CDatum(( val).sp_array) + out = &res + return CStatus_OK() \ No newline at end of file From e5dc4b3cdc7677b9af0335556f29904bb5bbac48 Mon Sep 17 00:00:00 2001 From: Vibhatha Abeykoon Date: Tue, 15 Feb 2022 12:03:55 +0530 Subject: [PATCH 013/131] initial version of function registry WIP --- python/pyarrow/_compute.pyx | 55 ++++++++++++++++++++++++---- python/pyarrow/compute.py | 3 +- python/pyarrow/includes/libarrow.pxd | 10 +++++ 3 files changed, 60 insertions(+), 8 deletions(-) diff --git a/python/pyarrow/_compute.pyx b/python/pyarrow/_compute.pyx index e222ae084e6..5e2e0c5bcef 100644 --- a/python/pyarrow/_compute.pyx +++ b/python/pyarrow/_compute.pyx @@ -2390,21 +2390,62 @@ cdef class UDFInterpreter: c_string c_func_name Arity c_arity CFunctionDoc c_func_doc + ExecFunc c_callback c_func_name = function_name.encode() c_arity = (arity) c_func_doc = _make_function_doc(function_doc) - @staticmethod - cdef CStatus ExecFunc(CKernelContext* ctx, const CExecBatch& batch, CDatum* out): + cdef CStatus udf(CKernelContext* ctx, const CExecBatch& batch, CDatum* out): cdef: - CDatum res - val = lib.asarray([10]) - res = CDatum(( val).sp_array) - out = &res + CDatum* res + #val = lib.asarray([10]) + #res = CDatum(( val).sp_array) return CStatus_OK() +cdef CStatus udf(CKernelContext* ctx, const CExecBatch& batch, CDatum* out): + return CStatus_OK() - \ No newline at end of file +cdef class UDFSynthesizer: + + def __init__(self): + # TODO: find a better Exception type to return the response + raise ValueError("Cannot be initialized using the constructor.") + + @staticmethod + def register_function(func_name, arity, function_doc, in_types, out_type, callback): + cdef: + c_string c_func_name + CArity c_arity + CFunctionDoc c_func_doc + CInputType in_tmp + vector[CInputType] c_in_types + ExecFunc c_callback + shared_ptr[CDataType] c_type + + if func_name and isinstance(func_name, str): + c_func_name = func_name.encode() + else: + raise ValueError("func_name should be str") + + if arity and isinstance(arity, Arity): + c_arity = ( arity).arity + else: + raise ValueError("arity must be an instance of Arity") + + c_func_doc = _make_function_doc(function_doc) + + if in_types and isinstance(in_types, list): + for in_type in in_types: + in_tmp = ( in_type).input_type + c_in_types.push_back(in_tmp) + + c_type = pyarrow_unwrap_data_type(out_type) + + c_callback = udf + cdef COutputType* c_out_type = new COutputType(c_type) + cdef CUDFSynthesizer* c_udf_syn = new CUDFSynthesizer(c_func_name, + c_arity, c_func_doc, c_in_types, deref(c_out_type), c_callback) + c_udf_syn.MakeFunction() diff --git a/python/pyarrow/compute.py b/python/pyarrow/compute.py index 70b48641453..e9f801931fa 100644 --- a/python/pyarrow/compute.py +++ b/python/pyarrow/compute.py @@ -80,7 +80,8 @@ _group_by, # Expressions Expression, - UDFInterpreter + UDFInterpreter, + UDFSynthesizer ) from collections import namedtuple diff --git a/python/pyarrow/includes/libarrow.pxd b/python/pyarrow/includes/libarrow.pxd index 95a69ea9af6..eb2b00eb653 100644 --- a/python/pyarrow/includes/libarrow.pxd +++ b/python/pyarrow/includes/libarrow.pxd @@ -2690,3 +2690,13 @@ cdef extern from "arrow/util/byte_size.h" namespace "arrow::util" nogil: int64_t TotalBufferSize(const CChunkedArray& array) int64_t TotalBufferSize(const CRecordBatch& record_batch) int64_t TotalBufferSize(const CTable& table) + +ctypedef CStatus(*ExecFunc)(CKernelContext*, const CExecBatch&, CDatum*) + +cdef extern from "arrow/python/udf.h" namespace "arrow::py" nogil: + # TODO: determine a better name. This may be confused for a cudf util + cdef cppclass CUDFSynthesizer "arrow::py::UDFSynthesizer": + CUDFSynthesizer(c_string func_name, CArity arity, CFunctionDoc func_doc, + vector[CInputType] in_types, COutputType out_type, ExecFunc) + CStatus MakeFunction() + From dc15bdad9d61fcea7c90c24178cbeca65fb751f8 Mon Sep 17 00:00:00 2001 From: Vibhatha Abeykoon Date: Tue, 15 Feb 2022 19:45:03 +0530 Subject: [PATCH 014/131] updating call back API --- python/examples/statistics/udf_example.py | 25 +++++++++++++++++++++++ python/pyarrow/_compute.pyx | 5 +++++ python/pyarrow/includes/libarrow.pxd | 3 +++ 3 files changed, 33 insertions(+) create mode 100644 python/examples/statistics/udf_example.py diff --git a/python/examples/statistics/udf_example.py b/python/examples/statistics/udf_example.py new file mode 100644 index 00000000000..c72a0efe300 --- /dev/null +++ b/python/examples/statistics/udf_example.py @@ -0,0 +1,25 @@ +import pyarrow as pa +from pyarrow.compute import UDFInterpreter, UDFSynthesizer +from pyarrow.compute import Arity, InputType +func_doc = {} +func_doc["summary"] = "summary" +func_doc["description"] = "desc" +func_doc["arg_names"] = ["number"] +func_doc["options_class"] = "SomeOptions" +func_doc["options_required"] = False +arity = Arity.unary() +func_name = "python_udf" +in_types = [InputType.array(pa.int64())] +out_type = pa.int64() + +def udf(): + print("Hello From Python") + +callback = udf +UDFSynthesizer.register_function(func_name, arity, func_doc, in_types, out_type, callback) + +from pyarrow import compute as pc + +func1 = pc.get_function(func_name) + +pc.call_function(func_name, [pa.array([20])]) \ No newline at end of file diff --git a/python/pyarrow/_compute.pyx b/python/pyarrow/_compute.pyx index 5e2e0c5bcef..dd644d19d50 100644 --- a/python/pyarrow/_compute.pyx +++ b/python/pyarrow/_compute.pyx @@ -2406,6 +2406,11 @@ cdef class UDFInterpreter: return CStatus_OK() cdef CStatus udf(CKernelContext* ctx, const CExecBatch& batch, CDatum* out): + cdef extern from "Python.h": + Py_Initialize() + cdef c_string tstr = batch.ToString() + PyObject* str = PyUnicode_FromString(tstr) + Py_Finalize() return CStatus_OK() cdef class UDFSynthesizer: diff --git a/python/pyarrow/includes/libarrow.pxd b/python/pyarrow/includes/libarrow.pxd index eb2b00eb653..6125d6ab03f 100644 --- a/python/pyarrow/includes/libarrow.pxd +++ b/python/pyarrow/includes/libarrow.pxd @@ -1814,6 +1814,9 @@ cdef extern from "arrow/compute/api.h" namespace "arrow::compute" nogil: CResult[shared_ptr[CRecordBatch]] ToRecordBatch( shared_ptr[CSchema] schema, CMemoryPool* pool) const + inline const CDatum& operator[](i) const + c_string ToString() const + cdef cppclass CKernelContext" arrow::compute::KernelContext": CKernelContext(CExecContext* exec_ctx) From 78ca0a964b82470b0d61b4b53f6a9e9e52699b17 Mon Sep 17 00:00:00 2001 From: Vibhatha Abeykoon Date: Thu, 17 Feb 2022 22:25:18 +0530 Subject: [PATCH 015/131] func registry with a cython udf --- cpp/examples/arrow/udf_example.cc | 3 +- cpp/src/arrow/python/udf.h | 1 - python/examples/statistics/udf_example.py | 6 +-- python/pyarrow/_compute.pyx | 56 ++++++++--------------- python/pyarrow/compute.py | 5 +- python/pyarrow/includes/libarrow.pxd | 10 +++- 6 files changed, 35 insertions(+), 46 deletions(-) diff --git a/cpp/examples/arrow/udf_example.cc b/cpp/examples/arrow/udf_example.cc index 7d9ca3e30dd..54afa87c995 100644 --- a/cpp/examples/arrow/udf_example.cc +++ b/cpp/examples/arrow/udf_example.cc @@ -20,6 +20,7 @@ #include #include #include +#include #include #include #include @@ -437,7 +438,7 @@ arrow::Status ExecutePy() { } int main(int argc, char** argv) { - auto status = ExecuteSynth(); + auto status = Execute(); if (!status.ok()) { std::cerr << "Error occurred : " << status.message() << std::endl; return EXIT_FAILURE; diff --git a/cpp/src/arrow/python/udf.h b/cpp/src/arrow/python/udf.h index cb17ea01b0d..e68a4651a9b 100644 --- a/cpp/src/arrow/python/udf.h +++ b/cpp/src/arrow/python/udf.h @@ -135,7 +135,6 @@ class ARROW_PYTHON_EXPORT UDFSynthesizer { cp::OutputType out_type_; //KernelExec kernel_exec_; Status(*callback_)(cp::KernelContext*, const cp::ExecBatch&, Datum*); - }; } // namespace py diff --git a/python/examples/statistics/udf_example.py b/python/examples/statistics/udf_example.py index c72a0efe300..1adb4a70517 100644 --- a/python/examples/statistics/udf_example.py +++ b/python/examples/statistics/udf_example.py @@ -1,5 +1,5 @@ import pyarrow as pa -from pyarrow.compute import UDFInterpreter, UDFSynthesizer +from pyarrow.compute import register_function from pyarrow.compute import Arity, InputType func_doc = {} func_doc["summary"] = "summary" @@ -13,10 +13,10 @@ out_type = pa.int64() def udf(): - print("Hello From Python") + pass callback = udf -UDFSynthesizer.register_function(func_name, arity, func_doc, in_types, out_type, callback) +register_function(func_name, arity, func_doc, in_types, out_type, callback) from pyarrow import compute as pc diff --git a/python/pyarrow/_compute.pyx b/python/pyarrow/_compute.pyx index dd644d19d50..4f134357133 100644 --- a/python/pyarrow/_compute.pyx +++ b/python/pyarrow/_compute.pyx @@ -29,6 +29,8 @@ from pyarrow.lib cimport * from pyarrow.includes.libarrow cimport * import pyarrow.lib as lib +from cpython.ref cimport PyObject + import numpy as np @@ -2379,48 +2381,28 @@ cdef CFunctionDoc _make_function_doc(func_doc): else: raise TypeError(f"func_doc must be a dictionary") -cdef class UDFInterpreter: - - def __init__(self): - raise TypeError("Cannot initialize UDFInterpreter from the constructor") - @staticmethod - def create_scalar_function(function_name, arity, function_doc): - cdef: - c_string c_func_name - Arity c_arity - CFunctionDoc c_func_doc - ExecFunc c_callback +global function_map - c_func_name = function_name.encode() - c_arity = (arity) - c_func_doc = _make_function_doc(function_doc) +function_map = {} +def static_py_udf(arrow_array): + p_new_array = call_function("add", [arrow_array, 1]) + return p_new_array - @staticmethod - cdef CStatus udf(CKernelContext* ctx, const CExecBatch& batch, CDatum* out): - cdef: - CDatum* res - #val = lib.asarray([10]) - #res = CDatum(( val).sp_array) - return CStatus_OK() - -cdef CStatus udf(CKernelContext* ctx, const CExecBatch& batch, CDatum* out): - cdef extern from "Python.h": - Py_Initialize() - cdef c_string tstr = batch.ToString() - PyObject* str = PyUnicode_FromString(tstr) - Py_Finalize() +cdef CStatus udf(CKernelContext* ctx, const CExecBatch& batch, CDatum* out) nogil: + cdef CDatum datum = batch.values[0] + cdef shared_ptr[CArrayData] array_data = datum.array() + cdef shared_ptr[CArray] c_array = MakeArray(array_data) + cdef shared_ptr[CArray] new_array + with gil: + p_array = pyarrow_wrap_array(c_array) + new_array = pyarrow_unwrap_array(static_py_udf(p_array)) + cdef CDatum new_datum = CDatum(new_array) + out[0] = new_datum return CStatus_OK() -cdef class UDFSynthesizer: - - def __init__(self): - # TODO: find a better Exception type to return the response - raise ValueError("Cannot be initialized using the constructor.") - - @staticmethod - def register_function(func_name, arity, function_doc, in_types, out_type, callback): +def register_function(func_name, arity, function_doc, in_types, out_type, callback): cdef: c_string c_func_name CArity c_arity @@ -2453,4 +2435,6 @@ cdef class UDFSynthesizer: cdef COutputType* c_out_type = new COutputType(c_type) cdef CUDFSynthesizer* c_udf_syn = new CUDFSynthesizer(c_func_name, c_arity, c_func_doc, c_in_types, deref(c_out_type), c_callback) + function_map[func_name] = callback c_udf_syn.MakeFunction() + diff --git a/python/pyarrow/compute.py b/python/pyarrow/compute.py index e9f801931fa..e0b010293fb 100644 --- a/python/pyarrow/compute.py +++ b/python/pyarrow/compute.py @@ -78,10 +78,9 @@ get_function, list_functions, _group_by, + register_function, # Expressions - Expression, - UDFInterpreter, - UDFSynthesizer + Expression ) from collections import namedtuple diff --git a/python/pyarrow/includes/libarrow.pxd b/python/pyarrow/includes/libarrow.pxd index 6125d6ab03f..76386b7ca70 100644 --- a/python/pyarrow/includes/libarrow.pxd +++ b/python/pyarrow/includes/libarrow.pxd @@ -19,6 +19,8 @@ from pyarrow.includes.common cimport * +from cpython.ref cimport PyObject + cdef extern from "arrow/util/key_value_metadata.h" namespace "arrow" nogil: cdef cppclass CKeyValueMetadata" arrow::KeyValueMetadata": @@ -1808,13 +1810,15 @@ cdef extern from "arrow/compute/api.h" namespace "arrow::compute" nogil: cdef cppclass CExecBatch" arrow::compute::ExecBatch": CExecBatch(const CRecordBatch& batch); + @staticmethod CResult[CExecBatch] Make(vector[CDatum] values) CResult[shared_ptr[CRecordBatch]] ToRecordBatch( shared_ptr[CSchema] schema, CMemoryPool* pool) const - inline const CDatum& operator[](i) const + #inline const CDatum& operator[](i) const + vector[CDatum] values c_string ToString() const @@ -2369,6 +2373,8 @@ cdef extern from "arrow/compute/api.h" namespace "arrow::compute" nogil: const shared_ptr[CTable]& table() const const shared_ptr[CScalar]& scalar() const + CArrayData* mutable_array() const + cdef extern from * namespace "arrow::compute": # inlined from compute/function_internal.h to avoid exposing @@ -2700,6 +2706,6 @@ cdef extern from "arrow/python/udf.h" namespace "arrow::py" nogil: # TODO: determine a better name. This may be confused for a cudf util cdef cppclass CUDFSynthesizer "arrow::py::UDFSynthesizer": CUDFSynthesizer(c_string func_name, CArity arity, CFunctionDoc func_doc, - vector[CInputType] in_types, COutputType out_type, ExecFunc) + vector[CInputType] in_types, COutputType out_type, ExecFunc) CStatus MakeFunction() From 9531409f803c3cc7daaf7cbdd2f0e119216da7b2 Mon Sep 17 00:00:00 2001 From: Vibhatha Abeykoon Date: Fri, 18 Feb 2022 15:55:50 +0530 Subject: [PATCH 016/131] testing udf python expose --- python/examples/statistics/udf_example.py | 9 +++++---- python/pyarrow/_compute.pyx | 13 ++++--------- 2 files changed, 9 insertions(+), 13 deletions(-) diff --git a/python/examples/statistics/udf_example.py b/python/examples/statistics/udf_example.py index 1adb4a70517..82eced897eb 100644 --- a/python/examples/statistics/udf_example.py +++ b/python/examples/statistics/udf_example.py @@ -1,5 +1,5 @@ import pyarrow as pa -from pyarrow.compute import register_function +from pyarrow.compute import register_function, call_function from pyarrow.compute import Arity, InputType func_doc = {} func_doc["summary"] = "summary" @@ -12,10 +12,11 @@ in_types = [InputType.array(pa.int64())] out_type = pa.int64() -def udf(): - pass +def py_function(arrow_array): + p_new_array = call_function("add", [arrow_array, 1]) + return p_new_array -callback = udf +callback = py_function register_function(func_name, arity, func_doc, in_types, out_type, callback) from pyarrow import compute as pc diff --git a/python/pyarrow/_compute.pyx b/python/pyarrow/_compute.pyx index 4f134357133..aec25a30945 100644 --- a/python/pyarrow/_compute.pyx +++ b/python/pyarrow/_compute.pyx @@ -2381,23 +2381,20 @@ cdef CFunctionDoc _make_function_doc(func_doc): else: raise TypeError(f"func_doc must be a dictionary") +cdef object py_function = None -global function_map - -function_map = {} - -def static_py_udf(arrow_array): +def py_function(arrow_array): p_new_array = call_function("add", [arrow_array, 1]) return p_new_array -cdef CStatus udf(CKernelContext* ctx, const CExecBatch& batch, CDatum* out) nogil: +cdef CStatus udf(self, CKernelContext* ctx, const CExecBatch& batch, CDatum* out) nogil: cdef CDatum datum = batch.values[0] cdef shared_ptr[CArrayData] array_data = datum.array() cdef shared_ptr[CArray] c_array = MakeArray(array_data) cdef shared_ptr[CArray] new_array with gil: p_array = pyarrow_wrap_array(c_array) - new_array = pyarrow_unwrap_array(static_py_udf(p_array)) + new_array = pyarrow_unwrap_array(py_function(p_array)) cdef CDatum new_datum = CDatum(new_array) out[0] = new_datum return CStatus_OK() @@ -2430,11 +2427,9 @@ def register_function(func_name, arity, function_doc, in_types, out_type, callba c_in_types.push_back(in_tmp) c_type = pyarrow_unwrap_data_type(out_type) - c_callback = udf cdef COutputType* c_out_type = new COutputType(c_type) cdef CUDFSynthesizer* c_udf_syn = new CUDFSynthesizer(c_func_name, c_arity, c_func_doc, c_in_types, deref(c_out_type), c_callback) - function_map[func_name] = callback c_udf_syn.MakeFunction() From 7400faff22763a7d2f81553ed66afd8e43d1c641 Mon Sep 17 00:00:00 2001 From: Vibhatha Abeykoon Date: Mon, 21 Feb 2022 16:32:43 +0530 Subject: [PATCH 017/131] initial version of end-to-end pycallable --- cpp/examples/arrow/udf_example.cc | 58 +++++----- cpp/src/arrow/python/udf.h | 131 ++++++++++++++++------ python/examples/statistics/udf_example.py | 16 ++- python/pyarrow/_compute.pyx | 45 ++++++++ python/pyarrow/compute.py | 1 + python/pyarrow/includes/libarrow.pxd | 3 + 6 files changed, 189 insertions(+), 65 deletions(-) diff --git a/cpp/examples/arrow/udf_example.cc b/cpp/examples/arrow/udf_example.cc index 54afa87c995..1d3073fee55 100644 --- a/cpp/examples/arrow/udf_example.cc +++ b/cpp/examples/arrow/udf_example.cc @@ -154,7 +154,7 @@ std::unique_ptr ExampleFunctionOptionsType::Copy( PyObject* SimpleFunction() { PyObject* out = Py_BuildValue("s", "hello"); - std::cout << "HELLO" << std::endl; + std::cout << "HELLO FROM PYTHON FUNCTION IN C++" << std::endl; return std::move(out); } @@ -366,34 +366,34 @@ arrow::Status Execute() { return future.status(); } -arrow::Status ExecuteSynth() { - - std::string func_name = "simple_func"; - cp::Arity arity = cp::Arity::Unary(); - const cp::FunctionDoc func_doc3{ - "Example function to demonstrate registering an out-of-tree function", - "", - {"x"}, - "ExampleFunctionOptions3"}; - std::vector in_types = {cp::InputType::Array(arrow::int64())}; - cp::OutputType out_type = arrow::int64(); - - arrow::py::UDFSynthesizer udf_sync(func_name, arity, func_doc3, in_types, out_type, ExampleFunctionImpl); - ABORT_ON_FAILURE(udf_sync.MakeFunction()); - - arrow::Int64Builder builder(arrow::default_memory_pool()); - std::shared_ptr arr1, arr2; - ABORT_ON_FAILURE(builder.Append(42)); - ABORT_ON_FAILURE(builder.Finish(&arr1)); - auto options = std::make_shared(); - auto maybe_result = cp::CallFunction(func_name, {arr1}, options.get()); - ABORT_ON_FAILURE(maybe_result.status()); - - std::cout << "Result 1: " << maybe_result->make_array()->ToString() << std::endl; - - - return arrow::Status::OK(); -} +// arrow::Status ExecuteSynth() { +// std::string func_name = "simple_func"; +// cp::Arity arity = cp::Arity::Unary(); +// const cp::FunctionDoc func_doc3{ +// "Example function to demonstrate registering an out-of-tree function", +// "", +// {"x"}, +// "ExampleFunctionOptions3"}; +// std::vector in_types = {cp::InputType::Array(arrow::int64())}; +// cp::OutputType out_type = arrow::int64(); +// PyObject* (*py_callback)(); +// py_callback = &SimpleFunction; +// arrow::py::UDFSynthesizer udf_sync(func_name, arity, func_doc3, in_types, out_type, +// py_callback); +// ABORT_ON_FAILURE(udf_sync.MakePyFunction()); + +// arrow::Int64Builder builder(arrow::default_memory_pool()); +// std::shared_ptr arr1, arr2; +// ABORT_ON_FAILURE(builder.Append(42)); +// ABORT_ON_FAILURE(builder.Finish(&arr1)); +// auto options = std::make_shared(); +// auto maybe_result = cp::CallFunction(func_name, {arr1}, options.get()); +// ABORT_ON_FAILURE(maybe_result.status()); + +// std::cout << "Result 1: " << maybe_result->make_array()->ToString() << std::endl; + +// return arrow::Status::OK(); +// } arrow::Status ExecutePy() { cp::ExecContext exec_context(arrow::default_memory_pool(), diff --git a/cpp/src/arrow/python/udf.h b/cpp/src/arrow/python/udf.h index e68a4651a9b..5b10c9d0920 100644 --- a/cpp/src/arrow/python/udf.h +++ b/cpp/src/arrow/python/udf.h @@ -17,6 +17,8 @@ #include "arrow/python/common.h" #include "arrow/python/visibility.h" +#include + namespace cp = arrow::compute; namespace arrow { @@ -85,8 +87,7 @@ class ARROW_PYTHON_EXPORT UDFScalarFunction /// kernel's signature does not match the function's arity. Status AddKernel(UDFScalarKernel kernel) { ARROW_RETURN_NOT_OK(CheckArity(kernel.signature->in_types())); - if (arity_.is_varargs && - !kernel.signature->is_varargs()) { + if (arity_.is_varargs && !kernel.signature->is_varargs()) { return Status::Invalid("Function accepts varargs but kernel signature does not"); } kernels_.emplace_back(std::move(kernel)); @@ -98,43 +99,109 @@ class ARROW_PYTHON_EXPORT UDFScalarFunction Status Hello2(); }; -using KernelExec = std::function; +using KernelExec = + std::function; class ARROW_PYTHON_EXPORT UDFSynthesizer { - public: - - UDFSynthesizer(std::string func_name, cp::Arity arity, cp::FunctionDoc func_doc, - std::vector in_types, cp::OutputType out_type, - Status(*callback)(cp::KernelContext*, const cp::ExecBatch&, Datum*)) - : func_name_(func_name), arity_(arity), func_doc_(func_doc), - in_types_(in_types), out_type_(out_type), callback_(callback) {} - - Status MakeFunction() { - Status st; - auto func = std::make_shared(func_name_, arity_, &func_doc_); - cp::ScalarKernel kernel(in_types_, out_type_, callback_); - kernel.mem_allocation = cp::MemAllocation::NO_PREALLOCATE; - st = func->AddKernel(std::move(kernel)); - if (!st.ok()) { - return Status::ExecutionError("Kernel couldn't be added to the udf"); + public: + UDFSynthesizer(std::string func_name, cp::Arity arity, cp::FunctionDoc func_doc, + std::vector in_types, cp::OutputType out_type, + Status (*callback)(cp::KernelContext*, const cp::ExecBatch&, Datum*)) + : func_name_(func_name), + arity_(arity), + func_doc_(func_doc), + in_types_(in_types), + out_type_(out_type), + callback_(callback) {} + + UDFSynthesizer(std::string func_name, cp::Arity arity, cp::FunctionDoc func_doc, + std::vector in_types, cp::OutputType out_type) + : func_name_(func_name), + arity_(arity), + func_doc_(func_doc), + in_types_(in_types), + out_type_(out_type) {} + + Status MakeFunction() { + Status st; + auto func = std::make_shared(func_name_, arity_, &func_doc_); + cp::ScalarKernel kernel(in_types_, out_type_, callback_); + kernel.mem_allocation = cp::MemAllocation::NO_PREALLOCATE; + st = func->AddKernel(std::move(kernel)); + if (!st.ok()) { + return Status::ExecutionError("Kernel couldn't be added to the udf"); + } + auto registry = cp::GetFunctionRegistry(); + st = registry->AddFunction(std::move(func)); + if (!st.ok()) { + return Status::ExecutionError("udf registration failed"); + } + return Status::OK(); + } + + Status MakePyFunction(PyObject* function, PyObject* args) { + Status st; + auto func = std::make_shared(func_name_, arity_, &func_doc_); + Py_XINCREF(function); + Py_XINCREF(args); + //double result = PyFloat_AsDouble(args); + //std::cout << "Make Function Args : " << result << std::endl; + auto call_back_lambda = [function, args](cp::KernelContext* ctx, const cp::ExecBatch& batch, + Datum* out) { + PyGILState_STATE state = PyGILState_Ensure(); + // PyObject* obj = Py_BuildValue("s", "hello"); + //Py_XINCREF(function); + //Py_XINCREF(args); + if (function == NULL) { + PyGILState_Release(state); + return Status::ExecutionError("python function cannot be null"); } - auto registry = cp::GetFunctionRegistry(); - st = registry->AddFunction(std::move(func)); - if (!st.ok()) { - return Status::ExecutionError("udf registration failed"); + + int res = PyCallable_Check(function); + + if (res == 1) { + std::cout << "This is a PyCallback" << std::endl; + PyObject *result = PyObject_CallObject(function, args); + Py_DECREF(function); + if (result == NULL) { + PyGILState_Release(state); + return Status::ExecutionError("Error occured in computation"); + } + } else { + std::cout << "This is not a callable" << std::endl; + PyErr_Print(); } + // Python Way Ends + auto res_func = cp::CallFunction("add", {batch[0].array(), batch[0].array()}); + *out->mutable_array() = *res_func.ValueOrDie().array(); + PyGILState_Release(state); return Status::OK(); + }; + cp::ScalarKernel kernel(in_types_, out_type_, call_back_lambda); + kernel.mem_allocation = cp::MemAllocation::NO_PREALLOCATE; + st = func->AddKernel(std::move(kernel)); + if (!st.ok()) { + return Status::ExecutionError("Kernel couldn't be added to the udf"); + } + auto registry = cp::GetFunctionRegistry(); + st = registry->AddFunction(std::move(func)); + if (!st.ok()) { + return Status::ExecutionError("udf registration failed"); } + return Status::OK(); + } - private: - - std::string func_name_; - cp::Arity arity_; - cp::FunctionDoc func_doc_; - std::vector in_types_; - cp::OutputType out_type_; - //KernelExec kernel_exec_; - Status(*callback_)(cp::KernelContext*, const cp::ExecBatch&, Datum*); + private: + std::string func_name_; + cp::Arity arity_; + cp::FunctionDoc func_doc_; + std::vector in_types_; + cp::OutputType out_type_; + // KernelExec kernel_exec_; + Status (*callback_)(cp::KernelContext*, const cp::ExecBatch&, Datum*); + // C++ way + //PyObject* (*py_call_back_)(); + // Python way }; } // namespace py diff --git a/python/examples/statistics/udf_example.py b/python/examples/statistics/udf_example.py index 82eced897eb..01fb147cd10 100644 --- a/python/examples/statistics/udf_example.py +++ b/python/examples/statistics/udf_example.py @@ -1,5 +1,5 @@ import pyarrow as pa -from pyarrow.compute import register_function, call_function +from pyarrow.compute import register_function, call_function, register_pyfunction from pyarrow.compute import Arity, InputType func_doc = {} func_doc["summary"] = "summary" @@ -16,11 +16,19 @@ def py_function(arrow_array): p_new_array = call_function("add", [arrow_array, 1]) return p_new_array -callback = py_function -register_function(func_name, arity, func_doc, in_types, out_type, callback) +def simple_function(args): + print("\t \tHello From Python") + print(args) + return args + +callback = simple_function +args = tuple([12345]) +register_pyfunction(func_name, arity, func_doc, in_types, out_type, callback, args) from pyarrow import compute as pc func1 = pc.get_function(func_name) -pc.call_function(func_name, [pa.array([20])]) \ No newline at end of file +a = pc.call_function(func_name, [pa.array([20])]) + +print(a) \ No newline at end of file diff --git a/python/pyarrow/_compute.pyx b/python/pyarrow/_compute.pyx index aec25a30945..08ef45637b7 100644 --- a/python/pyarrow/_compute.pyx +++ b/python/pyarrow/_compute.pyx @@ -2433,3 +2433,48 @@ def register_function(func_name, arity, function_doc, in_types, out_type, callba c_arity, c_func_doc, c_in_types, deref(c_out_type), c_callback) c_udf_syn.MakeFunction() +def register_pyfunction(func_name, arity, function_doc, in_types, out_type, callback, args): + cdef: + c_string c_func_name + CArity c_arity + CFunctionDoc c_func_doc + CInputType in_tmp + vector[CInputType] c_in_types + PyObject* c_callback + PyObject* c_args + shared_ptr[CDataType] c_type + object obj + + if func_name and isinstance(func_name, str): + c_func_name = func_name.encode() + else: + raise ValueError("func_name should be str") + + if arity and isinstance(arity, Arity): + c_arity = ( arity).arity + else: + raise ValueError("arity must be an instance of Arity") + + c_func_doc = _make_function_doc(function_doc) + + if in_types and isinstance(in_types, list): + for in_type in in_types: + in_tmp = ( in_type).input_type + c_in_types.push_back(in_tmp) + + c_type = pyarrow_unwrap_data_type(out_type) + c_callback = callback + # TODO: make sure to add a validation about args being a tuple + # the PyObject_CallObject(...) expects args to be a tuple + c_args = args + #c_callback = udf + cdef COutputType* c_out_type = new COutputType(c_type) + cdef CUDFSynthesizer* c_udf_syn = new CUDFSynthesizer(c_func_name, + c_arity, c_func_doc, c_in_types, deref(c_out_type)) + c_udf_syn.MakePyFunction(c_callback, c_args) + obj = c_callback + pargs = c_args + obj(pargs) + +cdef public void py_caller(py_function): + py_function() diff --git a/python/pyarrow/compute.py b/python/pyarrow/compute.py index e0b010293fb..ed30e780b39 100644 --- a/python/pyarrow/compute.py +++ b/python/pyarrow/compute.py @@ -79,6 +79,7 @@ list_functions, _group_by, register_function, + register_pyfunction, # Expressions Expression ) diff --git a/python/pyarrow/includes/libarrow.pxd b/python/pyarrow/includes/libarrow.pxd index 76386b7ca70..d897c005f9a 100644 --- a/python/pyarrow/includes/libarrow.pxd +++ b/python/pyarrow/includes/libarrow.pxd @@ -2707,5 +2707,8 @@ cdef extern from "arrow/python/udf.h" namespace "arrow::py" nogil: cdef cppclass CUDFSynthesizer "arrow::py::UDFSynthesizer": CUDFSynthesizer(c_string func_name, CArity arity, CFunctionDoc func_doc, vector[CInputType] in_types, COutputType out_type, ExecFunc) + CUDFSynthesizer(c_string func_name, CArity arity, CFunctionDoc func_doc, + vector[CInputType] in_types, COutputType out_type) CStatus MakeFunction() + CStatus MakePyFunction(PyObject* function, PyObject* args) From 5ec38eda9863eed7576a724c0ee6540a01ae2d2b Mon Sep 17 00:00:00 2001 From: Vibhatha Abeykoon Date: Mon, 21 Feb 2022 17:31:53 +0530 Subject: [PATCH 018/131] adding and end-to-end udf for scalar array --- cpp/src/arrow/python/udf.h | 29 ++++++++++------- python/examples/statistics/udf_example.py | 38 ++++++++++++++++++----- python/pyarrow/_compute.pyx | 14 ++------- 3 files changed, 49 insertions(+), 32 deletions(-) diff --git a/cpp/src/arrow/python/udf.h b/cpp/src/arrow/python/udf.h index 5b10c9d0920..91668eba531 100644 --- a/cpp/src/arrow/python/udf.h +++ b/cpp/src/arrow/python/udf.h @@ -15,6 +15,7 @@ #include "arrow/util/logging.h" #include "arrow/python/common.h" +#include "arrow/python/pyarrow.h" #include "arrow/python/visibility.h" #include @@ -139,41 +140,45 @@ class ARROW_PYTHON_EXPORT UDFSynthesizer { return Status::OK(); } - Status MakePyFunction(PyObject* function, PyObject* args) { + Status MakePyFunction(PyObject* function) { Status st; auto func = std::make_shared(func_name_, arity_, &func_doc_); Py_XINCREF(function); - Py_XINCREF(args); //double result = PyFloat_AsDouble(args); //std::cout << "Make Function Args : " << result << std::endl; - auto call_back_lambda = [function, args](cp::KernelContext* ctx, const cp::ExecBatch& batch, + auto call_back_lambda = [function](cp::KernelContext* ctx, const cp::ExecBatch& batch, Datum* out) { PyGILState_STATE state = PyGILState_Ensure(); - // PyObject* obj = Py_BuildValue("s", "hello"); - //Py_XINCREF(function); - //Py_XINCREF(args); + std::shared_ptr c_res_array; if (function == NULL) { PyGILState_Release(state); return Status::ExecutionError("python function cannot be null"); } int res = PyCallable_Check(function); - if (res == 1) { std::cout << "This is a PyCallback" << std::endl; - PyObject *result = PyObject_CallObject(function, args); + auto c_array = batch[0].make_array(); + PyObject* py_array = wrap_array(c_array); + PyObject* arg_tuple = PyTuple_Pack(1, py_array); + PyObject *result = PyObject_CallObject(function, arg_tuple); Py_DECREF(function); if (result == NULL) { PyGILState_Release(state); return Status::ExecutionError("Error occured in computation"); } + auto res = unwrap_array(result); + if(!res.status().ok()) { + PyGILState_Release(state); + return res.status(); + } + c_res_array = res.ValueOrDie(); } else { - std::cout << "This is not a callable" << std::endl; PyErr_Print(); + return Status::ExecutionError("Error occured in computation"); } - // Python Way Ends - auto res_func = cp::CallFunction("add", {batch[0].array(), batch[0].array()}); - *out->mutable_array() = *res_func.ValueOrDie().array(); + auto datum = new Datum(c_res_array); + *out->mutable_array() = *datum->array(); PyGILState_Release(state); return Status::OK(); }; diff --git a/python/examples/statistics/udf_example.py b/python/examples/statistics/udf_example.py index 01fb147cd10..8bce7a9f1a0 100644 --- a/python/examples/statistics/udf_example.py +++ b/python/examples/statistics/udf_example.py @@ -1,5 +1,6 @@ import pyarrow as pa -from pyarrow.compute import register_function, call_function, register_pyfunction +from pyarrow import compute as pc +from pyarrow.compute import call_function, register_pyfunction from pyarrow.compute import Arity, InputType func_doc = {} func_doc["summary"] = "summary" @@ -17,18 +18,39 @@ def py_function(arrow_array): return p_new_array def simple_function(args): - print("\t \tHello From Python") + print("=" * 80) + print("Hello From Python") + print("=" * 80) print(args) return args -callback = simple_function -args = tuple([12345]) -register_pyfunction(func_name, arity, func_doc, in_types, out_type, callback, args) +def add_constant(array): + return pc.call_function("add", [array, 1]) -from pyarrow import compute as pc + +# example 1 +print("=" * 80) +print("Example 1") +print("=" * 80) +callback = simple_function +register_pyfunction(func_name, arity, func_doc, in_types, out_type, callback) func1 = pc.get_function(func_name) -a = pc.call_function(func_name, [pa.array([20])]) +a1 = pc.call_function(func_name, [pa.array([20])]) + +print(a1) + +# example 2 +print("=" * 80) +print("Example 2") +print("=" * 80) +callback = add_constant +func_name = "py_add_func" +register_pyfunction(func_name, arity, func_doc, in_types, out_type, callback) + +func2 = pc.get_function(func_name) + +a2 = pc.call_function(func_name, [pa.array([20])]) -print(a) \ No newline at end of file +print(a2) \ No newline at end of file diff --git a/python/pyarrow/_compute.pyx b/python/pyarrow/_compute.pyx index 08ef45637b7..30abd269203 100644 --- a/python/pyarrow/_compute.pyx +++ b/python/pyarrow/_compute.pyx @@ -2433,7 +2433,7 @@ def register_function(func_name, arity, function_doc, in_types, out_type, callba c_arity, c_func_doc, c_in_types, deref(c_out_type), c_callback) c_udf_syn.MakeFunction() -def register_pyfunction(func_name, arity, function_doc, in_types, out_type, callback, args): +def register_pyfunction(func_name, arity, function_doc, in_types, out_type, callback): cdef: c_string c_func_name CArity c_arity @@ -2441,7 +2441,6 @@ def register_pyfunction(func_name, arity, function_doc, in_types, out_type, call CInputType in_tmp vector[CInputType] c_in_types PyObject* c_callback - PyObject* c_args shared_ptr[CDataType] c_type object obj @@ -2464,17 +2463,8 @@ def register_pyfunction(func_name, arity, function_doc, in_types, out_type, call c_type = pyarrow_unwrap_data_type(out_type) c_callback = callback - # TODO: make sure to add a validation about args being a tuple - # the PyObject_CallObject(...) expects args to be a tuple - c_args = args #c_callback = udf cdef COutputType* c_out_type = new COutputType(c_type) cdef CUDFSynthesizer* c_udf_syn = new CUDFSynthesizer(c_func_name, c_arity, c_func_doc, c_in_types, deref(c_out_type)) - c_udf_syn.MakePyFunction(c_callback, c_args) - obj = c_callback - pargs = c_args - obj(pargs) - -cdef public void py_caller(py_function): - py_function() + c_udf_syn.MakePyFunction(c_callback) From bd3f1ce12b7e2b7e8fa93b57a5ec320838962038 Mon Sep 17 00:00:00 2001 From: Vibhatha Abeykoon Date: Mon, 21 Feb 2022 17:33:00 +0530 Subject: [PATCH 019/131] reformat --- python/examples/statistics/udf_example.py | 2 +- python/pyarrow/includes/libarrow.pxd | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/python/examples/statistics/udf_example.py b/python/examples/statistics/udf_example.py index 8bce7a9f1a0..f7b91c303fe 100644 --- a/python/examples/statistics/udf_example.py +++ b/python/examples/statistics/udf_example.py @@ -53,4 +53,4 @@ def add_constant(array): a2 = pc.call_function(func_name, [pa.array([20])]) -print(a2) \ No newline at end of file +print(a2) diff --git a/python/pyarrow/includes/libarrow.pxd b/python/pyarrow/includes/libarrow.pxd index d897c005f9a..defe5f84c22 100644 --- a/python/pyarrow/includes/libarrow.pxd +++ b/python/pyarrow/includes/libarrow.pxd @@ -2710,5 +2710,5 @@ cdef extern from "arrow/python/udf.h" namespace "arrow::py" nogil: CUDFSynthesizer(c_string func_name, CArity arity, CFunctionDoc func_doc, vector[CInputType] in_types, COutputType out_type) CStatus MakeFunction() - CStatus MakePyFunction(PyObject* function, PyObject* args) + CStatus MakePyFunction(PyObject* function) From 9f5e865fb72204f2440efed6ed63ad435e4695bd Mon Sep 17 00:00:00 2001 From: Vibhatha Abeykoon Date: Thu, 24 Feb 2022 10:21:31 +0530 Subject: [PATCH 020/131] minor changes --- python/examples/statistics/udf_example.py | 3 +++ python/pyarrow/_compute.pyx | 7 +++++++ python/pyarrow/compute.py | 1 + 3 files changed, 11 insertions(+) diff --git a/python/examples/statistics/udf_example.py b/python/examples/statistics/udf_example.py index f7b91c303fe..f82c35cdce4 100644 --- a/python/examples/statistics/udf_example.py +++ b/python/examples/statistics/udf_example.py @@ -10,7 +10,10 @@ func_doc["options_required"] = False arity = Arity.unary() func_name = "python_udf" +# TODO: evaluate this properly, the input type can be a record_batch, array or a table +# Caveat, a recordbatch or a table does not have type information. in_types = [InputType.array(pa.int64())] +# TODO: evaluate this properly, whether the output type can support table, array or recordbatch out_type = pa.int64() def py_function(arrow_array): diff --git a/python/pyarrow/_compute.pyx b/python/pyarrow/_compute.pyx index 30abd269203..caab1385b17 100644 --- a/python/pyarrow/_compute.pyx +++ b/python/pyarrow/_compute.pyx @@ -2468,3 +2468,10 @@ def register_pyfunction(func_name, arity, function_doc, in_types, out_type, call cdef CUDFSynthesizer* c_udf_syn = new CUDFSynthesizer(c_func_name, c_arity, c_func_doc, c_in_types, deref(c_out_type)) c_udf_syn.MakePyFunction(c_callback) + +def register_pyfunction2(func_name, func_doc, callable): + import inspect + signature = inspect.signature(callable) + return_annot = signature.return_annotation + parameter_vals = signature.parameters.values() + input_arrow_types = [val.annotation for val in parameter_vals] diff --git a/python/pyarrow/compute.py b/python/pyarrow/compute.py index ed30e780b39..20591929a39 100644 --- a/python/pyarrow/compute.py +++ b/python/pyarrow/compute.py @@ -80,6 +80,7 @@ _group_by, register_function, register_pyfunction, + register_pyfunction2, # Expressions Expression ) From 6d98c176740fab5a9452351815b2d536133fb2ed Mon Sep 17 00:00:00 2001 From: Vibhatha Abeykoon Date: Fri, 25 Feb 2022 12:20:10 +0530 Subject: [PATCH 021/131] removing inspect func --- python/pyarrow/_compute.pyx | 7 ------- python/pyarrow/compute.py | 1 - 2 files changed, 8 deletions(-) diff --git a/python/pyarrow/_compute.pyx b/python/pyarrow/_compute.pyx index caab1385b17..30abd269203 100644 --- a/python/pyarrow/_compute.pyx +++ b/python/pyarrow/_compute.pyx @@ -2468,10 +2468,3 @@ def register_pyfunction(func_name, arity, function_doc, in_types, out_type, call cdef CUDFSynthesizer* c_udf_syn = new CUDFSynthesizer(c_func_name, c_arity, c_func_doc, c_in_types, deref(c_out_type)) c_udf_syn.MakePyFunction(c_callback) - -def register_pyfunction2(func_name, func_doc, callable): - import inspect - signature = inspect.signature(callable) - return_annot = signature.return_annotation - parameter_vals = signature.parameters.values() - input_arrow_types = [val.annotation for val in parameter_vals] diff --git a/python/pyarrow/compute.py b/python/pyarrow/compute.py index 20591929a39..ed30e780b39 100644 --- a/python/pyarrow/compute.py +++ b/python/pyarrow/compute.py @@ -80,7 +80,6 @@ _group_by, register_function, register_pyfunction, - register_pyfunction2, # Expressions Expression ) From b4a8fd3ac7bbcf0777a868879fa0e006629522f6 Mon Sep 17 00:00:00 2001 From: Vibhatha Abeykoon Date: Fri, 25 Feb 2022 13:11:55 +0530 Subject: [PATCH 022/131] cleaning up the current python API --- cpp/src/arrow/python/python_test.cc | 25 ----------- python/examples/statistics/udf_example.py | 6 +-- python/pyarrow/_compute.pyx | 54 +---------------------- python/pyarrow/compute.py | 1 - 4 files changed, 4 insertions(+), 82 deletions(-) diff --git a/cpp/src/arrow/python/python_test.cc b/cpp/src/arrow/python/python_test.cc index d3c755993c2..a1cd0b1e07d 100644 --- a/cpp/src/arrow/python/python_test.cc +++ b/cpp/src/arrow/python/python_test.cc @@ -658,31 +658,6 @@ TEST(UDF, Initialization) { "ExampleFunctionOptions"}; arrow::Status st; const std::string name = "x+x"; - auto func2 = - std::make_shared(name, cp::Arity::Unary(), &func_doc); - arrow::py::UDFScalarKernel kernel2({cp::InputType::Array(arrow::int32())}, - arrow::int32(), ExamplePyFunctionImpl); - - kernel2.mem_allocation = cp::MemAllocation::NO_PREALLOCATE; - - st = func2->AddKernel(std::move(kernel2)); - - auto registry = cp::GetFunctionRegistry(); - st = registry->AddFunction(std::move(func2)); - - arrow::Int32Builder builder(arrow::default_memory_pool()); - std::shared_ptr arr; - - st = builder.Append(42); - st = builder.Finish(&arr); - auto options = std::make_shared(); - - // auto func = registry->GetFunction("x+x").ValueOrDie(); - - auto maybe_result = cp::CallFunction(name, {arr}, options.get()); - st = maybe_result.status(); - - std::cout << "Result 1: " << maybe_result->make_array()->ToString() << std::endl; } } // namespace py diff --git a/python/examples/statistics/udf_example.py b/python/examples/statistics/udf_example.py index f82c35cdce4..4cdabe34b9a 100644 --- a/python/examples/statistics/udf_example.py +++ b/python/examples/statistics/udf_example.py @@ -1,6 +1,6 @@ import pyarrow as pa from pyarrow import compute as pc -from pyarrow.compute import call_function, register_pyfunction +from pyarrow.compute import call_function, register_function from pyarrow.compute import Arity, InputType func_doc = {} func_doc["summary"] = "summary" @@ -36,7 +36,7 @@ def add_constant(array): print("Example 1") print("=" * 80) callback = simple_function -register_pyfunction(func_name, arity, func_doc, in_types, out_type, callback) +register_function(func_name, arity, func_doc, in_types, out_type, callback) func1 = pc.get_function(func_name) @@ -50,7 +50,7 @@ def add_constant(array): print("=" * 80) callback = add_constant func_name = "py_add_func" -register_pyfunction(func_name, arity, func_doc, in_types, out_type, callback) +register_function(func_name, arity, func_doc, in_types, out_type, callback) func2 = pc.get_function(func_name) diff --git a/python/pyarrow/_compute.pyx b/python/pyarrow/_compute.pyx index 30abd269203..86d48023027 100644 --- a/python/pyarrow/_compute.pyx +++ b/python/pyarrow/_compute.pyx @@ -2381,59 +2381,7 @@ cdef CFunctionDoc _make_function_doc(func_doc): else: raise TypeError(f"func_doc must be a dictionary") -cdef object py_function = None - -def py_function(arrow_array): - p_new_array = call_function("add", [arrow_array, 1]) - return p_new_array - -cdef CStatus udf(self, CKernelContext* ctx, const CExecBatch& batch, CDatum* out) nogil: - cdef CDatum datum = batch.values[0] - cdef shared_ptr[CArrayData] array_data = datum.array() - cdef shared_ptr[CArray] c_array = MakeArray(array_data) - cdef shared_ptr[CArray] new_array - with gil: - p_array = pyarrow_wrap_array(c_array) - new_array = pyarrow_unwrap_array(py_function(p_array)) - cdef CDatum new_datum = CDatum(new_array) - out[0] = new_datum - return CStatus_OK() - def register_function(func_name, arity, function_doc, in_types, out_type, callback): - cdef: - c_string c_func_name - CArity c_arity - CFunctionDoc c_func_doc - CInputType in_tmp - vector[CInputType] c_in_types - ExecFunc c_callback - shared_ptr[CDataType] c_type - - if func_name and isinstance(func_name, str): - c_func_name = func_name.encode() - else: - raise ValueError("func_name should be str") - - if arity and isinstance(arity, Arity): - c_arity = ( arity).arity - else: - raise ValueError("arity must be an instance of Arity") - - c_func_doc = _make_function_doc(function_doc) - - if in_types and isinstance(in_types, list): - for in_type in in_types: - in_tmp = ( in_type).input_type - c_in_types.push_back(in_tmp) - - c_type = pyarrow_unwrap_data_type(out_type) - c_callback = udf - cdef COutputType* c_out_type = new COutputType(c_type) - cdef CUDFSynthesizer* c_udf_syn = new CUDFSynthesizer(c_func_name, - c_arity, c_func_doc, c_in_types, deref(c_out_type), c_callback) - c_udf_syn.MakeFunction() - -def register_pyfunction(func_name, arity, function_doc, in_types, out_type, callback): cdef: c_string c_func_name CArity c_arity @@ -2463,7 +2411,7 @@ def register_pyfunction(func_name, arity, function_doc, in_types, out_type, call c_type = pyarrow_unwrap_data_type(out_type) c_callback = callback - #c_callback = udf + cdef COutputType* c_out_type = new COutputType(c_type) cdef CUDFSynthesizer* c_udf_syn = new CUDFSynthesizer(c_func_name, c_arity, c_func_doc, c_in_types, deref(c_out_type)) diff --git a/python/pyarrow/compute.py b/python/pyarrow/compute.py index ed30e780b39..e0b010293fb 100644 --- a/python/pyarrow/compute.py +++ b/python/pyarrow/compute.py @@ -79,7 +79,6 @@ list_functions, _group_by, register_function, - register_pyfunction, # Expressions Expression ) From e13259e6b56e35d6167c7037afa24750b4013e0f Mon Sep 17 00:00:00 2001 From: Vibhatha Abeykoon Date: Fri, 25 Feb 2022 13:59:48 +0530 Subject: [PATCH 023/131] cleaning up the current code --- cpp/examples/arrow/udf_example.cc | 71 -------------- cpp/src/arrow/python/udf.cc | 6 +- cpp/src/arrow/python/udf.h | 113 +--------------------- python/examples/statistics/udf_example.py | 2 + python/pyarrow/_compute.pyx | 2 +- python/pyarrow/includes/libarrow.pxd | 7 +- 6 files changed, 10 insertions(+), 191 deletions(-) diff --git a/cpp/examples/arrow/udf_example.cc b/cpp/examples/arrow/udf_example.cc index 1d3073fee55..6d127ec1afe 100644 --- a/cpp/examples/arrow/udf_example.cc +++ b/cpp/examples/arrow/udf_example.cc @@ -366,77 +366,6 @@ arrow::Status Execute() { return future.status(); } -// arrow::Status ExecuteSynth() { -// std::string func_name = "simple_func"; -// cp::Arity arity = cp::Arity::Unary(); -// const cp::FunctionDoc func_doc3{ -// "Example function to demonstrate registering an out-of-tree function", -// "", -// {"x"}, -// "ExampleFunctionOptions3"}; -// std::vector in_types = {cp::InputType::Array(arrow::int64())}; -// cp::OutputType out_type = arrow::int64(); -// PyObject* (*py_callback)(); -// py_callback = &SimpleFunction; -// arrow::py::UDFSynthesizer udf_sync(func_name, arity, func_doc3, in_types, out_type, -// py_callback); -// ABORT_ON_FAILURE(udf_sync.MakePyFunction()); - -// arrow::Int64Builder builder(arrow::default_memory_pool()); -// std::shared_ptr arr1, arr2; -// ABORT_ON_FAILURE(builder.Append(42)); -// ABORT_ON_FAILURE(builder.Finish(&arr1)); -// auto options = std::make_shared(); -// auto maybe_result = cp::CallFunction(func_name, {arr1}, options.get()); -// ABORT_ON_FAILURE(maybe_result.status()); - -// std::cout << "Result 1: " << maybe_result->make_array()->ToString() << std::endl; - -// return arrow::Status::OK(); -// } - -arrow::Status ExecutePy() { - cp::ExecContext exec_context(arrow::default_memory_pool(), - ::arrow::internal::GetCpuThreadPool()); - const std::string name = "simple_func"; - auto func2 = std::make_shared(name, cp::Arity::Unary(), - &func_doc2); - arrow::py::UDFScalarKernel kernel2({cp::InputType::Array(arrow::int64())}, - arrow::int64(), ExamplePyFunctionImpl); - - kernel2.mem_allocation = cp::MemAllocation::NO_PREALLOCATE; - ABORT_ON_FAILURE(func2->AddKernel(std::move(kernel2))); - - auto registry = cp::GetFunctionRegistry(); - - auto size_before_registration = registry->GetFunctionNames().size(); - - std::cout << "[Before] Func Reg Size: " << size_before_registration << ", " - << registry->num_functions() << std::endl; - - ABORT_ON_FAILURE(registry->AddFunction(std::move(func2))); - - auto size_after_registration = registry->GetFunctionNames().size(); - - std::cout << "[After] Func Reg Size: " << size_after_registration << ", " - << registry->num_functions() << std::endl; - - arrow::Int64Builder builder(arrow::default_memory_pool()); - std::shared_ptr arr; - ABORT_ON_FAILURE(builder.Append(42)); - ABORT_ON_FAILURE(builder.Finish(&arr)); - auto options = std::make_shared(); - - std::cout << "Calling function :" << arr->ToString() << std::endl; - - auto maybe_result = cp::CallFunction(name, {arr}, options.get()); - ABORT_ON_FAILURE(maybe_result.status()); - - std::cout << "Result 1: " << maybe_result->make_array()->ToString() << std::endl; - - return arrow::Status::OK(); -} - int main(int argc, char** argv) { auto status = Execute(); if (!status.ok()) { diff --git a/cpp/src/arrow/python/udf.cc b/cpp/src/arrow/python/udf.cc index c38b69ee432..17de2bfbd08 100644 --- a/cpp/src/arrow/python/udf.cc +++ b/cpp/src/arrow/python/udf.cc @@ -11,6 +11,10 @@ namespace cp = arrow::compute; namespace arrow { -namespace py {} // namespace py +namespace py { + + + +} // namespace py } // namespace arrow \ No newline at end of file diff --git a/cpp/src/arrow/python/udf.h b/cpp/src/arrow/python/udf.h index 91668eba531..4aac7b89c56 100644 --- a/cpp/src/arrow/python/udf.h +++ b/cpp/src/arrow/python/udf.h @@ -26,94 +26,8 @@ namespace arrow { namespace py { -// PyObject* CallUnaryTableUDF(PyObject* func, PyObject* arg1, std::shared_ptr
-// input); PyObject* CallUnaryArrayUDF(PyObject* func, PyObject* arg1, -// std::shared_ptr input); - -using UDFArrayKernelExec = std::function; - -struct UDFArrayKernel : public cp::Kernel { - UDFArrayKernel() = default; - - UDFArrayKernel(std::shared_ptr sig, UDFArrayKernelExec exec, - cp::KernelInit init = NULLPTR) - : cp::Kernel(std::move(sig), init), exec(std::move(exec)) {} - - UDFArrayKernel(std::vector in_types, cp::OutputType out_type, - UDFArrayKernelExec exec, cp::KernelInit init = NULLPTR) - : cp::Kernel(std::move(in_types), std::move(out_type), std::move(init)), - exec(std::move(exec)) {} - - /// \brief Perform a single invocation of this kernel. Depending on the - /// implementation, it may only write into preallocated memory, while in some - /// cases it will allocate its own memory. Any required state is managed - /// through the KernelContext. - UDFArrayKernelExec exec; - - /// \brief Writing execution results into larger contiguous allocations - /// requires that the kernel be able to write into sliced output ArrayData*, - /// including sliced output validity bitmaps. Some kernel implementations may - /// not be able to do this, so setting this to false disables this - /// functionality. - bool can_write_into_slices = true; -}; - -struct UDFScalarKernel : public UDFArrayKernel { - using UDFArrayKernel::UDFArrayKernel; - - // For scalar functions preallocated data and intersecting arg validity - // bitmaps is a reasonable default - cp::NullHandling::type null_handling = cp::NullHandling::INTERSECTION; - cp::MemAllocation::type mem_allocation = cp::MemAllocation::PREALLOCATE; -}; - -class ARROW_PYTHON_EXPORT UDFScalarFunction - : public cp::detail::FunctionImpl { - public: - using KernelType = UDFScalarKernel; - - UDFScalarFunction(std::string name, const cp::Arity& arity, const cp::FunctionDoc* doc, - const cp::FunctionOptions* default_options = NULLPTR) - : cp::detail::FunctionImpl(std::move(name), cp::Function::SCALAR, - arity, doc, default_options) {} - - /// \brief Add a kernel with given input/output types, no required state - /// initialization, preallocation for fixed-width types, and default null - /// handling (intersect validity bitmaps of inputs). - Status AddKernel(std::vector in_types, cp::OutputType out_type, - UDFArrayKernelExec exec, cp::KernelInit init = NULLPTR); - - /// \brief Add a kernel (function implementation). Returns error if the - /// kernel's signature does not match the function's arity. - Status AddKernel(UDFScalarKernel kernel) { - ARROW_RETURN_NOT_OK(CheckArity(kernel.signature->in_types())); - if (arity_.is_varargs && !kernel.signature->is_varargs()) { - return Status::Invalid("Function accepts varargs but kernel signature does not"); - } - kernels_.emplace_back(std::move(kernel)); - return Status::OK(); - } - - Status Hello1() { return Status::OK(); } - - Status Hello2(); -}; - -using KernelExec = - std::function; - class ARROW_PYTHON_EXPORT UDFSynthesizer { public: - UDFSynthesizer(std::string func_name, cp::Arity arity, cp::FunctionDoc func_doc, - std::vector in_types, cp::OutputType out_type, - Status (*callback)(cp::KernelContext*, const cp::ExecBatch&, Datum*)) - : func_name_(func_name), - arity_(arity), - func_doc_(func_doc), - in_types_(in_types), - out_type_(out_type), - callback_(callback) {} UDFSynthesizer(std::string func_name, cp::Arity arity, cp::FunctionDoc func_doc, std::vector in_types, cp::OutputType out_type) @@ -123,29 +37,10 @@ class ARROW_PYTHON_EXPORT UDFSynthesizer { in_types_(in_types), out_type_(out_type) {} - Status MakeFunction() { - Status st; - auto func = std::make_shared(func_name_, arity_, &func_doc_); - cp::ScalarKernel kernel(in_types_, out_type_, callback_); - kernel.mem_allocation = cp::MemAllocation::NO_PREALLOCATE; - st = func->AddKernel(std::move(kernel)); - if (!st.ok()) { - return Status::ExecutionError("Kernel couldn't be added to the udf"); - } - auto registry = cp::GetFunctionRegistry(); - st = registry->AddFunction(std::move(func)); - if (!st.ok()) { - return Status::ExecutionError("udf registration failed"); - } - return Status::OK(); - } - - Status MakePyFunction(PyObject* function) { + Status MakeFunction(PyObject* function) { Status st; auto func = std::make_shared(func_name_, arity_, &func_doc_); Py_XINCREF(function); - //double result = PyFloat_AsDouble(args); - //std::cout << "Make Function Args : " << result << std::endl; auto call_back_lambda = [function](cp::KernelContext* ctx, const cp::ExecBatch& batch, Datum* out) { PyGILState_STATE state = PyGILState_Ensure(); @@ -157,7 +52,6 @@ class ARROW_PYTHON_EXPORT UDFSynthesizer { int res = PyCallable_Check(function); if (res == 1) { - std::cout << "This is a PyCallback" << std::endl; auto c_array = batch[0].make_array(); PyObject* py_array = wrap_array(c_array); PyObject* arg_tuple = PyTuple_Pack(1, py_array); @@ -202,11 +96,6 @@ class ARROW_PYTHON_EXPORT UDFSynthesizer { cp::FunctionDoc func_doc_; std::vector in_types_; cp::OutputType out_type_; - // KernelExec kernel_exec_; - Status (*callback_)(cp::KernelContext*, const cp::ExecBatch&, Datum*); - // C++ way - //PyObject* (*py_call_back_)(); - // Python way }; } // namespace py diff --git a/python/examples/statistics/udf_example.py b/python/examples/statistics/udf_example.py index 4cdabe34b9a..40255d8d24e 100644 --- a/python/examples/statistics/udf_example.py +++ b/python/examples/statistics/udf_example.py @@ -57,3 +57,5 @@ def add_constant(array): a2 = pc.call_function(func_name, [pa.array([20])]) print(a2) + + diff --git a/python/pyarrow/_compute.pyx b/python/pyarrow/_compute.pyx index 86d48023027..99da8449d7d 100644 --- a/python/pyarrow/_compute.pyx +++ b/python/pyarrow/_compute.pyx @@ -2415,4 +2415,4 @@ def register_function(func_name, arity, function_doc, in_types, out_type, callba cdef COutputType* c_out_type = new COutputType(c_type) cdef CUDFSynthesizer* c_udf_syn = new CUDFSynthesizer(c_func_name, c_arity, c_func_doc, c_in_types, deref(c_out_type)) - c_udf_syn.MakePyFunction(c_callback) + c_udf_syn.MakeFunction(c_callback) diff --git a/python/pyarrow/includes/libarrow.pxd b/python/pyarrow/includes/libarrow.pxd index defe5f84c22..08b1bc05f3f 100644 --- a/python/pyarrow/includes/libarrow.pxd +++ b/python/pyarrow/includes/libarrow.pxd @@ -2700,15 +2700,10 @@ cdef extern from "arrow/util/byte_size.h" namespace "arrow::util" nogil: int64_t TotalBufferSize(const CRecordBatch& record_batch) int64_t TotalBufferSize(const CTable& table) -ctypedef CStatus(*ExecFunc)(CKernelContext*, const CExecBatch&, CDatum*) - cdef extern from "arrow/python/udf.h" namespace "arrow::py" nogil: # TODO: determine a better name. This may be confused for a cudf util cdef cppclass CUDFSynthesizer "arrow::py::UDFSynthesizer": - CUDFSynthesizer(c_string func_name, CArity arity, CFunctionDoc func_doc, - vector[CInputType] in_types, COutputType out_type, ExecFunc) CUDFSynthesizer(c_string func_name, CArity arity, CFunctionDoc func_doc, vector[CInputType] in_types, COutputType out_type) - CStatus MakeFunction() - CStatus MakePyFunction(PyObject* function) + CStatus MakeFunction(PyObject* function) From 541fffd063a9c169d2d2be097a277a70c312ecce Mon Sep 17 00:00:00 2001 From: Vibhatha Abeykoon Date: Fri, 25 Feb 2022 16:18:05 +0530 Subject: [PATCH 024/131] temp checkin --- cpp/examples/arrow/udf_example.cc | 85 ++++++++++++++++++++++++++++++- cpp/src/arrow/python/udf.cc | 56 +++++++++++++++++++- cpp/src/arrow/python/udf.h | 54 +------------------- 3 files changed, 140 insertions(+), 55 deletions(-) diff --git a/cpp/examples/arrow/udf_example.cc b/cpp/examples/arrow/udf_example.cc index 6d127ec1afe..06576a360f5 100644 --- a/cpp/examples/arrow/udf_example.cc +++ b/cpp/examples/arrow/udf_example.cc @@ -366,8 +366,91 @@ arrow::Status Execute() { return future.status(); } +arrow::Status ExecuteVar() { + const std::string name = "x+x"; + auto func = std::make_shared(name, cp::Arity::Unary(), &func_doc2); + auto exec_func = [](cp::KernelContext* ctx, const cp::ExecBatch& batch, + arrow::Datum* out) -> arrow::Status { + auto tb = batch[0].table(); + std::cout << "Batch as Table " << std::endl; + std::cout << tb->num_columns() << std::endl; + *out->mutable_array() = *batch[0].array(); + return arrow::Status::OK(); + }; + cp::ScalarKernel kernel({cp::InputType::Array(arrow::int64())}, arrow::int64(), + exec_func); + + kernel.mem_allocation = cp::MemAllocation::NO_PREALLOCATE; + + ABORT_ON_FAILURE(func->AddKernel(std::move(kernel))); + + auto registry = cp::GetFunctionRegistry(); + ABORT_ON_FAILURE(registry->AddFunction(std::move(func))); + + arrow::Int64Builder builder(arrow::default_memory_pool()); + std::shared_ptr arr1, arr2; + ABORT_ON_FAILURE(builder.Append(42)); + ABORT_ON_FAILURE(builder.Finish(&arr1)); + ABORT_ON_FAILURE(builder.Append(58)); + ABORT_ON_FAILURE(builder.Finish(&arr2)); + auto options = std::make_shared(); + auto maybe_result = cp::CallFunction(name, {arr1}, options.get()); + ABORT_ON_FAILURE(maybe_result.status()); + + std::cout << "Result 1: " << maybe_result->make_array()->ToString() << std::endl; + + // Expression serialization will raise NotImplemented if an expression includes + // FunctionOptions for which serialization is not supported. + // auto expr = cp::call(name, {}, options); + // auto maybe_serialized = cp::Serialize(expr); + // std::cerr << maybe_serialized.status().ToString() << std::endl; + + auto exec_registry = cp::default_exec_factory_registry(); + ABORT_ON_FAILURE( + exec_registry->AddFactory("compute_register_example", ExampleExecNodeFactory)); + + auto maybe_plan = cp::ExecPlan::Make(); + ABORT_ON_FAILURE(maybe_plan.status()); + auto plan = maybe_plan.ValueOrDie(); + cp::ExecContext exec_context(arrow::default_memory_pool(), + ::arrow::internal::GetCpuThreadPool()); + arrow::AsyncGenerator> source_gen, sink_gen; + ARROW_ASSIGN_OR_RAISE(auto basic_data, MakeBasicBatches()); + + cp::Expression a_times_10 = cp::call("multiply", {cp::field_ref("a"), cp::literal(10)}); + cp::Expression custom_exp = cp::call(name, {cp::field_ref("a")}, options); + + auto source_node_options = cp::SourceNodeOptions{basic_data.schema, basic_data.gen()}; + auto project_node_options = cp::ProjectNodeOptions{{ + cp::field_ref("a"), + custom_exp, + cp::field_ref("b"), + }}; + auto output_schema = arrow::schema({arrow::field("a", arrow::int64()), + arrow::field("a + a", arrow::int64()), + arrow::field("b", arrow::boolean())}); + std::shared_ptr out; + ABORT_ON_FAILURE(cp::Declaration::Sequence( + { + {"source", source_node_options}, + {"project", project_node_options}, + {"table_sink", cp::TableSinkNodeOptions{&out, output_schema}}, + }) + .AddToPlan(plan.get()) + .status()); + + ARROW_RETURN_NOT_OK(plan->StartProducing()); + + std::cout << "Output Table Data : " << std::endl; + std::cout << out->ToString() << std::endl; + + auto future = plan->finished(); + + return future.status(); +} + int main(int argc, char** argv) { - auto status = Execute(); + auto status = ExecuteVar(); if (!status.ok()) { std::cerr << "Error occurred : " << status.message() << std::endl; return EXIT_FAILURE; diff --git a/cpp/src/arrow/python/udf.cc b/cpp/src/arrow/python/udf.cc index 17de2bfbd08..4729245f30b 100644 --- a/cpp/src/arrow/python/udf.cc +++ b/cpp/src/arrow/python/udf.cc @@ -13,7 +13,61 @@ namespace arrow { namespace py { - +Status UDFSynthesizer::MakeFunction(PyObject* function) { + Status st; + auto func = std::make_shared(func_name_, arity_, &func_doc_); + Py_XINCREF(function); + auto call_back_lambda = [function](cp::KernelContext* ctx, const cp::ExecBatch& batch, + Datum* out) -> Status { + PyAcquireGIL lock; + PyObject* py_array = NULLPTR; + PyObject* arg_tuple = NULLPTR; + PyObject* result = NULLPTR; + std::shared_ptr c_res_array; + if (function == NULL) { + return Status::ExecutionError("python function cannot be null"); + } + + if (PyCallable_Check(function)) { + auto c_array = batch[0].make_array(); + Py_XINCREF(py_array); + Py_XINCREF(arg_tuple); + Py_XINCREF(result); + py_array = wrap_array(c_array); + arg_tuple = PyTuple_Pack(1, py_array); + result = PyObject_CallObject(function, arg_tuple); + Py_XDECREF(function); + if (result == NULL) { + return Status::ExecutionError("Error occured in computation"); + } + auto res = unwrap_array(result); + if (!res.status().ok()) { + return res.status(); + } + c_res_array = res.ValueOrDie(); + Py_XDECREF(py_array); + Py_XDECREF(arg_tuple); + Py_XDECREF(result); + } else { + return Status::ExecutionError("Error occured in computation"); + } + auto datum = new Datum(c_res_array); + *out->mutable_array() = *datum->array(); + return Status::OK(); + }; + cp::ScalarKernel kernel(in_types_, out_type_, call_back_lambda); + kernel.mem_allocation = cp::MemAllocation::NO_PREALLOCATE; + st = func->AddKernel(std::move(kernel)); + if (!st.ok()) { + return Status::ExecutionError("Kernel couldn't be added to the udf"); + } + auto registry = cp::GetFunctionRegistry(); + st = registry->AddFunction(std::move(func)); + if (!st.ok()) { + return Status::ExecutionError("udf registration failed"); + } + return Status::OK(); +} } // namespace py diff --git a/cpp/src/arrow/python/udf.h b/cpp/src/arrow/python/udf.h index 4aac7b89c56..07ed47d23ff 100644 --- a/cpp/src/arrow/python/udf.h +++ b/cpp/src/arrow/python/udf.h @@ -28,7 +28,6 @@ namespace py { class ARROW_PYTHON_EXPORT UDFSynthesizer { public: - UDFSynthesizer(std::string func_name, cp::Arity arity, cp::FunctionDoc func_doc, std::vector in_types, cp::OutputType out_type) : func_name_(func_name), @@ -37,58 +36,7 @@ class ARROW_PYTHON_EXPORT UDFSynthesizer { in_types_(in_types), out_type_(out_type) {} - Status MakeFunction(PyObject* function) { - Status st; - auto func = std::make_shared(func_name_, arity_, &func_doc_); - Py_XINCREF(function); - auto call_back_lambda = [function](cp::KernelContext* ctx, const cp::ExecBatch& batch, - Datum* out) { - PyGILState_STATE state = PyGILState_Ensure(); - std::shared_ptr c_res_array; - if (function == NULL) { - PyGILState_Release(state); - return Status::ExecutionError("python function cannot be null"); - } - - int res = PyCallable_Check(function); - if (res == 1) { - auto c_array = batch[0].make_array(); - PyObject* py_array = wrap_array(c_array); - PyObject* arg_tuple = PyTuple_Pack(1, py_array); - PyObject *result = PyObject_CallObject(function, arg_tuple); - Py_DECREF(function); - if (result == NULL) { - PyGILState_Release(state); - return Status::ExecutionError("Error occured in computation"); - } - auto res = unwrap_array(result); - if(!res.status().ok()) { - PyGILState_Release(state); - return res.status(); - } - c_res_array = res.ValueOrDie(); - } else { - PyErr_Print(); - return Status::ExecutionError("Error occured in computation"); - } - auto datum = new Datum(c_res_array); - *out->mutable_array() = *datum->array(); - PyGILState_Release(state); - return Status::OK(); - }; - cp::ScalarKernel kernel(in_types_, out_type_, call_back_lambda); - kernel.mem_allocation = cp::MemAllocation::NO_PREALLOCATE; - st = func->AddKernel(std::move(kernel)); - if (!st.ok()) { - return Status::ExecutionError("Kernel couldn't be added to the udf"); - } - auto registry = cp::GetFunctionRegistry(); - st = registry->AddFunction(std::move(func)); - if (!st.ok()) { - return Status::ExecutionError("udf registration failed"); - } - return Status::OK(); - } + Status MakeFunction(PyObject* function); private: std::string func_name_; From 82b71b78375d381fea1b69deddf46eec036b9da4 Mon Sep 17 00:00:00 2001 From: Vibhatha Abeykoon Date: Thu, 3 Mar 2022 07:32:41 +0530 Subject: [PATCH 025/131] minor changes --- cpp/examples/arrow/CMakeLists.txt | 7 +- cpp/examples/arrow/aggregate_example.cc | 138 ++++++++++++++++++++++++ cpp/examples/arrow/udf_example.cc | 44 ++++---- cpp/src/arrow/python/udf.h | 4 + 4 files changed, 170 insertions(+), 23 deletions(-) create mode 100644 cpp/examples/arrow/aggregate_example.cc diff --git a/cpp/examples/arrow/CMakeLists.txt b/cpp/examples/arrow/CMakeLists.txt index 57c35112665..503bc3266a4 100644 --- a/cpp/examples/arrow/CMakeLists.txt +++ b/cpp/examples/arrow/CMakeLists.txt @@ -136,12 +136,15 @@ if(ARROW_PARQUET AND ARROW_DATASET) add_dependencies(execution-plan-documentation-examples parquet) add_arrow_example(join_example EXTRA_LINK_LIBS ${DATASET_EXAMPLES_LINK_LIBS}) - add_dependencies(join-example parquet) - add_dependencies(dataset_documentation_example parquet) + add_dependencies(join_example parquet) add_arrow_example(udf_example EXTRA_LINK_LIBS ${DATASET_EXAMPLES_LINK_LIBS} ${PYTHON_LIBRARIES} ${PYTHON_OTHER_LIBS}) add_dependencies(udf_example parquet arrow_python) + + add_arrow_example(aggregate_example EXTRA_LINK_LIBS ${DATASET_EXAMPLES_LINK_LIBS}) + add_dependencies(aggregate_example parquet) + endif() message("PYTHON_INCLUDE_DIRS : ${PYTHON_INCLUDE_DIRS}") diff --git a/cpp/examples/arrow/aggregate_example.cc b/cpp/examples/arrow/aggregate_example.cc new file mode 100644 index 00000000000..793b760d6df --- /dev/null +++ b/cpp/examples/arrow/aggregate_example.cc @@ -0,0 +1,138 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +// This example showcases various ways to work with Datasets. It's +// intended to be paired with the documentation. + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include + +namespace cp = arrow::compute; + +#define ABORT_ON_FAILURE(expr) \ + do { \ + arrow::Status status_ = (expr); \ + if (!status_.ok()) { \ + std::cerr << status_.message() << std::endl; \ + abort(); \ + } \ + } while (0); + +template ::value | + arrow::is_boolean_type::value | + arrow::is_temporal_type::value>::type> +arrow::Result> GetArrayDataSample( + const std::vector& values) { + using ARROW_ARRAY_TYPE = typename arrow::TypeTraits::ArrayType; + using ARROW_BUILDER_TYPE = typename arrow::TypeTraits::BuilderType; + ARROW_BUILDER_TYPE builder; + ARROW_RETURN_NOT_OK(builder.Reserve(values.size())); + std::shared_ptr array; + ARROW_RETURN_NOT_OK(builder.AppendValues(values)); + ARROW_RETURN_NOT_OK(builder.Finish(&array)); + return array; +} + +arrow::Result> GetTable() { + std::shared_ptr table; + + auto field_vector = {arrow::field("a", arrow::int64()), + arrow::field("b", arrow::boolean()), + arrow::field("c", arrow::int64())}; + ARROW_ASSIGN_OR_RAISE(auto int_array, + GetArrayDataSample({0, 1, 2, 0, 4, 1, 0, 5})); + ARROW_ASSIGN_OR_RAISE(auto bool_array, GetArrayDataSample( + {false, true, false, true, true, false, true, false})); + ARROW_ASSIGN_OR_RAISE(auto data_array, + GetArrayDataSample({10, 11, 12, 10, 11, 11, 10, 15})); + + auto schema = arrow::schema(field_vector); + auto data_vector = {int_array, bool_array, data_array}; + + table = arrow::Table::Make(schema, data_vector, 8); + + return table; +} + +arrow::Status DoAggregate() { + auto maybe_plan = cp::ExecPlan::Make(); + ABORT_ON_FAILURE(maybe_plan.status()); + auto plan = maybe_plan.ValueOrDie(); + cp::ExecContext exec_context(arrow::default_memory_pool(), + ::arrow::internal::GetCpuThreadPool()); + + ARROW_ASSIGN_OR_RAISE(auto table, GetTable()); + + std::cout << "Source Table" << std::endl; + + std::cout << table->ToString() << std::endl; + + std::shared_ptr out; + cp::CountOptions options(cp::CountOptions::ONLY_VALID); + auto aggregate_options = + cp::AggregateNodeOptions{/*aggregates=*/{{"sum", &options}}, + /*targets=*/{"c"}, + /*names=*/{"count(c)"}, + /*keys=*/{}}; + auto schema = arrow::schema({ + arrow::field("count(c)", arrow::int64()) + //arrow::field("a", arrow::int64()) + }); + + ABORT_ON_FAILURE(cp::Declaration::Sequence( + { + {"table_source", cp::TableSourceNodeOptions{table, 2}}, + {"aggregate", aggregate_options}, + {"table_sink", cp::TableSinkNodeOptions{&out, schema}}, + }) + .AddToPlan(plan.get()) + .status()); + + ARROW_RETURN_NOT_OK(plan->StartProducing()); + + std::cout << "Output Table Data : " << std::endl; + std::cout << out->ToString() << std::endl; + + auto future = plan->finished(); + + return future.status(); +} + +int main(int argc, char** argv) { + auto status = DoAggregate(); + if (!status.ok()) { + std::cerr << "Error occurred: " << status.message() << std::endl; + return EXIT_FAILURE; + } + return EXIT_SUCCESS; +} diff --git a/cpp/examples/arrow/udf_example.cc b/cpp/examples/arrow/udf_example.cc index 06576a360f5..4ad31a5a761 100644 --- a/cpp/examples/arrow/udf_example.cc +++ b/cpp/examples/arrow/udf_example.cc @@ -123,6 +123,24 @@ arrow::Result MakeBasicBatches() { return out; } +arrow::Result> GetTable() { + std::shared_ptr table; + + auto field_vector = {arrow::field("a", arrow::int64()), + arrow::field("b", arrow::boolean())}; + ARROW_ASSIGN_OR_RAISE(auto int_array, + GetArrayDataSample({0, 4, 10, 20, 30})); + ARROW_ASSIGN_OR_RAISE(auto bool_array, GetArrayDataSample( + {false, true, false, true, true})); + + auto schema = arrow::schema(field_vector); + auto data_vector = {int_array, bool_array}; + + table = arrow::Table::Make(schema, data_vector, 5); + + return table; +} + class ExampleFunctionOptionsType : public cp::FunctionOptionsType { const char* type_name() const override { return "ExampleFunctionOptionsType"; } std::string Stringify(const cp::FunctionOptions&) const override { @@ -371,12 +389,11 @@ arrow::Status ExecuteVar() { auto func = std::make_shared(name, cp::Arity::Unary(), &func_doc2); auto exec_func = [](cp::KernelContext* ctx, const cp::ExecBatch& batch, arrow::Datum* out) -> arrow::Status { - auto tb = batch[0].table(); std::cout << "Batch as Table " << std::endl; - std::cout << tb->num_columns() << std::endl; *out->mutable_array() = *batch[0].array(); return arrow::Status::OK(); }; + auto options = std::make_shared(); cp::ScalarKernel kernel({cp::InputType::Array(arrow::int64())}, arrow::int64(), exec_func); @@ -387,24 +404,6 @@ arrow::Status ExecuteVar() { auto registry = cp::GetFunctionRegistry(); ABORT_ON_FAILURE(registry->AddFunction(std::move(func))); - arrow::Int64Builder builder(arrow::default_memory_pool()); - std::shared_ptr arr1, arr2; - ABORT_ON_FAILURE(builder.Append(42)); - ABORT_ON_FAILURE(builder.Finish(&arr1)); - ABORT_ON_FAILURE(builder.Append(58)); - ABORT_ON_FAILURE(builder.Finish(&arr2)); - auto options = std::make_shared(); - auto maybe_result = cp::CallFunction(name, {arr1}, options.get()); - ABORT_ON_FAILURE(maybe_result.status()); - - std::cout << "Result 1: " << maybe_result->make_array()->ToString() << std::endl; - - // Expression serialization will raise NotImplemented if an expression includes - // FunctionOptions for which serialization is not supported. - // auto expr = cp::call(name, {}, options); - // auto maybe_serialized = cp::Serialize(expr); - // std::cerr << maybe_serialized.status().ToString() << std::endl; - auto exec_registry = cp::default_exec_factory_registry(); ABORT_ON_FAILURE( exec_registry->AddFactory("compute_register_example", ExampleExecNodeFactory)); @@ -421,6 +420,8 @@ arrow::Status ExecuteVar() { cp::Expression custom_exp = cp::call(name, {cp::field_ref("a")}, options); auto source_node_options = cp::SourceNodeOptions{basic_data.schema, basic_data.gen()}; + ARROW_ASSIGN_OR_RAISE(auto table, GetTable()); + auto table_source_node_options = cp::TableSourceNodeOptions{table, 2}; auto project_node_options = cp::ProjectNodeOptions{{ cp::field_ref("a"), custom_exp, @@ -432,7 +433,7 @@ arrow::Status ExecuteVar() { std::shared_ptr out; ABORT_ON_FAILURE(cp::Declaration::Sequence( { - {"source", source_node_options}, + {"table_source", table_source_node_options}, {"project", project_node_options}, {"table_sink", cp::TableSinkNodeOptions{&out, output_schema}}, }) @@ -449,6 +450,7 @@ arrow::Status ExecuteVar() { return future.status(); } + int main(int argc, char** argv) { auto status = ExecuteVar(); if (!status.ok()) { diff --git a/cpp/src/arrow/python/udf.h b/cpp/src/arrow/python/udf.h index 07ed47d23ff..6d360ffcce1 100644 --- a/cpp/src/arrow/python/udf.h +++ b/cpp/src/arrow/python/udf.h @@ -46,6 +46,10 @@ class ARROW_PYTHON_EXPORT UDFSynthesizer { cp::OutputType out_type_; }; +class ARROW_PYTHON_EXPORT UDFScalarAggregator{ + +}; + } // namespace py } // namespace arrow \ No newline at end of file From c31ccc1b1c5d640208b7b5d6943cb79623715e5b Mon Sep 17 00:00:00 2001 From: Vibhatha Abeykoon Date: Fri, 4 Mar 2022 13:51:37 +0530 Subject: [PATCH 026/131] updating cmakelist --- cpp/examples/arrow/CMakeLists.txt | 8 -------- 1 file changed, 8 deletions(-) diff --git a/cpp/examples/arrow/CMakeLists.txt b/cpp/examples/arrow/CMakeLists.txt index 503bc3266a4..b265dc50093 100644 --- a/cpp/examples/arrow/CMakeLists.txt +++ b/cpp/examples/arrow/CMakeLists.txt @@ -146,11 +146,3 @@ if(ARROW_PARQUET AND ARROW_DATASET) add_dependencies(aggregate_example parquet) endif() - -message("PYTHON_INCLUDE_DIRS : ${PYTHON_INCLUDE_DIRS}") - -message("PYTHON_OTHER_LIBS : ${PYTHON_OTHER_LIBS}") - -message("PYTHON_LIBRARIES : ${PYTHON_LIBRARIES}") - - From 92caca74db735cbe999b9e61cc309afb3f23127e Mon Sep 17 00:00:00 2001 From: Vibhatha Abeykoon Date: Fri, 4 Mar 2022 14:54:54 +0530 Subject: [PATCH 027/131] updating cmakelist(examples) --- cpp/examples/arrow/CMakeLists.txt | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/cpp/examples/arrow/CMakeLists.txt b/cpp/examples/arrow/CMakeLists.txt index b265dc50093..b3232e9735e 100644 --- a/cpp/examples/arrow/CMakeLists.txt +++ b/cpp/examples/arrow/CMakeLists.txt @@ -136,13 +136,13 @@ if(ARROW_PARQUET AND ARROW_DATASET) add_dependencies(execution-plan-documentation-examples parquet) add_arrow_example(join_example EXTRA_LINK_LIBS ${DATASET_EXAMPLES_LINK_LIBS}) - add_dependencies(join_example parquet) + add_dependencies(join-example parquet) add_arrow_example(udf_example EXTRA_LINK_LIBS ${DATASET_EXAMPLES_LINK_LIBS} ${PYTHON_LIBRARIES} ${PYTHON_OTHER_LIBS}) - add_dependencies(udf_example parquet arrow_python) + add_dependencies(udf-example parquet arrow_python) add_arrow_example(aggregate_example EXTRA_LINK_LIBS ${DATASET_EXAMPLES_LINK_LIBS}) - add_dependencies(aggregate_example parquet) + add_dependencies(aggregate-example parquet) endif() From 1ff043f60516e3cd0a60675a373286a203754373 Mon Sep 17 00:00:00 2001 From: Vibhatha Abeykoon Date: Fri, 4 Mar 2022 18:56:04 +0530 Subject: [PATCH 028/131] minor fix for python --- cpp/examples/arrow/aggregate_example.cc | 1 - cpp/src/arrow/python/CMakeLists.txt | 3 +- cpp/src/arrow/python/udf.cc | 115 +++++++++++++----------- cpp/src/arrow/python/udf.h | 57 +++++++----- python/pyarrow/_compute.pyx | 6 +- python/pyarrow/includes/libarrow.pxd | 14 +-- python/setup.py | 3 + 7 files changed, 110 insertions(+), 89 deletions(-) diff --git a/cpp/examples/arrow/aggregate_example.cc b/cpp/examples/arrow/aggregate_example.cc index 793b760d6df..38f8cb0e97c 100644 --- a/cpp/examples/arrow/aggregate_example.cc +++ b/cpp/examples/arrow/aggregate_example.cc @@ -106,7 +106,6 @@ arrow::Status DoAggregate() { /*keys=*/{}}; auto schema = arrow::schema({ arrow::field("count(c)", arrow::int64()) - //arrow::field("a", arrow::int64()) }); ABORT_ON_FAILURE(cp::Declaration::Sequence( diff --git a/cpp/src/arrow/python/CMakeLists.txt b/cpp/src/arrow/python/CMakeLists.txt index b75fee4a25e..7235e2d0fe3 100644 --- a/cpp/src/arrow/python/CMakeLists.txt +++ b/cpp/src/arrow/python/CMakeLists.txt @@ -45,8 +45,7 @@ set(ARROW_PYTHON_SRCS python_to_arrow.cc pyarrow.cc serialize.cc - udf.cc - udf.h) + udf.cc) set_source_files_properties(init.cc PROPERTIES SKIP_PRECOMPILE_HEADERS ON SKIP_UNITY_BUILD_INCLUSION ON) diff --git a/cpp/src/arrow/python/udf.cc b/cpp/src/arrow/python/udf.cc index 4729245f30b..8520da41484 100644 --- a/cpp/src/arrow/python/udf.cc +++ b/cpp/src/arrow/python/udf.cc @@ -13,61 +13,68 @@ namespace arrow { namespace py { -Status UDFSynthesizer::MakeFunction(PyObject* function) { - Status st; - auto func = std::make_shared(func_name_, arity_, &func_doc_); - Py_XINCREF(function); - auto call_back_lambda = [function](cp::KernelContext* ctx, const cp::ExecBatch& batch, - Datum* out) -> Status { - PyAcquireGIL lock; - PyObject* py_array = NULLPTR; - PyObject* arg_tuple = NULLPTR; - PyObject* result = NULLPTR; - std::shared_ptr c_res_array; - if (function == NULL) { - return Status::ExecutionError("python function cannot be null"); - } +// Status ScalarUDFBuilder::MakeFunction() { +// Status st; +// auto func = std::make_shared(this->func_name_, this->arity_, &this->func_doc_); +// Py_XINCREF(this->function_); +// PyObject* function = this->function_; +// // lambda function +// auto call_back_lambda = [function](cp::KernelContext* ctx, const cp::ExecBatch& batch, +// Datum* out) -> Status { +// PyAcquireGIL lock; +// PyObject* py_array = NULLPTR; +// PyObject* arg_tuple = NULLPTR; +// PyObject* result = NULLPTR; +// std::shared_ptr c_res_array; +// if (function == NULL) { +// return Status::ExecutionError("python function cannot be null"); +// } - if (PyCallable_Check(function)) { - auto c_array = batch[0].make_array(); - Py_XINCREF(py_array); - Py_XINCREF(arg_tuple); - Py_XINCREF(result); - py_array = wrap_array(c_array); - arg_tuple = PyTuple_Pack(1, py_array); - result = PyObject_CallObject(function, arg_tuple); - Py_XDECREF(function); - if (result == NULL) { - return Status::ExecutionError("Error occured in computation"); - } - auto res = unwrap_array(result); - if (!res.status().ok()) { - return res.status(); - } - c_res_array = res.ValueOrDie(); - Py_XDECREF(py_array); - Py_XDECREF(arg_tuple); - Py_XDECREF(result); - } else { - return Status::ExecutionError("Error occured in computation"); - } - auto datum = new Datum(c_res_array); - *out->mutable_array() = *datum->array(); - return Status::OK(); - }; - cp::ScalarKernel kernel(in_types_, out_type_, call_back_lambda); - kernel.mem_allocation = cp::MemAllocation::NO_PREALLOCATE; - st = func->AddKernel(std::move(kernel)); - if (!st.ok()) { - return Status::ExecutionError("Kernel couldn't be added to the udf"); - } - auto registry = cp::GetFunctionRegistry(); - st = registry->AddFunction(std::move(func)); - if (!st.ok()) { - return Status::ExecutionError("udf registration failed"); - } - return Status::OK(); -} +// if (PyCallable_Check(function)) { +// //if the batch is an array +// auto c_array = batch[0].make_array(); +// Py_XINCREF(py_array); +// Py_XINCREF(arg_tuple); +// Py_XINCREF(result); +// py_array = wrap_array(c_array); +// arg_tuple = PyTuple_Pack(1, py_array); +// result = PyObject_CallObject(function, arg_tuple); +// Py_XDECREF(function); +// if (result == NULL) { +// return Status::ExecutionError("Error occured in computation"); +// } +// auto res = unwrap_array(result); +// if (!res.status().ok()) { +// return res.status(); +// } +// c_res_array = res.ValueOrDie(); +// Py_XDECREF(py_array); +// Py_XDECREF(arg_tuple); +// Py_XDECREF(result); +// Py_XDECREF(function); +// //if the batch is a scalar + +// } else { +// return Status::ExecutionError("Error occured in computation"); +// } +// auto datum = new Datum(c_res_array); +// *out->mutable_array() = *datum->array(); +// return Status::OK(); +// }; // lambda function + +// cp::ScalarKernel kernel(this->in_types_, this->out_type_, call_back_lambda); +// kernel.mem_allocation = cp::MemAllocation::NO_PREALLOCATE; +// st = func->AddKernel(std::move(kernel)); +// if (!st.ok()) { +// return Status::ExecutionError("Kernel couldn't be added to the udf"); +// } +// auto registry = cp::GetFunctionRegistry(); +// st = registry->AddFunction(std::move(func)); +// if (!st.ok()) { +// return Status::ExecutionError("udf registration failed"); +// } +// return Status::OK(); +// } } // namespace py diff --git a/cpp/src/arrow/python/udf.h b/cpp/src/arrow/python/udf.h index 6d360ffcce1..7275174b3e0 100644 --- a/cpp/src/arrow/python/udf.h +++ b/cpp/src/arrow/python/udf.h @@ -26,29 +26,40 @@ namespace arrow { namespace py { -class ARROW_PYTHON_EXPORT UDFSynthesizer { - public: - UDFSynthesizer(std::string func_name, cp::Arity arity, cp::FunctionDoc func_doc, - std::vector in_types, cp::OutputType out_type) - : func_name_(func_name), - arity_(arity), - func_doc_(func_doc), - in_types_(in_types), - out_type_(out_type) {} - - Status MakeFunction(PyObject* function); - - private: - std::string func_name_; - cp::Arity arity_; - cp::FunctionDoc func_doc_; - std::vector in_types_; - cp::OutputType out_type_; -}; - -class ARROW_PYTHON_EXPORT UDFScalarAggregator{ - -}; +// class ARROW_PYTHON_EXPORT UDFBuilder { +// public: +// virtual Status MakeFunction(); + +// protected: +// UDFBuilder(std::string func_name, cp::Function::Kind kind, cp::Arity arity, cp::FunctionDoc func_doc, +// std::vector in_types, cp::OutputType out_type) +// : func_name_(func_name), +// kind_(kind), +// arity_(arity), +// func_doc_(func_doc), +// in_types_(in_types), +// out_type_(out_type) {} + +// std::string func_name_; +// cp::Function::Kind kind_; +// cp::Arity arity_; +// cp::FunctionDoc func_doc_; +// std::vector in_types_; +// cp::OutputType out_type_; +// }; + +// class ARROW_PYTHON_EXPORT ScalarUDFBuilder : public UDFBuilder{ +// public: +// ScalarUDFBuilder(std::string func_name, cp::Arity arity, cp::FunctionDoc func_doc, +// std::vector in_types, cp::OutputType out_type, PyObject* function) +// : UDFBuilder(func_name, cp::Function::SCALAR, arity, func_doc, in_types, out_type), function_(function) {} + +// Status MakeFunction() override; + +// private: +// PyObject* function_; + +// }; } // namespace py diff --git a/python/pyarrow/_compute.pyx b/python/pyarrow/_compute.pyx index 99da8449d7d..71fd7872121 100644 --- a/python/pyarrow/_compute.pyx +++ b/python/pyarrow/_compute.pyx @@ -2413,6 +2413,6 @@ def register_function(func_name, arity, function_doc, in_types, out_type, callba c_callback = callback cdef COutputType* c_out_type = new COutputType(c_type) - cdef CUDFSynthesizer* c_udf_syn = new CUDFSynthesizer(c_func_name, - c_arity, c_func_doc, c_in_types, deref(c_out_type)) - c_udf_syn.MakeFunction(c_callback) + # cdef CScalarUDFBuilder* c_sc_builder = new CScalarUDFBuilder(c_func_name, + # c_arity, c_func_doc, c_in_types, deref(c_out_type), c_callback) + # c_sc_builder.MakeFunction() diff --git a/python/pyarrow/includes/libarrow.pxd b/python/pyarrow/includes/libarrow.pxd index 08b1bc05f3f..02e07dafb95 100644 --- a/python/pyarrow/includes/libarrow.pxd +++ b/python/pyarrow/includes/libarrow.pxd @@ -2700,10 +2700,12 @@ cdef extern from "arrow/util/byte_size.h" namespace "arrow::util" nogil: int64_t TotalBufferSize(const CRecordBatch& record_batch) int64_t TotalBufferSize(const CTable& table) -cdef extern from "arrow/python/udf.h" namespace "arrow::py" nogil: - # TODO: determine a better name. This may be confused for a cudf util - cdef cppclass CUDFSynthesizer "arrow::py::UDFSynthesizer": - CUDFSynthesizer(c_string func_name, CArity arity, CFunctionDoc func_doc, - vector[CInputType] in_types, COutputType out_type) - CStatus MakeFunction(PyObject* function) +# cdef extern from "arrow/python/udf.h" namespace "arrow::py" nogil: +# cdef cppclass CUDFBuilder "arrow::py::UDFBuilder": +# CUDFBuilder(c_string func_name, CArity arity, CFunctionDoc func_doc, +# vector[CInputType] in_types, COutputType out_type) + # cdef cppclass CScalarUDFBuilder "arrow::py::ScalarUDFBuilder"(CUDFBuilder): + # CScalarUDFBuilder(c_string func_name, CArity arity, CFunctionDoc func_doc, + # vector[CInputType] in_types, COutputType out_type, PyObject* function) + # CStatus MakeFunction() diff --git a/python/setup.py b/python/setup.py index 6a5e7372068..daf65ac15c7 100755 --- a/python/setup.py +++ b/python/setup.py @@ -287,6 +287,9 @@ def append_cmake_bool(value, varname): cmake_options.append('-DCMAKE_BUILD_TYPE={0}' .format(self.build_type.lower())) + + cmake_options.append('-DCMAKE_C_COMPILER={0}'.format(os.environ['CC'])) + cmake_options.append('-DCMAKE_CXX_COMPILER={0}'.format(os.environ['CXX'])) if self.boost_namespace != 'boost': cmake_options.append('-DBoost_NAMESPACE={}' From 460e2c90b6dc4657aa86a92ed42c10db4b6eaa52 Mon Sep 17 00:00:00 2001 From: Vibhatha Abeykoon Date: Sat, 5 Mar 2022 10:36:18 +0530 Subject: [PATCH 029/131] refactor code v1 --- cpp/examples/arrow/aggregate_example.cc | 22 ++--- cpp/examples/arrow/udf_example.cc | 1 - cpp/src/arrow/python/udf.cc | 119 ++++++++++++------------ cpp/src/arrow/python/udf.h | 78 +++++++++------- python/pyarrow/_compute.pyx | 9 +- python/pyarrow/includes/libarrow.pxd | 16 ++-- 6 files changed, 126 insertions(+), 119 deletions(-) diff --git a/cpp/examples/arrow/aggregate_example.cc b/cpp/examples/arrow/aggregate_example.cc index 38f8cb0e97c..a631d095375 100644 --- a/cpp/examples/arrow/aggregate_example.cc +++ b/cpp/examples/arrow/aggregate_example.cc @@ -71,10 +71,11 @@ arrow::Result> GetTable() { arrow::field("c", arrow::int64())}; ARROW_ASSIGN_OR_RAISE(auto int_array, GetArrayDataSample({0, 1, 2, 0, 4, 1, 0, 5})); - ARROW_ASSIGN_OR_RAISE(auto bool_array, GetArrayDataSample( - {false, true, false, true, true, false, true, false})); - ARROW_ASSIGN_OR_RAISE(auto data_array, - GetArrayDataSample({10, 11, 12, 10, 11, 11, 10, 15})); + ARROW_ASSIGN_OR_RAISE(auto bool_array, + GetArrayDataSample( + {false, true, false, true, true, false, true, false})); + ARROW_ASSIGN_OR_RAISE(auto data_array, GetArrayDataSample( + {10, 11, 12, 10, 11, 11, 10, 15})); auto schema = arrow::schema(field_vector); auto data_vector = {int_array, bool_array, data_array}; @@ -99,14 +100,11 @@ arrow::Status DoAggregate() { std::shared_ptr out; cp::CountOptions options(cp::CountOptions::ONLY_VALID); - auto aggregate_options = - cp::AggregateNodeOptions{/*aggregates=*/{{"sum", &options}}, - /*targets=*/{"c"}, - /*names=*/{"count(c)"}, - /*keys=*/{}}; - auto schema = arrow::schema({ - arrow::field("count(c)", arrow::int64()) - }); + auto aggregate_options = cp::AggregateNodeOptions{/*aggregates=*/{{"sum", &options}}, + /*targets=*/{"c"}, + /*names=*/{"count(c)"}, + /*keys=*/{}}; + auto schema = arrow::schema({arrow::field("count(c)", arrow::int64())}); ABORT_ON_FAILURE(cp::Declaration::Sequence( { diff --git a/cpp/examples/arrow/udf_example.cc b/cpp/examples/arrow/udf_example.cc index 4ad31a5a761..dfa2a1cec95 100644 --- a/cpp/examples/arrow/udf_example.cc +++ b/cpp/examples/arrow/udf_example.cc @@ -450,7 +450,6 @@ arrow::Status ExecuteVar() { return future.status(); } - int main(int argc, char** argv) { auto status = ExecuteVar(); if (!status.ok()) { diff --git a/cpp/src/arrow/python/udf.cc b/cpp/src/arrow/python/udf.cc index 8520da41484..2b88d1ff593 100644 --- a/cpp/src/arrow/python/udf.cc +++ b/cpp/src/arrow/python/udf.cc @@ -13,68 +13,67 @@ namespace arrow { namespace py { -// Status ScalarUDFBuilder::MakeFunction() { -// Status st; -// auto func = std::make_shared(this->func_name_, this->arity_, &this->func_doc_); -// Py_XINCREF(this->function_); -// PyObject* function = this->function_; -// // lambda function -// auto call_back_lambda = [function](cp::KernelContext* ctx, const cp::ExecBatch& batch, -// Datum* out) -> Status { -// PyAcquireGIL lock; -// PyObject* py_array = NULLPTR; -// PyObject* arg_tuple = NULLPTR; -// PyObject* result = NULLPTR; -// std::shared_ptr c_res_array; -// if (function == NULL) { -// return Status::ExecutionError("python function cannot be null"); -// } +Status ScalarUdfBuilder::MakeFunction(PyObject* function) { + Status st; + auto func = + std::make_shared(this->name(), this->arity(), &this->doc()); + // lambda function + auto call_back_lambda = [function](cp::KernelContext* ctx, const cp::ExecBatch& batch, + Datum* out) -> Status { + PyAcquireGIL lock; + PyObject* py_array = NULLPTR; + PyObject* arg_tuple = NULLPTR; + PyObject* result = NULLPTR; + std::shared_ptr c_res_array; + if (function == NULL) { + return Status::ExecutionError("python function cannot be null"); + } -// if (PyCallable_Check(function)) { -// //if the batch is an array -// auto c_array = batch[0].make_array(); -// Py_XINCREF(py_array); -// Py_XINCREF(arg_tuple); -// Py_XINCREF(result); -// py_array = wrap_array(c_array); -// arg_tuple = PyTuple_Pack(1, py_array); -// result = PyObject_CallObject(function, arg_tuple); -// Py_XDECREF(function); -// if (result == NULL) { -// return Status::ExecutionError("Error occured in computation"); -// } -// auto res = unwrap_array(result); -// if (!res.status().ok()) { -// return res.status(); -// } -// c_res_array = res.ValueOrDie(); -// Py_XDECREF(py_array); -// Py_XDECREF(arg_tuple); -// Py_XDECREF(result); -// Py_XDECREF(function); -// //if the batch is a scalar + if (PyCallable_Check(function)) { + // if the batch is an array + auto c_array = batch[0].make_array(); + Py_XINCREF(py_array); + Py_XINCREF(arg_tuple); + Py_XINCREF(result); + py_array = wrap_array(c_array); + arg_tuple = PyTuple_Pack(1, py_array); + result = PyObject_CallObject(function, arg_tuple); + Py_XDECREF(function); + if (result == NULL) { + return Status::ExecutionError("Error occured in computation"); + } + auto res = unwrap_array(result); + if (!res.status().ok()) { + return res.status(); + } + c_res_array = res.ValueOrDie(); + Py_XDECREF(py_array); + Py_XDECREF(arg_tuple); + Py_XDECREF(result); + Py_XDECREF(function); + // if the batch is a scalar + } else { + return Status::ExecutionError("Error occured in computation"); + } + auto datum = new Datum(c_res_array); + *out->mutable_array() = *datum->array(); + return Status::OK(); + }; + // lambda function -// } else { -// return Status::ExecutionError("Error occured in computation"); -// } -// auto datum = new Datum(c_res_array); -// *out->mutable_array() = *datum->array(); -// return Status::OK(); -// }; // lambda function - -// cp::ScalarKernel kernel(this->in_types_, this->out_type_, call_back_lambda); -// kernel.mem_allocation = cp::MemAllocation::NO_PREALLOCATE; -// st = func->AddKernel(std::move(kernel)); -// if (!st.ok()) { -// return Status::ExecutionError("Kernel couldn't be added to the udf"); -// } -// auto registry = cp::GetFunctionRegistry(); -// st = registry->AddFunction(std::move(func)); -// if (!st.ok()) { -// return Status::ExecutionError("udf registration failed"); -// } -// return Status::OK(); -// } + cp::ScalarKernel kernel(this->input_types(), this->output_type(), call_back_lambda); + kernel.mem_allocation = cp::MemAllocation::NO_PREALLOCATE; + st = func->AddKernel(std::move(kernel)); + if (!st.ok()) { + return Status::ExecutionError("Kernel couldn't be added to the udf"); + } + auto registry = cp::GetFunctionRegistry(); + st = registry->AddFunction(std::move(func)); + if (!st.ok()) { + return Status::ExecutionError("udf registration failed"); + } + return Status::OK(); +} } // namespace py diff --git a/cpp/src/arrow/python/udf.h b/cpp/src/arrow/python/udf.h index 7275174b3e0..083dacc65bf 100644 --- a/cpp/src/arrow/python/udf.h +++ b/cpp/src/arrow/python/udf.h @@ -26,40 +26,50 @@ namespace arrow { namespace py { -// class ARROW_PYTHON_EXPORT UDFBuilder { -// public: -// virtual Status MakeFunction(); - -// protected: -// UDFBuilder(std::string func_name, cp::Function::Kind kind, cp::Arity arity, cp::FunctionDoc func_doc, -// std::vector in_types, cp::OutputType out_type) -// : func_name_(func_name), -// kind_(kind), -// arity_(arity), -// func_doc_(func_doc), -// in_types_(in_types), -// out_type_(out_type) {} - -// std::string func_name_; -// cp::Function::Kind kind_; -// cp::Arity arity_; -// cp::FunctionDoc func_doc_; -// std::vector in_types_; -// cp::OutputType out_type_; -// }; - -// class ARROW_PYTHON_EXPORT ScalarUDFBuilder : public UDFBuilder{ -// public: -// ScalarUDFBuilder(std::string func_name, cp::Arity arity, cp::FunctionDoc func_doc, -// std::vector in_types, cp::OutputType out_type, PyObject* function) -// : UDFBuilder(func_name, cp::Function::SCALAR, arity, func_doc, in_types, out_type), function_(function) {} - -// Status MakeFunction() override; - -// private: -// PyObject* function_; - -// }; +class ARROW_PYTHON_EXPORT UdfBuilder { + public: + UdfBuilder(const std::string func_name, const cp::Function::Kind kind, const cp::Arity arity, + const cp::FunctionDoc* func_doc, const std::vector in_types, + const cp::OutputType out_type) + : func_name_(func_name), + kind_(kind), + arity_(arity), + func_doc_(func_doc), + in_types_(in_types), + out_type_(out_type) {} + + const std::string& name() const { return func_name_; } + + cp::Function::Kind kind() { return kind_; } + + const cp::Arity& arity() const { return arity_; } + + const cp::FunctionDoc& doc() const { return *func_doc_; } + + const std::vector& input_types() const { return in_types_; } + + const cp::OutputType& output_type() const { return out_type_; } + + private: + std::string func_name_; + cp::Function::Kind kind_; + cp::Arity arity_; + const cp::FunctionDoc* func_doc_; + std::vector in_types_; + cp::OutputType out_type_; +}; + +class ARROW_PYTHON_EXPORT ScalarUdfBuilder : public UdfBuilder { + public: + explicit ScalarUdfBuilder(const std::string func_name, const cp::Arity arity, + const cp::FunctionDoc* func_doc, + const std::vector in_types, + const cp::OutputType out_type) + : UdfBuilder(func_name, cp::Function::SCALAR, arity, func_doc, in_types, out_type) {} + + Status MakeFunction(PyObject* function); + +}; } // namespace py diff --git a/python/pyarrow/_compute.pyx b/python/pyarrow/_compute.pyx index 71fd7872121..86023f70233 100644 --- a/python/pyarrow/_compute.pyx +++ b/python/pyarrow/_compute.pyx @@ -2390,6 +2390,8 @@ def register_function(func_name, arity, function_doc, in_types, out_type, callba vector[CInputType] c_in_types PyObject* c_callback shared_ptr[CDataType] c_type + COutputType* c_out_type + CScalarUdfBuilder* c_sc_builder object obj if func_name and isinstance(func_name, str): @@ -2412,7 +2414,6 @@ def register_function(func_name, arity, function_doc, in_types, out_type, callba c_type = pyarrow_unwrap_data_type(out_type) c_callback = callback - cdef COutputType* c_out_type = new COutputType(c_type) - # cdef CScalarUDFBuilder* c_sc_builder = new CScalarUDFBuilder(c_func_name, - # c_arity, c_func_doc, c_in_types, deref(c_out_type), c_callback) - # c_sc_builder.MakeFunction() + c_out_type = new COutputType(c_type) + c_sc_builder = new CScalarUdfBuilder(c_func_name, c_arity, &c_func_doc, c_in_types, deref(c_out_type)) + c_sc_builder.MakeFunction(c_callback) diff --git a/python/pyarrow/includes/libarrow.pxd b/python/pyarrow/includes/libarrow.pxd index 02e07dafb95..606bd100c30 100644 --- a/python/pyarrow/includes/libarrow.pxd +++ b/python/pyarrow/includes/libarrow.pxd @@ -2700,12 +2700,12 @@ cdef extern from "arrow/util/byte_size.h" namespace "arrow::util" nogil: int64_t TotalBufferSize(const CRecordBatch& record_batch) int64_t TotalBufferSize(const CTable& table) -# cdef extern from "arrow/python/udf.h" namespace "arrow::py" nogil: -# cdef cppclass CUDFBuilder "arrow::py::UDFBuilder": -# CUDFBuilder(c_string func_name, CArity arity, CFunctionDoc func_doc, -# vector[CInputType] in_types, COutputType out_type) - # cdef cppclass CScalarUDFBuilder "arrow::py::ScalarUDFBuilder"(CUDFBuilder): - # CScalarUDFBuilder(c_string func_name, CArity arity, CFunctionDoc func_doc, - # vector[CInputType] in_types, COutputType out_type, PyObject* function) - # CStatus MakeFunction() +cdef extern from "arrow/python/udf.h" namespace "arrow::py" nogil: + cdef cppclass CUdfBuilder" arrow::py::UdfBuilder": + CUdfBuilder(c_string func_name, FunctionKind kind, CArity arity, CFunctionDoc* func_doc, + vector[CInputType] in_types, COutputType out_type) + cdef cppclass CScalarUdfBuilder" arrow::py::ScalarUdfBuilder"(CUdfBuilder): + CScalarUdfBuilder(c_string func_name, CArity arity, CFunctionDoc* func_doc, + vector[CInputType] in_types, COutputType out_type) + CStatus MakeFunction(PyObject* function) From a82ecd949b83ea86ea418707c5958a1028466053 Mon Sep 17 00:00:00 2001 From: Vibhatha Abeykoon Date: Mon, 7 Mar 2022 15:26:45 +0530 Subject: [PATCH 030/131] adding scalar unary and array unary ops --- cpp/src/arrow/python/udf.cc | 129 +++++++++++++++++----- cpp/src/arrow/python/udf.h | 53 ++++++--- python/examples/statistics/udf_example.py | 22 ++++ python/pyarrow/_compute.pyx | 24 +++- python/pyarrow/includes/libarrow.pxd | 19 +++- 5 files changed, 199 insertions(+), 48 deletions(-) diff --git a/cpp/src/arrow/python/udf.cc b/cpp/src/arrow/python/udf.cc index 2b88d1ff593..e9475c321cf 100644 --- a/cpp/src/arrow/python/udf.cc +++ b/cpp/src/arrow/python/udf.cc @@ -1,3 +1,20 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + #include "arrow/python/udf.h" #include @@ -13,56 +30,110 @@ namespace arrow { namespace py { +Status VerifyArityAndInput(cp::Arity arity, const cp::ExecBatch& batch) { + bool match = (uint64_t)arity.num_args == batch.values.size(); + if (!match) { + return Status::Invalid( + "Function Arity and Input data shape doesn't match, expceted {}"); + } + return Status::OK(); +} + Status ScalarUdfBuilder::MakeFunction(PyObject* function) { Status st; auto func = std::make_shared(this->name(), this->arity(), &this->doc()); // lambda function - auto call_back_lambda = [function](cp::KernelContext* ctx, const cp::ExecBatch& batch, - Datum* out) -> Status { + auto call_back_lambda = [function, this](cp::KernelContext* ctx, + const cp::ExecBatch& batch, + Datum* out) -> Status { PyAcquireGIL lock; - PyObject* py_array = NULLPTR; PyObject* arg_tuple = NULLPTR; PyObject* result = NULLPTR; - std::shared_ptr c_res_array; if (function == NULL) { return Status::ExecutionError("python function cannot be null"); } if (PyCallable_Check(function)) { + RETURN_NOT_OK(VerifyArityAndInput(this->arity(), batch)); // if the batch is an array - auto c_array = batch[0].make_array(); - Py_XINCREF(py_array); - Py_XINCREF(arg_tuple); - Py_XINCREF(result); - py_array = wrap_array(c_array); - arg_tuple = PyTuple_Pack(1, py_array); - result = PyObject_CallObject(function, arg_tuple); - Py_XDECREF(function); - if (result == NULL) { - return Status::ExecutionError("Error occured in computation"); - } - auto res = unwrap_array(result); - if (!res.status().ok()) { - return res.status(); + auto num_args = this->arity().num_args; + if (num_args == 1) { // unary function + if (batch[0].is_array()) { + std::shared_ptr c_res_array; + PyObject* py_array = NULLPTR; + auto c_array = batch[0].make_array(); + Py_XINCREF(py_array); + Py_XINCREF(arg_tuple); + Py_XINCREF(result); + py_array = wrap_array(c_array); + arg_tuple = PyTuple_Pack(1, py_array); + result = PyObject_CallObject(function, arg_tuple); + Py_XDECREF(function); + if (result == NULL) { + return Status::ExecutionError("Error occured in computation"); + } + auto res = unwrap_array(result); + if (!res.status().ok()) { + return res.status(); + } + c_res_array = res.ValueOrDie(); + Py_XDECREF(py_array); + Py_XDECREF(arg_tuple); + Py_XDECREF(result); + auto datum = new Datum(c_res_array); + *out->mutable_array() = *datum->array(); + } else if (batch[0].is_scalar()) { + std::shared_ptr c_res_scalar; + PyObject* py_scalar = NULLPTR; + auto c_scalar = batch[0].scalar(); + Py_XINCREF(py_scalar); + Py_XINCREF(arg_tuple); + Py_XINCREF(result); + py_scalar = wrap_scalar(c_scalar); + arg_tuple = PyTuple_Pack(1, py_scalar); + result = PyObject_CallObject(function, arg_tuple); + Py_XDECREF(function); + if (result == NULL) { + return Status::ExecutionError("Error occured in computation"); + } + auto res = unwrap_scalar(result); + if (!res.status().ok()) { + return res.status(); + } + c_res_scalar = res.ValueOrDie(); + Py_XDECREF(py_scalar); + Py_XDECREF(arg_tuple); + Py_XDECREF(result); + auto datum = new Datum(c_res_scalar); + *out = *datum; + } else { + return Status::Invalid("Invalid type, expected scalar or array input"); + } + } else if (num_args == 2) { // binary function + if (batch[0].is_array() && batch[1].is_array()) { + } else if (batch[0].is_scalar() && batch[1].is_scalar()) { + return Status::NotImplemented("TODO:"); + } else { + return Status::Invalid("Invalid type, expected scalar or array input"); + } + } else if (num_args == 3) { // ternary function + return Status::NotImplemented("TODO:"); + } else if (num_args > 3) { // varargs function + return Status::NotImplemented("TODO:"); } - c_res_array = res.ValueOrDie(); - Py_XDECREF(py_array); - Py_XDECREF(arg_tuple); - Py_XDECREF(result); - Py_XDECREF(function); // if the batch is a scalar } else { return Status::ExecutionError("Error occured in computation"); } - auto datum = new Datum(c_res_array); - *out->mutable_array() = *datum->array(); + Py_XDECREF(function); return Status::OK(); - }; - // lambda function + }; + // lambda function cp::ScalarKernel kernel(this->input_types(), this->output_type(), call_back_lambda); - kernel.mem_allocation = cp::MemAllocation::NO_PREALLOCATE; + kernel.mem_allocation = this->mem_allocation(); + kernel.null_handling = this->null_handling(); st = func->AddKernel(std::move(kernel)); if (!st.ok()) { return Status::ExecutionError("Kernel couldn't be added to the udf"); @@ -77,4 +148,4 @@ Status ScalarUdfBuilder::MakeFunction(PyObject* function) { } // namespace py -} // namespace arrow \ No newline at end of file +} // namespace arrow diff --git a/cpp/src/arrow/python/udf.h b/cpp/src/arrow/python/udf.h index 083dacc65bf..bc6b53fb8ab 100644 --- a/cpp/src/arrow/python/udf.h +++ b/cpp/src/arrow/python/udf.h @@ -1,3 +1,20 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + #pragma once #include "arrow/python/platform.h" @@ -18,8 +35,6 @@ #include "arrow/python/pyarrow.h" #include "arrow/python/visibility.h" -#include - namespace cp = arrow::compute; namespace arrow { @@ -28,15 +43,19 @@ namespace py { class ARROW_PYTHON_EXPORT UdfBuilder { public: - UdfBuilder(const std::string func_name, const cp::Function::Kind kind, const cp::Arity arity, - const cp::FunctionDoc* func_doc, const std::vector in_types, - const cp::OutputType out_type) + UdfBuilder(const std::string func_name, const cp::Function::Kind kind, + const cp::Arity arity, const cp::FunctionDoc* func_doc, + const std::vector in_types, const cp::OutputType out_type, + const cp::MemAllocation::type mem_allocation, + const cp::NullHandling::type null_handling) : func_name_(func_name), kind_(kind), arity_(arity), func_doc_(func_doc), in_types_(in_types), - out_type_(out_type) {} + out_type_(out_type), + mem_allocation_(mem_allocation), + null_handling_(null_handling) {} const std::string& name() const { return func_name_; } @@ -50,6 +69,10 @@ class ARROW_PYTHON_EXPORT UdfBuilder { const cp::OutputType& output_type() const { return out_type_; } + cp::MemAllocation::type mem_allocation() { return mem_allocation_; } + + cp::NullHandling::type null_handling() { return null_handling_; } + private: std::string func_name_; cp::Function::Kind kind_; @@ -57,20 +80,24 @@ class ARROW_PYTHON_EXPORT UdfBuilder { const cp::FunctionDoc* func_doc_; std::vector in_types_; cp::OutputType out_type_; + cp::MemAllocation::type mem_allocation_; + cp::NullHandling::type null_handling_; }; class ARROW_PYTHON_EXPORT ScalarUdfBuilder : public UdfBuilder { public: - explicit ScalarUdfBuilder(const std::string func_name, const cp::Arity arity, - const cp::FunctionDoc* func_doc, - const std::vector in_types, - const cp::OutputType out_type) - : UdfBuilder(func_name, cp::Function::SCALAR, arity, func_doc, in_types, out_type) {} + ScalarUdfBuilder(const std::string func_name, const cp::Arity arity, + const cp::FunctionDoc* func_doc, + const std::vector in_types, + const cp::OutputType out_type, + const cp::MemAllocation::type mem_allocation, + const cp::NullHandling::type null_handling) + : UdfBuilder(func_name, cp::Function::SCALAR, arity, func_doc, in_types, out_type, + mem_allocation, null_handling) {} Status MakeFunction(PyObject* function); - }; } // namespace py -} // namespace arrow \ No newline at end of file +} // namespace arrow diff --git a/python/examples/statistics/udf_example.py b/python/examples/statistics/udf_example.py index 40255d8d24e..9e92ac71045 100644 --- a/python/examples/statistics/udf_example.py +++ b/python/examples/statistics/udf_example.py @@ -58,4 +58,26 @@ def add_constant(array): print(a2) +# unary scalar example + +def unary_scalar_function(scalar): + return pc.call_function("add", [scalar, 1]) + +print("=" * 80) +print("Example 2") +print("=" * 80) +callback = unary_scalar_function +func_name = "py_scalar_add_func" +in_types = [InputType.scalar(pa.int64())] +out_type = pa.int64() +register_function(func_name, arity, func_doc, in_types, out_type, callback) + +func2 = pc.get_function(func_name) + +a3 = pc.call_function(func_name, [pa.scalar(10)]) + +print(a3) + + + diff --git a/python/pyarrow/_compute.pyx b/python/pyarrow/_compute.pyx index 86023f70233..3a79943f201 100644 --- a/python/pyarrow/_compute.pyx +++ b/python/pyarrow/_compute.pyx @@ -2381,7 +2381,8 @@ cdef CFunctionDoc _make_function_doc(func_doc): else: raise TypeError(f"func_doc must be a dictionary") -def register_function(func_name, arity, function_doc, in_types, out_type, callback): +def register_function(func_name, arity, function_doc, in_types, + out_type, callback, mem_allocation="no_preallocate", null_handling="computed_no_preallocate"): cdef: c_string c_func_name CArity c_arity @@ -2392,8 +2393,22 @@ def register_function(func_name, arity, function_doc, in_types, out_type, callba shared_ptr[CDataType] c_type COutputType* c_out_type CScalarUdfBuilder* c_sc_builder + MemAllocation c_mem_allocation + NullHandling c_null_handling object obj - + + _mem_allocation_map = { + "preallocate": MemAllocation_PREALLOCATE, + "no_preallocate": MemAllocation_NO_PREALLOCATE + } + + _null_handling_map = { + "intersect": NullHandling_INTERSECTION, + "computed_preallocate": NullHandling_COMPUTED_PREALLOCATE, + "computed_no_preallocate": NullHandling_COMPUTED_NO_PREALLOCATE, + "output_not_null": NullHandling_OUTPUT_NOT_NULL + } + if func_name and isinstance(func_name, str): c_func_name = func_name.encode() else: @@ -2415,5 +2430,8 @@ def register_function(func_name, arity, function_doc, in_types, out_type, callba c_callback = callback c_out_type = new COutputType(c_type) - c_sc_builder = new CScalarUdfBuilder(c_func_name, c_arity, &c_func_doc, c_in_types, deref(c_out_type)) + c_mem_allocation = _mem_allocation_map[mem_allocation] + c_null_handling = _null_handling_map[null_handling] + c_sc_builder = new CScalarUdfBuilder(c_func_name, c_arity, &c_func_doc, + c_in_types, deref(c_out_type), c_mem_allocation, c_null_handling) c_sc_builder.MakeFunction(c_callback) diff --git a/python/pyarrow/includes/libarrow.pxd b/python/pyarrow/includes/libarrow.pxd index 606bd100c30..94264cc7645 100644 --- a/python/pyarrow/includes/libarrow.pxd +++ b/python/pyarrow/includes/libarrow.pxd @@ -2700,12 +2700,25 @@ cdef extern from "arrow/util/byte_size.h" namespace "arrow::util" nogil: int64_t TotalBufferSize(const CRecordBatch& record_batch) int64_t TotalBufferSize(const CTable& table) +cdef extern from "arrow/compute/kernel.h" namespace "arrow::compute" nogil: + cdef enum MemAllocation" arrow::compute::MemAllocation::type": + MemAllocation_PREALLOCATE" arrow::compute::MemAllocation::PREALLOCATE" + MemAllocation_NO_PREALLOCATE" arrow::compute::MemAllocation::NO_PREALLOCATE" + + cdef enum NullHandling" arrow::compute::NullHandling::type": + NullHandling_INTERSECTION" arrow::compute::NullHandling::INTERSECTION" + NullHandling_COMPUTED_PREALLOCATE" arrow::compute::NullHandling::COMPUTED_PREALLOCATE" + NullHandling_COMPUTED_NO_PREALLOCATE" arrow::compute::NullHandling::COMPUTED_NO_PREALLOCATE" + NullHandling_OUTPUT_NOT_NULL" arrow::compute::NullHandling::OUTPUT_NOT_NULL" + cdef extern from "arrow/python/udf.h" namespace "arrow::py" nogil: cdef cppclass CUdfBuilder" arrow::py::UdfBuilder": CUdfBuilder(c_string func_name, FunctionKind kind, CArity arity, CFunctionDoc* func_doc, - vector[CInputType] in_types, COutputType out_type) + vector[CInputType] in_types, COutputType out_type, + MemAllocation mem_allocation, NullHandling null_handling) cdef cppclass CScalarUdfBuilder" arrow::py::ScalarUdfBuilder"(CUdfBuilder): CScalarUdfBuilder(c_string func_name, CArity arity, CFunctionDoc* func_doc, - vector[CInputType] in_types, COutputType out_type) + vector[CInputType] in_types, COutputType out_type, + MemAllocation mem_allocation, NullHandling null_handling) CStatus MakeFunction(PyObject* function) - + \ No newline at end of file From 846cb6cb8bbac5a9d848fde6e9a400d5ed0706d1 Mon Sep 17 00:00:00 2001 From: Vibhatha Abeykoon Date: Wed, 9 Mar 2022 09:27:18 +0530 Subject: [PATCH 031/131] adding initial macro component --- cpp/src/arrow/python/udf.cc | 90 +++++++++++++++++-------------------- cpp/src/arrow/python/udf.h | 8 ++++ 2 files changed, 50 insertions(+), 48 deletions(-) diff --git a/cpp/src/arrow/python/udf.cc b/cpp/src/arrow/python/udf.cc index e9475c321cf..925e1a9fcb7 100644 --- a/cpp/src/arrow/python/udf.cc +++ b/cpp/src/arrow/python/udf.cc @@ -30,6 +30,42 @@ namespace arrow { namespace py { +#define DEFINE_CALL_UDF(TYPE_NAME, FUNCTION_SUFFIX, CONVERT_SUFFIX) \ + Status exec_function_##FUNCTION_SUFFIX(const cp::ExecBatch& batch, PyObject* function, int num_args, Datum *out) { \ + std::shared_ptr c_res_data; \ + PyObject* result = NULLPTR; \ + PyObject* data = NULLPTR; \ + PyObject* arg_tuple = NULLPTR; \ + auto c_data = batch[0].CONVERT_SUFFIX(); \ + Py_XINCREF(data); \ + Py_XINCREF(arg_tuple); \ + Py_XINCREF(result); \ + data = wrap_##FUNCTION_SUFFIX(c_data); \ + arg_tuple = PyTuple_New(num_args); \ + PyTuple_SetItem(arg_tuple, 0, data); \ + result = PyObject_CallObject(function, arg_tuple); \ + Py_XDECREF(function); \ + if (result == NULL) { \ + return Status::ExecutionError("Error occured in computation"); \ + } \ + auto res = unwrap_##FUNCTION_SUFFIX(result); \ + if (!res.status().ok()) { \ + return res.status(); \ + } \ + c_res_data = res.ValueOrDie(); \ + Py_XDECREF(data); \ + Py_XDECREF(arg_tuple); \ + Py_XDECREF(result); \ + auto datum = new Datum(c_res_data); \ + *out = *datum; \ + return Status::OK(); \ + } + +DEFINE_CALL_UDF(Scalar, scalar, scalar) +DEFINE_CALL_UDF(Array, array, make_array) + +#undef DEFINE_CALL_UDF + Status VerifyArityAndInput(cp::Arity arity, const cp::ExecBatch& batch) { bool match = (uint64_t)arity.num_args == batch.values.size(); if (!match) { @@ -39,6 +75,10 @@ Status VerifyArityAndInput(cp::Arity arity, const cp::ExecBatch& batch) { return Status::OK(); } +bool CheckBatchValueTypes(const ExecBatch& batch, int num_args) { + +} + Status ScalarUdfBuilder::MakeFunction(PyObject* function) { Status st; auto func = @@ -48,8 +88,6 @@ Status ScalarUdfBuilder::MakeFunction(PyObject* function) { const cp::ExecBatch& batch, Datum* out) -> Status { PyAcquireGIL lock; - PyObject* arg_tuple = NULLPTR; - PyObject* result = NULLPTR; if (function == NULL) { return Status::ExecutionError("python function cannot be null"); } @@ -60,53 +98,9 @@ Status ScalarUdfBuilder::MakeFunction(PyObject* function) { auto num_args = this->arity().num_args; if (num_args == 1) { // unary function if (batch[0].is_array()) { - std::shared_ptr c_res_array; - PyObject* py_array = NULLPTR; - auto c_array = batch[0].make_array(); - Py_XINCREF(py_array); - Py_XINCREF(arg_tuple); - Py_XINCREF(result); - py_array = wrap_array(c_array); - arg_tuple = PyTuple_Pack(1, py_array); - result = PyObject_CallObject(function, arg_tuple); - Py_XDECREF(function); - if (result == NULL) { - return Status::ExecutionError("Error occured in computation"); - } - auto res = unwrap_array(result); - if (!res.status().ok()) { - return res.status(); - } - c_res_array = res.ValueOrDie(); - Py_XDECREF(py_array); - Py_XDECREF(arg_tuple); - Py_XDECREF(result); - auto datum = new Datum(c_res_array); - *out->mutable_array() = *datum->array(); + RETURN_NOT_OK(exec_function_array(batch, function, 1, out)); } else if (batch[0].is_scalar()) { - std::shared_ptr c_res_scalar; - PyObject* py_scalar = NULLPTR; - auto c_scalar = batch[0].scalar(); - Py_XINCREF(py_scalar); - Py_XINCREF(arg_tuple); - Py_XINCREF(result); - py_scalar = wrap_scalar(c_scalar); - arg_tuple = PyTuple_Pack(1, py_scalar); - result = PyObject_CallObject(function, arg_tuple); - Py_XDECREF(function); - if (result == NULL) { - return Status::ExecutionError("Error occured in computation"); - } - auto res = unwrap_scalar(result); - if (!res.status().ok()) { - return res.status(); - } - c_res_scalar = res.ValueOrDie(); - Py_XDECREF(py_scalar); - Py_XDECREF(arg_tuple); - Py_XDECREF(result); - auto datum = new Datum(c_res_scalar); - *out = *datum; + RETURN_NOT_OK(exec_function_scalar(batch, function, 1, out)); } else { return Status::Invalid("Invalid type, expected scalar or array input"); } diff --git a/cpp/src/arrow/python/udf.h b/cpp/src/arrow/python/udf.h index bc6b53fb8ab..c6facd94c8d 100644 --- a/cpp/src/arrow/python/udf.h +++ b/cpp/src/arrow/python/udf.h @@ -41,6 +41,14 @@ namespace arrow { namespace py { +#define DECLARE_CALL_UDF(TYPE_NAME, FUNCTION_SUFFIX, CONVERT_SUFFIX) \ + ARROW_PYTHON_EXPORT Status exec_function_##FUNCTION_SUFFIX(const cp::ExecBatch&, PyObject*, int, Datum*); + +DECLARE_CALL_UDF(Scalar, scalar, scalar) +DECLARE_CALL_UDF(Array, array, make_array) + +#undef DECLARE_CALL_UDF + class ARROW_PYTHON_EXPORT UdfBuilder { public: UdfBuilder(const std::string func_name, const cp::Function::Kind kind, From 2ef0e9bfbf9b8718a42649e541d5f9153b8e7e4e Mon Sep 17 00:00:00 2001 From: Vibhatha Abeykoon Date: Wed, 9 Mar 2022 13:39:35 +0530 Subject: [PATCH 032/131] adding exceptions and refactor --- cpp/src/arrow/python/udf.cc | 102 ++++++++---------- cpp/src/arrow/python/udf.h | 5 +- python/examples/statistics/udf_example.py | 121 ++++++++++++++-------- python/pyarrow/_compute.pyx | 43 +++++++- python/pyarrow/includes/libarrow.pxd | 12 +++ 5 files changed, 181 insertions(+), 102 deletions(-) diff --git a/cpp/src/arrow/python/udf.cc b/cpp/src/arrow/python/udf.cc index 925e1a9fcb7..014a57bcb55 100644 --- a/cpp/src/arrow/python/udf.cc +++ b/cpp/src/arrow/python/udf.cc @@ -30,35 +30,41 @@ namespace arrow { namespace py { -#define DEFINE_CALL_UDF(TYPE_NAME, FUNCTION_SUFFIX, CONVERT_SUFFIX) \ - Status exec_function_##FUNCTION_SUFFIX(const cp::ExecBatch& batch, PyObject* function, int num_args, Datum *out) { \ - std::shared_ptr c_res_data; \ - PyObject* result = NULLPTR; \ - PyObject* data = NULLPTR; \ - PyObject* arg_tuple = NULLPTR; \ - auto c_data = batch[0].CONVERT_SUFFIX(); \ - Py_XINCREF(data); \ - Py_XINCREF(arg_tuple); \ - Py_XINCREF(result); \ - data = wrap_##FUNCTION_SUFFIX(c_data); \ - arg_tuple = PyTuple_New(num_args); \ - PyTuple_SetItem(arg_tuple, 0, data); \ - result = PyObject_CallObject(function, arg_tuple); \ - Py_XDECREF(function); \ - if (result == NULL) { \ - return Status::ExecutionError("Error occured in computation"); \ - } \ - auto res = unwrap_##FUNCTION_SUFFIX(result); \ - if (!res.status().ok()) { \ - return res.status(); \ - } \ - c_res_data = res.ValueOrDie(); \ - Py_XDECREF(data); \ - Py_XDECREF(arg_tuple); \ - Py_XDECREF(result); \ - auto datum = new Datum(c_res_data); \ - *out = *datum; \ - return Status::OK(); \ +#define DEFINE_CALL_UDF(TYPE_NAME, FUNCTION_SUFFIX, CONVERT_SUFFIX) \ + Status exec_function_##FUNCTION_SUFFIX(const cp::ExecBatch& batch, PyObject* function, \ + int num_args, Datum* out) { \ + std::shared_ptr c_res_data; \ + PyObject* result = NULLPTR; \ + PyObject* data = NULLPTR; \ + PyObject* arg_tuple = NULLPTR; \ + Py_XINCREF(data); \ + Py_XINCREF(arg_tuple); \ + Py_XINCREF(result); \ + arg_tuple = PyTuple_New(num_args); \ + for (int arg_id = 0; arg_id < num_args; arg_id++) { \ + if (!batch[arg_id].is_##FUNCTION_SUFFIX()) { \ + return Status::Invalid("Input type and data type doesn't match"); \ + } \ + auto c_data = batch[arg_id].CONVERT_SUFFIX(); \ + data = wrap_##FUNCTION_SUFFIX(c_data); \ + PyTuple_SetItem(arg_tuple, arg_id, data); \ + } \ + result = PyObject_CallObject(function, arg_tuple); \ + Py_XDECREF(function); \ + if (result == NULL) { \ + return Status::ExecutionError("Error occured in computation"); \ + } \ + auto res = unwrap_##FUNCTION_SUFFIX(result); \ + if (!res.status().ok()) { \ + return res.status(); \ + } \ + c_res_data = res.ValueOrDie(); \ + Py_XDECREF(data); \ + Py_XDECREF(arg_tuple); \ + Py_XDECREF(result); \ + auto datum = new Datum(c_res_data); \ + *out = *datum; \ + return Status::OK(); \ } DEFINE_CALL_UDF(Scalar, scalar, scalar) @@ -75,10 +81,6 @@ Status VerifyArityAndInput(cp::Arity arity, const cp::ExecBatch& batch) { return Status::OK(); } -bool CheckBatchValueTypes(const ExecBatch& batch, int num_args) { - -} - Status ScalarUdfBuilder::MakeFunction(PyObject* function) { Status st; auto func = @@ -94,31 +96,15 @@ Status ScalarUdfBuilder::MakeFunction(PyObject* function) { if (PyCallable_Check(function)) { RETURN_NOT_OK(VerifyArityAndInput(this->arity(), batch)); - // if the batch is an array - auto num_args = this->arity().num_args; - if (num_args == 1) { // unary function - if (batch[0].is_array()) { - RETURN_NOT_OK(exec_function_array(batch, function, 1, out)); - } else if (batch[0].is_scalar()) { - RETURN_NOT_OK(exec_function_scalar(batch, function, 1, out)); - } else { - return Status::Invalid("Invalid type, expected scalar or array input"); - } - } else if (num_args == 2) { // binary function - if (batch[0].is_array() && batch[1].is_array()) { - } else if (batch[0].is_scalar() && batch[1].is_scalar()) { - return Status::NotImplemented("TODO:"); - } else { - return Status::Invalid("Invalid type, expected scalar or array input"); - } - } else if (num_args == 3) { // ternary function - return Status::NotImplemented("TODO:"); - } else if (num_args > 3) { // varargs function - return Status::NotImplemented("TODO:"); + if (batch[0].is_array()) { // checke 0-th element to select array callable + RETURN_NOT_OK(exec_function_array(batch, function, this->arity().num_args, out)); + } else if (batch[0].is_scalar()) { // check 0-th element to select scalar callable + RETURN_NOT_OK(exec_function_scalar(batch, function, this->arity().num_args, out)); + } else { + return Status::Invalid("Unexpected input type, scalar or array type expected."); } - // if the batch is a scalar } else { - return Status::ExecutionError("Error occured in computation"); + return Status::ExecutionError("Expected a callable python object."); } Py_XDECREF(function); return Status::OK(); @@ -130,12 +116,12 @@ Status ScalarUdfBuilder::MakeFunction(PyObject* function) { kernel.null_handling = this->null_handling(); st = func->AddKernel(std::move(kernel)); if (!st.ok()) { - return Status::ExecutionError("Kernel couldn't be added to the udf"); + return Status::ExecutionError("Kernel couldn't be added to the udf : " + st.message()); } auto registry = cp::GetFunctionRegistry(); st = registry->AddFunction(std::move(func)); if (!st.ok()) { - return Status::ExecutionError("udf registration failed"); + return Status::ExecutionError("udf registration failed : " + st.message()); } return Status::OK(); } diff --git a/cpp/src/arrow/python/udf.h b/cpp/src/arrow/python/udf.h index c6facd94c8d..37c1086d979 100644 --- a/cpp/src/arrow/python/udf.h +++ b/cpp/src/arrow/python/udf.h @@ -41,8 +41,9 @@ namespace arrow { namespace py { -#define DECLARE_CALL_UDF(TYPE_NAME, FUNCTION_SUFFIX, CONVERT_SUFFIX) \ - ARROW_PYTHON_EXPORT Status exec_function_##FUNCTION_SUFFIX(const cp::ExecBatch&, PyObject*, int, Datum*); +#define DECLARE_CALL_UDF(TYPE_NAME, FUNCTION_SUFFIX, CONVERT_SUFFIX) \ + ARROW_PYTHON_EXPORT Status exec_function_##FUNCTION_SUFFIX(const cp::ExecBatch&, \ + PyObject*, int, Datum*); DECLARE_CALL_UDF(Scalar, scalar, scalar) DECLARE_CALL_UDF(Array, array, make_array) diff --git a/python/examples/statistics/udf_example.py b/python/examples/statistics/udf_example.py index 9e92ac71045..3a225aa42db 100644 --- a/python/examples/statistics/udf_example.py +++ b/python/examples/statistics/udf_example.py @@ -1,20 +1,26 @@ +from typing import List import pyarrow as pa from pyarrow import compute as pc from pyarrow.compute import call_function, register_function from pyarrow.compute import Arity, InputType -func_doc = {} -func_doc["summary"] = "summary" -func_doc["description"] = "desc" -func_doc["arg_names"] = ["number"] -func_doc["options_class"] = "SomeOptions" -func_doc["options_required"] = False -arity = Arity.unary() -func_name = "python_udf" + +def get_function_doc(summary:str, desc:str, arg_names:List[str], + options_class:str, options_required:bool=False): + func_doc = {} + func_doc["summary"] = summary + func_doc["description"] = desc + func_doc["arg_names"] = arg_names + func_doc["options_class"] = options_class + func_doc["options_required"] = False + return func_doc + +arity_1 = Arity.unary() +func_name_1 = "python_udf" # TODO: evaluate this properly, the input type can be a record_batch, array or a table # Caveat, a recordbatch or a table does not have type information. -in_types = [InputType.array(pa.int64())] +in_types_1 = [InputType.array(pa.int64())] # TODO: evaluate this properly, whether the output type can support table, array or recordbatch -out_type = pa.int64() +out_type_1 = pa.int64() def py_function(arrow_array): p_new_array = call_function("add", [arrow_array, 1]) @@ -27,57 +33,90 @@ def simple_function(args): print(args) return args -def add_constant(array): - return pc.call_function("add", [array, 1]) +# # example 1 +# print("=" * 80) +# print("Example 1") +# print("=" * 80) +# doc_1 = get_function_doc("simple function", "test simple function", +# ["message"], "None") +# register_function(func_name_1, arity_1, doc_1, in_types_1, out_type_1, simple_function) +# func1 = pc.get_function(func_name_1) -# example 1 -print("=" * 80) -print("Example 1") -print("=" * 80) -callback = simple_function -register_function(func_name, arity, func_doc, in_types, out_type, callback) +# a1 = pc.call_function(func_name_1, [pa.array([20])]) -func1 = pc.get_function(func_name) +# print(a1) -a1 = pc.call_function(func_name, [pa.array([20])]) +# # example 2 +# print("=" * 80) +# print("Example 2") +# print("=" * 80) -print(a1) +# def add_constant(array): +# return pc.call_function("add", [array, 1]) -# example 2 -print("=" * 80) -print("Example 2") -print("=" * 80) -callback = add_constant -func_name = "py_add_func" -register_function(func_name, arity, func_doc, in_types, out_type, callback) +# func_name_2 = "py_add_func" +# arity_2 = Arity.unary() +# in_types_2 = [InputType.array(pa.int64())] +# out_type_2 = pa.int64() +# doc_2 = get_function_doc("add function", "test add function", +# ["value"], "None") +# register_function(func_name_2, arity_2, doc_2, in_types_2, out_type_2, add_constant) -func2 = pc.get_function(func_name) +# func2 = pc.get_function(func_name_2) -a2 = pc.call_function(func_name, [pa.array([20])]) +# a2 = pc.call_function(func_name_2, [pa.array([20])]) -print(a2) +# print(a2) -# unary scalar example +# example 3 + +print("=" * 80) +print("Example 3") +print("=" * 80) def unary_scalar_function(scalar): return pc.call_function("add", [scalar, 1]) +arity_3 = Arity.unary() +func_name_3 = "py_scalar_add_func" +in_types_3 = [InputType.scalar(pa.int64())] +out_type_3 = pa.int64() +doc_3 = get_function_doc("scalar add function", "test scalar add function", + ["scalar_value"], "None") +register_function(func_name_3, arity_3, doc_3, in_types_3, out_type_3, unary_scalar_function) + +func3 = pc.get_function(func_name_3) + +a3 = pc.call_function(func_name_3, [pa.scalar(10)]) + +print(a3) + +## Binary Function print("=" * 80) -print("Example 2") +print("Scalar Binary Example 4") print("=" * 80) -callback = unary_scalar_function -func_name = "py_scalar_add_func" -in_types = [InputType.scalar(pa.int64())] -out_type = pa.int64() -register_function(func_name, arity, func_doc, in_types, out_type, callback) +arity_4 = Arity.binary() +func_name_4 = "scalar_udf_binary_add" +# TODO: evaluate this properly, the input type can be a record_batch, array or a table +# Caveat, a recordbatch or a table does not have type information. +in_types_4 = [InputType.scalar(pa.int64()), InputType.scalar(pa.int64())] +# TODO: evaluate this properly, whether the output type can support table, array or recordbatch +out_type_4 = pa.int64() +scalar_binary_add_function_doc = get_function_doc("scalar bin add function", + "test scalar bin add function", + ["scalar_value1", "scalar_value2"], "None") -func2 = pc.get_function(func_name) +def binary_scalar_function(scalar1, scalar2): + return pc.call_function("add", [scalar1, scalar2]) -a3 = pc.call_function(func_name, [pa.scalar(10)]) +register_function(func_name_4, arity_4, scalar_binary_add_function_doc, in_types_4, out_type_4, binary_scalar_function) -print(a3) +func4 = pc.get_function(func_name_4) + +a4 = pc.call_function(func_name_4, [pa.scalar(10), pa.scalar(20)]) +print(a4) diff --git a/python/pyarrow/_compute.pyx b/python/pyarrow/_compute.pyx index 3a79943f201..ce023d102dd 100644 --- a/python/pyarrow/_compute.pyx +++ b/python/pyarrow/_compute.pyx @@ -261,6 +261,22 @@ cdef class Arity(_Weakrefable): def unary(): cdef CArity c_arity = CArity.Unary() return wrap_arity(c_arity) + + @staticmethod + def binary(): + cdef CArity c_arity = CArity.Binary() + return wrap_arity(c_arity) + + @staticmethod + def ternary(): + cdef CArity c_arity = CArity.Ternary() + return wrap_arity(c_arity) + + @staticmethod + def varargs(int num_args): + cdef CArity c_arity = CArity.VarArgs(num_args) + return wrap_arity(c_arity) + cdef class Function(_Weakrefable): """ @@ -2381,6 +2397,27 @@ cdef CFunctionDoc _make_function_doc(func_doc): else: raise TypeError(f"func_doc must be a dictionary") +cdef class UDFError(Exception): + cdef dict __dict__ + + def __init__(self, message='', extra_info=b''): + super().__init__(message) + self.extra_info = tobytes(extra_info) + + cdef CStatus to_status(self): + message = tobytes("UDF error: {}".format(str(self))) + return CStatus_UnknownError(message) + +cdef class UDFRegistrationError(UDFError): + + def __init__(self, message='', extra_info=b''): + super().__init__(message, extra_info) + + cdef CStatus to_status(self): + message = tobytes("UDF Registration error: {}".format(str(self))) + return CStatus_UnknownError(message) + + def register_function(func_name, arity, function_doc, in_types, out_type, callback, mem_allocation="no_preallocate", null_handling="computed_no_preallocate"): cdef: @@ -2395,6 +2432,7 @@ def register_function(func_name, arity, function_doc, in_types, CScalarUdfBuilder* c_sc_builder MemAllocation c_mem_allocation NullHandling c_null_handling + CStatus st object obj _mem_allocation_map = { @@ -2434,4 +2472,7 @@ def register_function(func_name, arity, function_doc, in_types, c_null_handling = _null_handling_map[null_handling] c_sc_builder = new CScalarUdfBuilder(c_func_name, c_arity, &c_func_doc, c_in_types, deref(c_out_type), c_mem_allocation, c_null_handling) - c_sc_builder.MakeFunction(c_callback) + st = c_sc_builder.MakeFunction(c_callback) + if not st.ok(): + error_msg = st.message().decode() + raise UDFRegistrationError(message = error_msg) diff --git a/python/pyarrow/includes/libarrow.pxd b/python/pyarrow/includes/libarrow.pxd index 94264cc7645..4ce1b5f44eb 100644 --- a/python/pyarrow/includes/libarrow.pxd +++ b/python/pyarrow/includes/libarrow.pxd @@ -1852,9 +1852,21 @@ cdef extern from "arrow/compute/api.h" namespace "arrow::compute" nogil: int num_args c_bool is_varargs + @staticmethod + CArity Nullary() + @staticmethod CArity Unary() + @staticmethod + CArity Binary() + + @staticmethod + CArity Ternary() + + @staticmethod + CArity VarArgs(int min_args) + cdef cppclass CInputType" arrow::compute::InputType": @staticmethod CInputType Array(shared_ptr[CDataType] type) From c238ba16ee0a734421289849466dc16e8813fab0 Mon Sep 17 00:00:00 2001 From: Vibhatha Abeykoon Date: Wed, 9 Mar 2022 18:52:51 +0530 Subject: [PATCH 033/131] updating example --- cpp/src/arrow/python/udf.cc | 9 +- python/examples/statistics/nltk_example.py | 58 -------- python/examples/udf/udf_example.py | 163 +++++++++++++++++++++ 3 files changed, 167 insertions(+), 63 deletions(-) delete mode 100644 python/examples/statistics/nltk_example.py create mode 100644 python/examples/udf/udf_example.py diff --git a/cpp/src/arrow/python/udf.cc b/cpp/src/arrow/python/udf.cc index 014a57bcb55..8b6b2ee715a 100644 --- a/cpp/src/arrow/python/udf.cc +++ b/cpp/src/arrow/python/udf.cc @@ -50,7 +50,6 @@ namespace py { PyTuple_SetItem(arg_tuple, arg_id, data); \ } \ result = PyObject_CallObject(function, arg_tuple); \ - Py_XDECREF(function); \ if (result == NULL) { \ return Status::ExecutionError("Error occured in computation"); \ } \ @@ -59,11 +58,11 @@ namespace py { return res.status(); \ } \ c_res_data = res.ValueOrDie(); \ + auto datum = new Datum(c_res_data); \ + *out = *datum; \ Py_XDECREF(data); \ Py_XDECREF(arg_tuple); \ Py_XDECREF(result); \ - auto datum = new Datum(c_res_data); \ - *out = *datum; \ return Status::OK(); \ } @@ -106,7 +105,6 @@ Status ScalarUdfBuilder::MakeFunction(PyObject* function) { } else { return Status::ExecutionError("Expected a callable python object."); } - Py_XDECREF(function); return Status::OK(); }; @@ -116,7 +114,8 @@ Status ScalarUdfBuilder::MakeFunction(PyObject* function) { kernel.null_handling = this->null_handling(); st = func->AddKernel(std::move(kernel)); if (!st.ok()) { - return Status::ExecutionError("Kernel couldn't be added to the udf : " + st.message()); + return Status::ExecutionError("Kernel couldn't be added to the udf : " + + st.message()); } auto registry = cp::GetFunctionRegistry(); st = registry->AddFunction(std::move(func)); diff --git a/python/examples/statistics/nltk_example.py b/python/examples/statistics/nltk_example.py deleted file mode 100644 index 779de9c3046..00000000000 --- a/python/examples/statistics/nltk_example.py +++ /dev/null @@ -1,58 +0,0 @@ -from turtle import down -import nltk -import pandas as pd -import pyarrow as pa - -def download(): - nltk.download([ - "names", - "stopwords", - "state_union", - "twitter_samples", - "movie_reviews", - "averaged_perceptron_tagger", - "vader_lexicon", - "punkt", - ]) - -def test_nltk(): - from nltk.sentiment import SentimentIntensityAnalyzer - sia = SentimentIntensityAnalyzer() - score = sia.polarity_scores("Wow, NLTK is really powerful!") - for item in score: - print(item, score[item]) - - -def get_nltk_tweets(): - tweets = [t.replace("://", "//") for t in nltk.corpus.twitter_samples.strings()] - return tweets - - -def make_product_info(tweets): - import random - num_records = len(tweets) - product_names = [] - product_quantities = [] - regions = [] - region_types = {0: "US", 1: "UK", 2: "JPN", 3: "IND", 4: "AUS"} - for id in range(num_records): - product_name = "prod-" + str(id) - product_quantity = random.randint(0, 1000) - region = region_types[random.randint(0, 4)] - product_names.append(product_name) - product_quantities.append(product_quantity) - regions.append(region) - dict_data = {"product_name": product_names, - "product_quantity": product_quantities, - "region": regions, - "review" : tweets} - - data_table = pa.Table.from_pydict(dict_data) - return data_table - - -data_table = make_product_info(get_nltk_tweets()) - -print(data_table[0:5].to_pandas()) - - diff --git a/python/examples/udf/udf_example.py b/python/examples/udf/udf_example.py new file mode 100644 index 00000000000..b18f7bd036f --- /dev/null +++ b/python/examples/udf/udf_example.py @@ -0,0 +1,163 @@ +from typing import List +import pyarrow as pa +from pyarrow import compute as pc +from pyarrow.compute import call_function, register_function +from pyarrow.compute import Arity, InputType + +def get_function_doc(summary:str, desc:str, arg_names:List[str], + options_class:str, options_required:bool=False): + func_doc = {} + func_doc["summary"] = summary + func_doc["description"] = desc + func_doc["arg_names"] = arg_names + func_doc["options_class"] = options_class + func_doc["options_required"] = False + return func_doc + +arity_1 = Arity.unary() +func_name_1 = "python_udf" +# TODO: evaluate this properly, the input type can be a record_batch, array or a table +# Caveat, a recordbatch or a table does not have type information. +in_types_1 = [InputType.array(pa.int64())] +# TODO: evaluate this properly, whether the output type can support table, array or recordbatch +out_type_1 = pa.int64() + +def py_function(arrow_array): + p_new_array = call_function("add", [arrow_array, 1]) + return p_new_array + +def simple_function(args): + print("=" * 80) + print(f"Hello From Python : {args}") + print("=" * 80) + return args + +# # example 1 +# print("=" * 80) +# print("Example 1") +# print("=" * 80) +# doc_1 = get_function_doc("simple function", "test simple function", +# ["message"], "None") +# register_function(func_name_1, arity_1, doc_1, in_types_1, out_type_1, simple_function) + +# func1 = pc.get_function(func_name_1) + +# a1_1 = pc.call_function(func_name_1, [pa.array([20])]) + +# print(a1_1) + +# a1_2 = pc.call_function(func_name_1, [pa.array([30])]) + +# print(a1_2) + +# # example 2 +# print("=" * 80) +# print("Example 2") +# print("=" * 80) + +# def add_constant(array): +# return pc.call_function("add", [array, 1]) + +# func_name_2 = "py_add_func" +# arity_2 = Arity.unary() +# in_types_2 = [InputType.array(pa.int64())] +# out_type_2 = pa.int64() +# doc_2 = get_function_doc("add function", "test add function", +# ["value"], "None") +# register_function(func_name_2, arity_2, doc_2, in_types_2, out_type_2, add_constant) + +# func2 = pc.get_function(func_name_2) + +# a2 = pc.call_function(func_name_2, [pa.array([20])]) + +# print(a2) + +# # example 3 + +print("=" * 80) +print("Example 3") +print("=" * 80) + +def unary_scalar_function(scalar): + return pc.call_function("add", [scalar, 1]) + +arity_3 = Arity.unary() +func_name_3 = "py_scalar_add_func" +in_types_3 = [InputType.scalar(pa.int64())] +out_type_3 = pa.int64() +doc_3 = get_function_doc("scalar add function", "test scalar add function", + ["scalar_value"], "None") +register_function(func_name_3, arity_3, doc_3, in_types_3, out_type_3, unary_scalar_function) + +func3 = pc.get_function(func_name_3) + +a3_1 = pc.call_function(func_name_3, [pa.scalar(10)]) + +print(a3_1) + +a3_2 = pc.call_function(func_name_3, [pa.scalar(100)]) + +print(a3_2) + +## Binary Function [Scalar] +# print("=" * 80) +# print("Scalar Binary Example 4") +# print("=" * 80) +# arity_4 = Arity.binary() +# func_name_4 = "scalar_udf_binary_add" +# # TODO: evaluate this properly, the input type can be a record_batch, array or a table +# # Caveat, a recordbatch or a table does not have type information. +# in_types_4 = [InputType.scalar(pa.int64()), InputType.scalar(pa.int64())] +# # TODO: evaluate this properly, whether the output type can support table, array or recordbatch +# out_type_4 = pa.int64() +# scalar_binary_add_function_doc = get_function_doc("scalar bin add function", +# "test scalar bin add function", +# ["scalar_value1", "scalar_value2"], "None") + +# def binary_scalar_function(scalar1, scalar2): +# return pc.call_function("add", [scalar1, scalar2]) + +# register_function(func_name_4, arity_4, scalar_binary_add_function_doc, in_types_4, out_type_4, binary_scalar_function) + +# func4 = pc.get_function(func_name_4) + +# a4_1 = pc.call_function(func_name_4, [pa.scalar(10), pa.scalar(20)]) + +# print(a4_1) + +# a4_2 = pc.call_function(func_name_4, [pa.scalar(50), pa.scalar(30)]) + +# print(a4_2) + + +## Binary Function [Array] +# print("=" * 80) +# print("Array Data Binary Function Example 5") +# print("=" * 80) +# arity_5 = Arity.binary() +# func_name_5 = "array_udf_binary_add" +# in_types_5 = [InputType.array(pa.int64()), InputType.array(pa.int64())] +# out_type_5 = pa.int64() +# array_binary_add_function_doc = get_function_doc("array bin add function", +# "test array bin add function", +# ["array_value1", "array_value2"], "None") + +# def binary_array_function(array1, array2): +# return pc.call_function("add", [array1, array2]) + +# register_function(func_name_5, arity_5, array_binary_add_function_doc, in_types_5, out_type_5, binary_array_function) + +# func5 = pc.get_function(func_name_5) + +# a5_1 = pc.call_function(func_name_5, [pa.array([10, 11]), pa.array([20, 21])]) + +# print(a5_1) + +# a5_2 = pc.call_function(func_name_5, [pa.array([1, 2]), pa.array([10, 20])]) + +# print(a5_2) + + + + + From 7e0ea90a4d2d68ce4d5ae2c194be1f04a6f85d43 Mon Sep 17 00:00:00 2001 From: Vibhatha Abeykoon Date: Wed, 9 Mar 2022 18:53:38 +0530 Subject: [PATCH 034/131] moved udf example --- python/examples/statistics/udf_example.py | 122 ---------------------- 1 file changed, 122 deletions(-) delete mode 100644 python/examples/statistics/udf_example.py diff --git a/python/examples/statistics/udf_example.py b/python/examples/statistics/udf_example.py deleted file mode 100644 index 3a225aa42db..00000000000 --- a/python/examples/statistics/udf_example.py +++ /dev/null @@ -1,122 +0,0 @@ -from typing import List -import pyarrow as pa -from pyarrow import compute as pc -from pyarrow.compute import call_function, register_function -from pyarrow.compute import Arity, InputType - -def get_function_doc(summary:str, desc:str, arg_names:List[str], - options_class:str, options_required:bool=False): - func_doc = {} - func_doc["summary"] = summary - func_doc["description"] = desc - func_doc["arg_names"] = arg_names - func_doc["options_class"] = options_class - func_doc["options_required"] = False - return func_doc - -arity_1 = Arity.unary() -func_name_1 = "python_udf" -# TODO: evaluate this properly, the input type can be a record_batch, array or a table -# Caveat, a recordbatch or a table does not have type information. -in_types_1 = [InputType.array(pa.int64())] -# TODO: evaluate this properly, whether the output type can support table, array or recordbatch -out_type_1 = pa.int64() - -def py_function(arrow_array): - p_new_array = call_function("add", [arrow_array, 1]) - return p_new_array - -def simple_function(args): - print("=" * 80) - print("Hello From Python") - print("=" * 80) - print(args) - return args - -# # example 1 -# print("=" * 80) -# print("Example 1") -# print("=" * 80) -# doc_1 = get_function_doc("simple function", "test simple function", -# ["message"], "None") -# register_function(func_name_1, arity_1, doc_1, in_types_1, out_type_1, simple_function) - -# func1 = pc.get_function(func_name_1) - -# a1 = pc.call_function(func_name_1, [pa.array([20])]) - -# print(a1) - -# # example 2 -# print("=" * 80) -# print("Example 2") -# print("=" * 80) - -# def add_constant(array): -# return pc.call_function("add", [array, 1]) - -# func_name_2 = "py_add_func" -# arity_2 = Arity.unary() -# in_types_2 = [InputType.array(pa.int64())] -# out_type_2 = pa.int64() -# doc_2 = get_function_doc("add function", "test add function", -# ["value"], "None") -# register_function(func_name_2, arity_2, doc_2, in_types_2, out_type_2, add_constant) - -# func2 = pc.get_function(func_name_2) - -# a2 = pc.call_function(func_name_2, [pa.array([20])]) - -# print(a2) - -# example 3 - -print("=" * 80) -print("Example 3") -print("=" * 80) - -def unary_scalar_function(scalar): - return pc.call_function("add", [scalar, 1]) - -arity_3 = Arity.unary() -func_name_3 = "py_scalar_add_func" -in_types_3 = [InputType.scalar(pa.int64())] -out_type_3 = pa.int64() -doc_3 = get_function_doc("scalar add function", "test scalar add function", - ["scalar_value"], "None") -register_function(func_name_3, arity_3, doc_3, in_types_3, out_type_3, unary_scalar_function) - -func3 = pc.get_function(func_name_3) - -a3 = pc.call_function(func_name_3, [pa.scalar(10)]) - -print(a3) - -## Binary Function -print("=" * 80) -print("Scalar Binary Example 4") -print("=" * 80) -arity_4 = Arity.binary() -func_name_4 = "scalar_udf_binary_add" -# TODO: evaluate this properly, the input type can be a record_batch, array or a table -# Caveat, a recordbatch or a table does not have type information. -in_types_4 = [InputType.scalar(pa.int64()), InputType.scalar(pa.int64())] -# TODO: evaluate this properly, whether the output type can support table, array or recordbatch -out_type_4 = pa.int64() -scalar_binary_add_function_doc = get_function_doc("scalar bin add function", - "test scalar bin add function", - ["scalar_value1", "scalar_value2"], "None") - -def binary_scalar_function(scalar1, scalar2): - return pc.call_function("add", [scalar1, scalar2]) - -register_function(func_name_4, arity_4, scalar_binary_add_function_doc, in_types_4, out_type_4, binary_scalar_function) - -func4 = pc.get_function(func_name_4) - -a4 = pc.call_function(func_name_4, [pa.scalar(10), pa.scalar(20)]) - -print(a4) - - - From a9870681f41b46b29c1c29aa75ffe7f43d8bbe90 Mon Sep 17 00:00:00 2001 From: Vibhatha Abeykoon Date: Thu, 10 Mar 2022 17:58:33 +0530 Subject: [PATCH 035/131] fix varargs function registration issue --- cpp/src/arrow/python/udf.cc | 3 +- python/examples/udf/udf_example.py | 234 ++++++++++++++++++++--------- 2 files changed, 169 insertions(+), 68 deletions(-) diff --git a/cpp/src/arrow/python/udf.cc b/cpp/src/arrow/python/udf.cc index 8b6b2ee715a..138a8957bc5 100644 --- a/cpp/src/arrow/python/udf.cc +++ b/cpp/src/arrow/python/udf.cc @@ -109,7 +109,8 @@ Status ScalarUdfBuilder::MakeFunction(PyObject* function) { }; // lambda function - cp::ScalarKernel kernel(this->input_types(), this->output_type(), call_back_lambda); + cp::ScalarKernel kernel(cp::KernelSignature::Make(this->input_types(), this->output_type(), + this->arity().is_varargs), call_back_lambda); kernel.mem_allocation = this->mem_allocation(); kernel.null_handling = this->null_handling(); st = func->AddKernel(std::move(kernel)); diff --git a/python/examples/udf/udf_example.py b/python/examples/udf/udf_example.py index b18f7bd036f..df719fec0e6 100644 --- a/python/examples/udf/udf_example.py +++ b/python/examples/udf/udf_example.py @@ -32,76 +32,201 @@ def simple_function(args): print("=" * 80) return args -# # example 1 -# print("=" * 80) -# print("Example 1") -# print("=" * 80) -# doc_1 = get_function_doc("simple function", "test simple function", -# ["message"], "None") -# register_function(func_name_1, arity_1, doc_1, in_types_1, out_type_1, simple_function) -# func1 = pc.get_function(func_name_1) +""" +Array Usage +""" -# a1_1 = pc.call_function(func_name_1, [pa.array([20])]) +# example 1 +print("=" * 80) +print("Example 1") +print("=" * 80) +doc_1 = get_function_doc("simple function", "test simple function", + ["message"], "None") +register_function(func_name_1, arity_1, doc_1, in_types_1, out_type_1, simple_function) -# print(a1_1) +func1 = pc.get_function(func_name_1) -# a1_2 = pc.call_function(func_name_1, [pa.array([30])]) +a1_1 = pc.call_function(func_name_1, [pa.array([20])]) -# print(a1_2) +print(a1_1) -# # example 2 -# print("=" * 80) -# print("Example 2") -# print("=" * 80) +a1_2 = pc.call_function(func_name_1, [pa.array([30])]) -# def add_constant(array): -# return pc.call_function("add", [array, 1]) +print(a1_2) -# func_name_2 = "py_add_func" -# arity_2 = Arity.unary() -# in_types_2 = [InputType.array(pa.int64())] -# out_type_2 = pa.int64() -# doc_2 = get_function_doc("add function", "test add function", -# ["value"], "None") -# register_function(func_name_2, arity_2, doc_2, in_types_2, out_type_2, add_constant) +# example 2 +print("=" * 80) +print("Example 2") +print("=" * 80) -# func2 = pc.get_function(func_name_2) +def add_constant(array): + return pc.call_function("add", [array, 1]) -# a2 = pc.call_function(func_name_2, [pa.array([20])]) +func_name_2 = "py_add_func" +arity_2 = Arity.unary() +in_types_2 = [InputType.array(pa.int64())] +out_type_2 = pa.int64() +doc_2 = get_function_doc("add function", "test add function", + ["value"], "None") +register_function(func_name_2, arity_2, doc_2, in_types_2, out_type_2, add_constant) -# print(a2) +func2 = pc.get_function(func_name_2) -# # example 3 +a2_1 = pc.call_function(func_name_2, [pa.array([20])]) +print(a2_1) + +a2_2 = pc.call_function(func_name_2, [pa.array([30])]) + +print(a2_2) + + +# Binary Function [Array] print("=" * 80) -print("Example 3") +print("Array Data Binary Function Example 3") print("=" * 80) +arity_3 = Arity.binary() +func_name_3 = "array_udf_binary_add" +in_types_3 = [InputType.array(pa.int64()), InputType.array(pa.int64())] +out_type_3 = pa.int64() +array_binary_add_function_doc = get_function_doc("array bin add function", + "test array bin add function", + ["array_value1", "array_value2"], "None") -def unary_scalar_function(scalar): - return pc.call_function("add", [scalar, 1]) +def binary_array_function(array1, array2): + return pc.call_function("add", [array1, array2]) -arity_3 = Arity.unary() -func_name_3 = "py_scalar_add_func" -in_types_3 = [InputType.scalar(pa.int64())] -out_type_3 = pa.int64() -doc_3 = get_function_doc("scalar add function", "test scalar add function", - ["scalar_value"], "None") -register_function(func_name_3, arity_3, doc_3, in_types_3, out_type_3, unary_scalar_function) +register_function(func_name_3, arity_3, array_binary_add_function_doc, + in_types_3, out_type_3, binary_array_function) func3 = pc.get_function(func_name_3) -a3_1 = pc.call_function(func_name_3, [pa.scalar(10)]) +a3_1 = pc.call_function(func_name_3, [pa.array([10, 11]), pa.array([20, 21])]) print(a3_1) -a3_2 = pc.call_function(func_name_3, [pa.scalar(100)]) +a3_2 = pc.call_function(func_name_3, [pa.array([1, 2]), pa.array([10, 20])]) print(a3_2) + +# Ternary Function [Array] +print("=" * 80) +print("Array Data Ternary Function Example 4") +print("=" * 80) +arity_4 = Arity.ternary() +func_name_4 = "array_udf_ternary_add" +in_types_4 = [InputType.array(pa.int64()), + InputType.array(pa.int64()), + InputType.array(pa.int64())] +out_type_4 = pa.int64() +array_ternary_add_function_doc = get_function_doc("array ternary add function", + "test array ternary add function", + ["array_value1", "array_value2", "array_value3"], "None") + +def ternary_array_function(array1, array2, array3): + return pc.call_function("add", + [pc.call_function("add", [array1, array2]), + array3]) + +register_function(func_name_4, arity_4, array_ternary_add_function_doc, + in_types_4, out_type_4, ternary_array_function) + +func4 = pc.get_function(func_name_4) + +a4_1 = pc.call_function(func_name_4, [pa.array([10, 11]), + pa.array([20, 21]), + pa.array([30, 31])]) + +print(a4_1) + +a4_2 = pc.call_function(func_name_4, [pa.array([1, 2]), + pa.array([10, 20]), + pa.array([100, 200]) + ]) + +print(a4_2) + + +# VarArgs Function [Array] +print("=" * 80) +print("Array Data VarArgs Function Example 5") +print("=" * 80) +arity_5 = Arity.varargs(4) +func_name_5 = "array_udf_varargs_add" +in_types_5 = [InputType.array(pa.int64()), + InputType.array(pa.int64()), + InputType.array(pa.int64()), + InputType.array(pa.int64()) + ] +out_type_5 = pa.int64() +array_varargs_add_function_doc = get_function_doc("array varargs add function", + "test array varargs add function", + ["array_value1", "array_value2", + "array_value3", "array_value4"], + "None") + +def varargs_array_function(array1, array2, array3, array4): + array12 = pc.call_function("add", [array1, array2]) + array34 = pc.call_function("add", [array3, array4]) + return pc.call_function("add", [array12, array34]) + +register_function(func_name_5, arity_5, array_varargs_add_function_doc, + in_types_5, out_type_5, varargs_array_function) + +func5 = pc.get_function(func_name_5) + +a5_1 = pc.call_function(func_name_5, [pa.array([10, 11]), + pa.array([20, 21]), + pa.array([30, 31]), + pa.array([40, 41])]) + +print(a5_1) + +a5_2 = pc.call_function(func_name_5, [pa.array([1, 2]), + pa.array([10, 20]), + pa.array([100, 200]), + pa.array([1000, 2000]) + ]) + +print(a5_2) + + +""" +Scalar Usage +""" + +# # example 4 + +# print("=" * 80) +# print("Example 4") +# print("=" * 80) + +# def unary_scalar_function(scalar): +# return pc.call_function("add", [scalar, 1]) + +# arity_3 = Arity.unary() +# func_name_3 = "py_scalar_add_func" +# in_types_3 = [InputType.scalar(pa.int64())] +# out_type_3 = pa.int64() +# doc_3 = get_function_doc("scalar add function", "test scalar add function", +# ["scalar_value"], "None") +# register_function(func_name_3, arity_3, doc_3, in_types_3, out_type_3, unary_scalar_function) + +# func3 = pc.get_function(func_name_3) + +# a3_1 = pc.call_function(func_name_3, [pa.scalar(10)]) + +# print(a3_1) + +# a3_2 = pc.call_function(func_name_3, [pa.scalar(100)]) + +# print(a3_2) + ## Binary Function [Scalar] # print("=" * 80) -# print("Scalar Binary Example 4") +# print("Scalar Binary Example 5") # print("=" * 80) # arity_4 = Arity.binary() # func_name_4 = "scalar_udf_binary_add" @@ -130,32 +255,7 @@ def unary_scalar_function(scalar): # print(a4_2) -## Binary Function [Array] -# print("=" * 80) -# print("Array Data Binary Function Example 5") -# print("=" * 80) -# arity_5 = Arity.binary() -# func_name_5 = "array_udf_binary_add" -# in_types_5 = [InputType.array(pa.int64()), InputType.array(pa.int64())] -# out_type_5 = pa.int64() -# array_binary_add_function_doc = get_function_doc("array bin add function", -# "test array bin add function", -# ["array_value1", "array_value2"], "None") - -# def binary_array_function(array1, array2): -# return pc.call_function("add", [array1, array2]) - -# register_function(func_name_5, arity_5, array_binary_add_function_doc, in_types_5, out_type_5, binary_array_function) - -# func5 = pc.get_function(func_name_5) - -# a5_1 = pc.call_function(func_name_5, [pa.array([10, 11]), pa.array([20, 21])]) - -# print(a5_1) - -# a5_2 = pc.call_function(func_name_5, [pa.array([1, 2]), pa.array([10, 20])]) -# print(a5_2) From f73fe0cc8fd478bb2e857054d614339272d2bdd8 Mon Sep 17 00:00:00 2001 From: Vibhatha Abeykoon Date: Fri, 11 Mar 2022 07:28:26 +0530 Subject: [PATCH 036/131] fix memory issue --- cpp/src/arrow/python/udf.cc | 47 ++-- python/examples/udf/udf_example.py | 363 +++++++++++++++-------------- 2 files changed, 226 insertions(+), 184 deletions(-) diff --git a/cpp/src/arrow/python/udf.cc b/cpp/src/arrow/python/udf.cc index 138a8957bc5..20bc570b274 100644 --- a/cpp/src/arrow/python/udf.cc +++ b/cpp/src/arrow/python/udf.cc @@ -34,22 +34,16 @@ namespace py { Status exec_function_##FUNCTION_SUFFIX(const cp::ExecBatch& batch, PyObject* function, \ int num_args, Datum* out) { \ std::shared_ptr c_res_data; \ - PyObject* result = NULLPTR; \ - PyObject* data = NULLPTR; \ - PyObject* arg_tuple = NULLPTR; \ - Py_XINCREF(data); \ - Py_XINCREF(arg_tuple); \ - Py_XINCREF(result); \ - arg_tuple = PyTuple_New(num_args); \ + PyObject* arg_tuple = PyTuple_New(num_args); \ for (int arg_id = 0; arg_id < num_args; arg_id++) { \ if (!batch[arg_id].is_##FUNCTION_SUFFIX()) { \ return Status::Invalid("Input type and data type doesn't match"); \ } \ auto c_data = batch[arg_id].CONVERT_SUFFIX(); \ - data = wrap_##FUNCTION_SUFFIX(c_data); \ + PyObject* data = wrap_##FUNCTION_SUFFIX(c_data); \ PyTuple_SetItem(arg_tuple, arg_id, data); \ } \ - result = PyObject_CallObject(function, arg_tuple); \ + PyObject* result = PyObject_CallObject(function, arg_tuple); \ if (result == NULL) { \ return Status::ExecutionError("Error occured in computation"); \ } \ @@ -60,9 +54,6 @@ namespace py { c_res_data = res.ValueOrDie(); \ auto datum = new Datum(c_res_data); \ *out = *datum; \ - Py_XDECREF(data); \ - Py_XDECREF(arg_tuple); \ - Py_XDECREF(result); \ return Status::OK(); \ } @@ -71,6 +62,32 @@ DEFINE_CALL_UDF(Array, array, make_array) #undef DEFINE_CALL_UDF +// Status exec_function_scalar(const cp::ExecBatch& batch, PyObject* function, +// int num_args, Datum* out) { +// std::shared_ptr c_res_data; +// PyObject* arg_tuple = PyTuple_New(num_args); +// for (int arg_id = 0; arg_id < num_args; arg_id++) { +// if (!batch[arg_id].is_scalar()) { +// return Status::Invalid("Input type and data type doesn't match"); +// } +// auto c_data = batch[arg_id].scalar(); +// PyObject* data = wrap_scalar(c_data); +// PyTuple_SetItem(arg_tuple, arg_id, data); +// } +// PyObject* result = PyObject_CallObject(function, arg_tuple); +// if (result == NULL) { +// return Status::ExecutionError("Error occured in computation"); +// } +// auto res = unwrap_scalar(result); +// if (!res.status().ok()) { +// return res.status(); +// } +// c_res_data = res.ValueOrDie(); +// auto datum = new Datum(c_res_data); +// *out = *datum; +// return Status::OK(); +// } + Status VerifyArityAndInput(cp::Arity arity, const cp::ExecBatch& batch) { bool match = (uint64_t)arity.num_args == batch.values.size(); if (!match) { @@ -109,8 +126,10 @@ Status ScalarUdfBuilder::MakeFunction(PyObject* function) { }; // lambda function - cp::ScalarKernel kernel(cp::KernelSignature::Make(this->input_types(), this->output_type(), - this->arity().is_varargs), call_back_lambda); + cp::ScalarKernel kernel( + cp::KernelSignature::Make(this->input_types(), this->output_type(), + this->arity().is_varargs), + call_back_lambda); kernel.mem_allocation = this->mem_allocation(); kernel.null_handling = this->null_handling(); st = func->AddKernel(std::move(kernel)); diff --git a/python/examples/udf/udf_example.py b/python/examples/udf/udf_example.py index df719fec0e6..46ab5e2e073 100644 --- a/python/examples/udf/udf_example.py +++ b/python/examples/udf/udf_example.py @@ -14,248 +14,271 @@ def get_function_doc(summary:str, desc:str, arg_names:List[str], func_doc["options_required"] = False return func_doc -arity_1 = Arity.unary() -func_name_1 = "python_udf" -# TODO: evaluate this properly, the input type can be a record_batch, array or a table -# Caveat, a recordbatch or a table does not have type information. -in_types_1 = [InputType.array(pa.int64())] -# TODO: evaluate this properly, whether the output type can support table, array or recordbatch -out_type_1 = pa.int64() - -def py_function(arrow_array): - p_new_array = call_function("add", [arrow_array, 1]) - return p_new_array - -def simple_function(args): - print("=" * 80) - print(f"Hello From Python : {args}") - print("=" * 80) - return args - """ Array Usage """ -# example 1 -print("=" * 80) -print("Example 1") -print("=" * 80) -doc_1 = get_function_doc("simple function", "test simple function", - ["message"], "None") -register_function(func_name_1, arity_1, doc_1, in_types_1, out_type_1, simple_function) +# # Example 1: Array Unary +# print("=" * 80) +# print("Example 1: Array Unary") +# print("=" * 80) -func1 = pc.get_function(func_name_1) +# def add_constant(array): +# return pc.call_function("add", [array, 1]) -a1_1 = pc.call_function(func_name_1, [pa.array([20])]) +# func_name_1 = "py_add_func" +# arity_1 = Arity.unary() +# in_types_1 = [InputType.array(pa.int64())] +# out_type_1 = pa.int64() +# doc_1 = get_function_doc("add function", "test add function", +# ["value"], "None") +# register_function(func_name_1, arity_1, doc_1, in_types_1, out_type_1, add_constant) -print(a1_1) +# func1 = pc.get_function(func_name_1) -a1_2 = pc.call_function(func_name_1, [pa.array([30])]) +# a1_1 = pc.call_function(func_name_1, [pa.array([20])]) -print(a1_2) +# print(a1_1) -# example 2 -print("=" * 80) -print("Example 2") -print("=" * 80) +# a1_2 = pc.call_function(func_name_1, [pa.array([30])]) -def add_constant(array): - return pc.call_function("add", [array, 1]) +# print(a1_2) -func_name_2 = "py_add_func" -arity_2 = Arity.unary() -in_types_2 = [InputType.array(pa.int64())] -out_type_2 = pa.int64() -doc_2 = get_function_doc("add function", "test add function", - ["value"], "None") -register_function(func_name_2, arity_2, doc_2, in_types_2, out_type_2, add_constant) -func2 = pc.get_function(func_name_2) +# # Example 2: Array Binary +# print("=" * 80) +# print("Example 2: Array Binary") +# print("=" * 80) +# arity_2 = Arity.binary() +# func_name_2 = "array_udf_binary_add" +# in_types_2 = [InputType.array(pa.int64()), InputType.array(pa.int64())] +# out_type_2 = pa.int64() +# array_binary_add_function_doc = get_function_doc("array bin add function", +# "test array bin add function", +# ["array_value1", "array_value2"], "None") -a2_1 = pc.call_function(func_name_2, [pa.array([20])]) +# def binary_array_function(array1, array2): +# return pc.call_function("add", [array1, array2]) -print(a2_1) +# register_function(func_name_2, arity_2, array_binary_add_function_doc, +# in_types_2, out_type_2, binary_array_function) -a2_2 = pc.call_function(func_name_2, [pa.array([30])]) +# func2 = pc.get_function(func_name_2) -print(a2_2) +# a2_1 = pc.call_function(func_name_2, [pa.array([10, 11]), pa.array([20, 21])]) +# print(a2_1) -# Binary Function [Array] -print("=" * 80) -print("Array Data Binary Function Example 3") -print("=" * 80) -arity_3 = Arity.binary() -func_name_3 = "array_udf_binary_add" -in_types_3 = [InputType.array(pa.int64()), InputType.array(pa.int64())] -out_type_3 = pa.int64() -array_binary_add_function_doc = get_function_doc("array bin add function", - "test array bin add function", - ["array_value1", "array_value2"], "None") +# a2_2 = pc.call_function(func_name_3, [pa.array([1, 2]), pa.array([10, 20])]) -def binary_array_function(array1, array2): - return pc.call_function("add", [array1, array2]) +# print(a2_2) -register_function(func_name_3, arity_3, array_binary_add_function_doc, - in_types_3, out_type_3, binary_array_function) -func3 = pc.get_function(func_name_3) +# # Example 3: Array Ternary +# print("=" * 80) +# print("Example 3: Array Ternary") +# print("=" * 80) +# arity_3 = Arity.ternary() +# func_name_3 = "array_udf_ternary_add" +# in_types_3 = [InputType.array(pa.int64()), +# InputType.array(pa.int64()), +# InputType.array(pa.int64())] +# out_type_3 = pa.int64() +# array_ternary_add_function_doc = get_function_doc("array ternary add function", +# "test array ternary add function", +# ["array_value1", "array_value2", "array_value3"], "None") -a3_1 = pc.call_function(func_name_3, [pa.array([10, 11]), pa.array([20, 21])]) +# def ternary_array_function(array1, array2, array3): +# return pc.call_function("add", +# [pc.call_function("add", [array1, array2]), +# array3]) -print(a3_1) +# register_function(func_name_3, arity_3, array_ternary_add_function_doc, +# in_types_3, out_type_3, ternary_array_function) -a3_2 = pc.call_function(func_name_3, [pa.array([1, 2]), pa.array([10, 20])]) +# func3 = pc.get_function(func_name_3) -print(a3_2) +# a3_1 = pc.call_function(func_name_3, [pa.array([10, 11]), +# pa.array([20, 21]), +# pa.array([30, 31])]) +# print(a3_1) -# Ternary Function [Array] -print("=" * 80) -print("Array Data Ternary Function Example 4") -print("=" * 80) -arity_4 = Arity.ternary() -func_name_4 = "array_udf_ternary_add" -in_types_4 = [InputType.array(pa.int64()), - InputType.array(pa.int64()), - InputType.array(pa.int64())] -out_type_4 = pa.int64() -array_ternary_add_function_doc = get_function_doc("array ternary add function", - "test array ternary add function", - ["array_value1", "array_value2", "array_value3"], "None") +# a3_2 = pc.call_function(func_name_3, [pa.array([1, 2]), +# pa.array([10, 20]), +# pa.array([100, 200]) +# ]) -def ternary_array_function(array1, array2, array3): - return pc.call_function("add", - [pc.call_function("add", [array1, array2]), - array3]) +# print(a3_2) -register_function(func_name_4, arity_4, array_ternary_add_function_doc, - in_types_4, out_type_4, ternary_array_function) -func4 = pc.get_function(func_name_4) +# # Example 4: Array VarArgs +# print("=" * 80) +# print("Example 4: Array VarArgs") +# print("=" * 80) +# arity_4 = Arity.varargs(4) +# func_name_4 = "array_udf_varargs_add" +# in_types_4 = [InputType.array(pa.int64()), +# InputType.array(pa.int64()), +# InputType.array(pa.int64()), +# InputType.array(pa.int64()) +# ] +# out_type_4 = pa.int64() +# array_varargs_add_function_doc = get_function_doc("array varargs add function", +# "test array varargs add function", +# ["array_value1", "array_value2", +# "array_value3", "array_value4"], +# "None") + +# def varargs_array_function(array1, array2, array3, array4): +# array12 = pc.call_function("add", [array1, array2]) +# array34 = pc.call_function("add", [array3, array4]) +# return pc.call_function("add", [array12, array34]) + +# register_function(func_name_4, arity_4, array_varargs_add_function_doc, +# in_types_4, out_type_4, varargs_array_function) + +# func4 = pc.get_function(func_name_4) + +# a4_1 = pc.call_function(func_name_4, [pa.array([10, 11]), +# pa.array([20, 21]), +# pa.array([30, 31]), +# pa.array([40, 41])]) + +# print(a4_1) -a4_1 = pc.call_function(func_name_4, [pa.array([10, 11]), - pa.array([20, 21]), - pa.array([30, 31])]) +# a4_2 = pc.call_function(func_name_4, [pa.array([1, 2]), +# pa.array([10, 20]), +# pa.array([100, 200]), +# pa.array([1000, 2000]) +# ]) -print(a4_1) +# print(a4_2) -a4_2 = pc.call_function(func_name_4, [pa.array([1, 2]), - pa.array([10, 20]), - pa.array([100, 200]) - ]) -print(a4_2) +""" +Scalar Usage +""" +# Example 5: Scalar Unary -# VarArgs Function [Array] print("=" * 80) -print("Array Data VarArgs Function Example 5") +print("Example 5: Scalar Unary ") print("=" * 80) -arity_5 = Arity.varargs(4) -func_name_5 = "array_udf_varargs_add" -in_types_5 = [InputType.array(pa.int64()), - InputType.array(pa.int64()), - InputType.array(pa.int64()), - InputType.array(pa.int64()) - ] -out_type_5 = pa.int64() -array_varargs_add_function_doc = get_function_doc("array varargs add function", - "test array varargs add function", - ["array_value1", "array_value2", - "array_value3", "array_value4"], - "None") -def varargs_array_function(array1, array2, array3, array4): - array12 = pc.call_function("add", [array1, array2]) - array34 = pc.call_function("add", [array3, array4]) - return pc.call_function("add", [array12, array34]) +def unary_scalar_function(scalar): + return pc.call_function("add", [scalar, 1]) -register_function(func_name_5, arity_5, array_varargs_add_function_doc, - in_types_5, out_type_5, varargs_array_function) +arity_5 = Arity.unary() +func_name_5 = "py_scalar_add_func" +in_types_5 = [InputType.scalar(pa.int64())] +out_type_5 = pa.int64() +doc_5 = get_function_doc("scalar add function", "test scalar add function", + ["scalar_value"], "None") +register_function(func_name_5, arity_5, doc_5, in_types_5, out_type_5, unary_scalar_function) func5 = pc.get_function(func_name_5) -a5_1 = pc.call_function(func_name_5, [pa.array([10, 11]), - pa.array([20, 21]), - pa.array([30, 31]), - pa.array([40, 41])]) +a5_1 = pc.call_function(func_name_5, [pa.scalar(10)]) print(a5_1) -a5_2 = pc.call_function(func_name_5, [pa.array([1, 2]), - pa.array([10, 20]), - pa.array([100, 200]), - pa.array([1000, 2000]) - ]) +a5_2 = pc.call_function(func_name_5, [pa.scalar(1)]) print(a5_2) -""" -Scalar Usage -""" +# Example 6: Scalar Binary +print("=" * 80) +print("Example 6: Scalar Binary") +print("=" * 80) +arity_7 = Arity.binary() +func_name_7 = "scalar_udf_binary_add" +in_types_7 = [InputType.scalar(pa.int64()), InputType.scalar(pa.int64())] +out_type_7 = pa.int64() +scalar_binary_add_function_doc = get_function_doc("scalar bin add function", + "test scalar bin add function", + ["scalar_value1", "scalar_value2"], "None") -# # example 4 +def binary_scalar_function(scalar1, scalar2): + return pc.call_function("add", [scalar1, scalar2]) -# print("=" * 80) -# print("Example 4") -# print("=" * 80) +register_function(func_name_7, arity_7, scalar_binary_add_function_doc, + in_types_7, out_type_7, binary_scalar_function) -# def unary_scalar_function(scalar): -# return pc.call_function("add", [scalar, 1]) +func7 = pc.get_function(func_name_7) -# arity_3 = Arity.unary() -# func_name_3 = "py_scalar_add_func" -# in_types_3 = [InputType.scalar(pa.int64())] -# out_type_3 = pa.int64() -# doc_3 = get_function_doc("scalar add function", "test scalar add function", -# ["scalar_value"], "None") -# register_function(func_name_3, arity_3, doc_3, in_types_3, out_type_3, unary_scalar_function) +a7_1 = pc.call_function(func_name_7, [pa.scalar(10), pa.scalar(20)]) -# func3 = pc.get_function(func_name_3) +print(a7_1) -# a3_1 = pc.call_function(func_name_3, [pa.scalar(10)]) +a7_2 = pc.call_function(func_name_7, [pa.scalar(50), pa.scalar(30)]) -# print(a3_1) +print(a7_2) -# a3_2 = pc.call_function(func_name_3, [pa.scalar(100)]) +# Example 8: Scalar Ternary +print("=" * 80) +print("Example 8: Scalar Ternary") +print("=" * 80) +arity_8 = Arity.ternary() +func_name_8 = "scalar_udf_ternary_add" +in_types_8 = [InputType.scalar(pa.int64()), + InputType.scalar(pa.int64()), + InputType.scalar(pa.int64())] +out_type_8 = pa.int64() +scalar_ternary_add_function_doc = get_function_doc("scalar ternary add function", + "test scalar ternary add function", + ["scalar_value1", "scalar_value2", + "scalar_value3"], "None") -# print(a3_2) +def ternary_scalar_function(scalar1, scalar2, scalar3): + return pc.call_function("add", [pc.call_function("add", [scalar1, scalar2]), scalar3]) -## Binary Function [Scalar] -# print("=" * 80) -# print("Scalar Binary Example 5") -# print("=" * 80) -# arity_4 = Arity.binary() -# func_name_4 = "scalar_udf_binary_add" -# # TODO: evaluate this properly, the input type can be a record_batch, array or a table -# # Caveat, a recordbatch or a table does not have type information. -# in_types_4 = [InputType.scalar(pa.int64()), InputType.scalar(pa.int64())] -# # TODO: evaluate this properly, whether the output type can support table, array or recordbatch -# out_type_4 = pa.int64() -# scalar_binary_add_function_doc = get_function_doc("scalar bin add function", -# "test scalar bin add function", -# ["scalar_value1", "scalar_value2"], "None") +register_function(func_name_8, arity_8, scalar_ternary_add_function_doc, + in_types_8, out_type_8, ternary_scalar_function) -# def binary_scalar_function(scalar1, scalar2): -# return pc.call_function("add", [scalar1, scalar2]) +func8 = pc.get_function(func_name_8) -# register_function(func_name_4, arity_4, scalar_binary_add_function_doc, in_types_4, out_type_4, binary_scalar_function) +a8_1 = pc.call_function(func_name_8, [pa.scalar(10), pa.scalar(20), pa.scalar(30)]) -# func4 = pc.get_function(func_name_4) +print(a8_1) -# a4_1 = pc.call_function(func_name_4, [pa.scalar(10), pa.scalar(20)]) +a8_2 = pc.call_function(func_name_8, [pa.scalar(1), pa.scalar(2), pa.scalar(3)]) -# print(a4_1) +print(a8_2) -# a4_2 = pc.call_function(func_name_4, [pa.scalar(50), pa.scalar(30)]) -# print(a4_2) +# Example 8: Scalar VarArgs +print("=" * 80) +print("Example 8: Scalar VarArgs") +print("=" * 80) +arity_8 = Arity.ternary() +func_name_8 = "scalar_udf_ternary_add" +in_types_8 = [InputType.scalar(pa.int64()), + InputType.scalar(pa.int64()), + InputType.scalar(pa.int64())] +out_type_8 = pa.int64() +scalar_ternary_add_function_doc = get_function_doc("scalar ternary add function", + "test scalar ternary add function", + ["scalar_value1", "scalar_value2", + "scalar_value3"], "None") + +def ternary_scalar_function(scalar1, scalar2, scalar3): + return pc.call_function("add", [pc.call_function("add", [scalar1, scalar2]), scalar3]) + +register_function(func_name_8, arity_8, scalar_ternary_add_function_doc, + in_types_8, out_type_8, ternary_scalar_function) + +func8 = pc.get_function(func_name_8) + +a8_1 = pc.call_function(func_name_8, [pa.scalar(10), pa.scalar(20), pa.scalar(30)]) +print(a8_1) +a8_2 = pc.call_function(func_name_8, [pa.scalar(1), pa.scalar(2), pa.scalar(3)]) +print(a8_2) From e839616315be7d3043a246d9380efae548918723 Mon Sep 17 00:00:00 2001 From: Vibhatha Abeykoon Date: Fri, 11 Mar 2022 09:22:10 +0530 Subject: [PATCH 037/131] adding udf example --- python/examples/udf/udf_example.py | 68 +++++++++++++++--------------- 1 file changed, 33 insertions(+), 35 deletions(-) diff --git a/python/examples/udf/udf_example.py b/python/examples/udf/udf_example.py index 46ab5e2e073..0af8952d867 100644 --- a/python/examples/udf/udf_example.py +++ b/python/examples/udf/udf_example.py @@ -193,10 +193,10 @@ def unary_scalar_function(scalar): print("=" * 80) print("Example 6: Scalar Binary") print("=" * 80) -arity_7 = Arity.binary() -func_name_7 = "scalar_udf_binary_add" -in_types_7 = [InputType.scalar(pa.int64()), InputType.scalar(pa.int64())] -out_type_7 = pa.int64() +arity_6 = Arity.binary() +func_name_6 = "scalar_udf_binary_add" +in_types_6 = [InputType.scalar(pa.int64()), InputType.scalar(pa.int64())] +out_type_6 = pa.int64() scalar_binary_add_function_doc = get_function_doc("scalar bin add function", "test scalar bin add function", ["scalar_value1", "scalar_value2"], "None") @@ -204,29 +204,29 @@ def unary_scalar_function(scalar): def binary_scalar_function(scalar1, scalar2): return pc.call_function("add", [scalar1, scalar2]) -register_function(func_name_7, arity_7, scalar_binary_add_function_doc, - in_types_7, out_type_7, binary_scalar_function) +register_function(func_name_6, arity_6, scalar_binary_add_function_doc, + in_types_6, out_type_6, binary_scalar_function) -func7 = pc.get_function(func_name_7) +func6 = pc.get_function(func_name_6) -a7_1 = pc.call_function(func_name_7, [pa.scalar(10), pa.scalar(20)]) +a6_1 = pc.call_function(func_name_6, [pa.scalar(10), pa.scalar(20)]) -print(a7_1) +print(a6_1) -a7_2 = pc.call_function(func_name_7, [pa.scalar(50), pa.scalar(30)]) +a6_2 = pc.call_function(func_name_6, [pa.scalar(50), pa.scalar(30)]) -print(a7_2) +print(a6_2) # Example 8: Scalar Ternary print("=" * 80) -print("Example 8: Scalar Ternary") +print("Example 7: Scalar Ternary") print("=" * 80) -arity_8 = Arity.ternary() -func_name_8 = "scalar_udf_ternary_add" -in_types_8 = [InputType.scalar(pa.int64()), +arity_7 = Arity.ternary() +func_name_7 = "scalar_udf_ternary_add" +in_types_7 = [InputType.scalar(pa.int64()), InputType.scalar(pa.int64()), InputType.scalar(pa.int64())] -out_type_8 = pa.int64() +out_type_7 = pa.int64() scalar_ternary_add_function_doc = get_function_doc("scalar ternary add function", "test scalar ternary add function", ["scalar_value1", "scalar_value2", @@ -235,52 +235,50 @@ def binary_scalar_function(scalar1, scalar2): def ternary_scalar_function(scalar1, scalar2, scalar3): return pc.call_function("add", [pc.call_function("add", [scalar1, scalar2]), scalar3]) -register_function(func_name_8, arity_8, scalar_ternary_add_function_doc, - in_types_8, out_type_8, ternary_scalar_function) +register_function(func_name_7, arity_7, scalar_ternary_add_function_doc, + in_types_7, out_type_7, ternary_scalar_function) -func8 = pc.get_function(func_name_8) +func7 = pc.get_function(func_name_7) -a8_1 = pc.call_function(func_name_8, [pa.scalar(10), pa.scalar(20), pa.scalar(30)]) +a7_1 = pc.call_function(func_name_7, [pa.scalar(10), pa.scalar(20), pa.scalar(30)]) -print(a8_1) +print(a7_1) -a8_2 = pc.call_function(func_name_8, [pa.scalar(1), pa.scalar(2), pa.scalar(3)]) +a7_2 = pc.call_function(func_name_7, [pa.scalar(1), pa.scalar(2), pa.scalar(3)]) -print(a8_2) +print(a7_2) # Example 8: Scalar VarArgs print("=" * 80) print("Example 8: Scalar VarArgs") print("=" * 80) -arity_8 = Arity.ternary() -func_name_8 = "scalar_udf_ternary_add" +arity_8 = Arity.varargs(4) +func_name_8 = "scalar_udf_varargs_add" in_types_8 = [InputType.scalar(pa.int64()), + InputType.scalar(pa.int64()), InputType.scalar(pa.int64()), InputType.scalar(pa.int64())] out_type_8 = pa.int64() scalar_ternary_add_function_doc = get_function_doc("scalar ternary add function", "test scalar ternary add function", ["scalar_value1", "scalar_value2", - "scalar_value3"], "None") + "scalar_value3", "scalar_value4"], "None") -def ternary_scalar_function(scalar1, scalar2, scalar3): - return pc.call_function("add", [pc.call_function("add", [scalar1, scalar2]), scalar3]) +def ternary_scalar_function(scalar1, scalar2, scalar3, scalar4): + return pc.call_function("add", [pc.call_function("add", + [pc.call_function("add", [scalar1, scalar2]), + scalar3]), scalar4]) register_function(func_name_8, arity_8, scalar_ternary_add_function_doc, in_types_8, out_type_8, ternary_scalar_function) func8 = pc.get_function(func_name_8) -a8_1 = pc.call_function(func_name_8, [pa.scalar(10), pa.scalar(20), pa.scalar(30)]) +a8_1 = pc.call_function(func_name_8, [pa.scalar(10), pa.scalar(20), pa.scalar(30), pa.scalar(40)]) print(a8_1) -a8_2 = pc.call_function(func_name_8, [pa.scalar(1), pa.scalar(2), pa.scalar(3)]) +a8_2 = pc.call_function(func_name_8, [pa.scalar(1), pa.scalar(2), pa.scalar(3), pa.scalar(4)]) print(a8_2) - - - - - From 35a94c284ceb243a67a6abbaad0c145f16978ca1 Mon Sep 17 00:00:00 2001 From: Vibhatha Abeykoon Date: Mon, 14 Mar 2022 11:14:20 +0530 Subject: [PATCH 038/131] refactor code and adding python test cases --- cpp/src/arrow/python/udf.cc | 49 ++--- python/examples/udf/udf_example.py | 298 +++++++++++++++------------ python/pyarrow/_compute.pxd | 9 +- python/pyarrow/_compute.pyx | 274 ++++++++++++++++-------- python/pyarrow/compute.py | 4 +- python/pyarrow/includes/libarrow.pxd | 17 +- python/pyarrow/tests/test_udf.py | 287 ++++++++++++++++++++++++++ python/setup.py | 3 - 8 files changed, 664 insertions(+), 277 deletions(-) create mode 100644 python/pyarrow/tests/test_udf.py diff --git a/cpp/src/arrow/python/udf.cc b/cpp/src/arrow/python/udf.cc index 20bc570b274..838f0914d7e 100644 --- a/cpp/src/arrow/python/udf.cc +++ b/cpp/src/arrow/python/udf.cc @@ -62,32 +62,6 @@ DEFINE_CALL_UDF(Array, array, make_array) #undef DEFINE_CALL_UDF -// Status exec_function_scalar(const cp::ExecBatch& batch, PyObject* function, -// int num_args, Datum* out) { -// std::shared_ptr c_res_data; -// PyObject* arg_tuple = PyTuple_New(num_args); -// for (int arg_id = 0; arg_id < num_args; arg_id++) { -// if (!batch[arg_id].is_scalar()) { -// return Status::Invalid("Input type and data type doesn't match"); -// } -// auto c_data = batch[arg_id].scalar(); -// PyObject* data = wrap_scalar(c_data); -// PyTuple_SetItem(arg_tuple, arg_id, data); -// } -// PyObject* result = PyObject_CallObject(function, arg_tuple); -// if (result == NULL) { -// return Status::ExecutionError("Error occured in computation"); -// } -// auto res = unwrap_scalar(result); -// if (!res.status().ok()) { -// return res.status(); -// } -// c_res_data = res.ValueOrDie(); -// auto datum = new Datum(c_res_data); -// *out = *datum; -// return Status::OK(); -// } - Status VerifyArityAndInput(cp::Arity arity, const cp::ExecBatch& batch) { bool match = (uint64_t)arity.num_args == batch.values.size(); if (!match) { @@ -101,21 +75,23 @@ Status ScalarUdfBuilder::MakeFunction(PyObject* function) { Status st; auto func = std::make_shared(this->name(), this->arity(), &this->doc()); + // creating a copy of objects for the lambda function + auto py_function = function; + auto arity = this->arity(); // lambda function - auto call_back_lambda = [function, this](cp::KernelContext* ctx, - const cp::ExecBatch& batch, - Datum* out) -> Status { + auto call_back_lambda = [py_function, arity](cp::KernelContext* ctx, + const cp::ExecBatch& batch, + Datum* out) -> Status { PyAcquireGIL lock; - if (function == NULL) { + if (py_function == NULL) { return Status::ExecutionError("python function cannot be null"); } - - if (PyCallable_Check(function)) { - RETURN_NOT_OK(VerifyArityAndInput(this->arity(), batch)); + if (PyCallable_Check(py_function)) { + RETURN_NOT_OK(VerifyArityAndInput(arity, batch)); if (batch[0].is_array()) { // checke 0-th element to select array callable - RETURN_NOT_OK(exec_function_array(batch, function, this->arity().num_args, out)); + RETURN_NOT_OK(exec_function_array(batch, py_function, arity.num_args, out)); } else if (batch[0].is_scalar()) { // check 0-th element to select scalar callable - RETURN_NOT_OK(exec_function_scalar(batch, function, this->arity().num_args, out)); + RETURN_NOT_OK(exec_function_scalar(batch, py_function, arity.num_args, out)); } else { return Status::Invalid("Unexpected input type, scalar or array type expected."); } @@ -123,9 +99,8 @@ Status ScalarUdfBuilder::MakeFunction(PyObject* function) { return Status::ExecutionError("Expected a callable python object."); } return Status::OK(); - }; + }; // lambda function - // lambda function cp::ScalarKernel kernel( cp::KernelSignature::Make(this->input_types(), this->output_type(), this->arity().is_varargs), diff --git a/python/examples/udf/udf_example.py b/python/examples/udf/udf_example.py index 0af8952d867..905cd0dd294 100644 --- a/python/examples/udf/udf_example.py +++ b/python/examples/udf/udf_example.py @@ -1,11 +1,12 @@ from typing import List import pyarrow as pa from pyarrow import compute as pc -from pyarrow.compute import call_function, register_function +from pyarrow.compute import register_function from pyarrow.compute import Arity, InputType -def get_function_doc(summary:str, desc:str, arg_names:List[str], - options_class:str, options_required:bool=False): + +def get_function_doc(summary: str, desc: str, arg_names: List[str], + options_class: str, options_required: bool = False): func_doc = {} func_doc["summary"] = summary func_doc["description"] = desc @@ -19,164 +20,179 @@ def get_function_doc(summary:str, desc:str, arg_names:List[str], Array Usage """ -# # Example 1: Array Unary -# print("=" * 80) -# print("Example 1: Array Unary") -# print("=" * 80) +# Example 1: Array Unary +print("=" * 80) +print("Example 1: Array Unary") +print("=" * 80) + + +def add_constant(array): + return pc.call_function("add", [array, 1]) + + +func_name_1 = "py_add_func" +arity_1 = Arity.unary() +in_types_1 = [InputType.array(pa.int64())] +out_type_1 = pa.int64() +doc_1 = get_function_doc("add function", "test add function", + ["value"], "None") +register_function(func_name_1, arity_1, doc_1, + in_types_1, out_type_1, add_constant) -# def add_constant(array): -# return pc.call_function("add", [array, 1]) +func1 = pc.get_function(func_name_1) -# func_name_1 = "py_add_func" -# arity_1 = Arity.unary() -# in_types_1 = [InputType.array(pa.int64())] -# out_type_1 = pa.int64() -# doc_1 = get_function_doc("add function", "test add function", -# ["value"], "None") -# register_function(func_name_1, arity_1, doc_1, in_types_1, out_type_1, add_constant) +a1_1 = pc.call_function(func_name_1, [pa.array([20])]) -# func1 = pc.get_function(func_name_1) +print(a1_1) + +a1_2 = pc.call_function(func_name_1, [pa.array([30])]) + +print(a1_2) + + +# Example 2: Array Binary +print("=" * 80) +print("Example 2: Array Binary") +print("=" * 80) +arity_2 = Arity.binary() +func_name_2 = "array_udf_binary_add" +in_types_2 = [InputType.array(pa.int64()), InputType.array(pa.int64())] +out_type_2 = pa.int64() +array_binary_add_function_doc = get_function_doc( + "array bin add function", + "test array bin add function", + ["array_value1", "array_value2"], "None") -# a1_1 = pc.call_function(func_name_1, [pa.array([20])]) -# print(a1_1) +def binary_array_function(array1, array2): + return pc.call_function("add", [array1, array2]) -# a1_2 = pc.call_function(func_name_1, [pa.array([30])]) -# print(a1_2) +register_function(func_name_2, arity_2, array_binary_add_function_doc, + in_types_2, out_type_2, binary_array_function) +func2 = pc.get_function(func_name_2) -# # Example 2: Array Binary -# print("=" * 80) -# print("Example 2: Array Binary") -# print("=" * 80) -# arity_2 = Arity.binary() -# func_name_2 = "array_udf_binary_add" -# in_types_2 = [InputType.array(pa.int64()), InputType.array(pa.int64())] -# out_type_2 = pa.int64() -# array_binary_add_function_doc = get_function_doc("array bin add function", -# "test array bin add function", -# ["array_value1", "array_value2"], "None") +a2_1 = pc.call_function(func_name_2, [pa.array([10, 11]), pa.array([20, 21])]) -# def binary_array_function(array1, array2): -# return pc.call_function("add", [array1, array2]) +print(a2_1) -# register_function(func_name_2, arity_2, array_binary_add_function_doc, -# in_types_2, out_type_2, binary_array_function) +a2_2 = pc.call_function(func_name_2, [pa.array([1, 2]), pa.array([10, 20])]) -# func2 = pc.get_function(func_name_2) +print(a2_2) -# a2_1 = pc.call_function(func_name_2, [pa.array([10, 11]), pa.array([20, 21])]) -# print(a2_1) +# Example 3: Array Ternary +print("=" * 80) +print("Example 3: Array Ternary") +print("=" * 80) +arity_3 = Arity.ternary() +func_name_3 = "array_udf_ternary_add" +in_types_3 = [InputType.array(pa.int64()), + InputType.array(pa.int64()), + InputType.array(pa.int64())] +out_type_3 = pa.int64() +array_ternary_add_function_doc = get_function_doc( + "array ternary add function", + "test array ternary add function", + ["array_value1", "array_value2", "array_value3"], "None") -# a2_2 = pc.call_function(func_name_3, [pa.array([1, 2]), pa.array([10, 20])]) -# print(a2_2) +def ternary_array_function(array1, array2, array3): + return pc.call_function("add", + [pc.call_function("add", [array1, array2]), + array3]) -# # Example 3: Array Ternary -# print("=" * 80) -# print("Example 3: Array Ternary") -# print("=" * 80) -# arity_3 = Arity.ternary() -# func_name_3 = "array_udf_ternary_add" -# in_types_3 = [InputType.array(pa.int64()), -# InputType.array(pa.int64()), -# InputType.array(pa.int64())] -# out_type_3 = pa.int64() -# array_ternary_add_function_doc = get_function_doc("array ternary add function", -# "test array ternary add function", -# ["array_value1", "array_value2", "array_value3"], "None") +register_function(func_name_3, arity_3, array_ternary_add_function_doc, + in_types_3, out_type_3, ternary_array_function) -# def ternary_array_function(array1, array2, array3): -# return pc.call_function("add", -# [pc.call_function("add", [array1, array2]), -# array3]) +func3 = pc.get_function(func_name_3) -# register_function(func_name_3, arity_3, array_ternary_add_function_doc, -# in_types_3, out_type_3, ternary_array_function) +a3_1 = pc.call_function(func_name_3, [pa.array([10, 11]), + pa.array([20, 21]), + pa.array([30, 31])]) -# func3 = pc.get_function(func_name_3) +print(a3_1) -# a3_1 = pc.call_function(func_name_3, [pa.array([10, 11]), -# pa.array([20, 21]), -# pa.array([30, 31])]) +a3_2 = pc.call_function(func_name_3, [pa.array([1, 2]), + pa.array([10, 20]), + pa.array([100, 200]) + ]) -# print(a3_1) +print(a3_2) -# a3_2 = pc.call_function(func_name_3, [pa.array([1, 2]), -# pa.array([10, 20]), -# pa.array([100, 200]) -# ]) -# print(a3_2) +# Example 4: Array VarArgs +print("=" * 80) +print("Example 4: Array VarArgs") +print("=" * 80) +arity_4 = Arity.varargs(4) +func_name_4 = "array_udf_varargs_add" +in_types_4 = [InputType.array(pa.int64()), + InputType.array(pa.int64()), + InputType.array(pa.int64()), + InputType.array(pa.int64()) + ] +out_type_4 = pa.int64() +array_varargs_add_function_doc = get_function_doc( + "array varargs add function", + "test array varargs add function", + ["array_value1", "array_value2", + "array_value3", "array_value4"], + "None") -# # Example 4: Array VarArgs -# print("=" * 80) -# print("Example 4: Array VarArgs") -# print("=" * 80) -# arity_4 = Arity.varargs(4) -# func_name_4 = "array_udf_varargs_add" -# in_types_4 = [InputType.array(pa.int64()), -# InputType.array(pa.int64()), -# InputType.array(pa.int64()), -# InputType.array(pa.int64()) -# ] -# out_type_4 = pa.int64() -# array_varargs_add_function_doc = get_function_doc("array varargs add function", -# "test array varargs add function", -# ["array_value1", "array_value2", -# "array_value3", "array_value4"], -# "None") +def varargs_array_function(array1, array2, array3, array4): + array12 = pc.call_function("add", [array1, array2]) + array34 = pc.call_function("add", [array3, array4]) + return pc.call_function("add", [array12, array34]) -# def varargs_array_function(array1, array2, array3, array4): -# array12 = pc.call_function("add", [array1, array2]) -# array34 = pc.call_function("add", [array3, array4]) -# return pc.call_function("add", [array12, array34]) -# register_function(func_name_4, arity_4, array_varargs_add_function_doc, -# in_types_4, out_type_4, varargs_array_function) +register_function(func_name_4, arity_4, array_varargs_add_function_doc, + in_types_4, out_type_4, varargs_array_function) -# func4 = pc.get_function(func_name_4) +func4 = pc.get_function(func_name_4) -# a4_1 = pc.call_function(func_name_4, [pa.array([10, 11]), -# pa.array([20, 21]), -# pa.array([30, 31]), -# pa.array([40, 41])]) +a4_1 = pc.call_function(func_name_4, [pa.array([10, 11]), + pa.array([20, 21]), + pa.array([30, 31]), + pa.array([40, 41])]) -# print(a4_1) +print(a4_1) -# a4_2 = pc.call_function(func_name_4, [pa.array([1, 2]), -# pa.array([10, 20]), -# pa.array([100, 200]), -# pa.array([1000, 2000]) -# ]) +a4_2 = pc.call_function(func_name_4, [pa.array([1, 2]), + pa.array([10, 20]), + pa.array([100, 200]), + pa.array([1000, 2000]) + ]) -# print(a4_2) +print(a4_2) """ Scalar Usage """ -# Example 5: Scalar Unary +# Example 5: Scalar Unary print("=" * 80) print("Example 5: Scalar Unary ") print("=" * 80) + def unary_scalar_function(scalar): return pc.call_function("add", [scalar, 1]) + arity_5 = Arity.unary() func_name_5 = "py_scalar_add_func" in_types_5 = [InputType.scalar(pa.int64())] out_type_5 = pa.int64() doc_5 = get_function_doc("scalar add function", "test scalar add function", - ["scalar_value"], "None") -register_function(func_name_5, arity_5, doc_5, in_types_5, out_type_5, unary_scalar_function) + ["scalar_value"], "None") +register_function(func_name_5, arity_5, doc_5, in_types_5, + out_type_5, unary_scalar_function) func5 = pc.get_function(func_name_5) @@ -197,15 +213,18 @@ def unary_scalar_function(scalar): func_name_6 = "scalar_udf_binary_add" in_types_6 = [InputType.scalar(pa.int64()), InputType.scalar(pa.int64())] out_type_6 = pa.int64() -scalar_binary_add_function_doc = get_function_doc("scalar bin add function", - "test scalar bin add function", - ["scalar_value1", "scalar_value2"], "None") +scalar_binary_add_function_doc = get_function_doc( + "scalar bin add function", + "test scalar bin add function", + ["scalar_value1", "scalar_value2"], "None") + def binary_scalar_function(scalar1, scalar2): return pc.call_function("add", [scalar1, scalar2]) + register_function(func_name_6, arity_6, scalar_binary_add_function_doc, - in_types_6, out_type_6, binary_scalar_function) + in_types_6, out_type_6, binary_scalar_function) func6 = pc.get_function(func_name_6) @@ -227,24 +246,32 @@ def binary_scalar_function(scalar1, scalar2): InputType.scalar(pa.int64()), InputType.scalar(pa.int64())] out_type_7 = pa.int64() -scalar_ternary_add_function_doc = get_function_doc("scalar ternary add function", - "test scalar ternary add function", - ["scalar_value1", "scalar_value2", - "scalar_value3"], "None") +scalar_ternary_add_function_doc = get_function_doc( + "scalar ternary add function", + "test scalar ternary add function", + ["scalar_value1", "scalar_value2", + "scalar_value3"], "None") + def ternary_scalar_function(scalar1, scalar2, scalar3): - return pc.call_function("add", [pc.call_function("add", [scalar1, scalar2]), scalar3]) + return pc.call_function("add", + [pc.call_function("add", + [scalar1, scalar2]), + scalar3]) + register_function(func_name_7, arity_7, scalar_ternary_add_function_doc, - in_types_7, out_type_7, ternary_scalar_function) + in_types_7, out_type_7, ternary_scalar_function) func7 = pc.get_function(func_name_7) -a7_1 = pc.call_function(func_name_7, [pa.scalar(10), pa.scalar(20), pa.scalar(30)]) +a7_1 = pc.call_function( + func_name_7, [pa.scalar(10), pa.scalar(20), pa.scalar(30)]) print(a7_1) -a7_2 = pc.call_function(func_name_7, [pa.scalar(1), pa.scalar(2), pa.scalar(3)]) +a7_2 = pc.call_function( + func_name_7, [pa.scalar(1), pa.scalar(2), pa.scalar(3)]) print(a7_2) @@ -260,25 +287,36 @@ def ternary_scalar_function(scalar1, scalar2, scalar3): InputType.scalar(pa.int64()), InputType.scalar(pa.int64())] out_type_8 = pa.int64() -scalar_ternary_add_function_doc = get_function_doc("scalar ternary add function", - "test scalar ternary add function", - ["scalar_value1", "scalar_value2", - "scalar_value3", "scalar_value4"], "None") + +scalar_ternary_add_function_doc = get_function_doc( + "scalar ternary add function", + "test scalar ternary add function", + ["scalar_value1", + "scalar_value2", + "scalar_value3", + "scalar_value4"], "None") + def ternary_scalar_function(scalar1, scalar2, scalar3, scalar4): - return pc.call_function("add", [pc.call_function("add", - [pc.call_function("add", [scalar1, scalar2]), - scalar3]), scalar4]) + return pc.call_function("add", + [pc.call_function("add", + [pc.call_function("add", + [scalar1, + scalar2]), + scalar3]), scalar4]) + register_function(func_name_8, arity_8, scalar_ternary_add_function_doc, - in_types_8, out_type_8, ternary_scalar_function) + in_types_8, out_type_8, ternary_scalar_function) func8 = pc.get_function(func_name_8) -a8_1 = pc.call_function(func_name_8, [pa.scalar(10), pa.scalar(20), pa.scalar(30), pa.scalar(40)]) +a8_1 = pc.call_function(func_name_8, [pa.scalar( + 10), pa.scalar(20), pa.scalar(30), pa.scalar(40)]) print(a8_1) -a8_2 = pc.call_function(func_name_8, [pa.scalar(1), pa.scalar(2), pa.scalar(3), pa.scalar(4)]) +a8_2 = pc.call_function(func_name_8, [pa.scalar( + 1), pa.scalar(2), pa.scalar(3), pa.scalar(4)]) print(a8_2) diff --git a/python/pyarrow/_compute.pxd b/python/pyarrow/_compute.pxd index 1274eb0aeeb..6fa1899fb66 100644 --- a/python/pyarrow/_compute.pxd +++ b/python/pyarrow/_compute.pxd @@ -24,21 +24,16 @@ from pyarrow.includes.libarrow cimport * cdef class Arity(_Weakrefable): cdef: CArity arity - + cdef void init(self, const CArity &arity) cdef class InputType(_Weakrefable): cdef: CInputType input_type - + cdef void init(self, const CInputType &input_type) -# cdef class FunctionDoc(_Weakrefable): -# cdef: -# CFunctionDoc function_doc - -# cdef void init(self, const CFunctionDoc &function_doc) cdef class FunctionOptions(_Weakrefable): cdef: diff --git a/python/pyarrow/_compute.pyx b/python/pyarrow/_compute.pyx index ce023d102dd..81a5c550109 100644 --- a/python/pyarrow/_compute.pyx +++ b/python/pyarrow/_compute.pyx @@ -200,29 +200,30 @@ FunctionDoc = namedtuple( ("summary", "description", "arg_names", "options_class", "options_required")) + cdef wrap_arity(const CArity c_arity): + """ + Wrap a C++ Arity in an Arity object + """ cdef Arity arity = Arity.__new__(Arity) arity.init(c_arity) return arity + cdef wrap_input_type(const CInputType c_input_type): + """ + Wrap a C++ InputType in an InputType object + """ cdef InputType input_type = InputType.__new__(InputType) input_type.init(c_input_type) return input_type -# cdef class FunctionDoc(_Weakrefable): - -# def __init__(self): -# raise TypeError("Cannot use constructor to initialize FunctionDoc") - -# cdef void init(self, const CFunctionDoc &function_doc): -# self.function_doc = function_doc - -# @staticmethod -# def create(self): -# pass cdef class InputType(_Weakrefable): + """ + An interface for defining input-types for streaming execution engine + applications. + """ def __init__(self): raise TypeError("Cannot use constructor to initialize InputType") @@ -232,6 +233,21 @@ cdef class InputType(_Weakrefable): @staticmethod def scalar(data_type): + """ + create a scalar input type of the given data type + + Parameter + --------- + data_type: DataType + + Examples + -------- + + >>> import pyarrow as pa + >>> from pyarrow.compute import InputType + >>> in_type = InputType.scalar(pa.int32()) + + """ cdef: shared_ptr[CDataType] c_data_type CInputType c_input_type @@ -241,6 +257,21 @@ cdef class InputType(_Weakrefable): @staticmethod def array(data_type): + """ + create an array input type of the given data type + + Parameter + --------- + data_type: DataType + + Examples + -------- + + >>> import pyarrow as pa + >>> from pyarrow.compute import InputType + >>> in_type = InputType.array(pa.int32()) + + """ cdef: shared_ptr[CDataType] c_data_type CInputType c_input_type @@ -250,6 +281,9 @@ cdef class InputType(_Weakrefable): cdef class Arity(_Weakrefable): + """ + An Arity object. + """ def __init__(self): raise TypeError("Cannot use constructor to initialize Arity") @@ -259,21 +293,39 @@ cdef class Arity(_Weakrefable): @staticmethod def unary(): + """ + create a unary arity object + """ cdef CArity c_arity = CArity.Unary() return wrap_arity(c_arity) - + @staticmethod def binary(): + """ + create a binary arity object + """ cdef CArity c_arity = CArity.Binary() return wrap_arity(c_arity) @staticmethod def ternary(): + """ + create a ternary arity object + """ cdef CArity c_arity = CArity.Ternary() return wrap_arity(c_arity) @staticmethod def varargs(int num_args): + """ + create a varargs arity object with defined number of arguments + + Parameter + --------- + + num_args: int + number of arguments + """ cdef CArity c_arity = CArity.VarArgs(num_args) return wrap_arity(c_arity) @@ -2358,45 +2410,51 @@ cdef CExpression _bind(Expression filter, Schema schema) except *: return GetResultValue(filter.unwrap().Bind( deref(pyarrow_unwrap_schema(schema).get()))) + cdef CFunctionDoc _make_function_doc(func_doc): + """ + Helper function to generate the FunctionDoc + """ cdef: CFunctionDoc f_doc vector[c_string] c_arg_names if isinstance(func_doc, dict): - if func_doc["summary"] and isinstance(func_doc["summary"], str): - f_doc.summary = func_doc["summary"].encode() - else: + if func_doc["summary"] and isinstance(func_doc["summary"], str): + f_doc.summary = func_doc["summary"].encode() + else: raise ValueError("key `summary` cannot be None") - - if func_doc["description"] and isinstance(func_doc["description"], str): - f_doc.description = func_doc["description"].encode() - else: + + if func_doc["description"] and isinstance(func_doc["description"], str): + f_doc.description = func_doc["description"].encode() + else: raise ValueError("key `description` cannot be None") - - if func_doc["arg_names"] and isinstance(func_doc["arg_names"], list): + + if func_doc["arg_names"] and isinstance(func_doc["arg_names"], list): for arg_name in func_doc["arg_names"]: if isinstance(arg_name, str): c_arg_names.push_back(arg_name.encode()) else: - raise ValueError("key `arg_names` must be a list of strings") + raise ValueError( + "key `arg_names` must be a list of strings") f_doc.arg_names = c_arg_names - else: + else: raise ValueError("key `arg_names` cannot be None") - - if func_doc["options_class"] and isinstance(func_doc["options_class"], str): + + if func_doc["options_class"] and isinstance(func_doc["options_class"], str): f_doc.options_class = func_doc["options_class"].encode() - else: + else: raise ValueError("key `options_class` cannot be None") - - if isinstance(func_doc["options_required"], bool): + + if isinstance(func_doc["options_required"], bool): f_doc.options_required = func_doc["options_required"] - else: + else: raise ValueError("key `options_required` cannot must be bool") return f_doc else: raise TypeError(f"func_doc must be a dictionary") + cdef class UDFError(Exception): cdef dict __dict__ @@ -2408,71 +2466,111 @@ cdef class UDFError(Exception): message = tobytes("UDF error: {}".format(str(self))) return CStatus_UnknownError(message) + cdef class UDFRegistrationError(UDFError): - + def __init__(self, message='', extra_info=b''): super().__init__(message, extra_info) cdef CStatus to_status(self): message = tobytes("UDF Registration error: {}".format(str(self))) return CStatus_UnknownError(message) - + def register_function(func_name, arity, function_doc, in_types, - out_type, callback, mem_allocation="no_preallocate", null_handling="computed_no_preallocate"): - cdef: - c_string c_func_name - CArity c_arity - CFunctionDoc c_func_doc - CInputType in_tmp - vector[CInputType] c_in_types - PyObject* c_callback - shared_ptr[CDataType] c_type - COutputType* c_out_type - CScalarUdfBuilder* c_sc_builder - MemAllocation c_mem_allocation - NullHandling c_null_handling - CStatus st - object obj - - _mem_allocation_map = { - "preallocate": MemAllocation_PREALLOCATE, - "no_preallocate": MemAllocation_NO_PREALLOCATE - } - - _null_handling_map = { - "intersect": NullHandling_INTERSECTION, - "computed_preallocate": NullHandling_COMPUTED_PREALLOCATE, - "computed_no_preallocate": NullHandling_COMPUTED_NO_PREALLOCATE, - "output_not_null": NullHandling_OUTPUT_NOT_NULL - } - - if func_name and isinstance(func_name, str): - c_func_name = func_name.encode() - else: - raise ValueError("func_name should be str") - - if arity and isinstance(arity, Arity): - c_arity = ( arity).arity - else: - raise ValueError("arity must be an instance of Arity") - - c_func_doc = _make_function_doc(function_doc) - - if in_types and isinstance(in_types, list): - for in_type in in_types: - in_tmp = ( in_type).input_type - c_in_types.push_back(in_tmp) - - c_type = pyarrow_unwrap_data_type(out_type) - c_callback = callback - - c_out_type = new COutputType(c_type) - c_mem_allocation = _mem_allocation_map[mem_allocation] - c_null_handling = _null_handling_map[null_handling] - c_sc_builder = new CScalarUdfBuilder(c_func_name, c_arity, &c_func_doc, - c_in_types, deref(c_out_type), c_mem_allocation, c_null_handling) - st = c_sc_builder.MakeFunction(c_callback) - if not st.ok(): - error_msg = st.message().decode() - raise UDFRegistrationError(message = error_msg) + out_type, callback, mem_allocation="no_preallocate", + null_handling="computed_no_preallocate"): + """ + Register a user-defined-function (function) + + Parameters + ---------- + + func_name: str + function name + arity: Arity + arity of the function + function_doc: dict + a dictionary object with keys + ("summary", + "description", + "arg_names", + "options_class", (not supported yet) + "options_required" (not supported yet) + ) + in_types: List[InputType] + list of InputType objects which defines the input + types for the function + out_type: DataType + output type of the function + callback: callable + user defined function + mem_allocation: str + memory allocation mode + "preallocate" or "no_preallocate" + null_handling: str + null handling mode + one of "intersect", "computed_preallocate", + "computed_no_preallocate", + "output_not_null" + """ + cdef: + c_string c_func_name + CArity c_arity + CFunctionDoc c_func_doc + CInputType in_tmp + vector[CInputType] c_in_types + PyObject* c_callback + shared_ptr[CDataType] c_type + COutputType* c_out_type + CScalarUdfBuilder* c_sc_builder + MemAllocation c_mem_allocation + NullHandling c_null_handling + CStatus st + object obj + + _mem_allocation_map = { + "preallocate": MemAllocation_PREALLOCATE, + "no_preallocate": MemAllocation_NO_PREALLOCATE + } + + _null_handling_map = { + "intersect": NullHandling_INTERSECTION, + "computed_preallocate": NullHandling_COMPUTED_PREALLOCATE, + "computed_no_preallocate": NullHandling_COMPUTED_NO_PREALLOCATE, + "output_not_null": NullHandling_OUTPUT_NOT_NULL + } + + if func_name and isinstance(func_name, str): + c_func_name = func_name.encode() + else: + raise ValueError("func_name should be str") + + if arity and isinstance(arity, Arity): + c_arity = ( arity).arity + else: + raise ValueError("arity must be an instance of Arity") + + c_func_doc = _make_function_doc(function_doc) + + if in_types and isinstance(in_types, list): + for in_type in in_types: + in_tmp = ( in_type).input_type + c_in_types.push_back(in_tmp) + + c_type = pyarrow_unwrap_data_type(out_type) + c_callback = callback + + c_out_type = new COutputType(c_type) + c_mem_allocation = _mem_allocation_map[mem_allocation] + c_null_handling = _null_handling_map[null_handling] + # Note: The VectorUDF, TableUDF and AggregatorUDFs will be defined + # when they are implemented. Only ScalarUDFBuilder is supported at the + # moment. + c_sc_builder = new CScalarUdfBuilder(c_func_name, c_arity, &c_func_doc, + c_in_types, deref(c_out_type), + c_mem_allocation, c_null_handling) + st = c_sc_builder.MakeFunction(c_callback) + if not st.ok(): + error_msg = st.message().decode() + raise UDFRegistrationError(message=error_msg) diff --git a/python/pyarrow/compute.py b/python/pyarrow/compute.py index e0b010293fb..6ce52255a22 100644 --- a/python/pyarrow/compute.py +++ b/python/pyarrow/compute.py @@ -16,13 +16,13 @@ # under the License. from pyarrow._compute import ( # noqa - Arity, + Arity, Function, FunctionOptions, FunctionRegistry, HashAggregateFunction, HashAggregateKernel, - InputType, + InputType, Kernel, ScalarAggregateFunction, ScalarAggregateKernel, diff --git a/python/pyarrow/includes/libarrow.pxd b/python/pyarrow/includes/libarrow.pxd index 4ce1b5f44eb..0b714118f0b 100644 --- a/python/pyarrow/includes/libarrow.pxd +++ b/python/pyarrow/includes/libarrow.pxd @@ -1809,19 +1809,17 @@ cdef extern from "arrow/compute/api.h" namespace "arrow::compute" nogil: CExecutor* executor() cdef cppclass CExecBatch" arrow::compute::ExecBatch": - CExecBatch(const CRecordBatch& batch); + CExecBatch(const CRecordBatch& batch) - @staticmethod CResult[CExecBatch] Make(vector[CDatum] values) CResult[shared_ptr[CRecordBatch]] ToRecordBatch( shared_ptr[CSchema] schema, CMemoryPool* pool) const - #inline const CDatum& operator[](i) const + # inline const CDatum& operator[](i) const vector[CDatum] values c_string ToString() const - cdef cppclass CKernelContext" arrow::compute::KernelContext": CKernelContext(CExecContext* exec_ctx) @@ -1851,7 +1849,7 @@ cdef extern from "arrow/compute/api.h" namespace "arrow::compute" nogil: cdef cppclass CArity" arrow::compute::Arity": int num_args c_bool is_varargs - + @staticmethod CArity Nullary() @@ -2726,11 +2724,10 @@ cdef extern from "arrow/compute/kernel.h" namespace "arrow::compute" nogil: cdef extern from "arrow/python/udf.h" namespace "arrow::py" nogil: cdef cppclass CUdfBuilder" arrow::py::UdfBuilder": CUdfBuilder(c_string func_name, FunctionKind kind, CArity arity, CFunctionDoc* func_doc, - vector[CInputType] in_types, COutputType out_type, - MemAllocation mem_allocation, NullHandling null_handling) + vector[CInputType] in_types, COutputType out_type, + MemAllocation mem_allocation, NullHandling null_handling) cdef cppclass CScalarUdfBuilder" arrow::py::ScalarUdfBuilder"(CUdfBuilder): CScalarUdfBuilder(c_string func_name, CArity arity, CFunctionDoc* func_doc, - vector[CInputType] in_types, COutputType out_type, - MemAllocation mem_allocation, NullHandling null_handling) + vector[CInputType] in_types, COutputType out_type, + MemAllocation mem_allocation, NullHandling null_handling) CStatus MakeFunction(PyObject* function) - \ No newline at end of file diff --git a/python/pyarrow/tests/test_udf.py b/python/pyarrow/tests/test_udf.py new file mode 100644 index 00000000000..eb94867b727 --- /dev/null +++ b/python/pyarrow/tests/test_udf.py @@ -0,0 +1,287 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +from typing import List + +import pytest + +import pyarrow as pa +from pyarrow import compute as pc +from pyarrow.compute import register_function +from pyarrow.compute import Arity, InputType + + +def get_function_doc(summary: str, desc: str, arg_names: List[str], + options_class: str, options_required: bool = False): + func_doc = {} + func_doc["summary"] = summary + func_doc["description"] = desc + func_doc["arg_names"] = arg_names + func_doc["options_class"] = options_class + func_doc["options_required"] = False + return func_doc + +# scalar unary function data + + +unary_doc = get_function_doc("add function", + "test add function", + ["scalar1"], + "None") + + +def unary_function(scalar1): + return pc.call_function("add", [scalar1, 1]) + +# scalar binary function data + + +binary_doc = get_function_doc("y=mx", + "find y from y = mx", + ["m", "x"], + "None") + + +def binary_function(m, x): + return pc.call_function("multiply", [m, x]) + +# scalar ternary function data + + +ternary_doc = get_function_doc("y=mx+c", + "find y from y = mx + c", + ["m", "x", "c"], + "None") + + +def ternary_function(m, x, c): + mx = pc.call_function("multiply", [m, x]) + return pc.call_function("add", [mx, c]) + +# scalar varargs function data + + +varargs_doc = get_function_doc("z=ax+by+c", + "find z from z = ax + by + c", + ["a", "x", "b", "y", "c"], + "None") + + +def varargs_function(a, x, b, y, c): + ax = pc.call_function("multiply", [a, x]) + by = pc.call_function("multiply", [b, y]) + ax_by = pc.call_function("add", [ax, by]) + return pc.call_function("add", [ax_by, c]) + + +@pytest.fixture +def function_input_types(): + return [ + [ + InputType.scalar(pa.int64()) + ], + [ + InputType.scalar(pa.int64()), + InputType.scalar(pa.int64()) + ], + [ + InputType.scalar(pa.int64()), + InputType.scalar(pa.int64()), + InputType.scalar(pa.int64()) + ], + [ + InputType.scalar(pa.int64()), + InputType.scalar(pa.int64()), + InputType.scalar(pa.int64()), + InputType.scalar(pa.int64()), + InputType.scalar(pa.int64()) + ], + [ + InputType.array(pa.int64()) + ], + [ + InputType.array(pa.int64()), + InputType.array(pa.int64()) + ], + [ + InputType.array(pa.int64()), + InputType.array(pa.int64()), + InputType.array(pa.int64()) + ], + [ + InputType.array(pa.int64()), + InputType.array(pa.int64()), + InputType.array(pa.int64()), + InputType.array(pa.int64()), + InputType.array(pa.int64()) + ] + ] + + +@pytest.fixture +def function_output_types(): + return [ + pa.int64(), + pa.int64(), + pa.int64(), + pa.int64() + ] + + +@pytest.fixture +def function_names(): + return [ + # scalar data function + "scalar_y=x+k", + "scalar_y=mx", + "scalar_y=mx+c", + "scalar_z=ax+by+c", + # array data function names + "array_y=x+k", + "array_y=mx", + "array_y=mx+c", + "array_z=ax+by+c" + ] + + +@pytest.fixture +def function_arities(): + return [ + Arity.unary(), + Arity.binary(), + Arity.ternary(), + Arity.varargs(5), + ] + + +@pytest.fixture +def function_docs(): + return [ + unary_doc, + binary_doc, + ternary_doc, + varargs_doc + ] + + +@pytest.fixture +def functions(): + return [ + unary_function, + binary_function, + ternary_function, + varargs_function + ] + + +@pytest.fixture +def function_inputs(): + return [ + # scalar input data + [ + pa.scalar(10, pa.int64()) + ], + [ + pa.scalar(10, pa.int64()), + pa.scalar(2, pa.int64()) + ], + [ + pa.scalar(10, pa.int64()), + pa.scalar(2, pa.int64()), + pa.scalar(5, pa.int64()) + ], + [ + pa.scalar(2, pa.int64()), + pa.scalar(10, pa.int64()), + pa.scalar(3, pa.int64()), + pa.scalar(20, pa.int64()), + pa.scalar(5, pa.int64()) + ], + # array input data + [ + pa.array([10, 20], pa.int64()) + ], + [ + pa.array([10, 20], pa.int64()), + pa.array([2, 4], pa.int64()) + ], + [ + pa.array([10, 20], pa.int64()), + pa.array([2, 4], pa.int64()), + pa.array([5, 10], pa.int64()) + ], + [ + pa.array([2, 3], pa.int64()), + pa.array([10, 20], pa.int64()), + pa.array([3, 7], pa.int64()), + pa.array([20, 30], pa.int64()), + pa.array([5, 10], pa.int64()) + ] + ] + + +@pytest.fixture +def expected_outputs(): + return [ + # scalar data + pa.scalar(11, pa.int64()), # 10 + 1 + pa.scalar(20, pa.int64()), # 10 * 2 + pa.scalar(25, pa.int64()), # 10 * 2 + 5 + pa.scalar(85, pa.int64()), # (2 * 10) + (3 * 20) + 5 + # array data + pa.array([11, 21], pa.int64()), # [10 + 1, 20 + 1] + pa.array([20, 80], pa.int64()), # [10 * 2, 20 * 4] + pa.array([25, 90], pa.int64()), # [(10 * 2) + 5, (20 * 4) + 10] + # [(2 * 10) + (3 * 20) + 5, (3 * 20) + (7 * 30) + 10] + pa.array([85, 280], pa.int64()) + ] + + +def test_udf_function_with_scalar_data(function_names, + function_arities, + function_input_types, + function_output_types, + function_docs, + functions, + function_inputs, + expected_outputs): + + # Note: 2 * -> used to duplicate the list + # Because the values are same irrespective of the type i.e scalar or array + for name, \ + arity, \ + in_types, \ + out_type, \ + doc, \ + function, \ + input, \ + expected_output in zip(function_names, + 2 * function_arities, + function_input_types, + 2 * function_output_types, + 2 * function_docs, + 2 * functions, + function_inputs, + expected_outputs): + + register_function(name, arity, doc, in_types, out_type, function) + + func = pc.get_function(name) + assert func.name == name + + result = pc.call_function(name, input) + assert result == expected_output diff --git a/python/setup.py b/python/setup.py index daf65ac15c7..6a5e7372068 100755 --- a/python/setup.py +++ b/python/setup.py @@ -287,9 +287,6 @@ def append_cmake_bool(value, varname): cmake_options.append('-DCMAKE_BUILD_TYPE={0}' .format(self.build_type.lower())) - - cmake_options.append('-DCMAKE_C_COMPILER={0}'.format(os.environ['CC'])) - cmake_options.append('-DCMAKE_CXX_COMPILER={0}'.format(os.environ['CXX'])) if self.boost_namespace != 'boost': cmake_options.append('-DBoost_NAMESPACE={}' From f1f96871eb14d716fd1d8748370d6196c0883583 Mon Sep 17 00:00:00 2001 From: Vibhatha Abeykoon Date: Mon, 14 Mar 2022 12:15:46 +0530 Subject: [PATCH 039/131] cleaning up udf C++ example --- cpp/examples/arrow/CMakeLists.txt | 5 +- cpp/examples/arrow/udf_example.cc | 331 ++++++------------------------ 2 files changed, 68 insertions(+), 268 deletions(-) diff --git a/cpp/examples/arrow/CMakeLists.txt b/cpp/examples/arrow/CMakeLists.txt index b3232e9735e..39e59cc192d 100644 --- a/cpp/examples/arrow/CMakeLists.txt +++ b/cpp/examples/arrow/CMakeLists.txt @@ -138,9 +138,8 @@ if(ARROW_PARQUET AND ARROW_DATASET) add_arrow_example(join_example EXTRA_LINK_LIBS ${DATASET_EXAMPLES_LINK_LIBS}) add_dependencies(join-example parquet) - add_arrow_example(udf_example EXTRA_LINK_LIBS - ${DATASET_EXAMPLES_LINK_LIBS} ${PYTHON_LIBRARIES} ${PYTHON_OTHER_LIBS}) - add_dependencies(udf-example parquet arrow_python) + add_arrow_example(udf_example EXTRA_LINK_LIBS ${DATASET_EXAMPLES_LINK_LIBS}) + add_dependencies(udf-example parquet) add_arrow_example(aggregate_example EXTRA_LINK_LIBS ${DATASET_EXAMPLES_LINK_LIBS}) add_dependencies(aggregate-example parquet) diff --git a/cpp/examples/arrow/udf_example.cc b/cpp/examples/arrow/udf_example.cc index dfa2a1cec95..bb9929b1bf8 100644 --- a/cpp/examples/arrow/udf_example.cc +++ b/cpp/examples/arrow/udf_example.cc @@ -33,10 +33,8 @@ #include #include -#include -#include - -// Demonstrate registering an Arrow compute function outside of the Arrow source tree +// Demonstrate registering an user-defined Arrow compute function outside of the Arrow +// source tree namespace cp = ::arrow::compute; @@ -49,21 +47,6 @@ namespace cp = ::arrow::compute; } \ } while (0); -struct BatchesWithSchema { - std::vector batches; - std::shared_ptr schema; - // // This method uses internal arrow utilities to - // // convert a vector of record batches to an AsyncGenerator of optional batches - arrow::AsyncGenerator> gen() const { - auto opt_batches = ::arrow::internal::MapVector( - [](cp::ExecBatch batch) { return arrow::util::make_optional(std::move(batch)); }, - batches); - arrow::AsyncGenerator> gen; - gen = arrow::MakeVectorGenerator(std::move(opt_batches)); - return gen; - } -}; - template ::value | arrow::is_boolean_type::value | @@ -88,133 +71,57 @@ arrow::Result> GetSampleRecordBatch( return record_batch->FromStructArray(struct_result); } -arrow::Result GetExecBatchFromVectors( - const arrow::FieldVector& field_vector, const arrow::ArrayVector& array_vector) { - std::shared_ptr record_batch; - ARROW_ASSIGN_OR_RAISE(auto res_batch, GetSampleRecordBatch(array_vector, field_vector)); - cp::ExecBatch batch{*res_batch}; - return batch; -} - -arrow::Result MakeBasicBatches() { - BatchesWithSchema out; - auto field_vector = {arrow::field("a", arrow::int64()), - arrow::field("b", arrow::boolean())}; - ARROW_ASSIGN_OR_RAISE(auto b1_int, GetArrayDataSample({0, 4})); - ARROW_ASSIGN_OR_RAISE(auto b2_int, GetArrayDataSample({5, 6, 7})); - ARROW_ASSIGN_OR_RAISE(auto b3_int, GetArrayDataSample({8, 9, 10})); - - ARROW_ASSIGN_OR_RAISE(auto b1_bool, - GetArrayDataSample({false, true})); - ARROW_ASSIGN_OR_RAISE(auto b2_bool, - GetArrayDataSample({true, false, true})); - ARROW_ASSIGN_OR_RAISE(auto b3_bool, - GetArrayDataSample({false, true, false})); - - ARROW_ASSIGN_OR_RAISE(auto b1, - GetExecBatchFromVectors(field_vector, {b1_int, b1_bool})); - ARROW_ASSIGN_OR_RAISE(auto b2, - GetExecBatchFromVectors(field_vector, {b2_int, b2_bool})); - ARROW_ASSIGN_OR_RAISE(auto b3, - GetExecBatchFromVectors(field_vector, {b3_int, b3_bool})); - - out.batches = {b1, b2, b3}; - out.schema = arrow::schema(field_vector); - return out; -} - arrow::Result> GetTable() { std::shared_ptr table; - auto field_vector = {arrow::field("a", arrow::int64()), - arrow::field("b", arrow::boolean())}; + auto field_vector = { + arrow::field("a", arrow::int64()), arrow::field("x", arrow::int64()), + arrow::field("y", arrow::int64()), arrow::field("z", arrow::int64()), + arrow::field("b", arrow::boolean())}; + ARROW_ASSIGN_OR_RAISE(auto int_array, - GetArrayDataSample({0, 4, 10, 20, 30})); + GetArrayDataSample({1, 2, 3, 4, 5, 6})); + ARROW_ASSIGN_OR_RAISE(auto x, + GetArrayDataSample({21, 22, 23, 24, 25, 26})); + ARROW_ASSIGN_OR_RAISE(auto y, + GetArrayDataSample({31, 32, 33, 34, 35, 36})); + ARROW_ASSIGN_OR_RAISE(auto z, + GetArrayDataSample({41, 42, 43, 44, 45, 46})); ARROW_ASSIGN_OR_RAISE(auto bool_array, GetArrayDataSample( - {false, true, false, true, true})); + {false, true, false, true, true, false})); auto schema = arrow::schema(field_vector); - auto data_vector = {int_array, bool_array}; + auto data_vector = {int_array, x, y, z, bool_array}; - table = arrow::Table::Make(schema, data_vector, 5); + table = arrow::Table::Make(schema, data_vector, 6); return table; } -class ExampleFunctionOptionsType : public cp::FunctionOptionsType { - const char* type_name() const override { return "ExampleFunctionOptionsType"; } +class UDFOptionsType : public cp::FunctionOptionsType { + const char* type_name() const override { return "UDFOptionsType"; } std::string Stringify(const cp::FunctionOptions&) const override { - return "ExampleFunctionOptionsType"; + return "UDFOptionsType"; } bool Compare(const cp::FunctionOptions&, const cp::FunctionOptions&) const override { return true; } std::unique_ptr Copy(const cp::FunctionOptions&) const override; - // optional: support for serialization - // Result> Serialize(const FunctionOptions&) const override; - // Result> Deserialize(const Buffer&) const override; }; -cp::FunctionOptionsType* GetExampleFunctionOptionsType() { - static ExampleFunctionOptionsType options_type; +cp::FunctionOptionsType* GetUDFOptionsType() { + static UDFOptionsType options_type; return &options_type; } -class ExampleFunctionOptions : public cp::FunctionOptions { +class UDFOptions : public cp::FunctionOptions { public: - ExampleFunctionOptions() : cp::FunctionOptions(GetExampleFunctionOptionsType()) {} + UDFOptions() : cp::FunctionOptions(GetUDFOptionsType()) {} }; -std::unique_ptr ExampleFunctionOptionsType::Copy( +std::unique_ptr UDFOptionsType::Copy( const cp::FunctionOptions&) const { - return std::unique_ptr(new ExampleFunctionOptions()); -} - -PyObject* SimpleFunction() { - PyObject* out = Py_BuildValue("s", "hello"); - std::cout << "HELLO FROM PYTHON FUNCTION IN C++" << std::endl; - return std::move(out); -} - -arrow::Status rb_test() { - auto datasource = MakeBasicBatches(); - auto batches = datasource->batches; - - ARROW_ASSIGN_OR_RAISE(auto rb, batches[0].ToRecordBatch(datasource->schema, - arrow::default_memory_pool())); - ARROW_ASSIGN_OR_RAISE(auto result, cp::CallFunction("add", {rb, rb})); - return arrow::Status::OK(); -} - -// PyObject* objectsRepresentation = PyObject_Repr(yourObject); -// const char* s = PyString_AsString(objectsRepresentation); - -arrow::Status ExampleFunctionImpl(cp::KernelContext* ctx, const cp::ExecBatch& batch, - arrow::Datum* out) { - std::cout << "calling udf :" << batch.length << std::endl; - Py_Initialize(); - PyObject* res = SimpleFunction(); - PyObject* objectsRepresentation = PyObject_Repr(res); - const char* s = PyUnicode_AsUTF8(objectsRepresentation); - std::cout << "Message :: " << s << std::endl; - Py_Finalize(); - auto result = cp::CallFunction("add", {batch[0].array(), batch[0].array()}); - *out->mutable_array() = *result.ValueOrDie().array(); - return arrow::Status::OK(); -} -// cp::KernelContext*, const cp::ExecBatch&, Datum*, PyObject* func -arrow::Status ExamplePyFunctionImpl(cp::KernelContext* ctx, const cp::ExecBatch& batch, - arrow::Datum* out, PyObject* func) { - std::cout << "H" << std::endl; - auto result = cp::CallFunction("add", {batch[0].array(), batch[0].array()}); - *out->mutable_array() = *result.ValueOrDie().array(); - // PyObject* res = SimpleFunction(); - // PyObject* objectsRepresentation = PyObject_Repr(res); - // const char* s = PyUnicode_AsUTF8(objectsRepresentation); - std::cout << "Message :: " - << "s" << std::endl; - - return arrow::Status::OK(); + return std::unique_ptr(new UDFOptions()); } class ExampleNodeOptions : public cp::ExecNodeOptions {}; @@ -257,147 +164,34 @@ arrow::Result ExampleExecNodeFactory(cp::ExecPlan* plan, } const cp::FunctionDoc func_doc{ - "Example function to demonstrate registering an out-of-tree function", - "", - {"x", "y"}, - "ExampleFunctionOptions"}; - -const cp::FunctionDoc func_doc2{ - "Example function to demonstrate registering an out-of-tree function", - "", - {"x"}, - "ExampleFunctionOptions2"}; - -PyObject* MultiplyFunction(PyObject* scalar) { - PyObject* constant = PyLong_FromLong(2); - PyObject* res = PyNumber_Multiply(constant, scalar); - return std::move(res); -} - -class ScalarUDF { - public: - ScalarUDF(); - explicit ScalarUDF(cp::Arity arity, std::vector input_types, - cp::OutputType output_type, PyObject* (*function)(PyObject*)) - : arity_(std::move(arity)), - input_types_(std::move(input_types)), - output_type_(output_type), - function_(function) {} - - arrow::Status Make(cp::KernelContext* ctx, const cp::ExecBatch& batch, - arrow::Datum* out) { - Py_Initialize(); - PyObject* args = PyTuple_Pack(1, PyLong_FromLong(2)); - PyObject* myResult = function_(args); - int64_t result = PyLong_AsLong(myResult); - Py_Finalize(); - std::cout << "Value : " << result << std::endl; - arrow::Result maybe_result; - arrow::Int64Builder builder(arrow::default_memory_pool()); - std::shared_ptr arr; - ABORT_ON_FAILURE(builder.Append(result)); - ABORT_ON_FAILURE(builder.Finish(&arr)); - maybe_result = cp::CallFunction("add", {batch[0].array(), arr}); - *out->mutable_array() = *maybe_result.ValueOrDie().array(); - return arrow::Status::OK(); - } - - private: - cp::Arity arity_; - std::vector input_types_; - cp::OutputType output_type_; - PyObject* (*function_)(PyObject*); -}; + "User-defined-function usage to demonstrate registering an out-of-tree function", + "returns x + y + z", + {"x", "y", "z"}, + "UDFOptions"}; arrow::Status Execute() { const std::string name = "x+x"; - auto func = std::make_shared(name, cp::Arity::Unary(), &func_doc2); - cp::ScalarKernel kernel({cp::InputType::Array(arrow::int64())}, arrow::int64(), - ExampleFunctionImpl); - - kernel.mem_allocation = cp::MemAllocation::NO_PREALLOCATE; - - ABORT_ON_FAILURE(func->AddKernel(std::move(kernel))); - - auto registry = cp::GetFunctionRegistry(); - ABORT_ON_FAILURE(registry->AddFunction(std::move(func))); - - arrow::Int64Builder builder(arrow::default_memory_pool()); - std::shared_ptr arr1, arr2; - ABORT_ON_FAILURE(builder.Append(42)); - ABORT_ON_FAILURE(builder.Finish(&arr1)); - ABORT_ON_FAILURE(builder.Append(58)); - ABORT_ON_FAILURE(builder.Finish(&arr2)); - auto options = std::make_shared(); - auto maybe_result = cp::CallFunction(name, {arr1}, options.get()); - ABORT_ON_FAILURE(maybe_result.status()); - - std::cout << "Result 1: " << maybe_result->make_array()->ToString() << std::endl; - - // Expression serialization will raise NotImplemented if an expression includes - // FunctionOptions for which serialization is not supported. - // auto expr = cp::call(name, {}, options); - // auto maybe_serialized = cp::Serialize(expr); - // std::cerr << maybe_serialized.status().ToString() << std::endl; + auto func = std::make_shared(name, cp::Arity::Ternary(), &func_doc); - auto exec_registry = cp::default_exec_factory_registry(); - ABORT_ON_FAILURE( - exec_registry->AddFactory("compute_register_example", ExampleExecNodeFactory)); - - auto maybe_plan = cp::ExecPlan::Make(); - ABORT_ON_FAILURE(maybe_plan.status()); - auto plan = maybe_plan.ValueOrDie(); - cp::ExecContext exec_context(arrow::default_memory_pool(), - ::arrow::internal::GetCpuThreadPool()); - arrow::AsyncGenerator> source_gen, sink_gen; - ARROW_ASSIGN_OR_RAISE(auto basic_data, MakeBasicBatches()); - - cp::Expression a_times_10 = cp::call("multiply", {cp::field_ref("a"), cp::literal(10)}); - cp::Expression custom_exp = cp::call(name, {cp::field_ref("a")}, options); - - auto source_node_options = cp::SourceNodeOptions{basic_data.schema, basic_data.gen()}; - auto project_node_options = cp::ProjectNodeOptions{{ - cp::field_ref("a"), - custom_exp, - cp::field_ref("b"), - }}; - auto output_schema = arrow::schema({arrow::field("a", arrow::int64()), - arrow::field("a + a", arrow::int64()), - arrow::field("b", arrow::boolean())}); - std::shared_ptr out; - ABORT_ON_FAILURE(cp::Declaration::Sequence( - { - {"source", source_node_options}, - {"project", project_node_options}, - {"table_sink", cp::TableSinkNodeOptions{&out, output_schema}}, - }) - .AddToPlan(plan.get()) - .status()); - - ARROW_RETURN_NOT_OK(plan->StartProducing()); - - std::cout << "Output Table Data : " << std::endl; - std::cout << out->ToString() << std::endl; - - auto future = plan->finished(); - - return future.status(); -} - -arrow::Status ExecuteVar() { - const std::string name = "x+x"; - auto func = std::make_shared(name, cp::Arity::Unary(), &func_doc2); auto exec_func = [](cp::KernelContext* ctx, const cp::ExecBatch& batch, arrow::Datum* out) -> arrow::Status { - std::cout << "Batch as Table " << std::endl; - *out->mutable_array() = *batch[0].array(); + auto in_res = cp::CallFunction("add", {batch[0].array(), batch[1].array()}); + auto in_arr = in_res.ValueOrDie().make_array(); + auto final_res = cp::CallFunction("add", {in_arr, batch[2].array()}); + auto final_arr = final_res.ValueOrDie().array(); + auto datum = new arrow::Datum(final_arr); + *out = *datum; return arrow::Status::OK(); }; - auto options = std::make_shared(); - cp::ScalarKernel kernel({cp::InputType::Array(arrow::int64())}, arrow::int64(), - exec_func); + + auto options = std::make_shared(); + cp::ScalarKernel kernel( + {cp::InputType::Array(arrow::int64()), cp::InputType::Array(arrow::int64()), + cp::InputType::Array(arrow::int64())}, + arrow::int64(), exec_func); kernel.mem_allocation = cp::MemAllocation::NO_PREALLOCATE; + kernel.null_handling = cp::NullHandling::COMPUTED_NO_PREALLOCATE; ABORT_ON_FAILURE(func->AddKernel(std::move(kernel))); @@ -406,7 +200,7 @@ arrow::Status ExecuteVar() { auto exec_registry = cp::default_exec_factory_registry(); ABORT_ON_FAILURE( - exec_registry->AddFactory("compute_register_example", ExampleExecNodeFactory)); + exec_registry->AddFactory("udf_register_example", ExampleExecNodeFactory)); auto maybe_plan = cp::ExecPlan::Make(); ABORT_ON_FAILURE(maybe_plan.status()); @@ -414,12 +208,10 @@ arrow::Status ExecuteVar() { cp::ExecContext exec_context(arrow::default_memory_pool(), ::arrow::internal::GetCpuThreadPool()); arrow::AsyncGenerator> source_gen, sink_gen; - ARROW_ASSIGN_OR_RAISE(auto basic_data, MakeBasicBatches()); - cp::Expression a_times_10 = cp::call("multiply", {cp::field_ref("a"), cp::literal(10)}); - cp::Expression custom_exp = cp::call(name, {cp::field_ref("a")}, options); + cp::Expression custom_exp = cp::call( + name, {cp::field_ref("x"), cp::field_ref("y"), cp::field_ref("z")}, options); - auto source_node_options = cp::SourceNodeOptions{basic_data.schema, basic_data.gen()}; ARROW_ASSIGN_OR_RAISE(auto table, GetTable()); auto table_source_node_options = cp::TableSourceNodeOptions{table, 2}; auto project_node_options = cp::ProjectNodeOptions{{ @@ -428,21 +220,30 @@ arrow::Status ExecuteVar() { cp::field_ref("b"), }}; auto output_schema = arrow::schema({arrow::field("a", arrow::int64()), - arrow::field("a + a", arrow::int64()), + arrow::field("x + y + z", arrow::int64()), arrow::field("b", arrow::boolean())}); std::shared_ptr out; - ABORT_ON_FAILURE(cp::Declaration::Sequence( - { - {"table_source", table_source_node_options}, - {"project", project_node_options}, - {"table_sink", cp::TableSinkNodeOptions{&out, output_schema}}, - }) - .AddToPlan(plan.get()) - .status()); + auto table_sink_node_options = cp::TableSinkNodeOptions{&out, output_schema}; + ABORT_ON_FAILURE( + cp::Declaration::Sequence({ + {"table_source", table_source_node_options}, + {"project", project_node_options}, + {"table_sink", table_sink_node_options}, + }) + .AddToPlan(plan.get()) + .status()); ARROW_RETURN_NOT_OK(plan->StartProducing()); + constexpr int print_len = 25; + std::cout << std::string(print_len, '#') << std::endl; + std::cout << "Input Table Data : " << std::endl; + std::cout << std::string(print_len, '#') << std::endl; + + std::cout << table->ToString() << std::endl; + std::cout << std::string(print_len, '#') << std::endl; std::cout << "Output Table Data : " << std::endl; + std::cout << std::string(print_len, '#') << std::endl; std::cout << out->ToString() << std::endl; auto future = plan->finished(); @@ -451,7 +252,7 @@ arrow::Status ExecuteVar() { } int main(int argc, char** argv) { - auto status = ExecuteVar(); + auto status = Execute(); if (!status.ok()) { std::cerr << "Error occurred : " << status.message() << std::endl; return EXIT_FAILURE; From 534592ef45994060f92e59cb9a42ae117251d583 Mon Sep 17 00:00:00 2001 From: Vibhatha Abeykoon Date: Mon, 14 Mar 2022 12:18:10 +0530 Subject: [PATCH 040/131] cleaning up examples cmake file --- cpp/examples/arrow/CMakeLists.txt | 4 ---- 1 file changed, 4 deletions(-) diff --git a/cpp/examples/arrow/CMakeLists.txt b/cpp/examples/arrow/CMakeLists.txt index 39e59cc192d..c6dd8f54592 100644 --- a/cpp/examples/arrow/CMakeLists.txt +++ b/cpp/examples/arrow/CMakeLists.txt @@ -15,8 +15,6 @@ # specific language governing permissions and limitations # under the License. -find_package(Python3Alt REQUIRED) - add_arrow_example(row_wise_conversion_example) if(ARROW_COMPUTE) @@ -114,8 +112,6 @@ if(ARROW_FLIGHT) endif() endif() -include_directories(${NUMPY_INCLUDE_DIRS} ${PYTHON_INCLUDE_DIRS}) - if(ARROW_PARQUET AND ARROW_DATASET) if(ARROW_BUILD_SHARED) set(DATASET_EXAMPLES_LINK_LIBS arrow_dataset_shared) From a7abaf8a59890622e6156d4cc383a27fa19b88cd Mon Sep 17 00:00:00 2001 From: Vibhatha Abeykoon Date: Mon, 14 Mar 2022 12:22:50 +0530 Subject: [PATCH 041/131] cleaning up temp test --- cpp/src/arrow/python/python_test.cc | 57 ----------------------------- 1 file changed, 57 deletions(-) diff --git a/cpp/src/arrow/python/python_test.cc b/cpp/src/arrow/python/python_test.cc index a1cd0b1e07d..1bb4487d2df 100644 --- a/cpp/src/arrow/python/python_test.cc +++ b/cpp/src/arrow/python/python_test.cc @@ -603,62 +603,5 @@ TEST_F(DecimalTest, UpdateWithNaN) { ASSERT_EQ(std::numeric_limits::min(), metadata.scale()); } -PyObject* SimpleFunction() { - PyObject* obj = Py_BuildValue("s", "hello"); - return obj; -} - -class ExampleFunctionOptionsType : public cp::FunctionOptionsType { - const char* type_name() const override { return "ExampleFunctionOptionsType"; } - std::string Stringify(const cp::FunctionOptions&) const override { - return "ExampleFunctionOptionsType"; - } - bool Compare(const cp::FunctionOptions&, const cp::FunctionOptions&) const override { - return true; - } - std::unique_ptr Copy(const cp::FunctionOptions&) const override; - // optional: support for serialization - // Result> Serialize(const FunctionOptions&) const override; - // Result> Deserialize(const Buffer&) const override; -}; - -cp::FunctionOptionsType* GetExampleFunctionOptionsType() { - static ExampleFunctionOptionsType options_type; - return &options_type; -} - -class ExampleFunctionOptions : public cp::FunctionOptions { - public: - ExampleFunctionOptions() : cp::FunctionOptions(GetExampleFunctionOptionsType()) {} -}; - -std::unique_ptr ExampleFunctionOptionsType::Copy( - const cp::FunctionOptions&) const { - return std::unique_ptr(new ExampleFunctionOptions()); -} - -arrow::Status ExamplePyFunctionImpl(cp::KernelContext* ctx, const cp::ExecBatch& batch, - arrow::Datum* out, PyObject* func) { - std::cout << "H" << std::endl; - auto result = cp::CallFunction("add", {batch[0].array(), batch[0].array()}); - *out->mutable_array() = *result.ValueOrDie().array(); - // PyObject* res = SimpleFunction(); - // PyObject* objectsRepresentation = PyObject_Repr(res); - // const char* s = PyUnicode_AsUTF8(objectsRepresentation); - std::cout << "Message :: " - << "s" << std::endl; - return arrow::Status::OK(); -} - -TEST(UDF, Initialization) { - const cp::FunctionDoc func_doc{ - "Example function to demonstrate registering an out-of-tree function", - "", - {"x"}, - "ExampleFunctionOptions"}; - arrow::Status st; - const std::string name = "x+x"; -} - } // namespace py } // namespace arrow From bff2be881bba637fe34553702c1250ad41d017d4 Mon Sep 17 00:00:00 2001 From: Vibhatha Abeykoon Date: Mon, 14 Mar 2022 12:38:27 +0530 Subject: [PATCH 042/131] reformat tests --- python/pyarrow/tests/test_udf.py | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/python/pyarrow/tests/test_udf.py b/python/pyarrow/tests/test_udf.py index eb94867b727..ab4e4db8363 100644 --- a/python/pyarrow/tests/test_udf.py +++ b/python/pyarrow/tests/test_udf.py @@ -91,6 +91,7 @@ def varargs_function(a, x, b, y, c): @pytest.fixture def function_input_types(): return [ + # scalar data input types [ InputType.scalar(pa.int64()) ], @@ -110,6 +111,7 @@ def function_input_types(): InputType.scalar(pa.int64()), InputType.scalar(pa.int64()) ], + # array data input types [ InputType.array(pa.int64()) ], @@ -145,7 +147,7 @@ def function_output_types(): @pytest.fixture def function_names(): return [ - # scalar data function + # scalar data function names "scalar_y=x+k", "scalar_y=mx", "scalar_y=mx+c", @@ -237,12 +239,12 @@ def function_inputs(): @pytest.fixture def expected_outputs(): return [ - # scalar data + # scalar output data pa.scalar(11, pa.int64()), # 10 + 1 pa.scalar(20, pa.int64()), # 10 * 2 pa.scalar(25, pa.int64()), # 10 * 2 + 5 pa.scalar(85, pa.int64()), # (2 * 10) + (3 * 20) + 5 - # array data + # array output data pa.array([11, 21], pa.int64()), # [10 + 1, 20 + 1] pa.array([20, 80], pa.int64()), # [10 * 2, 20 * 4] pa.array([25, 90], pa.int64()), # [(10 * 2) + 5, (20 * 4) + 10] @@ -251,7 +253,7 @@ def expected_outputs(): ] -def test_udf_function_with_scalar_data(function_names, +def test_scalar_udf_function_with_scalar_data(function_names, function_arities, function_input_types, function_output_types, From b2df0c38a78c59ab747e41662f045fbb808bb6d5 Mon Sep 17 00:00:00 2001 From: Vibhatha Abeykoon Date: Mon, 14 Mar 2022 12:40:27 +0530 Subject: [PATCH 043/131] cleaning code --- cpp/src/arrow/python/python_test.cc | 9 --------- 1 file changed, 9 deletions(-) diff --git a/cpp/src/arrow/python/python_test.cc b/cpp/src/arrow/python/python_test.cc index 1bb4487d2df..ba94f39ae11 100644 --- a/cpp/src/arrow/python/python_test.cc +++ b/cpp/src/arrow/python/python_test.cc @@ -39,18 +39,9 @@ #include "arrow/util/checked_cast.h" #include "arrow/util/logging.h" -#include -#include -#include -#include -#include - -#include "arrow/python/udf.h" - namespace arrow { using internal::checked_cast; -namespace cp = arrow::compute; namespace py { TEST(OwnedRef, TestMoves) { From 4c6efc2d87a2246a8ed75e46065fe63ca0295527 Mon Sep 17 00:00:00 2001 From: Vibhatha Abeykoon Date: Mon, 14 Mar 2022 12:42:04 +0530 Subject: [PATCH 044/131] acleaning spacing --- cpp/src/arrow/python/python_test.cc | 1 + 1 file changed, 1 insertion(+) diff --git a/cpp/src/arrow/python/python_test.cc b/cpp/src/arrow/python/python_test.cc index ba94f39ae11..c465fabc680 100644 --- a/cpp/src/arrow/python/python_test.cc +++ b/cpp/src/arrow/python/python_test.cc @@ -42,6 +42,7 @@ namespace arrow { using internal::checked_cast; + namespace py { TEST(OwnedRef, TestMoves) { From 286629ee60dcf5931ceffd9d988bd74a3e6cd020 Mon Sep 17 00:00:00 2001 From: Vibhatha Abeykoon Date: Mon, 14 Mar 2022 13:13:36 +0530 Subject: [PATCH 045/131] adding doc string for registration function --- python/pyarrow/_compute.pyx | 34 ++++++++++++++++++++++++++++++++ python/pyarrow/tests/test_udf.py | 14 ++++++------- 2 files changed, 41 insertions(+), 7 deletions(-) diff --git a/python/pyarrow/_compute.pyx b/python/pyarrow/_compute.pyx index 81a5c550109..a35194ecbb9 100644 --- a/python/pyarrow/_compute.pyx +++ b/python/pyarrow/_compute.pyx @@ -2513,6 +2513,40 @@ def register_function(func_name, arity, function_doc, in_types, one of "intersect", "computed_preallocate", "computed_no_preallocate", "output_not_null" + + Example + ------- + >>> from pyarrow import compute as pc + >>> from pyarrow.compute import register_function + >>> from pyarrow.compute import Arity, InputType + >>> + >>> func_doc = {} + >>> func_doc["summary"] = "simple udf" + >>> func_doc["description"] = "add a constant to a scalar" + >>> func_doc["arg_names"] = ["x"] + >>> func_doc["options_class"] = "None" + >>> func_doc["options_required"] = False + >>> + >>> def add_constant(array): + ... return pc.call_function("add", [array, 1]) + ... + >>> + >>> func_name = "py_add_func" + >>> arity = Arity.unary() + >>> in_types = [InputType.array(pa.int64())] + >>> out_type = pa.int64() + >>> register_function(func_name, arity, func_doc, + ... in_types, out_type, add_constant) + >>> + >>> func = pc.get_function(func_name) + >>> func.name + 'py_add_func' + >>> ans = pc.call_function(func_name, [pa.array([20])]) + >>> ans + + [ + 21 + ] """ cdef: c_string c_func_name diff --git a/python/pyarrow/tests/test_udf.py b/python/pyarrow/tests/test_udf.py index ab4e4db8363..7ae9d5e4850 100644 --- a/python/pyarrow/tests/test_udf.py +++ b/python/pyarrow/tests/test_udf.py @@ -254,13 +254,13 @@ def expected_outputs(): def test_scalar_udf_function_with_scalar_data(function_names, - function_arities, - function_input_types, - function_output_types, - function_docs, - functions, - function_inputs, - expected_outputs): + function_arities, + function_input_types, + function_output_types, + function_docs, + functions, + function_inputs, + expected_outputs): # Note: 2 * -> used to duplicate the list # Because the values are same irrespective of the type i.e scalar or array From facf36bbfeb56e9642074736d575694a1b615f34 Mon Sep 17 00:00:00 2001 From: Vibhatha Abeykoon Date: Mon, 14 Mar 2022 13:53:55 +0530 Subject: [PATCH 046/131] update function call --- python/pyarrow/tests/test_udf.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python/pyarrow/tests/test_udf.py b/python/pyarrow/tests/test_udf.py index 7ae9d5e4850..51f7199af5f 100644 --- a/python/pyarrow/tests/test_udf.py +++ b/python/pyarrow/tests/test_udf.py @@ -285,5 +285,5 @@ def test_scalar_udf_function_with_scalar_data(function_names, func = pc.get_function(name) assert func.name == name - result = pc.call_function(name, input) + result = pc.call_function(name, input, options=None, memory_pool=None) assert result == expected_output From d8ac3a6b19a2f80045e85dbf781544ac7252cbb2 Mon Sep 17 00:00:00 2001 From: Vibhatha Abeykoon Date: Mon, 14 Mar 2022 16:09:26 +0530 Subject: [PATCH 047/131] updating registration code --- python/pyarrow/_compute.pyx | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/python/pyarrow/_compute.pyx b/python/pyarrow/_compute.pyx index a35194ecbb9..dae54971e6f 100644 --- a/python/pyarrow/_compute.pyx +++ b/python/pyarrow/_compute.pyx @@ -30,6 +30,7 @@ from pyarrow.includes.libarrow cimport * import pyarrow.lib as lib from cpython.ref cimport PyObject +from libcpp cimport bool as c_bool import numpy as np @@ -2418,6 +2419,7 @@ cdef CFunctionDoc _make_function_doc(func_doc): cdef: CFunctionDoc f_doc vector[c_string] c_arg_names + c_bool c_options_required if isinstance(func_doc, dict): if func_doc["summary"] and isinstance(func_doc["summary"], str): f_doc.summary = func_doc["summary"].encode() @@ -2446,9 +2448,10 @@ cdef CFunctionDoc _make_function_doc(func_doc): raise ValueError("key `options_class` cannot be None") if isinstance(func_doc["options_required"], bool): - f_doc.options_required = func_doc["options_required"] + c_options_required = func_doc["options_required"] + f_doc.options_required = c_options_required else: - raise ValueError("key `options_required` cannot must be bool") + raise ValueError("key `options_required` must be bool") return f_doc else: From 06e042d94a9dccd5e3c48446f4fb6c3268f2e05d Mon Sep 17 00:00:00 2001 From: Vibhatha Abeykoon Date: Mon, 14 Mar 2022 19:27:37 +0530 Subject: [PATCH 048/131] refactor python bindings and func docs --- cpp/examples/arrow/udf_example.cc | 7 +++++-- cpp/src/arrow/python/udf.cc | 3 ++- python/pyarrow/_compute.pyx | 3 ++- 3 files changed, 9 insertions(+), 4 deletions(-) diff --git a/cpp/examples/arrow/udf_example.cc b/cpp/examples/arrow/udf_example.cc index bb9929b1bf8..cbb752afeb5 100644 --- a/cpp/examples/arrow/udf_example.cc +++ b/cpp/examples/arrow/udf_example.cc @@ -186,8 +186,11 @@ arrow::Status Execute() { auto options = std::make_shared(); cp::ScalarKernel kernel( - {cp::InputType::Array(arrow::int64()), cp::InputType::Array(arrow::int64()), - cp::InputType::Array(arrow::int64())}, + { + cp::InputType::Array(arrow::int64()), + cp::InputType::Array(arrow::int64()), + cp::InputType::Array(arrow::int64()) + }, arrow::int64(), exec_func); kernel.mem_allocation = cp::MemAllocation::NO_PREALLOCATE; diff --git a/cpp/src/arrow/python/udf.cc b/cpp/src/arrow/python/udf.cc index 838f0914d7e..c967dd045ad 100644 --- a/cpp/src/arrow/python/udf.cc +++ b/cpp/src/arrow/python/udf.cc @@ -73,8 +73,9 @@ Status VerifyArityAndInput(cp::Arity arity, const cp::ExecBatch& batch) { Status ScalarUdfBuilder::MakeFunction(PyObject* function) { Status st; + auto func_doc = this->doc(); auto func = - std::make_shared(this->name(), this->arity(), &this->doc()); + std::make_shared(this->name(), this->arity(), &func_doc); // creating a copy of objects for the lambda function auto py_function = function; auto arity = this->arity(); diff --git a/python/pyarrow/_compute.pyx b/python/pyarrow/_compute.pyx index dae54971e6f..bf58998c08a 100644 --- a/python/pyarrow/_compute.pyx +++ b/python/pyarrow/_compute.pyx @@ -317,7 +317,7 @@ cdef class Arity(_Weakrefable): return wrap_arity(c_arity) @staticmethod - def varargs(int num_args): + def varargs(num_args): """ create a varargs arity object with defined number of arguments @@ -2519,6 +2519,7 @@ def register_function(func_name, arity, function_doc, in_types, Example ------- + >>> from pyarrow import compute as pc >>> from pyarrow.compute import register_function >>> from pyarrow.compute import Arity, InputType From a35066f4f494f3a244d481a41f9ea696163ae5d1 Mon Sep 17 00:00:00 2001 From: Vibhatha Abeykoon Date: Mon, 28 Mar 2022 15:07:22 +0530 Subject: [PATCH 049/131] addressing reviews --- cpp/examples/arrow/aggregate_example.cc | 14 +- cpp/examples/arrow/udf_example.cc | 63 ++-- cpp/src/arrow/python/udf.cc | 2 +- python/examples/udf/udf_example.py | 474 +++++++++++++----------- python/pyarrow/_compute.pyx | 102 +++-- python/pyarrow/public-api.pxi | 1 + python/pyarrow/tests/test_udf.py | 27 +- 7 files changed, 367 insertions(+), 316 deletions(-) diff --git a/cpp/examples/arrow/aggregate_example.cc b/cpp/examples/arrow/aggregate_example.cc index a631d095375..aa116c063e4 100644 --- a/cpp/examples/arrow/aggregate_example.cc +++ b/cpp/examples/arrow/aggregate_example.cc @@ -15,19 +15,9 @@ // specific language governing permissions and limitations // under the License. -// This example showcases various ways to work with Datasets. It's -// intended to be paired with the documentation. - #include #include -#include -#include -#include -#include -#include -#include -#include -#include +#include // ARROW-15263 #include #include #include @@ -35,6 +25,8 @@ #include #include #include +#include +#include namespace cp = arrow::compute; diff --git a/cpp/examples/arrow/udf_example.cc b/cpp/examples/arrow/udf_example.cc index cbb752afeb5..4733e311535 100644 --- a/cpp/examples/arrow/udf_example.cc +++ b/cpp/examples/arrow/udf_example.cc @@ -17,23 +17,21 @@ #include #include -#include -#include -#include -#include -#include -#include -#include -#include +#include // ARROW-15263 #include #include +#include #include #include #include #include +#include +#include +#include +#include -// Demonstrate registering an user-defined Arrow compute function outside of the Arrow +// Demonstrate registering a user-defined Arrow compute function outside of the Arrow // source tree namespace cp = ::arrow::compute; @@ -63,14 +61,6 @@ arrow::Result> GetArrayDataSample( return array; } -arrow::Result> GetSampleRecordBatch( - const arrow::ArrayVector array_vector, const arrow::FieldVector& field_vector) { - std::shared_ptr record_batch; - ARROW_ASSIGN_OR_RAISE(auto struct_result, - arrow::StructArray::Make(array_vector, field_vector)); - return record_batch->FromStructArray(struct_result); -} - arrow::Result> GetTable() { std::shared_ptr table; @@ -103,10 +93,12 @@ class UDFOptionsType : public cp::FunctionOptionsType { std::string Stringify(const cp::FunctionOptions&) const override { return "UDFOptionsType"; } - bool Compare(const cp::FunctionOptions&, const cp::FunctionOptions&) const override { + bool Compare(const cp::FunctionOptions& options, + const cp::FunctionOptions& other) const override { return true; } - std::unique_ptr Copy(const cp::FunctionOptions&) const override; + std::unique_ptr Copy( + const cp::FunctionOptions& options) const override; }; cp::FunctionOptionsType* GetUDFOptionsType() { @@ -169,29 +161,26 @@ const cp::FunctionDoc func_doc{ {"x", "y", "z"}, "UDFOptions"}; +arrow::Status SampleFunction(cp::KernelContext* ctx, const cp::ExecBatch& batch, + arrow::Datum* out) { + auto in_res = cp::CallFunction("add", {batch[0].array(), batch[1].array()}); + auto in_arr = in_res.ValueOrDie().make_array(); + auto final_res = cp::CallFunction("add", {in_arr, batch[2].array()}); + auto final_arr = final_res.ValueOrDie().array(); + auto datum = new arrow::Datum(final_arr); + *out = *datum; + return arrow::Status::OK(); +} + arrow::Status Execute() { - const std::string name = "x+x"; + const std::string name = "add_three"; auto func = std::make_shared(name, cp::Arity::Ternary(), &func_doc); - auto exec_func = [](cp::KernelContext* ctx, const cp::ExecBatch& batch, - arrow::Datum* out) -> arrow::Status { - auto in_res = cp::CallFunction("add", {batch[0].array(), batch[1].array()}); - auto in_arr = in_res.ValueOrDie().make_array(); - auto final_res = cp::CallFunction("add", {in_arr, batch[2].array()}); - auto final_arr = final_res.ValueOrDie().array(); - auto datum = new arrow::Datum(final_arr); - *out = *datum; - return arrow::Status::OK(); - }; - auto options = std::make_shared(); cp::ScalarKernel kernel( - { - cp::InputType::Array(arrow::int64()), - cp::InputType::Array(arrow::int64()), - cp::InputType::Array(arrow::int64()) - }, - arrow::int64(), exec_func); + {cp::InputType::Array(arrow::int64()), cp::InputType::Array(arrow::int64()), + cp::InputType::Array(arrow::int64())}, + arrow::int64(), SampleFunction); kernel.mem_allocation = cp::MemAllocation::NO_PREALLOCATE; kernel.null_handling = cp::NullHandling::COMPUTED_NO_PREALLOCATE; diff --git a/cpp/src/arrow/python/udf.cc b/cpp/src/arrow/python/udf.cc index c967dd045ad..293e27f1415 100644 --- a/cpp/src/arrow/python/udf.cc +++ b/cpp/src/arrow/python/udf.cc @@ -66,7 +66,7 @@ Status VerifyArityAndInput(cp::Arity arity, const cp::ExecBatch& batch) { bool match = (uint64_t)arity.num_args == batch.values.size(); if (!match) { return Status::Invalid( - "Function Arity and Input data shape doesn't match, expceted {}"); + "Function Arity and Input data shape doesn't match, expected {}"); } return Status::OK(); } diff --git a/python/examples/udf/udf_example.py b/python/examples/udf/udf_example.py index 905cd0dd294..47c8fac64ab 100644 --- a/python/examples/udf/udf_example.py +++ b/python/examples/udf/udf_example.py @@ -2,7 +2,7 @@ import pyarrow as pa from pyarrow import compute as pc from pyarrow.compute import register_function -from pyarrow.compute import Arity, InputType +from pyarrow.compute import InputType def get_function_doc(summary: str, desc: str, arg_names: List[str], @@ -21,153 +21,161 @@ def get_function_doc(summary: str, desc: str, arg_names: List[str], """ # Example 1: Array Unary -print("=" * 80) -print("Example 1: Array Unary") -print("=" * 80) -def add_constant(array): - return pc.call_function("add", [array, 1]) +def array_unary_example(): + print("=" * 80) + print("Example 1: Array Unary") + print("=" * 80) + def add_one(array): + return pc.call_function("add", [array, 1]) -func_name_1 = "py_add_func" -arity_1 = Arity.unary() -in_types_1 = [InputType.array(pa.int64())] -out_type_1 = pa.int64() -doc_1 = get_function_doc("add function", "test add function", - ["value"], "None") -register_function(func_name_1, arity_1, doc_1, - in_types_1, out_type_1, add_constant) + func_name = "py_add_one_func" + arity = 1 + in_types = [InputType.array(pa.int64())] + out_type = pa.int64() + doc = get_function_doc("add function", "add function", + ["value"], "None") + register_function(func_name, arity, doc, + in_types, out_type, add_one) -func1 = pc.get_function(func_name_1) + func = pc.get_function(func_name) -a1_1 = pc.call_function(func_name_1, [pa.array([20])]) + assert func.name == func_name -print(a1_1) + res = pc.call_function(func_name, [pa.array([20])]) -a1_2 = pc.call_function(func_name_1, [pa.array([30])]) + print(res) -print(a1_2) + res = pc.call_function(func_name, [pa.array([30])]) + + print(res) # Example 2: Array Binary -print("=" * 80) -print("Example 2: Array Binary") -print("=" * 80) -arity_2 = Arity.binary() -func_name_2 = "array_udf_binary_add" -in_types_2 = [InputType.array(pa.int64()), InputType.array(pa.int64())] -out_type_2 = pa.int64() -array_binary_add_function_doc = get_function_doc( - "array bin add function", - "test array bin add function", - ["array_value1", "array_value2"], "None") +def array_binary_example(): + print("=" * 80) + print("Example 2: Array Binary") + print("=" * 80) + arity = 2 + func_name = "array_udf_binary_add" + in_types = [InputType.array(pa.int64()), InputType.array(pa.int64())] + out_type = pa.int64() + doc = get_function_doc( + "array bin add function", + "array bin add function", + ["array_value1", "array_value2"], "None") -def binary_array_function(array1, array2): - return pc.call_function("add", [array1, array2]) + def binary_array_function(array1, array2): + return pc.call_function("add", [array1, array2]) + register_function(func_name, arity, doc, + in_types, out_type, binary_array_function) -register_function(func_name_2, arity_2, array_binary_add_function_doc, - in_types_2, out_type_2, binary_array_function) + func = pc.get_function(func_name) -func2 = pc.get_function(func_name_2) + assert func.name == func_name -a2_1 = pc.call_function(func_name_2, [pa.array([10, 11]), pa.array([20, 21])]) + ans = pc.call_function(func_name, [pa.array([10, 11]), pa.array([20, 21])]) -print(a2_1) + print(ans) -a2_2 = pc.call_function(func_name_2, [pa.array([1, 2]), pa.array([10, 20])]) + ans = pc.call_function(func_name, [pa.array([1, 2]), pa.array([10, 20])]) -print(a2_2) + print(ans) # Example 3: Array Ternary -print("=" * 80) -print("Example 3: Array Ternary") -print("=" * 80) -arity_3 = Arity.ternary() -func_name_3 = "array_udf_ternary_add" -in_types_3 = [InputType.array(pa.int64()), - InputType.array(pa.int64()), - InputType.array(pa.int64())] -out_type_3 = pa.int64() -array_ternary_add_function_doc = get_function_doc( - "array ternary add function", - "test array ternary add function", - ["array_value1", "array_value2", "array_value3"], "None") +def array_ternary_example(): + print("=" * 80) + print("Example 3: Array Ternary") + print("=" * 80) + arity = 3 + func_name = "array_udf_ternary_add" + in_types = [InputType.array(pa.int64()), + InputType.array(pa.int64()), + InputType.array(pa.int64())] + out_type = pa.int64() + doc = get_function_doc( + "array ternary add function", + "array ternary add function", + ["array_value1", "array_value2", "array_value3"], "None") -def ternary_array_function(array1, array2, array3): - return pc.call_function("add", - [pc.call_function("add", [array1, array2]), - array3]) + def ternary_array_function(array1, array2, array3): + return pc.call_function("add", + [pc.call_function("add", [array1, array2]), + array3]) + register_function(func_name, arity, doc, + in_types, out_type, ternary_array_function) -register_function(func_name_3, arity_3, array_ternary_add_function_doc, - in_types_3, out_type_3, ternary_array_function) + func = pc.get_function(func_name) -func3 = pc.get_function(func_name_3) + assert func.name == func_name -a3_1 = pc.call_function(func_name_3, [pa.array([10, 11]), - pa.array([20, 21]), - pa.array([30, 31])]) + ans = pc.call_function(func_name, [pa.array([10, 11]), + pa.array([20, 21]), + pa.array([30, 31])]) -print(a3_1) + print(ans) -a3_2 = pc.call_function(func_name_3, [pa.array([1, 2]), - pa.array([10, 20]), - pa.array([100, 200]) - ]) + ans = pc.call_function(func_name, [pa.array([1, 2]), + pa.array([10, 20]), + pa.array([100, 200]) + ]) -print(a3_2) + print(ans) # Example 4: Array VarArgs -print("=" * 80) -print("Example 4: Array VarArgs") -print("=" * 80) -arity_4 = Arity.varargs(4) -func_name_4 = "array_udf_varargs_add" -in_types_4 = [InputType.array(pa.int64()), - InputType.array(pa.int64()), - InputType.array(pa.int64()), - InputType.array(pa.int64()) - ] -out_type_4 = pa.int64() -array_varargs_add_function_doc = get_function_doc( - "array varargs add function", - "test array varargs add function", - ["array_value1", "array_value2", - "array_value3", "array_value4"], - "None") - - -def varargs_array_function(array1, array2, array3, array4): - array12 = pc.call_function("add", [array1, array2]) - array34 = pc.call_function("add", [array3, array4]) - return pc.call_function("add", [array12, array34]) - - -register_function(func_name_4, arity_4, array_varargs_add_function_doc, - in_types_4, out_type_4, varargs_array_function) - -func4 = pc.get_function(func_name_4) - -a4_1 = pc.call_function(func_name_4, [pa.array([10, 11]), - pa.array([20, 21]), - pa.array([30, 31]), - pa.array([40, 41])]) - -print(a4_1) - -a4_2 = pc.call_function(func_name_4, [pa.array([1, 2]), - pa.array([10, 20]), - pa.array([100, 200]), - pa.array([1000, 2000]) - ]) - -print(a4_2) +def array_varargs_example(): + print("=" * 80) + print("Example 4: Array VarArgs") + print("=" * 80) + arity = 4 + func_name = "array_udf_varargs_add" + in_types = [InputType.array(pa.int64()), + InputType.array(pa.int64()), + InputType.array(pa.int64()), + InputType.array(pa.int64()) + ] + out_type = pa.int64() + doc = get_function_doc( + "array varargs add function", + "array varargs add function", + ["array_value1", "array_value2", + "array_value3", "array_value4"], + "None") + + def varargs_array_function(array1, array2, array3, array4): + array12 = pc.call_function("add", [array1, array2]) + array34 = pc.call_function("add", [array3, array4]) + return pc.call_function("add", [array12, array34]) + + register_function(func_name, arity, doc, + in_types, out_type, varargs_array_function) + + func = pc.get_function(func_name) + + assert func.name == func_name + + ans = pc.call_function(func_name, [pa.array([10, 11]), + pa.array([20, 21]), + pa.array([30, 31]), + pa.array([40, 41])]) + + print(ans) + + ans = pc.call_function(func_name, [pa.array([1, 2]), + pa.array([10, 20]), + pa.array([100, 200]), + pa.array([1000, 2000]) + ]) + + print(ans) """ @@ -176,147 +184,169 @@ def varargs_array_function(array1, array2, array3, array4): # Example 5: Scalar Unary -print("=" * 80) -print("Example 5: Scalar Unary ") -print("=" * 80) +def scalar_unary_example(): + print("=" * 80) + print("Example 5: Scalar Unary ") + print("=" * 80) -def unary_scalar_function(scalar): - return pc.call_function("add", [scalar, 1]) + def unary_scalar_function(scalar): + return pc.call_function("add", [scalar, 1]) + arity = 1 + func_name = "py_scalar_add_func" + in_types = [InputType.scalar(pa.int64())] + out_type = pa.int64() + doc = get_function_doc("scalar add function", "scalar add function", + ["scalar_value"], "None") + register_function(func_name, arity, doc, in_types, + out_type, unary_scalar_function) -arity_5 = Arity.unary() -func_name_5 = "py_scalar_add_func" -in_types_5 = [InputType.scalar(pa.int64())] -out_type_5 = pa.int64() -doc_5 = get_function_doc("scalar add function", "test scalar add function", - ["scalar_value"], "None") -register_function(func_name_5, arity_5, doc_5, in_types_5, - out_type_5, unary_scalar_function) + func = pc.get_function(func_name) -func5 = pc.get_function(func_name_5) + assert func.name == func_name -a5_1 = pc.call_function(func_name_5, [pa.scalar(10)]) + ans = pc.call_function(func_name, [pa.scalar(10)]) -print(a5_1) + print(ans) -a5_2 = pc.call_function(func_name_5, [pa.scalar(1)]) + ans = pc.call_function(func_name, [pa.scalar(1)]) -print(a5_2) + print(ans) # Example 6: Scalar Binary -print("=" * 80) -print("Example 6: Scalar Binary") -print("=" * 80) -arity_6 = Arity.binary() -func_name_6 = "scalar_udf_binary_add" -in_types_6 = [InputType.scalar(pa.int64()), InputType.scalar(pa.int64())] -out_type_6 = pa.int64() -scalar_binary_add_function_doc = get_function_doc( - "scalar bin add function", - "test scalar bin add function", - ["scalar_value1", "scalar_value2"], "None") - +def scalar_binary_example(): + print("=" * 80) + print("Example 6: Scalar Binary") + print("=" * 80) + arity = 2 + func_name = "scalar_udf_binary_add" + in_types = [InputType.scalar(pa.int64()), InputType.scalar(pa.int64())] + out_type = pa.int64() + doc = get_function_doc( + "scalar bin add function", + "scalar bin add function", + ["scalar_value1", "scalar_value2"], "None") -def binary_scalar_function(scalar1, scalar2): - return pc.call_function("add", [scalar1, scalar2]) + def binary_scalar_function(scalar1, scalar2): + return pc.call_function("add", [scalar1, scalar2]) + register_function(func_name, arity, doc, + in_types, out_type, binary_scalar_function) -register_function(func_name_6, arity_6, scalar_binary_add_function_doc, - in_types_6, out_type_6, binary_scalar_function) + func = pc.get_function(func_name) -func6 = pc.get_function(func_name_6) + assert func.name == func_name -a6_1 = pc.call_function(func_name_6, [pa.scalar(10), pa.scalar(20)]) + ans = pc.call_function(func_name, [pa.scalar(10), pa.scalar(20)]) -print(a6_1) + print(ans) -a6_2 = pc.call_function(func_name_6, [pa.scalar(50), pa.scalar(30)]) + ans = pc.call_function(func_name, [pa.scalar(50), pa.scalar(30)]) -print(a6_2) + print(ans) # Example 8: Scalar Ternary -print("=" * 80) -print("Example 7: Scalar Ternary") -print("=" * 80) -arity_7 = Arity.ternary() -func_name_7 = "scalar_udf_ternary_add" -in_types_7 = [InputType.scalar(pa.int64()), - InputType.scalar(pa.int64()), - InputType.scalar(pa.int64())] -out_type_7 = pa.int64() -scalar_ternary_add_function_doc = get_function_doc( - "scalar ternary add function", - "test scalar ternary add function", - ["scalar_value1", "scalar_value2", - "scalar_value3"], "None") -def ternary_scalar_function(scalar1, scalar2, scalar3): - return pc.call_function("add", - [pc.call_function("add", - [scalar1, scalar2]), - scalar3]) +def scalar_ternary_function(): + print("=" * 80) + print("Example 7: Scalar Ternary") + print("=" * 80) + arity = 3 + func_name = "scalar_udf_ternary_add" + in_types = [InputType.scalar(pa.int64()), + InputType.scalar(pa.int64()), + InputType.scalar(pa.int64())] + out_type = pa.int64() + doc = get_function_doc( + "scalar ternary add function", + "scalar ternary add function", + ["scalar_value1", "scalar_value2", + "scalar_value3"], "None") + def ternary_scalar_function(scalar1, scalar2, scalar3): + return pc.call_function("add", + [pc.call_function("add", + [scalar1, scalar2]), + scalar3]) -register_function(func_name_7, arity_7, scalar_ternary_add_function_doc, - in_types_7, out_type_7, ternary_scalar_function) + register_function(func_name, arity, doc, + in_types, out_type, ternary_scalar_function) -func7 = pc.get_function(func_name_7) + func = pc.get_function(func_name) -a7_1 = pc.call_function( - func_name_7, [pa.scalar(10), pa.scalar(20), pa.scalar(30)]) + assert func.name == func_name -print(a7_1) + ans = pc.call_function( + func_name, [pa.scalar(10), pa.scalar(20), pa.scalar(30)]) -a7_2 = pc.call_function( - func_name_7, [pa.scalar(1), pa.scalar(2), pa.scalar(3)]) + print(ans) -print(a7_2) + ans = pc.call_function( + func_name, [pa.scalar(1), pa.scalar(2), pa.scalar(3)]) + print(ans) -# Example 8: Scalar VarArgs -print("=" * 80) -print("Example 8: Scalar VarArgs") -print("=" * 80) -arity_8 = Arity.varargs(4) -func_name_8 = "scalar_udf_varargs_add" -in_types_8 = [InputType.scalar(pa.int64()), - InputType.scalar(pa.int64()), - InputType.scalar(pa.int64()), - InputType.scalar(pa.int64())] -out_type_8 = pa.int64() - -scalar_ternary_add_function_doc = get_function_doc( - "scalar ternary add function", - "test scalar ternary add function", - ["scalar_value1", - "scalar_value2", - "scalar_value3", - "scalar_value4"], "None") - - -def ternary_scalar_function(scalar1, scalar2, scalar3, scalar4): - return pc.call_function("add", - [pc.call_function("add", - [pc.call_function("add", - [scalar1, - scalar2]), - scalar3]), scalar4]) - - -register_function(func_name_8, arity_8, scalar_ternary_add_function_doc, - in_types_8, out_type_8, ternary_scalar_function) -func8 = pc.get_function(func_name_8) - -a8_1 = pc.call_function(func_name_8, [pa.scalar( - 10), pa.scalar(20), pa.scalar(30), pa.scalar(40)]) - -print(a8_1) - -a8_2 = pc.call_function(func_name_8, [pa.scalar( - 1), pa.scalar(2), pa.scalar(3), pa.scalar(4)]) - -print(a8_2) +# Example 8: Scalar VarArgs +def scalar_varargs_function(): + print("=" * 80) + print("Example 8: Scalar VarArgs") + print("=" * 80) + arity = 4 + func_name = "scalar_udf_varargs_add" + in_types = [InputType.scalar(pa.int64()), + InputType.scalar(pa.int64()), + InputType.scalar(pa.int64()), + InputType.scalar(pa.int64())] + out_type = pa.int64() + + doc = get_function_doc( + "scalar ternary add function", + "scalar ternary add function", + ["scalar_value1", + "scalar_value2", + "scalar_value3", + "scalar_value4"], "None") + + def varargs_scalar_function(scalar1, scalar2, scalar3, scalar4): + return pc.call_function("add", + [pc.call_function("add", + [pc.call_function("add", + [scalar1, + scalar2]), + scalar3]), scalar4]) + + register_function(func_name, arity, doc, + in_types, out_type, varargs_scalar_function) + + func = pc.get_function(func_name) + + assert func.name == func_name + + ans = pc.call_function(func_name, [pa.scalar( + 10), pa.scalar(20), pa.scalar(30), pa.scalar(40)]) + + print(ans) + + ans = pc.call_function(func_name, [pa.scalar( + 1), pa.scalar(2), pa.scalar(3), pa.scalar(4)]) + + print(ans) + + +if __name__ == '__main__': + + # scalar function examples + scalar_unary_example() + scalar_binary_example() + scalar_ternary_function() + scalar_varargs_function() + + # array function examples + array_unary_example() + array_binary_example() + array_ternary_example() + array_varargs_example() diff --git a/python/pyarrow/_compute.pyx b/python/pyarrow/_compute.pyx index bf58998c08a..a0bc1ff6eab 100644 --- a/python/pyarrow/_compute.pyx +++ b/python/pyarrow/_compute.pyx @@ -292,6 +292,14 @@ cdef class Arity(_Weakrefable): cdef void init(self, const CArity &arity): self.arity = arity + @staticmethod + def nullary(): + """ + create a nullary arity object + """ + cdef CArity c_arity = CArity.Nullary() + return wrap_arity(c_arity) + @staticmethod def unary(): """ @@ -622,9 +630,6 @@ cdef class FunctionRegistry(_Weakrefable): func = GetResultValue(self.registry.GetFunction(c_name)) return wrap_function(func) - def register_function(self, name, arity, input_types, output_type, function_kind): - pass - cdef FunctionRegistry _global_func_registry = FunctionRegistry() @@ -2442,16 +2447,12 @@ cdef CFunctionDoc _make_function_doc(func_doc): else: raise ValueError("key `arg_names` cannot be None") - if func_doc["options_class"] and isinstance(func_doc["options_class"], str): - f_doc.options_class = func_doc["options_class"].encode() - else: - raise ValueError("key `options_class` cannot be None") + # UDFOptions integration: + # TODO: https://issues.apache.org/jira/browse/ARROW-16041 + f_doc.options_class = tobytes("None") - if isinstance(func_doc["options_required"], bool): - c_options_required = func_doc["options_required"] - f_doc.options_required = c_options_required - else: - raise ValueError("key `options_required` must be bool") + c_options_required = False + f_doc.options_required = c_options_required return f_doc else: @@ -2480,26 +2481,24 @@ cdef class UDFRegistrationError(UDFError): return CStatus_UnknownError(message) -def register_function(func_name, arity, function_doc, in_types, +def register_function(func_name, num_args, function_doc, in_types, out_type, callback, mem_allocation="no_preallocate", null_handling="computed_no_preallocate"): """ - Register a user-defined-function (function) + Register a user-defined-function Parameters ---------- func_name: str function name - arity: Arity - arity of the function + num_args: int + number of arguments in the function function_doc: dict a dictionary object with keys ("summary", "description", - "arg_names", - "options_class", (not supported yet) - "options_required" (not supported yet) + "arg_names" ) in_types: List[InputType] list of InputType objects which defines the input @@ -2507,15 +2506,54 @@ def register_function(func_name, arity, function_doc, in_types, out_type: DataType output type of the function callback: callable - user defined function + user defined function + function includes arguments equal to the number + of input_types defined. The return type of the + function is of the type defined as output_type. + The output is a datum object which can be + an Array or a ChunkedArray or a Table or a RecordBatch. mem_allocation: str - memory allocation mode - "preallocate" or "no_preallocate" + For data types that support pre-allocation (i.e. fixed-width), the + kernel expects to be provided a pre-allocated data buffer to write + into. Non-fixed-width types must always allocate their own data + buffers. The allocation made for the same length as the execution batch, + so vector kernels yielding differently sized output should not use this. + It is valid for the data to not be preallocated but the validity bitmap + is (or is computed using the intersection/bitwise-and method). + + memory allocation mode + + "preallocate" + For variable-size output types like BinaryType or StringType, or for + nested types, this option has no effect. + "no_preallocate" + The kernel is responsible for allocating its own data buffer for + fixed-width type outputs. + null_handling: str + null handling mode - one of "intersect", "computed_preallocate", - "computed_no_preallocate", + + "intersect" + Compute the output validity bitmap by intersecting the validity bitmaps + of the arguments using bitwise-and operations. This means that values + in the output are valid/non-null only if the corresponding values in + all input arguments were valid/non-null. Kernel generally need not + touch the bitmap thereafter, but a kernel's exec function is permitted + to alter the bitmap after the null intersection is computed if it needs + to. + + "computed_preallocate" + Kernel expects a pre-allocated buffer to write the result bitmap + into. The preallocated memory is not zeroed (except for the last byte), + so the kernel should ensure to completely populate the bitmap. + + "computed_no_preallocate" + Kernel allocates and sets the validity bitmap of the output. + "output_not_null" + Kernel output is never null and a validity bitmap does not need to be + allocated. Example ------- @@ -2528,8 +2566,6 @@ def register_function(func_name, arity, function_doc, in_types, >>> func_doc["summary"] = "simple udf" >>> func_doc["description"] = "add a constant to a scalar" >>> func_doc["arg_names"] = ["x"] - >>> func_doc["options_class"] = "None" - >>> func_doc["options_required"] = False >>> >>> def add_constant(array): ... return pc.call_function("add", [array, 1]) @@ -2584,8 +2620,18 @@ def register_function(func_name, arity, function_doc, in_types, else: raise ValueError("func_name should be str") - if arity and isinstance(arity, Arity): - c_arity = ( arity).arity + if num_args and isinstance(num_args, int): + assert num_args > 0 + if num_args == 0: + c_arity = ( Arity.nullary()).arity + elif num_args == 1: + c_arity = ( Arity.unary()).arity + elif num_args == 2: + c_arity = ( Arity.binary()).arity + elif num_args == 3: + c_arity = ( Arity.ternary()).arity + elif num_args > 3: + c_arity = ( Arity.varargs(num_args)).arity else: raise ValueError("arity must be an instance of Arity") diff --git a/python/pyarrow/public-api.pxi b/python/pyarrow/public-api.pxi index 1b095c52e55..fa15b943f02 100644 --- a/python/pyarrow/public-api.pxi +++ b/python/pyarrow/public-api.pxi @@ -25,6 +25,7 @@ from pyarrow.includes.libarrow cimport (CArray, CDataType, CField, # You cannot assign something to a dereferenced pointer in Cython thus these # methods don't use Status to indicate a successful operation. + cdef api bint pyarrow_is_buffer(object buffer): return isinstance(buffer, Buffer) diff --git a/python/pyarrow/tests/test_udf.py b/python/pyarrow/tests/test_udf.py index 51f7199af5f..bacd91fe595 100644 --- a/python/pyarrow/tests/test_udf.py +++ b/python/pyarrow/tests/test_udf.py @@ -22,17 +22,14 @@ import pyarrow as pa from pyarrow import compute as pc from pyarrow.compute import register_function -from pyarrow.compute import Arity, InputType +from pyarrow.compute import InputType -def get_function_doc(summary: str, desc: str, arg_names: List[str], - options_class: str, options_required: bool = False): +def get_function_doc(summary: str, desc: str, arg_names: List[str]): func_doc = {} func_doc["summary"] = summary func_doc["description"] = desc func_doc["arg_names"] = arg_names - func_doc["options_class"] = options_class - func_doc["options_required"] = False return func_doc # scalar unary function data @@ -40,8 +37,7 @@ def get_function_doc(summary: str, desc: str, arg_names: List[str], unary_doc = get_function_doc("add function", "test add function", - ["scalar1"], - "None") + ["scalar1"]) def unary_function(scalar1): @@ -52,8 +48,7 @@ def unary_function(scalar1): binary_doc = get_function_doc("y=mx", "find y from y = mx", - ["m", "x"], - "None") + ["m", "x"]) def binary_function(m, x): @@ -64,8 +59,7 @@ def binary_function(m, x): ternary_doc = get_function_doc("y=mx+c", "find y from y = mx + c", - ["m", "x", "c"], - "None") + ["m", "x", "c"]) def ternary_function(m, x, c): @@ -77,8 +71,7 @@ def ternary_function(m, x, c): varargs_doc = get_function_doc("z=ax+by+c", "find z from z = ax + by + c", - ["a", "x", "b", "y", "c"], - "None") + ["a", "x", "b", "y", "c"]) def varargs_function(a, x, b, y, c): @@ -163,10 +156,10 @@ def function_names(): @pytest.fixture def function_arities(): return [ - Arity.unary(), - Arity.binary(), - Arity.ternary(), - Arity.varargs(5), + 1, + 2, + 3, + 5, ] From 427ef1d09487f7279ef25ffc9102c0ea73788a97 Mon Sep 17 00:00:00 2001 From: Vibhatha Abeykoon Date: Mon, 28 Mar 2022 16:37:41 +0530 Subject: [PATCH 050/131] adding test cases for negative cases --- python/pyarrow/_compute.pyx | 17 ++++++-- python/pyarrow/compute.py | 4 +- python/pyarrow/tests/test_udf.py | 70 +++++++++++++++++++++++++++++++- 3 files changed, 85 insertions(+), 6 deletions(-) diff --git a/python/pyarrow/_compute.pyx b/python/pyarrow/_compute.pyx index a0bc1ff6eab..ff35ab1a34f 100644 --- a/python/pyarrow/_compute.pyx +++ b/python/pyarrow/_compute.pyx @@ -2425,7 +2425,7 @@ cdef CFunctionDoc _make_function_doc(func_doc): CFunctionDoc f_doc vector[c_string] c_arg_names c_bool c_options_required - if isinstance(func_doc, dict): + if func_doc and isinstance(func_doc, dict): if func_doc["summary"] and isinstance(func_doc["summary"], str): f_doc.summary = func_doc["summary"].encode() else: @@ -2456,7 +2456,7 @@ cdef CFunctionDoc _make_function_doc(func_doc): return f_doc else: - raise TypeError(f"func_doc must be a dictionary") + raise ValueError(f"func_doc must be a dictionary") cdef class UDFError(Exception): @@ -2641,9 +2641,18 @@ def register_function(func_name, num_args, function_doc, in_types, for in_type in in_types: in_tmp = ( in_type).input_type c_in_types.push_back(in_tmp) + else: + raise ValueError("input types must be of type InputType") + + if out_type: + c_type = pyarrow_unwrap_data_type(out_type) + else: + raise ValueError("Output value type must be defined") - c_type = pyarrow_unwrap_data_type(out_type) - c_callback = callback + if callback and callable(callback): + c_callback = callback + else: + raise ValueError("callback must be a callable") c_out_type = new COutputType(c_type) c_mem_allocation = _mem_allocation_map[mem_allocation] diff --git a/python/pyarrow/compute.py b/python/pyarrow/compute.py index 6ce52255a22..46dc91e2814 100644 --- a/python/pyarrow/compute.py +++ b/python/pyarrow/compute.py @@ -80,7 +80,9 @@ _group_by, register_function, # Expressions - Expression + Expression, + # Exceptions + UDFRegistrationError, ) from collections import namedtuple diff --git a/python/pyarrow/tests/test_udf.py b/python/pyarrow/tests/test_udf.py index bacd91fe595..63a05476382 100644 --- a/python/pyarrow/tests/test_udf.py +++ b/python/pyarrow/tests/test_udf.py @@ -22,7 +22,7 @@ import pyarrow as pa from pyarrow import compute as pc from pyarrow.compute import register_function -from pyarrow.compute import InputType +from pyarrow.compute import InputType, UDFRegistrationError def get_function_doc(summary: str, desc: str, arg_names: List[str]): @@ -280,3 +280,71 @@ def test_scalar_udf_function_with_scalar_data(function_names, result = pc.call_function(name, input, options=None, memory_pool=None) assert result == expected_output + + +def test_udf_input(): + def unary_scalar_function(scalar): + return pc.call_function("add", [scalar, 1]) + + # validate arity + arity = -1 + func_name = "py_scalar_add_func" + in_types = [InputType.scalar(pa.int64())] + out_type = pa.int64() + doc = get_function_doc("scalar add function", "scalar add function", + ["scalar_value"]) + try: + register_function(func_name, arity, doc, in_types, + out_type, unary_scalar_function) + except Exception as ex: + assert isinstance(ex, AssertionError) + + # validate function name + try: + register_function(None, 1, doc, in_types, + out_type, unary_scalar_function) + except Exception as ex: + assert isinstance(ex, ValueError) + + # validate docs + try: + register_function(func_name, 1, None, in_types, + out_type, unary_scalar_function) + except Exception as ex: + assert isinstance(ex, ValueError) + + # validate function not matching defined arity config + def invalid_function(array1, array2): + return pc.call_function("add", [array1, array2]) + + try: + register_function("invalid_function", 1, doc, in_types, + out_type, invalid_function) + pc.call_function("invalid_function", [pa.array([10]), pa.array([20])], + options=None, memory_pool=None) + except Exception as ex: + assert isinstance(ex, pa.lib.ArrowInvalid) + + # validate function + try: + register_function("none_function", 1, doc, in_types, + out_type, None) + except Exception as ex: + assert isinstance(ex, ValueError) + assert "callback must be a callable" == str(ex) + + # validate output type + try: + register_function(func_name, 1, doc, in_types, + None, unary_scalar_function) + except Exception as ex: + assert isinstance(ex, ValueError) + assert "Output value type must be defined" == str(ex) + + # validate input type + try: + register_function(func_name, 1, doc, None, + out_type, unary_scalar_function) + except Exception as ex: + assert isinstance(ex, ValueError) + assert "input types must be of type InputType" == str(ex) From bd1e74b974168f0a1263df5f7ed3ebba151c53f2 Mon Sep 17 00:00:00 2001 From: Vibhatha Abeykoon Date: Mon, 28 Mar 2022 18:45:22 +0530 Subject: [PATCH 051/131] fixing an issue in func docs passing --- cpp/src/arrow/python/udf.cc | 7 +++---- cpp/src/arrow/python/udf.h | 13 ++++++++----- python/examples/udf/udf_example.py | 18 +++++++++--------- python/pyarrow/_compute.pyx | 6 ++++-- python/pyarrow/includes/libarrow.pxd | 8 +++++--- python/pyarrow/tests/test_udf.py | 4 ++-- 6 files changed, 31 insertions(+), 25 deletions(-) diff --git a/cpp/src/arrow/python/udf.cc b/cpp/src/arrow/python/udf.cc index 293e27f1415..d9259f0c5b4 100644 --- a/cpp/src/arrow/python/udf.cc +++ b/cpp/src/arrow/python/udf.cc @@ -71,11 +71,10 @@ Status VerifyArityAndInput(cp::Arity arity, const cp::ExecBatch& batch) { return Status::OK(); } -Status ScalarUdfBuilder::MakeFunction(PyObject* function) { +Status ScalarUdfBuilder::MakeFunction(PyObject* function, UDFOptions* options) { Status st; - auto func_doc = this->doc(); - auto func = - std::make_shared(this->name(), this->arity(), &func_doc); + auto doc = this->doc(); + auto func = std::make_shared(this->name(), this->arity(), &doc); // creating a copy of objects for the lambda function auto py_function = function; auto arity = this->arity(); diff --git a/cpp/src/arrow/python/udf.h b/cpp/src/arrow/python/udf.h index 37c1086d979..0fb64e4b780 100644 --- a/cpp/src/arrow/python/udf.h +++ b/cpp/src/arrow/python/udf.h @@ -50,10 +50,13 @@ DECLARE_CALL_UDF(Array, array, make_array) #undef DECLARE_CALL_UDF +// Exposing the UDFOptions: https://issues.apache.org/jira/browse/ARROW-16041 +struct UDFOptions {}; + class ARROW_PYTHON_EXPORT UdfBuilder { public: UdfBuilder(const std::string func_name, const cp::Function::Kind kind, - const cp::Arity arity, const cp::FunctionDoc* func_doc, + const cp::Arity arity, const cp::FunctionDoc func_doc, const std::vector in_types, const cp::OutputType out_type, const cp::MemAllocation::type mem_allocation, const cp::NullHandling::type null_handling) @@ -72,7 +75,7 @@ class ARROW_PYTHON_EXPORT UdfBuilder { const cp::Arity& arity() const { return arity_; } - const cp::FunctionDoc& doc() const { return *func_doc_; } + const cp::FunctionDoc doc() const { return func_doc_; } const std::vector& input_types() const { return in_types_; } @@ -86,7 +89,7 @@ class ARROW_PYTHON_EXPORT UdfBuilder { std::string func_name_; cp::Function::Kind kind_; cp::Arity arity_; - const cp::FunctionDoc* func_doc_; + const cp::FunctionDoc func_doc_; std::vector in_types_; cp::OutputType out_type_; cp::MemAllocation::type mem_allocation_; @@ -96,7 +99,7 @@ class ARROW_PYTHON_EXPORT UdfBuilder { class ARROW_PYTHON_EXPORT ScalarUdfBuilder : public UdfBuilder { public: ScalarUdfBuilder(const std::string func_name, const cp::Arity arity, - const cp::FunctionDoc* func_doc, + const cp::FunctionDoc func_doc, const std::vector in_types, const cp::OutputType out_type, const cp::MemAllocation::type mem_allocation, @@ -104,7 +107,7 @@ class ARROW_PYTHON_EXPORT ScalarUdfBuilder : public UdfBuilder { : UdfBuilder(func_name, cp::Function::SCALAR, arity, func_doc, in_types, out_type, mem_allocation, null_handling) {} - Status MakeFunction(PyObject* function); + Status MakeFunction(PyObject* function, UDFOptions* options = NULLPTR); }; } // namespace py diff --git a/python/examples/udf/udf_example.py b/python/examples/udf/udf_example.py index 47c8fac64ab..67bc8a5bd41 100644 --- a/python/examples/udf/udf_example.py +++ b/python/examples/udf/udf_example.py @@ -341,12 +341,12 @@ def varargs_scalar_function(scalar1, scalar2, scalar3, scalar4): # scalar function examples scalar_unary_example() - scalar_binary_example() - scalar_ternary_function() - scalar_varargs_function() - - # array function examples - array_unary_example() - array_binary_example() - array_ternary_example() - array_varargs_example() + # scalar_binary_example() + # scalar_ternary_function() + # scalar_varargs_function() + + # # array function examples + # array_unary_example() + # array_binary_example() + # array_ternary_example() + # array_varargs_example() diff --git a/python/pyarrow/_compute.pyx b/python/pyarrow/_compute.pyx index ff35ab1a34f..e2040aea6ae 100644 --- a/python/pyarrow/_compute.pyx +++ b/python/pyarrow/_compute.pyx @@ -2601,6 +2601,7 @@ def register_function(func_name, num_args, function_doc, in_types, MemAllocation c_mem_allocation NullHandling c_null_handling CStatus st + CUDFOptions c_options object obj _mem_allocation_map = { @@ -2660,10 +2661,11 @@ def register_function(func_name, num_args, function_doc, in_types, # Note: The VectorUDF, TableUDF and AggregatorUDFs will be defined # when they are implemented. Only ScalarUDFBuilder is supported at the # moment. - c_sc_builder = new CScalarUdfBuilder(c_func_name, c_arity, &c_func_doc, + c_sc_builder = new CScalarUdfBuilder(c_func_name, c_arity, c_func_doc, c_in_types, deref(c_out_type), c_mem_allocation, c_null_handling) - st = c_sc_builder.MakeFunction(c_callback) + + st = c_sc_builder.MakeFunction(c_callback, &c_options) if not st.ok(): error_msg = st.message().decode() raise UDFRegistrationError(message=error_msg) diff --git a/python/pyarrow/includes/libarrow.pxd b/python/pyarrow/includes/libarrow.pxd index 0b714118f0b..9a9f7b49184 100644 --- a/python/pyarrow/includes/libarrow.pxd +++ b/python/pyarrow/includes/libarrow.pxd @@ -2722,12 +2722,14 @@ cdef extern from "arrow/compute/kernel.h" namespace "arrow::compute" nogil: NullHandling_OUTPUT_NOT_NULL" arrow::compute::NullHandling::OUTPUT_NOT_NULL" cdef extern from "arrow/python/udf.h" namespace "arrow::py" nogil: + cdef cppclass CUDFOptions" arrow::py::UDFOptions": + pass cdef cppclass CUdfBuilder" arrow::py::UdfBuilder": - CUdfBuilder(c_string func_name, FunctionKind kind, CArity arity, CFunctionDoc* func_doc, + CUdfBuilder(c_string func_name, FunctionKind kind, CArity arity, CFunctionDoc func_doc, vector[CInputType] in_types, COutputType out_type, MemAllocation mem_allocation, NullHandling null_handling) cdef cppclass CScalarUdfBuilder" arrow::py::ScalarUdfBuilder"(CUdfBuilder): - CScalarUdfBuilder(c_string func_name, CArity arity, CFunctionDoc* func_doc, + CScalarUdfBuilder(c_string func_name, CArity arity, CFunctionDoc func_doc, vector[CInputType] in_types, COutputType out_type, MemAllocation mem_allocation, NullHandling null_handling) - CStatus MakeFunction(PyObject* function) + CStatus MakeFunction(PyObject* function, CUDFOptions* options) diff --git a/python/pyarrow/tests/test_udf.py b/python/pyarrow/tests/test_udf.py index 63a05476382..05e8b07c343 100644 --- a/python/pyarrow/tests/test_udf.py +++ b/python/pyarrow/tests/test_udf.py @@ -22,7 +22,7 @@ import pyarrow as pa from pyarrow import compute as pc from pyarrow.compute import register_function -from pyarrow.compute import InputType, UDFRegistrationError +from pyarrow.compute import InputType def get_function_doc(summary: str, desc: str, arg_names: List[str]): @@ -278,7 +278,7 @@ def test_scalar_udf_function_with_scalar_data(function_names, func = pc.get_function(name) assert func.name == name - result = pc.call_function(name, input, options=None, memory_pool=None) + result = pc.call_function(name, input) assert result == expected_output From be1e59f4b8195bdf72ffb57a89bb137011aab88b Mon Sep 17 00:00:00 2001 From: Vibhatha Abeykoon Date: Mon, 28 Mar 2022 20:15:48 +0530 Subject: [PATCH 052/131] minor check for appveyour --- python/examples/udf/udf_example.py | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/python/examples/udf/udf_example.py b/python/examples/udf/udf_example.py index 67bc8a5bd41..47c8fac64ab 100644 --- a/python/examples/udf/udf_example.py +++ b/python/examples/udf/udf_example.py @@ -341,12 +341,12 @@ def varargs_scalar_function(scalar1, scalar2, scalar3, scalar4): # scalar function examples scalar_unary_example() - # scalar_binary_example() - # scalar_ternary_function() - # scalar_varargs_function() - - # # array function examples - # array_unary_example() - # array_binary_example() - # array_ternary_example() - # array_varargs_example() + scalar_binary_example() + scalar_ternary_function() + scalar_varargs_function() + + # array function examples + array_unary_example() + array_binary_example() + array_ternary_example() + array_varargs_example() From a9228e9657db1897749be98e0f48753172c76fd4 Mon Sep 17 00:00:00 2001 From: Vibhatha Abeykoon Date: Thu, 31 Mar 2022 10:51:05 +0530 Subject: [PATCH 053/131] removing aggregate example --- cpp/examples/arrow/aggregate_example.cc | 127 ------------------------ 1 file changed, 127 deletions(-) delete mode 100644 cpp/examples/arrow/aggregate_example.cc diff --git a/cpp/examples/arrow/aggregate_example.cc b/cpp/examples/arrow/aggregate_example.cc deleted file mode 100644 index aa116c063e4..00000000000 --- a/cpp/examples/arrow/aggregate_example.cc +++ /dev/null @@ -1,127 +0,0 @@ -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. - -#include -#include -#include // ARROW-15263 -#include -#include -#include - -#include -#include -#include -#include -#include - -namespace cp = arrow::compute; - -#define ABORT_ON_FAILURE(expr) \ - do { \ - arrow::Status status_ = (expr); \ - if (!status_.ok()) { \ - std::cerr << status_.message() << std::endl; \ - abort(); \ - } \ - } while (0); - -template ::value | - arrow::is_boolean_type::value | - arrow::is_temporal_type::value>::type> -arrow::Result> GetArrayDataSample( - const std::vector& values) { - using ARROW_ARRAY_TYPE = typename arrow::TypeTraits::ArrayType; - using ARROW_BUILDER_TYPE = typename arrow::TypeTraits::BuilderType; - ARROW_BUILDER_TYPE builder; - ARROW_RETURN_NOT_OK(builder.Reserve(values.size())); - std::shared_ptr array; - ARROW_RETURN_NOT_OK(builder.AppendValues(values)); - ARROW_RETURN_NOT_OK(builder.Finish(&array)); - return array; -} - -arrow::Result> GetTable() { - std::shared_ptr table; - - auto field_vector = {arrow::field("a", arrow::int64()), - arrow::field("b", arrow::boolean()), - arrow::field("c", arrow::int64())}; - ARROW_ASSIGN_OR_RAISE(auto int_array, - GetArrayDataSample({0, 1, 2, 0, 4, 1, 0, 5})); - ARROW_ASSIGN_OR_RAISE(auto bool_array, - GetArrayDataSample( - {false, true, false, true, true, false, true, false})); - ARROW_ASSIGN_OR_RAISE(auto data_array, GetArrayDataSample( - {10, 11, 12, 10, 11, 11, 10, 15})); - - auto schema = arrow::schema(field_vector); - auto data_vector = {int_array, bool_array, data_array}; - - table = arrow::Table::Make(schema, data_vector, 8); - - return table; -} - -arrow::Status DoAggregate() { - auto maybe_plan = cp::ExecPlan::Make(); - ABORT_ON_FAILURE(maybe_plan.status()); - auto plan = maybe_plan.ValueOrDie(); - cp::ExecContext exec_context(arrow::default_memory_pool(), - ::arrow::internal::GetCpuThreadPool()); - - ARROW_ASSIGN_OR_RAISE(auto table, GetTable()); - - std::cout << "Source Table" << std::endl; - - std::cout << table->ToString() << std::endl; - - std::shared_ptr out; - cp::CountOptions options(cp::CountOptions::ONLY_VALID); - auto aggregate_options = cp::AggregateNodeOptions{/*aggregates=*/{{"sum", &options}}, - /*targets=*/{"c"}, - /*names=*/{"count(c)"}, - /*keys=*/{}}; - auto schema = arrow::schema({arrow::field("count(c)", arrow::int64())}); - - ABORT_ON_FAILURE(cp::Declaration::Sequence( - { - {"table_source", cp::TableSourceNodeOptions{table, 2}}, - {"aggregate", aggregate_options}, - {"table_sink", cp::TableSinkNodeOptions{&out, schema}}, - }) - .AddToPlan(plan.get()) - .status()); - - ARROW_RETURN_NOT_OK(plan->StartProducing()); - - std::cout << "Output Table Data : " << std::endl; - std::cout << out->ToString() << std::endl; - - auto future = plan->finished(); - - return future.status(); -} - -int main(int argc, char** argv) { - auto status = DoAggregate(); - if (!status.ok()) { - std::cerr << "Error occurred: " << status.message() << std::endl; - return EXIT_FAILURE; - } - return EXIT_SUCCESS; -} From a34a455a4b90dba2200ac30c728904cdb0a3509d Mon Sep 17 00:00:00 2001 From: Vibhatha Abeykoon Date: Thu, 31 Mar 2022 11:31:57 +0530 Subject: [PATCH 054/131] addressing reviews on udf example --- cpp/examples/arrow/CMakeLists.txt | 6 +- cpp/examples/arrow/udf_example.cc | 163 ++---------------------------- 2 files changed, 12 insertions(+), 157 deletions(-) diff --git a/cpp/examples/arrow/CMakeLists.txt b/cpp/examples/arrow/CMakeLists.txt index c6dd8f54592..0ef268f2562 100644 --- a/cpp/examples/arrow/CMakeLists.txt +++ b/cpp/examples/arrow/CMakeLists.txt @@ -134,10 +134,6 @@ if(ARROW_PARQUET AND ARROW_DATASET) add_arrow_example(join_example EXTRA_LINK_LIBS ${DATASET_EXAMPLES_LINK_LIBS}) add_dependencies(join-example parquet) - add_arrow_example(udf_example EXTRA_LINK_LIBS ${DATASET_EXAMPLES_LINK_LIBS}) - add_dependencies(udf-example parquet) - - add_arrow_example(aggregate_example EXTRA_LINK_LIBS ${DATASET_EXAMPLES_LINK_LIBS}) - add_dependencies(aggregate-example parquet) + add_arrow_example(udf_example) endif() diff --git a/cpp/examples/arrow/udf_example.cc b/cpp/examples/arrow/udf_example.cc index 4733e311535..86db3e27002 100644 --- a/cpp/examples/arrow/udf_example.cc +++ b/cpp/examples/arrow/udf_example.cc @@ -51,108 +51,11 @@ template ::value>::type> arrow::Result> GetArrayDataSample( const std::vector& values) { - using ARROW_ARRAY_TYPE = typename arrow::TypeTraits::ArrayType; - using ARROW_BUILDER_TYPE = typename arrow::TypeTraits::BuilderType; - ARROW_BUILDER_TYPE builder; + using ArrowBuilderType = typename arrow::TypeTraits::BuilderType; + ArrowBuilderType builder; ARROW_RETURN_NOT_OK(builder.Reserve(values.size())); - std::shared_ptr array; ARROW_RETURN_NOT_OK(builder.AppendValues(values)); - ARROW_RETURN_NOT_OK(builder.Finish(&array)); - return array; -} - -arrow::Result> GetTable() { - std::shared_ptr table; - - auto field_vector = { - arrow::field("a", arrow::int64()), arrow::field("x", arrow::int64()), - arrow::field("y", arrow::int64()), arrow::field("z", arrow::int64()), - arrow::field("b", arrow::boolean())}; - - ARROW_ASSIGN_OR_RAISE(auto int_array, - GetArrayDataSample({1, 2, 3, 4, 5, 6})); - ARROW_ASSIGN_OR_RAISE(auto x, - GetArrayDataSample({21, 22, 23, 24, 25, 26})); - ARROW_ASSIGN_OR_RAISE(auto y, - GetArrayDataSample({31, 32, 33, 34, 35, 36})); - ARROW_ASSIGN_OR_RAISE(auto z, - GetArrayDataSample({41, 42, 43, 44, 45, 46})); - ARROW_ASSIGN_OR_RAISE(auto bool_array, GetArrayDataSample( - {false, true, false, true, true, false})); - - auto schema = arrow::schema(field_vector); - auto data_vector = {int_array, x, y, z, bool_array}; - - table = arrow::Table::Make(schema, data_vector, 6); - - return table; -} - -class UDFOptionsType : public cp::FunctionOptionsType { - const char* type_name() const override { return "UDFOptionsType"; } - std::string Stringify(const cp::FunctionOptions&) const override { - return "UDFOptionsType"; - } - bool Compare(const cp::FunctionOptions& options, - const cp::FunctionOptions& other) const override { - return true; - } - std::unique_ptr Copy( - const cp::FunctionOptions& options) const override; -}; - -cp::FunctionOptionsType* GetUDFOptionsType() { - static UDFOptionsType options_type; - return &options_type; -} - -class UDFOptions : public cp::FunctionOptions { - public: - UDFOptions() : cp::FunctionOptions(GetUDFOptionsType()) {} -}; - -std::unique_ptr UDFOptionsType::Copy( - const cp::FunctionOptions&) const { - return std::unique_ptr(new UDFOptions()); -} - -class ExampleNodeOptions : public cp::ExecNodeOptions {}; - -// a basic ExecNode which ignores all input batches -class ExampleNode : public cp::ExecNode { - public: - ExampleNode(ExecNode* input, const ExampleNodeOptions&) - : ExecNode(/*plan=*/input->plan(), /*inputs=*/{input}, - /*input_labels=*/{"ignored"}, - /*output_schema=*/input->output_schema(), /*num_outputs=*/1) {} - - const char* kind_name() const override { return "ExampleNode"; } - - arrow::Status StartProducing() override { - outputs_[0]->InputFinished(this, 0); - return arrow::Status::OK(); - } - - void ResumeProducing(ExecNode* output) override {} - void PauseProducing(ExecNode* output) override {} - - void StopProducing(ExecNode* output) override { inputs_[0]->StopProducing(this); } - void StopProducing() override { inputs_[0]->StopProducing(); } - - void InputReceived(ExecNode* input, cp::ExecBatch batch) override {} - void ErrorReceived(ExecNode* input, arrow::Status error) override {} - void InputFinished(ExecNode* input, int total_batches) override {} - - arrow::Future<> finished() override { return inputs_[0]->finished(); } -}; - -arrow::Result ExampleExecNodeFactory(cp::ExecPlan* plan, - std::vector inputs, - const cp::ExecNodeOptions& options) { - const auto& example_options = - arrow::internal::checked_cast(options); - - return plan->EmplaceNode(inputs[0], example_options); + return builder.Finish(); } const cp::FunctionDoc func_doc{ @@ -175,8 +78,6 @@ arrow::Status SampleFunction(cp::KernelContext* ctx, const cp::ExecBatch& batch, arrow::Status Execute() { const std::string name = "add_three"; auto func = std::make_shared(name, cp::Arity::Ternary(), &func_doc); - - auto options = std::make_shared(); cp::ScalarKernel kernel( {cp::InputType::Array(arrow::int64()), cp::InputType::Array(arrow::int64()), cp::InputType::Array(arrow::int64())}, @@ -190,57 +91,15 @@ arrow::Status Execute() { auto registry = cp::GetFunctionRegistry(); ABORT_ON_FAILURE(registry->AddFunction(std::move(func))); - auto exec_registry = cp::default_exec_factory_registry(); - ABORT_ON_FAILURE( - exec_registry->AddFactory("udf_register_example", ExampleExecNodeFactory)); - - auto maybe_plan = cp::ExecPlan::Make(); - ABORT_ON_FAILURE(maybe_plan.status()); - auto plan = maybe_plan.ValueOrDie(); - cp::ExecContext exec_context(arrow::default_memory_pool(), - ::arrow::internal::GetCpuThreadPool()); - arrow::AsyncGenerator> source_gen, sink_gen; - - cp::Expression custom_exp = cp::call( - name, {cp::field_ref("x"), cp::field_ref("y"), cp::field_ref("z")}, options); - - ARROW_ASSIGN_OR_RAISE(auto table, GetTable()); - auto table_source_node_options = cp::TableSourceNodeOptions{table, 2}; - auto project_node_options = cp::ProjectNodeOptions{{ - cp::field_ref("a"), - custom_exp, - cp::field_ref("b"), - }}; - auto output_schema = arrow::schema({arrow::field("a", arrow::int64()), - arrow::field("x + y + z", arrow::int64()), - arrow::field("b", arrow::boolean())}); - std::shared_ptr out; - auto table_sink_node_options = cp::TableSinkNodeOptions{&out, output_schema}; - ABORT_ON_FAILURE( - cp::Declaration::Sequence({ - {"table_source", table_source_node_options}, - {"project", project_node_options}, - {"table_sink", table_sink_node_options}, - }) - .AddToPlan(plan.get()) - .status()); - - ARROW_RETURN_NOT_OK(plan->StartProducing()); - constexpr int print_len = 25; - std::cout << std::string(print_len, '#') << std::endl; - std::cout << "Input Table Data : " << std::endl; - std::cout << std::string(print_len, '#') << std::endl; + ARROW_ASSIGN_OR_RAISE(auto x, GetArrayDataSample({1, 2, 3})); + ARROW_ASSIGN_OR_RAISE(auto y, GetArrayDataSample({4, 5, 6})); + ARROW_ASSIGN_OR_RAISE(auto z, GetArrayDataSample({7, 8, 9})); - std::cout << table->ToString() << std::endl; - - std::cout << std::string(print_len, '#') << std::endl; - std::cout << "Output Table Data : " << std::endl; - std::cout << std::string(print_len, '#') << std::endl; - std::cout << out->ToString() << std::endl; - - auto future = plan->finished(); - - return future.status(); + ARROW_ASSIGN_OR_RAISE(auto res, cp::CallFunction(name, {x, y, z})); + auto res_array = res.make_array(); + std::cout << "Result" << std::endl; + std::cout << res_array->ToString() << std::endl; + return arrow::Status::OK(); } int main(int argc, char** argv) { From 09d126b8942906d6ddb1d18ba2cfddeab7a44281 Mon Sep 17 00:00:00 2001 From: Vibhatha Abeykoon Date: Thu, 31 Mar 2022 12:00:25 +0530 Subject: [PATCH 055/131] addressing reviews p1 --- cpp/examples/arrow/udf_example.cc | 13 +++---- cpp/src/arrow/python/api.h | 1 - cpp/src/arrow/python/udf.cc | 12 +++--- cpp/src/arrow/python/udf.h | 61 +++++++++++++++---------------- 4 files changed, 41 insertions(+), 46 deletions(-) diff --git a/cpp/examples/arrow/udf_example.cc b/cpp/examples/arrow/udf_example.cc index 86db3e27002..81a1a2b36ab 100644 --- a/cpp/examples/arrow/udf_example.cc +++ b/cpp/examples/arrow/udf_example.cc @@ -66,12 +66,9 @@ const cp::FunctionDoc func_doc{ arrow::Status SampleFunction(cp::KernelContext* ctx, const cp::ExecBatch& batch, arrow::Datum* out) { - auto in_res = cp::CallFunction("add", {batch[0].array(), batch[1].array()}); - auto in_arr = in_res.ValueOrDie().make_array(); - auto final_res = cp::CallFunction("add", {in_arr, batch[2].array()}); - auto final_arr = final_res.ValueOrDie().array(); - auto datum = new arrow::Datum(final_arr); - *out = *datum; + // temp = x + y; return temp + z + ARROW_ASSIGN_OR_RAISE(auto temp, cp::CallFunction("add", {batch[0], batch[1]})); + ARROW_ASSIGN_OR_RAISE(*out, cp::CallFunction("add", {temp, batch[2]})); return arrow::Status::OK(); } @@ -86,10 +83,10 @@ arrow::Status Execute() { kernel.mem_allocation = cp::MemAllocation::NO_PREALLOCATE; kernel.null_handling = cp::NullHandling::COMPUTED_NO_PREALLOCATE; - ABORT_ON_FAILURE(func->AddKernel(std::move(kernel))); + ARROW_RETURN_NOT_OK(func->AddKernel(std::move(kernel))); auto registry = cp::GetFunctionRegistry(); - ABORT_ON_FAILURE(registry->AddFunction(std::move(func))); + ARROW_RETURN_NOT_OK(registry->AddFunction(std::move(func))); ARROW_ASSIGN_OR_RAISE(auto x, GetArrayDataSample({1, 2, 3})); ARROW_ASSIGN_OR_RAISE(auto y, GetArrayDataSample({4, 5, 6})); diff --git a/cpp/src/arrow/python/api.h b/cpp/src/arrow/python/api.h index 7737d791e31..a0b13d6d130 100644 --- a/cpp/src/arrow/python/api.h +++ b/cpp/src/arrow/python/api.h @@ -28,4 +28,3 @@ #include "arrow/python/numpy_to_arrow.h" #include "arrow/python/python_to_arrow.h" #include "arrow/python/serialize.h" -#include "arrow/python/udf.h" diff --git a/cpp/src/arrow/python/udf.cc b/cpp/src/arrow/python/udf.cc index d9259f0c5b4..0b6d6027bd9 100644 --- a/cpp/src/arrow/python/udf.cc +++ b/cpp/src/arrow/python/udf.cc @@ -63,7 +63,7 @@ DEFINE_CALL_UDF(Array, array, make_array) #undef DEFINE_CALL_UDF Status VerifyArityAndInput(cp::Arity arity, const cp::ExecBatch& batch) { - bool match = (uint64_t)arity.num_args == batch.values.size(); + bool match = static_cast(arity.num_args) == batch.values.size(); if (!match) { return Status::Invalid( "Function Arity and Input data shape doesn't match, expected {}"); @@ -79,9 +79,9 @@ Status ScalarUdfBuilder::MakeFunction(PyObject* function, UDFOptions* options) { auto py_function = function; auto arity = this->arity(); // lambda function - auto call_back_lambda = [py_function, arity](cp::KernelContext* ctx, - const cp::ExecBatch& batch, - Datum* out) -> Status { + auto call_back = [py_function, arity](cp::KernelContext* ctx, + const cp::ExecBatch& batch, + Datum* out) -> Status { PyAcquireGIL lock; if (py_function == NULL) { return Status::ExecutionError("python function cannot be null"); @@ -96,7 +96,7 @@ Status ScalarUdfBuilder::MakeFunction(PyObject* function, UDFOptions* options) { return Status::Invalid("Unexpected input type, scalar or array type expected."); } } else { - return Status::ExecutionError("Expected a callable python object."); + return Status::TypeError("Expected a callable python object."); } return Status::OK(); }; // lambda function @@ -104,7 +104,7 @@ Status ScalarUdfBuilder::MakeFunction(PyObject* function, UDFOptions* options) { cp::ScalarKernel kernel( cp::KernelSignature::Make(this->input_types(), this->output_type(), this->arity().is_varargs), - call_back_lambda); + call_back); kernel.mem_allocation = this->mem_allocation(); kernel.null_handling = this->null_handling(); st = func->AddKernel(std::move(kernel)); diff --git a/cpp/src/arrow/python/udf.h b/cpp/src/arrow/python/udf.h index 0fb64e4b780..a22bba9fd9f 100644 --- a/cpp/src/arrow/python/udf.h +++ b/cpp/src/arrow/python/udf.h @@ -35,14 +35,12 @@ #include "arrow/python/pyarrow.h" #include "arrow/python/visibility.h" -namespace cp = arrow::compute; - namespace arrow { namespace py { -#define DECLARE_CALL_UDF(TYPE_NAME, FUNCTION_SUFFIX, CONVERT_SUFFIX) \ - ARROW_PYTHON_EXPORT Status exec_function_##FUNCTION_SUFFIX(const cp::ExecBatch&, \ +#define DECLARE_CALL_UDF(TYPE_NAME, FUNCTION_SUFFIX, CONVERT_SUFFIX) \ + ARROW_PYTHON_EXPORT Status exec_function_##FUNCTION_SUFFIX(const compute::ExecBatch&, \ PyObject*, int, Datum*); DECLARE_CALL_UDF(Scalar, scalar, scalar) @@ -55,11 +53,12 @@ struct UDFOptions {}; class ARROW_PYTHON_EXPORT UdfBuilder { public: - UdfBuilder(const std::string func_name, const cp::Function::Kind kind, - const cp::Arity arity, const cp::FunctionDoc func_doc, - const std::vector in_types, const cp::OutputType out_type, - const cp::MemAllocation::type mem_allocation, - const cp::NullHandling::type null_handling) + UdfBuilder(const std::string func_name, const compute::Function::Kind kind, + const compute::Arity arity, const compute::FunctionDoc func_doc, + const std::vector in_types, + const compute::OutputType out_type, + const compute::MemAllocation::type mem_allocation, + const compute::NullHandling::type null_handling) : func_name_(func_name), kind_(kind), arity_(arity), @@ -71,41 +70,41 @@ class ARROW_PYTHON_EXPORT UdfBuilder { const std::string& name() const { return func_name_; } - cp::Function::Kind kind() { return kind_; } + compute::Function::Kind kind() { return kind_; } - const cp::Arity& arity() const { return arity_; } + const compute::Arity& arity() const { return arity_; } - const cp::FunctionDoc doc() const { return func_doc_; } + const compute::FunctionDoc doc() const { return func_doc_; } - const std::vector& input_types() const { return in_types_; } + const std::vector& input_types() const { return in_types_; } - const cp::OutputType& output_type() const { return out_type_; } + const compute::OutputType& output_type() const { return out_type_; } - cp::MemAllocation::type mem_allocation() { return mem_allocation_; } + compute::MemAllocation::type mem_allocation() { return mem_allocation_; } - cp::NullHandling::type null_handling() { return null_handling_; } + compute::NullHandling::type null_handling() { return null_handling_; } private: std::string func_name_; - cp::Function::Kind kind_; - cp::Arity arity_; - const cp::FunctionDoc func_doc_; - std::vector in_types_; - cp::OutputType out_type_; - cp::MemAllocation::type mem_allocation_; - cp::NullHandling::type null_handling_; + compute::Function::Kind kind_; + compute::Arity arity_; + const compute::FunctionDoc func_doc_; + std::vector in_types_; + compute::OutputType out_type_; + compute::MemAllocation::type mem_allocation_; + compute::NullHandling::type null_handling_; }; class ARROW_PYTHON_EXPORT ScalarUdfBuilder : public UdfBuilder { public: - ScalarUdfBuilder(const std::string func_name, const cp::Arity arity, - const cp::FunctionDoc func_doc, - const std::vector in_types, - const cp::OutputType out_type, - const cp::MemAllocation::type mem_allocation, - const cp::NullHandling::type null_handling) - : UdfBuilder(func_name, cp::Function::SCALAR, arity, func_doc, in_types, out_type, - mem_allocation, null_handling) {} + ScalarUdfBuilder(const std::string func_name, const compute::Arity arity, + const compute::FunctionDoc func_doc, + const std::vector in_types, + const compute::OutputType out_type, + const compute::MemAllocation::type mem_allocation, + const compute::NullHandling::type null_handling) + : UdfBuilder(func_name, compute::Function::SCALAR, arity, func_doc, in_types, + out_type, mem_allocation, null_handling) {} Status MakeFunction(PyObject* function, UDFOptions* options = NULLPTR); }; From 113d35f0628e8cd7d579c15b899fbb2bbe3d1bd4 Mon Sep 17 00:00:00 2001 From: Vibhatha Abeykoon Date: Thu, 31 Mar 2022 16:24:15 +0530 Subject: [PATCH 056/131] addressing reviews p2 --- cpp/src/arrow/python/udf.cc | 140 +++++++++++++++-------------- cpp/src/arrow/python/udf.h | 13 +-- python/examples/udf/udf_example.py | 4 + 3 files changed, 83 insertions(+), 74 deletions(-) diff --git a/cpp/src/arrow/python/udf.cc b/cpp/src/arrow/python/udf.cc index 0b6d6027bd9..5bb306b8817 100644 --- a/cpp/src/arrow/python/udf.cc +++ b/cpp/src/arrow/python/udf.cc @@ -24,45 +24,63 @@ #include "arrow/compute/function.h" #include "arrow/python/common.h" -namespace cp = arrow::compute; - namespace arrow { namespace py { -#define DEFINE_CALL_UDF(TYPE_NAME, FUNCTION_SUFFIX, CONVERT_SUFFIX) \ - Status exec_function_##FUNCTION_SUFFIX(const cp::ExecBatch& batch, PyObject* function, \ - int num_args, Datum* out) { \ - std::shared_ptr c_res_data; \ - PyObject* arg_tuple = PyTuple_New(num_args); \ - for (int arg_id = 0; arg_id < num_args; arg_id++) { \ - if (!batch[arg_id].is_##FUNCTION_SUFFIX()) { \ - return Status::Invalid("Input type and data type doesn't match"); \ - } \ - auto c_data = batch[arg_id].CONVERT_SUFFIX(); \ - PyObject* data = wrap_##FUNCTION_SUFFIX(c_data); \ - PyTuple_SetItem(arg_tuple, arg_id, data); \ - } \ - PyObject* result = PyObject_CallObject(function, arg_tuple); \ - if (result == NULL) { \ - return Status::ExecutionError("Error occured in computation"); \ - } \ - auto res = unwrap_##FUNCTION_SUFFIX(result); \ - if (!res.status().ok()) { \ - return res.status(); \ - } \ - c_res_data = res.ValueOrDie(); \ - auto datum = new Datum(c_res_data); \ - *out = *datum; \ - return Status::OK(); \ +Status exec_function_scalar(const compute::ExecBatch& batch, PyObject* function, + int num_args, Datum* out) { + std::shared_ptr c_res_data; + PyObject* arg_tuple = PyTuple_New(num_args); + for (int arg_id = 0; arg_id < num_args; arg_id++) { + if (!batch[arg_id].is_scalar()) { + return Status::Invalid("Input type and data type doesn't match"); + } + auto c_data = batch[arg_id].scalar(); + PyObject* data = wrap_scalar(c_data); + PyTuple_SetItem(arg_tuple, arg_id, data); } + PyObject* result = PyObject_CallObject(function, arg_tuple); + if (result == NULL) { + return Status::ExecutionError("Error occured in computation"); + } + auto res = unwrap_scalar(result); + if (!res.status().ok()) { + return res.status(); + } + c_res_data = res.ValueOrDie(); + auto datum = new Datum(c_res_data); + *out = *datum; + return Status::OK(); +} -DEFINE_CALL_UDF(Scalar, scalar, scalar) -DEFINE_CALL_UDF(Array, array, make_array) - -#undef DEFINE_CALL_UDF +Status exec_function_array(const compute::ExecBatch& batch, PyObject* function, + int num_args, Datum* out) { + std::shared_ptr c_res_data; + PyObject* arg_tuple = PyTuple_New(num_args); + for (int arg_id = 0; arg_id < num_args; arg_id++) { + if (!batch[arg_id].is_array()) { + return Status::Invalid("Input type and data type doesn't match"); + } + auto c_data = batch[arg_id].make_array(); + PyObject* data = wrap_array(c_data); + PyTuple_SetItem(arg_tuple, arg_id, data); + } + PyObject* result = PyObject_CallObject(function, arg_tuple); + if (result == NULL) { + return Status::ExecutionError("Error occured in computation"); + } + auto res = unwrap_array(result); + if (!res.status().ok()) { + return res.status(); + } + c_res_data = res.ValueOrDie(); + auto datum = new Datum(c_res_data); + *out = *datum; + return Status::OK(); +} -Status VerifyArityAndInput(cp::Arity arity, const cp::ExecBatch& batch) { +Status VerifyArityAndInput(compute::Arity arity, const compute::ExecBatch& batch) { bool match = static_cast(arity.num_args) == batch.values.size(); if (!match) { return Status::Invalid( @@ -72,51 +90,43 @@ Status VerifyArityAndInput(cp::Arity arity, const cp::ExecBatch& batch) { } Status ScalarUdfBuilder::MakeFunction(PyObject* function, UDFOptions* options) { - Status st; - auto doc = this->doc(); - auto func = std::make_shared(this->name(), this->arity(), &doc); // creating a copy of objects for the lambda function - auto py_function = function; + Py_INCREF(function); + function_.reset(function); + if (function_.obj() == NULL) { + return Status::ExecutionError("python function cannot be null"); + } + if (!PyCallable_Check(function_.obj())) { + return Status::TypeError("Expected a callable python object."); + } + auto doc = this->doc(); + scalar_func_ = + std::make_shared(this->name(), this->arity(), &doc); auto arity = this->arity(); // lambda function - auto call_back = [py_function, arity](cp::KernelContext* ctx, - const cp::ExecBatch& batch, - Datum* out) -> Status { + auto call_back = [&, arity](compute::KernelContext* ctx, + const compute::ExecBatch& batch, Datum* out) -> Status { PyAcquireGIL lock; - if (py_function == NULL) { - return Status::ExecutionError("python function cannot be null"); - } - if (PyCallable_Check(py_function)) { - RETURN_NOT_OK(VerifyArityAndInput(arity, batch)); - if (batch[0].is_array()) { // checke 0-th element to select array callable - RETURN_NOT_OK(exec_function_array(batch, py_function, arity.num_args, out)); - } else if (batch[0].is_scalar()) { // check 0-th element to select scalar callable - RETURN_NOT_OK(exec_function_scalar(batch, py_function, arity.num_args, out)); - } else { - return Status::Invalid("Unexpected input type, scalar or array type expected."); - } + RETURN_NOT_OK(VerifyArityAndInput(arity, batch)); + if (batch[0].is_array()) { // checke 0-th element to select array callable + RETURN_NOT_OK(exec_function_array(batch, function_.obj(), arity.num_args, out)); + } else if (batch[0].is_scalar()) { // check 0-th element to select scalar callable + RETURN_NOT_OK(exec_function_scalar(batch, function_.obj(), arity.num_args, out)); } else { - return Status::TypeError("Expected a callable python object."); + return Status::Invalid("Unexpected input type, scalar or array type expected."); } return Status::OK(); }; // lambda function - cp::ScalarKernel kernel( - cp::KernelSignature::Make(this->input_types(), this->output_type(), - this->arity().is_varargs), + compute::ScalarKernel kernel( + compute::KernelSignature::Make(this->input_types(), this->output_type(), + this->arity().is_varargs), call_back); kernel.mem_allocation = this->mem_allocation(); kernel.null_handling = this->null_handling(); - st = func->AddKernel(std::move(kernel)); - if (!st.ok()) { - return Status::ExecutionError("Kernel couldn't be added to the udf : " + - st.message()); - } - auto registry = cp::GetFunctionRegistry(); - st = registry->AddFunction(std::move(func)); - if (!st.ok()) { - return Status::ExecutionError("udf registration failed : " + st.message()); - } + RETURN_NOT_OK(scalar_func_->AddKernel(std::move(kernel))); + auto registry = compute::GetFunctionRegistry(); + RETURN_NOT_OK(registry->AddFunction(std::move(func))); return Status::OK(); } diff --git a/cpp/src/arrow/python/udf.h b/cpp/src/arrow/python/udf.h index a22bba9fd9f..09121f1f4f3 100644 --- a/cpp/src/arrow/python/udf.h +++ b/cpp/src/arrow/python/udf.h @@ -39,15 +39,6 @@ namespace arrow { namespace py { -#define DECLARE_CALL_UDF(TYPE_NAME, FUNCTION_SUFFIX, CONVERT_SUFFIX) \ - ARROW_PYTHON_EXPORT Status exec_function_##FUNCTION_SUFFIX(const compute::ExecBatch&, \ - PyObject*, int, Datum*); - -DECLARE_CALL_UDF(Scalar, scalar, scalar) -DECLARE_CALL_UDF(Array, array, make_array) - -#undef DECLARE_CALL_UDF - // Exposing the UDFOptions: https://issues.apache.org/jira/browse/ARROW-16041 struct UDFOptions {}; @@ -107,6 +98,10 @@ class ARROW_PYTHON_EXPORT ScalarUdfBuilder : public UdfBuilder { out_type, mem_allocation, null_handling) {} Status MakeFunction(PyObject* function, UDFOptions* options = NULLPTR); + + private: + OwnedRefNoGIL function_; + std::shared_ptr scalar_func_; }; } // namespace py diff --git a/python/examples/udf/udf_example.py b/python/examples/udf/udf_example.py index 47c8fac64ab..501d21e828b 100644 --- a/python/examples/udf/udf_example.py +++ b/python/examples/udf/udf_example.py @@ -41,6 +41,8 @@ def add_one(array): in_types, out_type, add_one) func = pc.get_function(func_name) + + print(func) assert func.name == func_name @@ -159,6 +161,8 @@ def varargs_array_function(array1, array2, array3, array4): in_types, out_type, varargs_array_function) func = pc.get_function(func_name) + + print(func) assert func.name == func_name From 1b8183a0e71d2f069315febc997c44fe60a5ebc4 Mon Sep 17 00:00:00 2001 From: Vibhatha Abeykoon Date: Thu, 31 Mar 2022 17:31:30 +0530 Subject: [PATCH 057/131] fixing a typo --- cpp/src/arrow/python/udf.cc | 2 +- python/examples/udf/udf_example.py | 4 ---- 2 files changed, 1 insertion(+), 5 deletions(-) diff --git a/cpp/src/arrow/python/udf.cc b/cpp/src/arrow/python/udf.cc index 5bb306b8817..e3c1f1fafbe 100644 --- a/cpp/src/arrow/python/udf.cc +++ b/cpp/src/arrow/python/udf.cc @@ -126,7 +126,7 @@ Status ScalarUdfBuilder::MakeFunction(PyObject* function, UDFOptions* options) { kernel.null_handling = this->null_handling(); RETURN_NOT_OK(scalar_func_->AddKernel(std::move(kernel))); auto registry = compute::GetFunctionRegistry(); - RETURN_NOT_OK(registry->AddFunction(std::move(func))); + RETURN_NOT_OK(registry->AddFunction(std::move(scalar_func_))); return Status::OK(); } diff --git a/python/examples/udf/udf_example.py b/python/examples/udf/udf_example.py index 501d21e828b..47c8fac64ab 100644 --- a/python/examples/udf/udf_example.py +++ b/python/examples/udf/udf_example.py @@ -41,8 +41,6 @@ def add_one(array): in_types, out_type, add_one) func = pc.get_function(func_name) - - print(func) assert func.name == func_name @@ -161,8 +159,6 @@ def varargs_array_function(array1, array2, array3, array4): in_types, out_type, varargs_array_function) func = pc.get_function(func_name) - - print(func) assert func.name == func_name From 58e8b90e351d462ed5694b1a9947b56105478778 Mon Sep 17 00:00:00 2001 From: Vibhatha Abeykoon Date: Thu, 31 Mar 2022 17:33:57 +0530 Subject: [PATCH 058/131] fixing typo --- python/pyarrow/public-api.pxi | 1 + 1 file changed, 1 insertion(+) diff --git a/python/pyarrow/public-api.pxi b/python/pyarrow/public-api.pxi index fa15b943f02..f8bf6709f35 100644 --- a/python/pyarrow/public-api.pxi +++ b/python/pyarrow/public-api.pxi @@ -37,6 +37,7 @@ cdef api shared_ptr[CBuffer] pyarrow_unwrap_buffer(object buffer): return shared_ptr[CBuffer]() + cdef api object pyarrow_wrap_buffer(const shared_ptr[CBuffer]& buf): cdef Buffer result = Buffer.__new__(Buffer) result.init(buf) From 9c68525858a22a400a739882e29605219e962215 Mon Sep 17 00:00:00 2001 From: Vibhatha Abeykoon Date: Thu, 31 Mar 2022 17:34:56 +0530 Subject: [PATCH 059/131] fixing a formatting typo --- python/pyarrow/public-api.pxi | 1 + 1 file changed, 1 insertion(+) diff --git a/python/pyarrow/public-api.pxi b/python/pyarrow/public-api.pxi index f8bf6709f35..c427fb9f5db 100644 --- a/python/pyarrow/public-api.pxi +++ b/python/pyarrow/public-api.pxi @@ -29,6 +29,7 @@ from pyarrow.includes.libarrow cimport (CArray, CDataType, CField, cdef api bint pyarrow_is_buffer(object buffer): return isinstance(buffer, Buffer) + cdef api shared_ptr[CBuffer] pyarrow_unwrap_buffer(object buffer): cdef Buffer buf if pyarrow_is_buffer(buffer): From 0eff94773e3c5d38b1da86a719b6d305f437ab0a Mon Sep 17 00:00:00 2001 From: Vibhatha Abeykoon Date: Thu, 31 Mar 2022 17:48:27 +0530 Subject: [PATCH 060/131] removing custom exceptions --- cpp/src/arrow/python/udf.cc | 2 +- python/pyarrow/_compute.pyx | 24 +----------------------- python/pyarrow/compute.py | 2 -- 3 files changed, 2 insertions(+), 26 deletions(-) diff --git a/cpp/src/arrow/python/udf.cc b/cpp/src/arrow/python/udf.cc index e3c1f1fafbe..085d449af6c 100644 --- a/cpp/src/arrow/python/udf.cc +++ b/cpp/src/arrow/python/udf.cc @@ -100,7 +100,7 @@ Status ScalarUdfBuilder::MakeFunction(PyObject* function, UDFOptions* options) { return Status::TypeError("Expected a callable python object."); } auto doc = this->doc(); - scalar_func_ = + scalar_func_ = std::make_shared(this->name(), this->arity(), &doc); auto arity = this->arity(); // lambda function diff --git a/python/pyarrow/_compute.pyx b/python/pyarrow/_compute.pyx index e2040aea6ae..d2c3ded5792 100644 --- a/python/pyarrow/_compute.pyx +++ b/python/pyarrow/_compute.pyx @@ -2459,28 +2459,6 @@ cdef CFunctionDoc _make_function_doc(func_doc): raise ValueError(f"func_doc must be a dictionary") -cdef class UDFError(Exception): - cdef dict __dict__ - - def __init__(self, message='', extra_info=b''): - super().__init__(message) - self.extra_info = tobytes(extra_info) - - cdef CStatus to_status(self): - message = tobytes("UDF error: {}".format(str(self))) - return CStatus_UnknownError(message) - - -cdef class UDFRegistrationError(UDFError): - - def __init__(self, message='', extra_info=b''): - super().__init__(message, extra_info) - - cdef CStatus to_status(self): - message = tobytes("UDF Registration error: {}".format(str(self))) - return CStatus_UnknownError(message) - - def register_function(func_name, num_args, function_doc, in_types, out_type, callback, mem_allocation="no_preallocate", null_handling="computed_no_preallocate"): @@ -2668,4 +2646,4 @@ def register_function(func_name, num_args, function_doc, in_types, st = c_sc_builder.MakeFunction(c_callback, &c_options) if not st.ok(): error_msg = st.message().decode() - raise UDFRegistrationError(message=error_msg) + raise RuntimeError(message=error_msg) diff --git a/python/pyarrow/compute.py b/python/pyarrow/compute.py index 46dc91e2814..67df9df7bd7 100644 --- a/python/pyarrow/compute.py +++ b/python/pyarrow/compute.py @@ -81,8 +81,6 @@ register_function, # Expressions Expression, - # Exceptions - UDFRegistrationError, ) from collections import namedtuple From 493426dac404f7c99bb7a85bb6f2364386fae712 Mon Sep 17 00:00:00 2001 From: Vibhatha Abeykoon Date: Thu, 31 Mar 2022 17:52:21 +0530 Subject: [PATCH 061/131] cmake formatting --- cpp/examples/arrow/CMakeLists.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cpp/examples/arrow/CMakeLists.txt b/cpp/examples/arrow/CMakeLists.txt index 0ef268f2562..229373665d5 100644 --- a/cpp/examples/arrow/CMakeLists.txt +++ b/cpp/examples/arrow/CMakeLists.txt @@ -135,5 +135,5 @@ if(ARROW_PARQUET AND ARROW_DATASET) add_dependencies(join-example parquet) add_arrow_example(udf_example) - + endif() From 24c1d40d919a2d2c81fdc38be833f9d6bf4d99d0 Mon Sep 17 00:00:00 2001 From: Vibhatha Abeykoon Date: Fri, 1 Apr 2022 07:53:29 +0530 Subject: [PATCH 062/131] removing arity python interface --- python/pyarrow/_compute.pxd | 6 --- python/pyarrow/_compute.pyx | 81 ++++--------------------------------- python/pyarrow/compute.py | 1 - 3 files changed, 7 insertions(+), 81 deletions(-) diff --git a/python/pyarrow/_compute.pxd b/python/pyarrow/_compute.pxd index 6fa1899fb66..b43a4c84065 100644 --- a/python/pyarrow/_compute.pxd +++ b/python/pyarrow/_compute.pxd @@ -21,12 +21,6 @@ from pyarrow.lib cimport * from pyarrow.includes.common cimport * from pyarrow.includes.libarrow cimport * -cdef class Arity(_Weakrefable): - cdef: - CArity arity - - cdef void init(self, const CArity &arity) - cdef class InputType(_Weakrefable): cdef: diff --git a/python/pyarrow/_compute.pyx b/python/pyarrow/_compute.pyx index d2c3ded5792..6344874f138 100644 --- a/python/pyarrow/_compute.pyx +++ b/python/pyarrow/_compute.pyx @@ -202,15 +202,6 @@ FunctionDoc = namedtuple( "options_required")) -cdef wrap_arity(const CArity c_arity): - """ - Wrap a C++ Arity in an Arity object - """ - cdef Arity arity = Arity.__new__(Arity) - arity.init(c_arity) - return arity - - cdef wrap_input_type(const CInputType c_input_type): """ Wrap a C++ InputType in an InputType object @@ -281,64 +272,6 @@ cdef class InputType(_Weakrefable): return wrap_input_type(c_input_type) -cdef class Arity(_Weakrefable): - """ - An Arity object. - """ - - def __init__(self): - raise TypeError("Cannot use constructor to initialize Arity") - - cdef void init(self, const CArity &arity): - self.arity = arity - - @staticmethod - def nullary(): - """ - create a nullary arity object - """ - cdef CArity c_arity = CArity.Nullary() - return wrap_arity(c_arity) - - @staticmethod - def unary(): - """ - create a unary arity object - """ - cdef CArity c_arity = CArity.Unary() - return wrap_arity(c_arity) - - @staticmethod - def binary(): - """ - create a binary arity object - """ - cdef CArity c_arity = CArity.Binary() - return wrap_arity(c_arity) - - @staticmethod - def ternary(): - """ - create a ternary arity object - """ - cdef CArity c_arity = CArity.Ternary() - return wrap_arity(c_arity) - - @staticmethod - def varargs(num_args): - """ - create a varargs arity object with defined number of arguments - - Parameter - --------- - - num_args: int - number of arguments - """ - cdef CArity c_arity = CArity.VarArgs(num_args) - return wrap_arity(c_arity) - - cdef class Function(_Weakrefable): """ A compute function. @@ -2538,7 +2471,7 @@ def register_function(func_name, num_args, function_doc, in_types, >>> from pyarrow import compute as pc >>> from pyarrow.compute import register_function - >>> from pyarrow.compute import Arity, InputType + >>> from pyarrow.compute import InputType >>> >>> func_doc = {} >>> func_doc["summary"] = "simple udf" @@ -2550,7 +2483,7 @@ def register_function(func_name, num_args, function_doc, in_types, ... >>> >>> func_name = "py_add_func" - >>> arity = Arity.unary() + >>> arity = 1 >>> in_types = [InputType.array(pa.int64())] >>> out_type = pa.int64() >>> register_function(func_name, arity, func_doc, @@ -2602,15 +2535,15 @@ def register_function(func_name, num_args, function_doc, in_types, if num_args and isinstance(num_args, int): assert num_args > 0 if num_args == 0: - c_arity = ( Arity.nullary()).arity + c_arity = CArity.Nullary() elif num_args == 1: - c_arity = ( Arity.unary()).arity + c_arity = CArity.Unary() elif num_args == 2: - c_arity = ( Arity.binary()).arity + c_arity = CArity.Binary() elif num_args == 3: - c_arity = ( Arity.ternary()).arity + c_arity = CArity.Ternary() elif num_args > 3: - c_arity = ( Arity.varargs(num_args)).arity + c_arity = CArity.VarArgs(num_args) else: raise ValueError("arity must be an instance of Arity") diff --git a/python/pyarrow/compute.py b/python/pyarrow/compute.py index 67df9df7bd7..340b872e275 100644 --- a/python/pyarrow/compute.py +++ b/python/pyarrow/compute.py @@ -16,7 +16,6 @@ # under the License. from pyarrow._compute import ( # noqa - Arity, Function, FunctionOptions, FunctionRegistry, From 20ebc30252e8cc0257aab4b579ffba1d4910be21 Mon Sep 17 00:00:00 2001 From: Vibhatha Abeykoon Date: Fri, 1 Apr 2022 15:07:08 +0530 Subject: [PATCH 063/131] refactor the udf builder API and add options --- cpp/src/arrow/python/udf.cc | 18 ++++++------- cpp/src/arrow/python/udf.h | 40 +++++++++++++++++----------- python/pyarrow/_compute.pyx | 14 +++++----- python/pyarrow/includes/libarrow.pxd | 20 +++++++++----- 4 files changed, 54 insertions(+), 38 deletions(-) diff --git a/cpp/src/arrow/python/udf.cc b/cpp/src/arrow/python/udf.cc index 085d449af6c..a682e6bdd20 100644 --- a/cpp/src/arrow/python/udf.cc +++ b/cpp/src/arrow/python/udf.cc @@ -89,7 +89,7 @@ Status VerifyArityAndInput(compute::Arity arity, const compute::ExecBatch& batch return Status::OK(); } -Status ScalarUdfBuilder::MakeFunction(PyObject* function, UDFOptions* options) { +Status ScalarUdfBuilder::MakeFunction(PyObject* function, ScalarUdfOptions* options) { // creating a copy of objects for the lambda function Py_INCREF(function); function_.reset(function); @@ -99,10 +99,10 @@ Status ScalarUdfBuilder::MakeFunction(PyObject* function, UDFOptions* options) { if (!PyCallable_Check(function_.obj())) { return Status::TypeError("Expected a callable python object."); } - auto doc = this->doc(); - scalar_func_ = - std::make_shared(this->name(), this->arity(), &doc); - auto arity = this->arity(); + auto doc = options->doc(); + auto arity = options->arity(); + scalar_func_ = std::make_shared(options->name(), arity, &doc); + // lambda function auto call_back = [&, arity](compute::KernelContext* ctx, const compute::ExecBatch& batch, Datum* out) -> Status { @@ -119,11 +119,11 @@ Status ScalarUdfBuilder::MakeFunction(PyObject* function, UDFOptions* options) { }; // lambda function compute::ScalarKernel kernel( - compute::KernelSignature::Make(this->input_types(), this->output_type(), - this->arity().is_varargs), + compute::KernelSignature::Make(options->input_types(), options->output_type(), + arity.is_varargs), call_back); - kernel.mem_allocation = this->mem_allocation(); - kernel.null_handling = this->null_handling(); + kernel.mem_allocation = options->mem_allocation(); + kernel.null_handling = options->null_handling(); RETURN_NOT_OK(scalar_func_->AddKernel(std::move(kernel))); auto registry = compute::GetFunctionRegistry(); RETURN_NOT_OK(registry->AddFunction(std::move(scalar_func_))); diff --git a/cpp/src/arrow/python/udf.h b/cpp/src/arrow/python/udf.h index 09121f1f4f3..44e5330a5b9 100644 --- a/cpp/src/arrow/python/udf.h +++ b/cpp/src/arrow/python/udf.h @@ -40,18 +40,15 @@ namespace arrow { namespace py { // Exposing the UDFOptions: https://issues.apache.org/jira/browse/ARROW-16041 -struct UDFOptions {}; - -class ARROW_PYTHON_EXPORT UdfBuilder { +class ARROW_PYTHON_EXPORT UdfOptions { public: - UdfBuilder(const std::string func_name, const compute::Function::Kind kind, - const compute::Arity arity, const compute::FunctionDoc func_doc, + UdfOptions(const compute::Function::Kind kind, const compute::Arity arity, + const compute::FunctionDoc func_doc, const std::vector in_types, const compute::OutputType out_type, const compute::MemAllocation::type mem_allocation, const compute::NullHandling::type null_handling) - : func_name_(func_name), - kind_(kind), + : kind_(kind), arity_(arity), func_doc_(func_doc), in_types_(in_types), @@ -59,8 +56,6 @@ class ARROW_PYTHON_EXPORT UdfBuilder { mem_allocation_(mem_allocation), null_handling_(null_handling) {} - const std::string& name() const { return func_name_; } - compute::Function::Kind kind() { return kind_; } const compute::Arity& arity() const { return arity_; } @@ -76,7 +71,6 @@ class ARROW_PYTHON_EXPORT UdfBuilder { compute::NullHandling::type null_handling() { return null_handling_; } private: - std::string func_name_; compute::Function::Kind kind_; compute::Arity arity_; const compute::FunctionDoc func_doc_; @@ -86,18 +80,34 @@ class ARROW_PYTHON_EXPORT UdfBuilder { compute::NullHandling::type null_handling_; }; -class ARROW_PYTHON_EXPORT ScalarUdfBuilder : public UdfBuilder { +class ARROW_PYTHON_EXPORT ScalarUdfOptions : public UdfOptions { public: - ScalarUdfBuilder(const std::string func_name, const compute::Arity arity, + ScalarUdfOptions(const std::string func_name, const compute::Arity arity, const compute::FunctionDoc func_doc, const std::vector in_types, const compute::OutputType out_type, const compute::MemAllocation::type mem_allocation, const compute::NullHandling::type null_handling) - : UdfBuilder(func_name, compute::Function::SCALAR, arity, func_doc, in_types, - out_type, mem_allocation, null_handling) {} + : UdfOptions(compute::Function::SCALAR, arity, func_doc, in_types, out_type, + mem_allocation, null_handling), + func_name_(func_name) {} + + const std::string& name() const { return func_name_; } + + private: + std::string func_name_; +}; + +class ARROW_PYTHON_EXPORT UdfBuilder { + public: + UdfBuilder() {} +}; + +class ARROW_PYTHON_EXPORT ScalarUdfBuilder : public UdfBuilder { + public: + ScalarUdfBuilder() : UdfBuilder() {} - Status MakeFunction(PyObject* function, UDFOptions* options = NULLPTR); + Status MakeFunction(PyObject* function, ScalarUdfOptions* options = NULLPTR); private: OwnedRefNoGIL function_; diff --git a/python/pyarrow/_compute.pyx b/python/pyarrow/_compute.pyx index 6344874f138..8110e09836b 100644 --- a/python/pyarrow/_compute.pyx +++ b/python/pyarrow/_compute.pyx @@ -2512,7 +2512,7 @@ def register_function(func_name, num_args, function_doc, in_types, MemAllocation c_mem_allocation NullHandling c_null_handling CStatus st - CUDFOptions c_options + CScalarUdfOptions* c_options object obj _mem_allocation_map = { @@ -2528,7 +2528,7 @@ def register_function(func_name, num_args, function_doc, in_types, } if func_name and isinstance(func_name, str): - c_func_name = func_name.encode() + c_func_name = tobytes(func_name) else: raise ValueError("func_name should be str") @@ -2572,11 +2572,11 @@ def register_function(func_name, num_args, function_doc, in_types, # Note: The VectorUDF, TableUDF and AggregatorUDFs will be defined # when they are implemented. Only ScalarUDFBuilder is supported at the # moment. - c_sc_builder = new CScalarUdfBuilder(c_func_name, c_arity, c_func_doc, - c_in_types, deref(c_out_type), - c_mem_allocation, c_null_handling) - - st = c_sc_builder.MakeFunction(c_callback, &c_options) + c_options = new CScalarUdfOptions(c_func_name, c_arity, c_func_doc, + c_in_types, deref(c_out_type), + c_mem_allocation, c_null_handling) + c_sc_builder = new CScalarUdfBuilder() + st = c_sc_builder.MakeFunction(c_callback, c_options) if not st.ok(): error_msg = st.message().decode() raise RuntimeError(message=error_msg) diff --git a/python/pyarrow/includes/libarrow.pxd b/python/pyarrow/includes/libarrow.pxd index 9a9f7b49184..0babe342026 100644 --- a/python/pyarrow/includes/libarrow.pxd +++ b/python/pyarrow/includes/libarrow.pxd @@ -2722,14 +2722,20 @@ cdef extern from "arrow/compute/kernel.h" namespace "arrow::compute" nogil: NullHandling_OUTPUT_NOT_NULL" arrow::compute::NullHandling::OUTPUT_NOT_NULL" cdef extern from "arrow/python/udf.h" namespace "arrow::py" nogil: - cdef cppclass CUDFOptions" arrow::py::UDFOptions": - pass - cdef cppclass CUdfBuilder" arrow::py::UdfBuilder": - CUdfBuilder(c_string func_name, FunctionKind kind, CArity arity, CFunctionDoc func_doc, + cdef cppclass CUdfOptions" arrow::py::UdfOptions": + CUdfOptions(FunctionKind kind, CArity arity, CFunctionDoc func_doc, vector[CInputType] in_types, COutputType out_type, MemAllocation mem_allocation, NullHandling null_handling) - cdef cppclass CScalarUdfBuilder" arrow::py::ScalarUdfBuilder"(CUdfBuilder): - CScalarUdfBuilder(c_string func_name, CArity arity, CFunctionDoc func_doc, + + cdef cppclass CScalarUdfOptions" arrow::py::ScalarUdfOptions"(CUdfOptions): + + CScalarUdfOptions(c_string func_name, CArity arity, CFunctionDoc func_doc, vector[CInputType] in_types, COutputType out_type, MemAllocation mem_allocation, NullHandling null_handling) - CStatus MakeFunction(PyObject* function, CUDFOptions* options) + + cdef cppclass CUdfBuilder" arrow::py::UdfBuilder": + CUdfBuilder() + + cdef cppclass CScalarUdfBuilder" arrow::py::ScalarUdfBuilder"(CUdfBuilder): + CScalarUdfBuilder() + CStatus MakeFunction(PyObject* function, CScalarUdfOptions* options) From 6d0215f334aa13be50a8e01286c7be6f585ea3c4 Mon Sep 17 00:00:00 2001 From: Vibhatha Abeykoon Date: Sat, 2 Apr 2022 11:26:43 +0530 Subject: [PATCH 064/131] rebase --- .../arrow/compute_register_example.cc | 2 +- cpp/examples/arrow/udf_example.cc | 2 +- cpp/src/arrow/compute/cast.cc | 4 +- cpp/src/arrow/compute/exec_test.cc | 12 +- cpp/src/arrow/compute/function.cc | 8 +- cpp/src/arrow/compute/function.h | 20 +-- cpp/src/arrow/compute/function_test.cc | 18 +-- .../arrow/compute/kernels/aggregate_basic.cc | 22 +-- .../arrow/compute/kernels/aggregate_mode.cc | 2 +- .../compute/kernels/aggregate_quantile.cc | 2 +- .../compute/kernels/aggregate_tdigest.cc | 4 +- .../compute/kernels/aggregate_var_std.cc | 4 +- .../arrow/compute/kernels/hash_aggregate.cc | 34 ++--- .../compute/kernels/scalar_arithmetic.cc | 130 +++++++++--------- .../arrow/compute/kernels/scalar_boolean.cc | 18 +-- .../arrow/compute/kernels/scalar_compare.cc | 22 +-- .../arrow/compute/kernels/scalar_if_else.cc | 8 +- .../arrow/compute/kernels/scalar_nested.cc | 10 +- .../arrow/compute/kernels/scalar_random.cc | 2 +- .../compute/kernels/scalar_set_lookup.cc | 8 +- .../compute/kernels/scalar_string_ascii.cc | 90 ++++++------ .../compute/kernels/scalar_string_internal.h | 6 +- .../compute/kernels/scalar_string_utf8.cc | 62 ++++----- .../compute/kernels/scalar_temporal_binary.cc | 28 ++-- .../compute/kernels/scalar_temporal_unary.cc | 60 ++++---- .../arrow/compute/kernels/scalar_validity.cc | 28 ++-- .../compute/kernels/vector_array_sort.cc | 4 +- cpp/src/arrow/compute/kernels/vector_hash.cc | 6 +- .../arrow/compute/kernels/vector_nested.cc | 4 +- .../arrow/compute/kernels/vector_replace.cc | 6 +- .../arrow/compute/kernels/vector_selection.cc | 16 +-- cpp/src/arrow/compute/kernels/vector_sort.cc | 4 +- cpp/src/arrow/compute/registry_test.cc | 6 +- cpp/src/arrow/python/udf.cc | 2 +- 34 files changed, 324 insertions(+), 330 deletions(-) diff --git a/cpp/examples/arrow/compute_register_example.cc b/cpp/examples/arrow/compute_register_example.cc index 0f6165a0646..f089b910ec4 100644 --- a/cpp/examples/arrow/compute_register_example.cc +++ b/cpp/examples/arrow/compute_register_example.cc @@ -126,7 +126,7 @@ const cp::FunctionDoc func_doc{ int main(int argc, char** argv) { const std::string name = "compute_register_example"; - auto func = std::make_shared(name, cp::Arity::Unary(), &func_doc); + auto func = std::make_shared(name, cp::Arity::Unary(), func_doc); cp::ScalarKernel kernel({cp::InputType::Array(arrow::int64())}, arrow::int64(), ExampleFunctionImpl); kernel.mem_allocation = cp::MemAllocation::NO_PREALLOCATE; diff --git a/cpp/examples/arrow/udf_example.cc b/cpp/examples/arrow/udf_example.cc index 81a1a2b36ab..f525bf859ff 100644 --- a/cpp/examples/arrow/udf_example.cc +++ b/cpp/examples/arrow/udf_example.cc @@ -74,7 +74,7 @@ arrow::Status SampleFunction(cp::KernelContext* ctx, const cp::ExecBatch& batch, arrow::Status Execute() { const std::string name = "add_three"; - auto func = std::make_shared(name, cp::Arity::Ternary(), &func_doc); + auto func = std::make_shared(name, cp::Arity::Ternary(), func_doc); cp::ScalarKernel kernel( {cp::InputType::Array(arrow::int64()), cp::InputType::Array(arrow::int64()), cp::InputType::Array(arrow::int64())}, diff --git a/cpp/src/arrow/compute/cast.cc b/cpp/src/arrow/compute/cast.cc index 4de68ba8d90..bd49041b4f3 100644 --- a/cpp/src/arrow/compute/cast.cc +++ b/cpp/src/arrow/compute/cast.cc @@ -95,7 +95,7 @@ const FunctionDoc cast_doc{"Cast values to another data type", // to the standard SQL CAST(expr AS target_type) class CastMetaFunction : public MetaFunction { public: - CastMetaFunction() : MetaFunction("cast", Arity::Unary(), &cast_doc) {} + CastMetaFunction() : MetaFunction("cast", Arity::Unary(), cast_doc) {} Result ValidateOptions(const FunctionOptions* options) const { auto cast_options = static_cast(options); @@ -153,7 +153,7 @@ CastOptions::CastOptions(bool safe) constexpr char CastOptions::kTypeName[]; CastFunction::CastFunction(std::string name, Type::type out_type_id) - : ScalarFunction(std::move(name), Arity::Unary(), /*doc=*/nullptr), + : ScalarFunction(std::move(name), Arity::Unary(), FunctionDoc::Empty()), out_type_id_(out_type_id) {} Status CastFunction::AddKernel(Type::type in_type_id, ScalarKernel kernel) { diff --git a/cpp/src/arrow/compute/exec_test.cc b/cpp/src/arrow/compute/exec_test.cc index 198cb84ff5e..7bf5a0ead1c 100644 --- a/cpp/src/arrow/compute/exec_test.cc +++ b/cpp/src/arrow/compute/exec_test.cc @@ -682,7 +682,7 @@ class TestCallScalarFunction : public TestComputeInternals { // This function simply copies memory from the input argument into the // (preallocated) output auto func = - std::make_shared("test_copy", Arity::Unary(), /*doc=*/nullptr); + std::make_shared("test_copy", Arity::Unary(), /*doc=*/FunctionDoc::Empty()); // Add a few kernels. Our implementation only accepts arrays ASSERT_OK(func->AddKernel({InputType::Array(uint8())}, uint8(), ExecCopy)); @@ -692,7 +692,7 @@ class TestCallScalarFunction : public TestComputeInternals { // A version which doesn't want the executor to call PropagateNulls auto func2 = std::make_shared("test_copy_computed_bitmap", - Arity::Unary(), /*doc=*/nullptr); + Arity::Unary(), /*doc=*/FunctionDoc::Empty()); ScalarKernel kernel({InputType::Array(uint8())}, uint8(), ExecComputedBitmap); kernel.null_handling = NullHandling::COMPUTED_PREALLOCATE; ASSERT_OK(func2->AddKernel(kernel)); @@ -705,9 +705,9 @@ class TestCallScalarFunction : public TestComputeInternals { // A function that allocates its own output memory. We have cases for both // non-preallocated data and non-preallocated validity bitmap auto f1 = std::make_shared("test_nopre_data", Arity::Unary(), - /*doc=*/nullptr); + /*doc=*/FunctionDoc::Empty()); auto f2 = std::make_shared("test_nopre_validity_or_data", - Arity::Unary(), /*doc=*/nullptr); + Arity::Unary(), /*doc=*/FunctionDoc::Empty()); ScalarKernel kernel({InputType::Array(uint8())}, uint8(), ExecNoPreallocatedData); kernel.mem_allocation = MemAllocation::NO_PREALLOCATE; @@ -727,7 +727,7 @@ class TestCallScalarFunction : public TestComputeInternals { // This function's behavior depends on a static parameter that is made // available to the kernel's execution function through its Options object auto func = std::make_shared("test_stateful", Arity::Unary(), - /*doc=*/nullptr); + /*doc=*/FunctionDoc::Empty()); ScalarKernel kernel({InputType::Array(int32())}, int32(), ExecStateful, InitStateful); ASSERT_OK(func->AddKernel(kernel)); @@ -738,7 +738,7 @@ class TestCallScalarFunction : public TestComputeInternals { auto registry = GetFunctionRegistry(); auto func = std::make_shared("test_scalar_add_int32", Arity::Binary(), - /*doc=*/nullptr); + /*doc=*/FunctionDoc::Empty()); ASSERT_OK(func->AddKernel({InputType::Scalar(int32()), InputType::Scalar(int32())}, int32(), ExecAddInt32)); ASSERT_OK(registry->AddFunction(func)); diff --git a/cpp/src/arrow/compute/function.cc b/cpp/src/arrow/compute/function.cc index 1a7f36862dd..f1b3fcbccf4 100644 --- a/cpp/src/arrow/compute/function.cc +++ b/cpp/src/arrow/compute/function.cc @@ -290,9 +290,9 @@ Status ValidateFunctionDescription(const std::string& s) { } // namespace Status Function::Validate() const { - if (!doc_->summary.empty()) { + if (!doc_.summary.empty()) { // Documentation given, check its contents - int arg_count = static_cast(doc_->arg_names.size()); + int arg_count = static_cast(doc_.arg_names.size()); // Some varargs functions allow 0 vararg, others expect at least 1, // hence the two possible values below. bool arg_count_match = (arg_count == arity_.num_args) || @@ -302,9 +302,9 @@ Status Function::Validate() const { "In function '", name_, "': ", "number of argument names for function documentation != function arity"); } - Status st = ValidateFunctionSummary(doc_->summary); + Status st = ValidateFunctionSummary(doc_.summary); if (st.ok()) { - st &= ValidateFunctionDescription(doc_->description); + st &= ValidateFunctionDescription(doc_.description); } if (!st.ok()) { return st.WithMessage("In function '", name_, "': ", st.message()); diff --git a/cpp/src/arrow/compute/function.h b/cpp/src/arrow/compute/function.h index 1273ab09c4f..a841e18d36c 100644 --- a/cpp/src/arrow/compute/function.h +++ b/cpp/src/arrow/compute/function.h @@ -205,7 +205,7 @@ class ARROW_EXPORT Function { const Arity& arity() const { return arity_; } /// \brief Return the function documentation - const FunctionDoc& doc() const { return *doc_; } + const FunctionDoc doc() const { return doc_; } /// \brief Returns the number of registered kernels for this function. virtual int num_kernels() const = 0; @@ -245,11 +245,11 @@ class ARROW_EXPORT Function { protected: Function(std::string name, Function::Kind kind, const Arity& arity, - const FunctionDoc* doc, const FunctionOptions* default_options) + const FunctionDoc doc, const FunctionOptions* default_options) : name_(std::move(name)), kind_(kind), arity_(arity), - doc_(doc ? doc : &FunctionDoc::Empty()), + doc_(doc), default_options_(default_options) {} Status CheckArity(const std::vector&) const; @@ -258,7 +258,7 @@ class ARROW_EXPORT Function { std::string name_; Function::Kind kind_; Arity arity_; - const FunctionDoc* doc_; + const FunctionDoc doc_; const FunctionOptions* default_options_ = NULLPTR; }; @@ -280,7 +280,7 @@ class FunctionImpl : public Function { protected: FunctionImpl(std::string name, Function::Kind kind, const Arity& arity, - const FunctionDoc* doc, const FunctionOptions* default_options) + const FunctionDoc doc, const FunctionOptions* default_options) : Function(std::move(name), kind, arity, doc, default_options) {} std::vector kernels_; @@ -305,7 +305,7 @@ class ARROW_EXPORT ScalarFunction : public detail::FunctionImpl { public: using KernelType = ScalarKernel; - ScalarFunction(std::string name, const Arity& arity, const FunctionDoc* doc, + ScalarFunction(std::string name, const Arity& arity, const FunctionDoc doc, const FunctionOptions* default_options = NULLPTR) : detail::FunctionImpl(std::move(name), Function::SCALAR, arity, doc, default_options) {} @@ -329,7 +329,7 @@ class ARROW_EXPORT VectorFunction : public detail::FunctionImpl { public: using KernelType = VectorKernel; - VectorFunction(std::string name, const Arity& arity, const FunctionDoc* doc, + VectorFunction(std::string name, const Arity& arity, const FunctionDoc doc, const FunctionOptions* default_options = NULLPTR) : detail::FunctionImpl(std::move(name), Function::VECTOR, arity, doc, default_options) {} @@ -350,7 +350,7 @@ class ARROW_EXPORT ScalarAggregateFunction public: using KernelType = ScalarAggregateKernel; - ScalarAggregateFunction(std::string name, const Arity& arity, const FunctionDoc* doc, + ScalarAggregateFunction(std::string name, const Arity& arity, const FunctionDoc doc, const FunctionOptions* default_options = NULLPTR) : detail::FunctionImpl( std::move(name), Function::SCALAR_AGGREGATE, arity, doc, default_options) {} @@ -365,7 +365,7 @@ class ARROW_EXPORT HashAggregateFunction public: using KernelType = HashAggregateKernel; - HashAggregateFunction(std::string name, const Arity& arity, const FunctionDoc* doc, + HashAggregateFunction(std::string name, const Arity& arity, const FunctionDoc doc, const FunctionOptions* default_options = NULLPTR) : detail::FunctionImpl( std::move(name), Function::HASH_AGGREGATE, arity, doc, default_options) {} @@ -392,7 +392,7 @@ class ARROW_EXPORT MetaFunction : public Function { const FunctionOptions* options, ExecContext* ctx) const = 0; - MetaFunction(std::string name, const Arity& arity, const FunctionDoc* doc, + MetaFunction(std::string name, const Arity& arity, const FunctionDoc doc, const FunctionOptions* default_options = NULLPTR) : Function(std::move(name), Function::META, arity, doc, default_options) {} }; diff --git a/cpp/src/arrow/compute/function_test.cc b/cpp/src/arrow/compute/function_test.cc index 13de2a29ab8..5b7a90dbf29 100644 --- a/cpp/src/arrow/compute/function_test.cc +++ b/cpp/src/arrow/compute/function_test.cc @@ -179,8 +179,8 @@ TEST(Arity, Basics) { } TEST(ScalarFunction, Basics) { - ScalarFunction func("scalar_test", Arity::Binary(), /*doc=*/nullptr); - ScalarFunction varargs_func("varargs_test", Arity::VarArgs(1), /*doc=*/nullptr); + ScalarFunction func("scalar_test", Arity::Binary(), /*doc=*/FunctionDoc::Empty()); + ScalarFunction varargs_func("varargs_test", Arity::VarArgs(1), /*doc=*/FunctionDoc::Empty()); ASSERT_EQ("scalar_test", func.name()); ASSERT_EQ(2, func.arity().num_args); @@ -194,8 +194,8 @@ TEST(ScalarFunction, Basics) { } TEST(VectorFunction, Basics) { - VectorFunction func("vector_test", Arity::Binary(), /*doc=*/nullptr); - VectorFunction varargs_func("varargs_test", Arity::VarArgs(1), /*doc=*/nullptr); + VectorFunction func("vector_test", Arity::Binary(), /*doc=*/FunctionDoc::Empty()); + VectorFunction varargs_func("varargs_test", Arity::VarArgs(1), /*doc=*/FunctionDoc::Empty()); ASSERT_EQ("vector_test", func.name()); ASSERT_EQ(2, func.arity().num_args); @@ -260,15 +260,15 @@ void CheckAddDispatch(FunctionType* func) { } TEST(ScalarVectorFunction, DispatchExact) { - ScalarFunction func1("scalar_test", Arity::Binary(), /*doc=*/nullptr); - VectorFunction func2("vector_test", Arity::Binary(), /*doc=*/nullptr); + ScalarFunction func1("scalar_test", Arity::Binary(), /*doc=*/FunctionDoc::Empty()); + VectorFunction func2("vector_test", Arity::Binary(), /*doc=*/FunctionDoc::Empty()); CheckAddDispatch(&func1); CheckAddDispatch(&func2); } TEST(ArrayFunction, VarArgs) { - ScalarFunction va_func("va_test", Arity::VarArgs(1), /*doc=*/nullptr); + ScalarFunction va_func("va_test", Arity::VarArgs(1), /*doc=*/FunctionDoc::Empty()); std::vector va_args = {int8()}; @@ -294,7 +294,7 @@ TEST(ArrayFunction, VarArgs) { } TEST(ScalarAggregateFunction, Basics) { - ScalarAggregateFunction func("agg_test", Arity::Unary(), /*doc=*/nullptr); + ScalarAggregateFunction func("agg_test", Arity::Unary(), /*doc=*/FunctionDoc::Empty()); ASSERT_EQ("agg_test", func.name()); ASSERT_EQ(1, func.arity().num_args); @@ -313,7 +313,7 @@ Status NoopMerge(KernelContext*, const KernelState&, KernelState*) { Status NoopFinalize(KernelContext*, Datum*) { return Status::OK(); } TEST(ScalarAggregateFunction, DispatchExact) { - ScalarAggregateFunction func("agg_test", Arity::Unary(), /*doc=*/nullptr); + ScalarAggregateFunction func("agg_test", Arity::Unary(), FunctionDoc::Empty()); std::vector in_args = {ValueDescr::Array(int8())}; ScalarAggregateKernel kernel(std::move(in_args), int64(), NoopInit, NoopConsume, diff --git a/cpp/src/arrow/compute/kernels/aggregate_basic.cc b/cpp/src/arrow/compute/kernels/aggregate_basic.cc index c9e2d85a26b..25e838f2999 100644 --- a/cpp/src/arrow/compute/kernels/aggregate_basic.cc +++ b/cpp/src/arrow/compute/kernels/aggregate_basic.cc @@ -918,7 +918,7 @@ void RegisterScalarAggregateBasic(FunctionRegistry* registry) { static auto default_count_options = CountOptions::Defaults(); auto func = std::make_shared( - "count", Arity::Unary(), &count_doc, &default_count_options); + "count", Arity::Unary(), count_doc, &default_count_options); // Takes any input, outputs int64 scalar InputType any_input; @@ -927,12 +927,12 @@ void RegisterScalarAggregateBasic(FunctionRegistry* registry) { DCHECK_OK(registry->AddFunction(std::move(func))); func = std::make_shared( - "count_distinct", Arity::Unary(), &count_distinct_doc, &default_count_options); + "count_distinct", Arity::Unary(), count_distinct_doc, &default_count_options); // Takes any input, outputs int64 scalar AddCountDistinctKernels(func.get()); DCHECK_OK(registry->AddFunction(std::move(func))); - func = std::make_shared("sum", Arity::Unary(), &sum_doc, + func = std::make_shared("sum", Arity::Unary(), sum_doc, &default_scalar_aggregate_options); AddArrayScalarAggKernels(SumInit, {boolean()}, uint64(), func.get()); AddAggKernel( @@ -961,7 +961,7 @@ void RegisterScalarAggregateBasic(FunctionRegistry* registry) { #endif DCHECK_OK(registry->AddFunction(std::move(func))); - func = std::make_shared("mean", Arity::Unary(), &mean_doc, + func = std::make_shared("mean", Arity::Unary(), mean_doc, &default_scalar_aggregate_options); AddArrayScalarAggKernels(MeanInit, {boolean()}, float64(), func.get()); AddArrayScalarAggKernels(MeanInit, NumericTypes(), float64(), func.get()); @@ -986,7 +986,7 @@ void RegisterScalarAggregateBasic(FunctionRegistry* registry) { DCHECK_OK(registry->AddFunction(std::move(func))); func = std::make_shared( - "min_max", Arity::Unary(), &min_max_doc, &default_scalar_aggregate_options); + "min_max", Arity::Unary(), min_max_doc, &default_scalar_aggregate_options); AddMinMaxKernels(MinMaxInit, {null(), boolean()}, func.get()); AddMinMaxKernels(MinMaxInit, NumericTypes(), func.get()); AddMinMaxKernels(MinMaxInit, TemporalTypes(), func.get()); @@ -1011,18 +1011,18 @@ void RegisterScalarAggregateBasic(FunctionRegistry* registry) { DCHECK_OK(registry->AddFunction(std::move(func))); // Add min/max as convenience functions - func = std::make_shared("min", Arity::Unary(), &min_or_max_doc, + func = std::make_shared("min", Arity::Unary(), min_or_max_doc, &default_scalar_aggregate_options); AddMinOrMaxAggKernel(func.get(), min_max_func); DCHECK_OK(registry->AddFunction(std::move(func))); - func = std::make_shared("max", Arity::Unary(), &min_or_max_doc, + func = std::make_shared("max", Arity::Unary(), min_or_max_doc, &default_scalar_aggregate_options); AddMinOrMaxAggKernel(func.get(), min_max_func); DCHECK_OK(registry->AddFunction(std::move(func))); func = std::make_shared( - "product", Arity::Unary(), &product_doc, &default_scalar_aggregate_options); + "product", Arity::Unary(), product_doc, &default_scalar_aggregate_options); AddArrayScalarAggKernels(ProductInit::Init, {boolean()}, uint64(), func.get()); AddArrayScalarAggKernels(ProductInit::Init, SignedIntTypes(), int64(), func.get()); AddArrayScalarAggKernels(ProductInit::Init, UnsignedIntTypes(), uint64(), func.get()); @@ -1038,19 +1038,19 @@ void RegisterScalarAggregateBasic(FunctionRegistry* registry) { DCHECK_OK(registry->AddFunction(std::move(func))); // any - func = std::make_shared("any", Arity::Unary(), &any_doc, + func = std::make_shared("any", Arity::Unary(), any_doc, &default_scalar_aggregate_options); AddArrayScalarAggKernels(AnyInit, {boolean()}, boolean(), func.get()); DCHECK_OK(registry->AddFunction(std::move(func))); // all - func = std::make_shared("all", Arity::Unary(), &all_doc, + func = std::make_shared("all", Arity::Unary(), all_doc, &default_scalar_aggregate_options); AddArrayScalarAggKernels(AllInit, {boolean()}, boolean(), func.get()); DCHECK_OK(registry->AddFunction(std::move(func))); // index - func = std::make_shared("index", Arity::Unary(), &index_doc); + func = std::make_shared("index", Arity::Unary(), index_doc); AddBasicAggKernels(IndexInit::Init, BaseBinaryTypes(), int64(), func.get()); AddBasicAggKernels(IndexInit::Init, PrimitiveTypes(), int64(), func.get()); AddBasicAggKernels(IndexInit::Init, TemporalTypes(), int64(), func.get()); diff --git a/cpp/src/arrow/compute/kernels/aggregate_mode.cc b/cpp/src/arrow/compute/kernels/aggregate_mode.cc index 287c2c5d368..7d3440cbef3 100644 --- a/cpp/src/arrow/compute/kernels/aggregate_mode.cc +++ b/cpp/src/arrow/compute/kernels/aggregate_mode.cc @@ -428,7 +428,7 @@ const FunctionDoc mode_doc{ void RegisterScalarAggregateMode(FunctionRegistry* registry) { static auto default_options = ModeOptions::Defaults(); - auto func = std::make_shared("mode", Arity::Unary(), &mode_doc, + auto func = std::make_shared("mode", Arity::Unary(), mode_doc, &default_options); DCHECK_OK(func->AddKernel( NewModeKernel(boolean(), ModeExecutor::Exec))); diff --git a/cpp/src/arrow/compute/kernels/aggregate_quantile.cc b/cpp/src/arrow/compute/kernels/aggregate_quantile.cc index 1ca030130b0..810fb539913 100644 --- a/cpp/src/arrow/compute/kernels/aggregate_quantile.cc +++ b/cpp/src/arrow/compute/kernels/aggregate_quantile.cc @@ -531,7 +531,7 @@ const FunctionDoc quantile_doc{ void RegisterScalarAggregateQuantile(FunctionRegistry* registry) { static QuantileOptions default_options; - auto func = std::make_shared("quantile", Arity::Unary(), &quantile_doc, + auto func = std::make_shared("quantile", Arity::Unary(), quantile_doc, &default_options); AddQuantileKernels(func.get()); DCHECK_OK(registry->AddFunction(std::move(func))); diff --git a/cpp/src/arrow/compute/kernels/aggregate_tdigest.cc b/cpp/src/arrow/compute/kernels/aggregate_tdigest.cc index 7c86267d940..037bba42f16 100644 --- a/cpp/src/arrow/compute/kernels/aggregate_tdigest.cc +++ b/cpp/src/arrow/compute/kernels/aggregate_tdigest.cc @@ -196,7 +196,7 @@ const FunctionDoc approximate_median_doc{ std::shared_ptr AddTDigestAggKernels() { static auto default_tdigest_options = TDigestOptions::Defaults(); auto func = std::make_shared( - "tdigest", Arity::Unary(), &tdigest_doc, &default_tdigest_options); + "tdigest", Arity::Unary(), tdigest_doc, &default_tdigest_options); AddTDigestKernels(TDigestInit, NumericTypes(), func.get()); AddTDigestKernels(TDigestInit, {decimal128(1, 1), decimal256(1, 1)}, func.get()); return func; @@ -207,7 +207,7 @@ std::shared_ptr AddApproximateMedianAggKernels( static ScalarAggregateOptions default_scalar_aggregate_options; auto median = std::make_shared( - "approximate_median", Arity::Unary(), &approximate_median_doc, + "approximate_median", Arity::Unary(), approximate_median_doc, &default_scalar_aggregate_options); auto sig = diff --git a/cpp/src/arrow/compute/kernels/aggregate_var_std.cc b/cpp/src/arrow/compute/kernels/aggregate_var_std.cc index feb98718aee..5404c089a50 100644 --- a/cpp/src/arrow/compute/kernels/aggregate_var_std.cc +++ b/cpp/src/arrow/compute/kernels/aggregate_var_std.cc @@ -290,7 +290,7 @@ const FunctionDoc variance_doc{ std::shared_ptr AddStddevAggKernels() { static auto default_std_options = VarianceOptions::Defaults(); auto func = std::make_shared( - "stddev", Arity::Unary(), &stddev_doc, &default_std_options); + "stddev", Arity::Unary(), stddev_doc, &default_std_options); AddVarStdKernels(StddevInit, NumericTypes(), func.get()); AddVarStdKernels(StddevInit, {decimal128(1, 1), decimal256(1, 1)}, func.get()); return func; @@ -299,7 +299,7 @@ std::shared_ptr AddStddevAggKernels() { std::shared_ptr AddVarianceAggKernels() { static auto default_var_options = VarianceOptions::Defaults(); auto func = std::make_shared( - "variance", Arity::Unary(), &variance_doc, &default_var_options); + "variance", Arity::Unary(), variance_doc, &default_var_options); AddVarStdKernels(VarianceInit, NumericTypes(), func.get()); AddVarStdKernels(VarianceInit, {decimal128(1, 1), decimal256(1, 1)}, func.get()); return func; diff --git a/cpp/src/arrow/compute/kernels/hash_aggregate.cc b/cpp/src/arrow/compute/kernels/hash_aggregate.cc index db34ee6c596..d9ffcda5962 100644 --- a/cpp/src/arrow/compute/kernels/hash_aggregate.cc +++ b/cpp/src/arrow/compute/kernels/hash_aggregate.cc @@ -3554,7 +3554,7 @@ void RegisterHashAggregateBasic(FunctionRegistry* registry) { { auto func = std::make_shared( - "hash_count", Arity::Binary(), &hash_count_doc, &default_count_options); + "hash_count", Arity::Binary(), hash_count_doc, &default_count_options); DCHECK_OK(func->AddKernel( MakeKernel(ValueDescr::ARRAY, HashAggregateInit))); @@ -3563,7 +3563,7 @@ void RegisterHashAggregateBasic(FunctionRegistry* registry) { { auto func = std::make_shared( - "hash_sum", Arity::Binary(), &hash_sum_doc, &default_scalar_aggregate_options); + "hash_sum", Arity::Binary(), hash_sum_doc, &default_scalar_aggregate_options); DCHECK_OK(AddHashAggKernels({boolean()}, GroupedSumFactory::Make, func.get())); DCHECK_OK(AddHashAggKernels(SignedIntTypes(), GroupedSumFactory::Make, func.get())); DCHECK_OK(AddHashAggKernels(UnsignedIntTypes(), GroupedSumFactory::Make, func.get())); @@ -3578,7 +3578,7 @@ void RegisterHashAggregateBasic(FunctionRegistry* registry) { { auto func = std::make_shared( - "hash_product", Arity::Binary(), &hash_product_doc, + "hash_product", Arity::Binary(), hash_product_doc, &default_scalar_aggregate_options); DCHECK_OK(AddHashAggKernels({boolean()}, GroupedProductFactory::Make, func.get())); DCHECK_OK( @@ -3596,7 +3596,7 @@ void RegisterHashAggregateBasic(FunctionRegistry* registry) { { auto func = std::make_shared( - "hash_mean", Arity::Binary(), &hash_mean_doc, &default_scalar_aggregate_options); + "hash_mean", Arity::Binary(), hash_mean_doc, &default_scalar_aggregate_options); DCHECK_OK(AddHashAggKernels({boolean()}, GroupedMeanFactory::Make, func.get())); DCHECK_OK(AddHashAggKernels(SignedIntTypes(), GroupedMeanFactory::Make, func.get())); DCHECK_OK( @@ -3612,7 +3612,7 @@ void RegisterHashAggregateBasic(FunctionRegistry* registry) { { auto func = std::make_shared( - "hash_stddev", Arity::Binary(), &hash_stddev_doc, &default_variance_options); + "hash_stddev", Arity::Binary(), hash_stddev_doc, &default_variance_options); DCHECK_OK(AddHashAggKernels(SignedIntTypes(), GroupedVarStdFactory::Make, func.get())); DCHECK_OK(AddHashAggKernels(UnsignedIntTypes(), @@ -3626,7 +3626,7 @@ void RegisterHashAggregateBasic(FunctionRegistry* registry) { { auto func = std::make_shared( - "hash_variance", Arity::Binary(), &hash_variance_doc, &default_variance_options); + "hash_variance", Arity::Binary(), hash_variance_doc, &default_variance_options); DCHECK_OK(AddHashAggKernels(SignedIntTypes(), GroupedVarStdFactory::Make, func.get())); DCHECK_OK(AddHashAggKernels(UnsignedIntTypes(), @@ -3641,7 +3641,7 @@ void RegisterHashAggregateBasic(FunctionRegistry* registry) { HashAggregateFunction* tdigest_func = nullptr; { auto func = std::make_shared( - "hash_tdigest", Arity::Binary(), &hash_tdigest_doc, &default_tdigest_options); + "hash_tdigest", Arity::Binary(), hash_tdigest_doc, &default_tdigest_options); DCHECK_OK( AddHashAggKernels(SignedIntTypes(), GroupedTDigestFactory::Make, func.get())); DCHECK_OK( @@ -3657,7 +3657,7 @@ void RegisterHashAggregateBasic(FunctionRegistry* registry) { { auto func = std::make_shared( - "hash_approximate_median", Arity::Binary(), &hash_approximate_median_doc, + "hash_approximate_median", Arity::Binary(), hash_approximate_median_doc, &default_scalar_aggregate_options); DCHECK_OK(func->AddKernel(MakeApproximateMedianKernel(tdigest_func))); DCHECK_OK(registry->AddFunction(std::move(func))); @@ -3666,7 +3666,7 @@ void RegisterHashAggregateBasic(FunctionRegistry* registry) { HashAggregateFunction* min_max_func = nullptr; { auto func = std::make_shared( - "hash_min_max", Arity::Binary(), &hash_min_max_doc, + "hash_min_max", Arity::Binary(), hash_min_max_doc, &default_scalar_aggregate_options); DCHECK_OK(AddHashAggKernels(NumericTypes(), GroupedMinMaxFactory::Make, func.get())); DCHECK_OK(AddHashAggKernels(TemporalTypes(), GroupedMinMaxFactory::Make, func.get())); @@ -3682,7 +3682,7 @@ void RegisterHashAggregateBasic(FunctionRegistry* registry) { { auto func = std::make_shared( - "hash_min", Arity::Binary(), &hash_min_or_max_doc, + "hash_min", Arity::Binary(), hash_min_or_max_doc, &default_scalar_aggregate_options); DCHECK_OK(func->AddKernel(MakeMinOrMaxKernel(min_max_func))); DCHECK_OK(registry->AddFunction(std::move(func))); @@ -3690,7 +3690,7 @@ void RegisterHashAggregateBasic(FunctionRegistry* registry) { { auto func = std::make_shared( - "hash_max", Arity::Binary(), &hash_min_or_max_doc, + "hash_max", Arity::Binary(), hash_min_or_max_doc, &default_scalar_aggregate_options); DCHECK_OK(func->AddKernel(MakeMinOrMaxKernel(min_max_func))); DCHECK_OK(registry->AddFunction(std::move(func))); @@ -3698,21 +3698,21 @@ void RegisterHashAggregateBasic(FunctionRegistry* registry) { { auto func = std::make_shared( - "hash_any", Arity::Binary(), &hash_any_doc, &default_scalar_aggregate_options); + "hash_any", Arity::Binary(), hash_any_doc, &default_scalar_aggregate_options); DCHECK_OK(func->AddKernel(MakeKernel(boolean(), HashAggregateInit))); DCHECK_OK(registry->AddFunction(std::move(func))); } { auto func = std::make_shared( - "hash_all", Arity::Binary(), &hash_all_doc, &default_scalar_aggregate_options); + "hash_all", Arity::Binary(), hash_all_doc, &default_scalar_aggregate_options); DCHECK_OK(func->AddKernel(MakeKernel(boolean(), HashAggregateInit))); DCHECK_OK(registry->AddFunction(std::move(func))); } { auto func = std::make_shared( - "hash_count_distinct", Arity::Binary(), &hash_count_distinct_doc, + "hash_count_distinct", Arity::Binary(), hash_count_distinct_doc, &default_count_options); DCHECK_OK(func->AddKernel( MakeKernel(ValueDescr::ARRAY, GroupedDistinctInit))); @@ -3721,7 +3721,7 @@ void RegisterHashAggregateBasic(FunctionRegistry* registry) { { auto func = std::make_shared( - "hash_distinct", Arity::Binary(), &hash_distinct_doc, &default_count_options); + "hash_distinct", Arity::Binary(), hash_distinct_doc, &default_count_options); DCHECK_OK(func->AddKernel( MakeKernel(ValueDescr::ARRAY, GroupedDistinctInit))); DCHECK_OK(registry->AddFunction(std::move(func))); @@ -3729,7 +3729,7 @@ void RegisterHashAggregateBasic(FunctionRegistry* registry) { { auto func = std::make_shared("hash_one", Arity::Binary(), - &hash_one_doc); + hash_one_doc); DCHECK_OK(AddHashAggKernels(NumericTypes(), GroupedOneFactory::Make, func.get())); DCHECK_OK(AddHashAggKernels(TemporalTypes(), GroupedOneFactory::Make, func.get())); DCHECK_OK(AddHashAggKernels(BaseBinaryTypes(), GroupedOneFactory::Make, func.get())); @@ -3741,7 +3741,7 @@ void RegisterHashAggregateBasic(FunctionRegistry* registry) { { auto func = std::make_shared("hash_list", Arity::Binary(), - &hash_list_doc); + hash_list_doc); DCHECK_OK(AddHashAggKernels(NumericTypes(), GroupedListFactory::Make, func.get())); DCHECK_OK(AddHashAggKernels(TemporalTypes(), GroupedListFactory::Make, func.get())); DCHECK_OK(AddHashAggKernels(BaseBinaryTypes(), GroupedListFactory::Make, func.get())); diff --git a/cpp/src/arrow/compute/kernels/scalar_arithmetic.cc b/cpp/src/arrow/compute/kernels/scalar_arithmetic.cc index f53d9f0c7f0..bfbc3cedf88 100644 --- a/cpp/src/arrow/compute/kernels/scalar_arithmetic.cc +++ b/cpp/src/arrow/compute/kernels/scalar_arithmetic.cc @@ -1935,7 +1935,7 @@ void AddNullExec(ScalarFunction* func) { template std::shared_ptr MakeArithmeticFunction(std::string name, - const FunctionDoc* doc) { + const FunctionDoc doc) { auto func = std::make_shared(name, Arity::Binary(), doc); for (const auto& ty : NumericTypes()) { auto exec = ArithmeticExecFromOp(ty); @@ -1949,7 +1949,7 @@ std::shared_ptr MakeArithmeticFunction(std::string name, // only on non-null output. template std::shared_ptr MakeArithmeticFunctionNotNull(std::string name, - const FunctionDoc* doc) { + const FunctionDoc doc) { auto func = std::make_shared(name, Arity::Binary(), doc); for (const auto& ty : NumericTypes()) { auto exec = ArithmeticExecFromOp(ty); @@ -1961,7 +1961,7 @@ std::shared_ptr MakeArithmeticFunctionNotNull(std::string name, template std::shared_ptr MakeUnaryArithmeticFunction(std::string name, - const FunctionDoc* doc) { + const FunctionDoc doc) { auto func = std::make_shared(name, Arity::Unary(), doc); for (const auto& ty : NumericTypes()) { auto exec = ArithmeticExecFromOp(ty); @@ -1975,7 +1975,7 @@ std::shared_ptr MakeUnaryArithmeticFunction(std::string name, // output type for integral inputs. template std::shared_ptr MakeUnaryArithmeticFunctionWithFixedIntOutType( - std::string name, const FunctionDoc* doc) { + std::string name, const FunctionDoc doc) { auto int_out_ty = TypeTraits::type_singleton(); auto func = std::make_shared(name, Arity::Unary(), doc); for (const auto& ty : NumericTypes()) { @@ -1997,7 +1997,7 @@ std::shared_ptr MakeUnaryArithmeticFunctionWithFixedIntOutType( // only on non-null output. template std::shared_ptr MakeUnaryArithmeticFunctionNotNull( - std::string name, const FunctionDoc* doc) { + std::string name, const FunctionDoc doc) { auto func = std::make_shared(name, Arity::Unary(), doc); for (const auto& ty : NumericTypes()) { auto exec = ArithmeticExecFromOp(ty); @@ -2075,7 +2075,7 @@ Status ExecRound(KernelContext* ctx, const ExecBatch& batch, Datum* out) { // kernel dispatch based on RoundMode, only on non-null output. template