From f54fd1ca0c30b4b46e8d21d1f445b140c4271f7e Mon Sep 17 00:00:00 2001 From: Vibhatha Abeykoon Date: Fri, 4 Mar 2022 18:57:55 +0530 Subject: [PATCH 01/26] updating submodule --- testing | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/testing b/testing index 5bab2f264a2..53b49804710 160000 --- a/testing +++ b/testing @@ -1 +1 @@ -Subproject commit 5bab2f264a23f5af68f69ea93d24ef1e8e77fc88 +Subproject commit 53b498047109d9940fcfab388bd9d6aeb8c57425 From 98cd4539ec9dcc0f3bd9d600875b39407e2f9ef1 Mon Sep 17 00:00:00 2001 From: Vibhatha Abeykoon Date: Sat, 19 Mar 2022 19:14:39 +0530 Subject: [PATCH 02/26] temp commit to remove files in submodule --- .gitmodules | 3 --- testing | 1 - 2 files changed, 4 deletions(-) delete mode 160000 testing diff --git a/.gitmodules b/.gitmodules index 6efc4871542..71722b21777 100644 --- a/.gitmodules +++ b/.gitmodules @@ -1,6 +1,3 @@ [submodule "cpp/submodules/parquet-testing"] path = cpp/submodules/parquet-testing url = https://github.com/apache/parquet-testing.git -[submodule "testing"] - path = testing - url = https://github.com/apache/arrow-testing diff --git a/testing b/testing deleted file mode 160000 index 53b49804710..00000000000 --- a/testing +++ /dev/null @@ -1 +0,0 @@ -Subproject commit 53b498047109d9940fcfab388bd9d6aeb8c57425 From a72cf546156059c90e011642dc9f54c6ac5a1fc1 Mon Sep 17 00:00:00 2001 From: Vibhatha Abeykoon Date: Sat, 19 Mar 2022 19:15:36 +0530 Subject: [PATCH 03/26] adding submodule --- .gitmodules | 3 +++ 1 file changed, 3 insertions(+) diff --git a/.gitmodules b/.gitmodules index 71722b21777..6efc4871542 100644 --- a/.gitmodules +++ b/.gitmodules @@ -1,3 +1,6 @@ [submodule "cpp/submodules/parquet-testing"] path = cpp/submodules/parquet-testing url = https://github.com/apache/parquet-testing.git +[submodule "testing"] + path = testing + url = https://github.com/apache/arrow-testing From 7c0d11e475421e9d877e09bc028db6c0c4eb8ce9 Mon Sep 17 00:00:00 2001 From: Vibhatha Abeykoon Date: Sun, 20 Mar 2022 23:09:21 +0530 Subject: [PATCH 04/26] updating testing submodule --- testing | 1 + 1 file changed, 1 insertion(+) create mode 160000 testing diff --git a/testing b/testing new file mode 160000 index 00000000000..d315f798520 --- /dev/null +++ b/testing @@ -0,0 +1 @@ +Subproject commit d315f7985207d2d67fc2c8e41053e9d97d573f4b From 25862d810da8d390346487f8eedb36382a576fb2 Mon Sep 17 00:00:00 2001 From: Vibhatha Abeykoon Date: Sun, 20 Mar 2022 23:11:24 +0530 Subject: [PATCH 05/26] revert to uupstream version --- testing | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/testing b/testing index d315f798520..53b49804710 160000 --- a/testing +++ b/testing @@ -1 +1 @@ -Subproject commit d315f7985207d2d67fc2c8e41053e9d97d573f4b +Subproject commit 53b498047109d9940fcfab388bd9d6aeb8c57425 From 06e06e8e85ffccffc4dea8963b2ccc5ca60113b2 Mon Sep 17 00:00:00 2001 From: Vibhatha Abeykoon Date: Fri, 22 Jul 2022 13:37:33 +0530 Subject: [PATCH 06/26] adding initial scalar udf docs for python --- docs/source/python/compute.rst | 77 ++++++++++++++++++++++++++++++++++ 1 file changed, 77 insertions(+) diff --git a/docs/source/python/compute.rst b/docs/source/python/compute.rst index bcbca9dff36..c282329edc2 100644 --- a/docs/source/python/compute.rst +++ b/docs/source/python/compute.rst @@ -370,3 +370,80 @@ our ``even_filter`` with a ``pc.field("nums") > 5`` filter: :class:`.Dataset` currently can be filtered using :meth:`.Dataset.to_table` method passing a ``filter`` argument. See :ref:`py-filter-dataset` in Dataset documentation. + + +User-Defined Functions +====================== + +.. warning:: + User-defined functions only supports scalar functions and the current version is experimental. + +To use a user-defined-function (UDF), either the experimental `dataset` API options can be used or the +function can be directly called using :func:`pyarrow.compute.call_function`. + +To register a UDF, a function name, function docs and input types and output type needs to be defined. + +.. code-block:: python + + import pyarrow.compute as pc + function_name = "regression" + function_docs = { + "summary": "Calculate y based on m, x and c values", + "description": "Obtaining output of a linear scalar function" + } + input_types = { + "m" : pa.int64(), + "x" : pa.int64(), + "c" : pa.int64(), + } + output_type = pa.int64() + + def linear_calculation(ctx, m, x, c): + return pc.add(pc.multiply(m, x), c) + + pc.register_scalar_function(linear_calculation, + function_name, + function_docs, + input_types, + output_type) + +.. note:: + There is a default parameter, `ctx` which is refers to a context object and it should be the + first parameter of any user-defined-function. The idea is to make available passing required + meta-data across an application which would be important for UDFs. + +Calling a UDF directly using :func:`pyarrow.compute.call_function`, + +.. code-block:: python + + >>> res = pc.call_function("regression", [pa.scalar(2), pa.scalar(10), pa.scalar(5)]) + 25 + +.. warning:: + Note that when the passed values to a function are all scalars, internally each scalar + is passed as an array of size 1. + +UDFs can be used with tabular data by using `dataset` API and apply a UDF function on the +dataset. + +.. code-block:: python + + >>> sample_data = {'trip_name': ['A', 'B', 'C', 'D'], 'total_amount($)': [10, 20, 45, 15]} + >>> data_table = pa.Table.from_pydict(sample_data) + >>> import pyarrow.dataset as ds + >>> dataset = ds.dataset(data_table) + >>> func_args = [pc.scalar(5), ds.field("total_amount($)"), pc.scalar(2)] + >>> result_table = dataset.to_table( + ... columns={ + ... 'total_amount_projected($)': ds.field('')._call(function_name, func_args), + ... 'total_amount($)': ds.field('total_amount($)'), + ... 'trip_name': ds.field('trip_name') + ... }) + pyarrow.Table + total_amount_projected($): int64 + total_amount($): int64 + trip_name: string + ---- + total_amount_projected($): [[52,102,227,77]] + total_amount($): [[10,20,45,15]] + trip_name: [["A","B","C","D"]] From 79eedd842050a671296574d28c73d5df0d8841e5 Mon Sep 17 00:00:00 2001 From: Vibhatha Lakmal Abeykoon Date: Fri, 22 Jul 2022 18:12:10 +0530 Subject: [PATCH 07/26] update warnings Co-authored-by: Antoine Pitrou --- docs/source/python/compute.rst | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/docs/source/python/compute.rst b/docs/source/python/compute.rst index c282329edc2..2cb27a578cd 100644 --- a/docs/source/python/compute.rst +++ b/docs/source/python/compute.rst @@ -376,7 +376,8 @@ User-Defined Functions ====================== .. warning:: - User-defined functions only supports scalar functions and the current version is experimental. + This API is **experimental**. + Also, only scalar functions can currently be user-defined. To use a user-defined-function (UDF), either the experimental `dataset` API options can be used or the function can be directly called using :func:`pyarrow.compute.call_function`. From 88d46cc8860680bdd4502912aa1bfdb59541d968 Mon Sep 17 00:00:00 2001 From: Vibhatha Lakmal Abeykoon Date: Fri, 22 Jul 2022 18:12:46 +0530 Subject: [PATCH 08/26] update udf description Co-authored-by: Antoine Pitrou --- docs/source/python/compute.rst | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/docs/source/python/compute.rst b/docs/source/python/compute.rst index 2cb27a578cd..ecc86458aba 100644 --- a/docs/source/python/compute.rst +++ b/docs/source/python/compute.rst @@ -379,8 +379,10 @@ User-Defined Functions This API is **experimental**. Also, only scalar functions can currently be user-defined. -To use a user-defined-function (UDF), either the experimental `dataset` API options can be used or the -function can be directly called using :func:`pyarrow.compute.call_function`. +PyArrow allows defining and registering custom compute functions in Python. +Those functions can then be called from Python as well as C++ (and potentially +any other implementation wrapping Arrow C++, such as the R ``arrow`` package`) +using their registered function name. To register a UDF, a function name, function docs and input types and output type needs to be defined. From 0497de031356fc3d54e5d84cde259434a7da8566 Mon Sep 17 00:00:00 2001 From: Vibhatha Lakmal Abeykoon Date: Fri, 22 Jul 2022 18:13:08 +0530 Subject: [PATCH 09/26] grammar fix Co-authored-by: Antoine Pitrou --- docs/source/python/compute.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/source/python/compute.rst b/docs/source/python/compute.rst index ecc86458aba..c47c123f384 100644 --- a/docs/source/python/compute.rst +++ b/docs/source/python/compute.rst @@ -384,7 +384,7 @@ Those functions can then be called from Python as well as C++ (and potentially any other implementation wrapping Arrow C++, such as the R ``arrow`` package`) using their registered function name. -To register a UDF, a function name, function docs and input types and output type needs to be defined. +To register a UDF, a function name, function docs and input types and output type need to be defined. .. code-block:: python From 100ad56441eb99aec419a4b18f1e758ee17b669c Mon Sep 17 00:00:00 2001 From: Vibhatha Lakmal Abeykoon Date: Fri, 22 Jul 2022 18:14:59 +0530 Subject: [PATCH 10/26] update udf function doc Co-authored-by: Antoine Pitrou --- docs/source/python/compute.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/source/python/compute.rst b/docs/source/python/compute.rst index c47c123f384..0f19ec81733 100644 --- a/docs/source/python/compute.rst +++ b/docs/source/python/compute.rst @@ -391,7 +391,7 @@ To register a UDF, a function name, function docs and input types and output typ import pyarrow.compute as pc function_name = "regression" function_docs = { - "summary": "Calculate y based on m, x and c values", + "summary": "Calculate y = mx + c", "description": "Obtaining output of a linear scalar function" } input_types = { From b0221c92a15894db6331dbab9b2fea76966205f4 Mon Sep 17 00:00:00 2001 From: Vibhatha Lakmal Abeykoon Date: Fri, 22 Jul 2022 18:15:45 +0530 Subject: [PATCH 11/26] update udf func doc description Co-authored-by: Antoine Pitrou --- docs/source/python/compute.rst | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/docs/source/python/compute.rst b/docs/source/python/compute.rst index 0f19ec81733..51283fb75ad 100644 --- a/docs/source/python/compute.rst +++ b/docs/source/python/compute.rst @@ -392,7 +392,9 @@ To register a UDF, a function name, function docs and input types and output typ function_name = "regression" function_docs = { "summary": "Calculate y = mx + c", - "description": "Obtaining output of a linear scalar function" + "description": + "Compute the affine function y = mx + c.\n" + "This function takes three inputs, m, x and c, in order." } input_types = { "m" : pa.int64(), From eec2473337c1f81944005c804c1774730d6c2b5f Mon Sep 17 00:00:00 2001 From: Vibhatha Lakmal Abeykoon Date: Fri, 22 Jul 2022 18:17:25 +0530 Subject: [PATCH 12/26] update description Co-authored-by: Antoine Pitrou --- docs/source/python/compute.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/source/python/compute.rst b/docs/source/python/compute.rst index 51283fb75ad..fccedd916de 100644 --- a/docs/source/python/compute.rst +++ b/docs/source/python/compute.rst @@ -417,7 +417,7 @@ To register a UDF, a function name, function docs and input types and output typ first parameter of any user-defined-function. The idea is to make available passing required meta-data across an application which would be important for UDFs. -Calling a UDF directly using :func:`pyarrow.compute.call_function`, +You can call a user-defined function directly using :func:`pyarrow.compute.call_function`: .. code-block:: python From c48c90afba09a7fe717805133864be4d7f216806 Mon Sep 17 00:00:00 2001 From: Vibhatha Lakmal Abeykoon Date: Fri, 22 Jul 2022 18:19:29 +0530 Subject: [PATCH 13/26] update the how the udf can be called with datasets. Co-authored-by: Antoine Pitrou --- docs/source/python/compute.rst | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/docs/source/python/compute.rst b/docs/source/python/compute.rst index fccedd916de..211a28a21a9 100644 --- a/docs/source/python/compute.rst +++ b/docs/source/python/compute.rst @@ -428,8 +428,9 @@ You can call a user-defined function directly using :func:`pyarrow.compute.call_ Note that when the passed values to a function are all scalars, internally each scalar is passed as an array of size 1. -UDFs can be used with tabular data by using `dataset` API and apply a UDF function on the -dataset. +More generally, user-defined functions are usable everywhere a compute function +can be referred to by its name. For example, they can be called on a dataset's +column using :meth:`Expression._call`: .. code-block:: python From 60abbc6bc9532b75c5316d890e8ad7537e75db54 Mon Sep 17 00:00:00 2001 From: Vibhatha Lakmal Abeykoon Date: Fri, 22 Jul 2022 18:22:08 +0530 Subject: [PATCH 14/26] update info on context Co-authored-by: Antoine Pitrou --- docs/source/python/compute.rst | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/docs/source/python/compute.rst b/docs/source/python/compute.rst index 211a28a21a9..85c33daccaf 100644 --- a/docs/source/python/compute.rst +++ b/docs/source/python/compute.rst @@ -412,10 +412,12 @@ To register a UDF, a function name, function docs and input types and output typ input_types, output_type) -.. note:: - There is a default parameter, `ctx` which is refers to a context object and it should be the - first parameter of any user-defined-function. The idea is to make available passing required - meta-data across an application which would be important for UDFs. +The implementation of a user-defined function always takes a first *context* +parameter (named ``ctx`` in the example above) which is an instance of +:class:`pyarrow.compute.ScalarUdfContext`. +This context exposes several useful attributes, particularly a +:attr:`~pyarrow.compute.ScalarUdfContext.memory_pool` to be used for +allocations in the context of the user-defined function. You can call a user-defined function directly using :func:`pyarrow.compute.call_function`: From d803a4f2e398d68e2cb88f38534ccd4072808960 Mon Sep 17 00:00:00 2001 From: Vibhatha Lakmal Abeykoon Date: Fri, 22 Jul 2022 18:22:38 +0530 Subject: [PATCH 15/26] adding addequate memory Co-authored-by: Antoine Pitrou --- docs/source/python/compute.rst | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/docs/source/python/compute.rst b/docs/source/python/compute.rst index 85c33daccaf..c7b59e2469b 100644 --- a/docs/source/python/compute.rst +++ b/docs/source/python/compute.rst @@ -403,8 +403,9 @@ To register a UDF, a function name, function docs and input types and output typ } output_type = pa.int64() - def linear_calculation(ctx, m, x, c): - return pc.add(pc.multiply(m, x), c) + def affine_calculation(ctx, m, x, c): + temp = pc.multiply(m, x, memory_pool=ctx.memory_pool) + return pc.add(temp, c, memory_pool=ctx.memory_pool) pc.register_scalar_function(linear_calculation, function_name, From b4959a749a82132fc4d713cb1cdb2687be990257 Mon Sep 17 00:00:00 2001 From: Vibhatha Abeykoon Date: Fri, 22 Jul 2022 18:45:26 +0530 Subject: [PATCH 16/26] adding a new example to show case the scalar limitations' --- docs/source/python/compute.rst | 69 ++++++++++++++++++++++++++++------ 1 file changed, 57 insertions(+), 12 deletions(-) diff --git a/docs/source/python/compute.rst b/docs/source/python/compute.rst index c7b59e2469b..f34f3600444 100644 --- a/docs/source/python/compute.rst +++ b/docs/source/python/compute.rst @@ -389,7 +389,7 @@ To register a UDF, a function name, function docs and input types and output typ .. code-block:: python import pyarrow.compute as pc - function_name = "regression" + function_name = "affine" function_docs = { "summary": "Calculate y = mx + c", "description": @@ -397,17 +397,22 @@ To register a UDF, a function name, function docs and input types and output typ "This function takes three inputs, m, x and c, in order." } input_types = { - "m" : pa.int64(), - "x" : pa.int64(), - "c" : pa.int64(), + "m" : pa.float64(), + "x" : pa.float64(), + "c" : pa.float64(), } - output_type = pa.int64() + output_type = pa.float64() +<<<<<<< HEAD def affine_calculation(ctx, m, x, c): temp = pc.multiply(m, x, memory_pool=ctx.memory_pool) return pc.add(temp, c, memory_pool=ctx.memory_pool) +======= + def affine(ctx, m, x, c): + return pc.add(pc.multiply(m, x), c) +>>>>>>> 5a2c81946 (adding a new example to show case the scalar limitations') - pc.register_scalar_function(linear_calculation, + pc.register_scalar_function(affine, function_name, function_docs, input_types, @@ -424,27 +429,67 @@ You can call a user-defined function directly using :func:`pyarrow.compute.call_ .. code-block:: python - >>> res = pc.call_function("regression", [pa.scalar(2), pa.scalar(10), pa.scalar(5)]) + >>> res = pc.call_function("affine", [pa.scalar(2), pa.scalar(10), pa.scalar(5)]) 25 -.. warning:: +.. note:: Note that when the passed values to a function are all scalars, internally each scalar is passed as an array of size 1. More generally, user-defined functions are usable everywhere a compute function can be referred to by its name. For example, they can be called on a dataset's column using :meth:`Expression._call`: +Considering a series of scalar inputs, + +.. code-block:: python + + >>> import pyarrow as pa + >>> import pyarrow.compute as pc + >>> function_name = "affine_with_python" + >>> function_docs = { + ... "summary": "Calculate y = mx + c with Python", + ... "description": + ... "Compute the affine function y = mx + c.\n" + ... "This function takes three inputs, m, x and c, in order." + ... } + >>> input_types = { + ... "m" : pa.float64(), + ... "x" : pa.float64(), + ... "c" : pa.float64(), + ... } + >>> output_type = pa.float64() + >>> + >>> def affine_with_python(ctx, m, x, c): + ... m = m[0].as_py() + ... x = x[0].as_py() + ... c = c[0].as_py() + ... return pa.array([m * x + c]) + ... + >>> pc.register_scalar_function(affine_with_python, + ... function_name, + ... function_docs, + ... input_types, + ... output_type) + >>> + >>> pc.call_function(function_name, [pa.scalar(10.1), pa.scalar(10.2), pa.scalar(20.2)]) + + +When all the inputs are scalar, the input is a size=1 array and the values has to be properly +treated within the UDF. And also make sure to include the final output as a size=1 array. + +UDFs can be used with tabular data by using `dataset` API and apply a UDF function on the +dataset. .. code-block:: python - >>> sample_data = {'trip_name': ['A', 'B', 'C', 'D'], 'total_amount($)': [10, 20, 45, 15]} - >>> data_table = pa.Table.from_pydict(sample_data) >>> import pyarrow.dataset as ds + >>> sample_data = {'trip_name': ['A', 'B', 'C', 'D'], 'total_amount($)': [10.21, 20.12, 45.32, 15.12]} + >>> data_table = pa.Table.from_pydict(sample_data) >>> dataset = ds.dataset(data_table) - >>> func_args = [pc.scalar(5), ds.field("total_amount($)"), pc.scalar(2)] + >>> func_args = [pc.scalar(5.2), ds.field("total_amount($)"), pc.scalar(2.1)] >>> result_table = dataset.to_table( ... columns={ - ... 'total_amount_projected($)': ds.field('')._call(function_name, func_args), + ... 'total_amount_projected($)': ds.field('')._call("affine", func_args), ... 'total_amount($)': ds.field('total_amount($)'), ... 'trip_name': ds.field('trip_name') ... }) From 93323d9012158d58a08aa5d735fd8e53181db4df Mon Sep 17 00:00:00 2001 From: Vibhatha Abeykoon Date: Fri, 22 Jul 2022 19:05:12 +0530 Subject: [PATCH 17/26] updating example and rebase --- docs/source/python/compute.rst | 13 ++++--------- 1 file changed, 4 insertions(+), 9 deletions(-) diff --git a/docs/source/python/compute.rst b/docs/source/python/compute.rst index f34f3600444..eadb7f00c50 100644 --- a/docs/source/python/compute.rst +++ b/docs/source/python/compute.rst @@ -403,14 +403,9 @@ To register a UDF, a function name, function docs and input types and output typ } output_type = pa.float64() -<<<<<<< HEAD - def affine_calculation(ctx, m, x, c): + def affine(ctx, m, x, c): temp = pc.multiply(m, x, memory_pool=ctx.memory_pool) return pc.add(temp, c, memory_pool=ctx.memory_pool) -======= - def affine(ctx, m, x, c): - return pc.add(pc.multiply(m, x), c) ->>>>>>> 5a2c81946 (adding a new example to show case the scalar limitations') pc.register_scalar_function(affine, function_name, @@ -429,8 +424,8 @@ You can call a user-defined function directly using :func:`pyarrow.compute.call_ .. code-block:: python - >>> res = pc.call_function("affine", [pa.scalar(2), pa.scalar(10), pa.scalar(5)]) - 25 + >>> pc.call_function("affine", [pa.scalar(2.5), pa.scalar(10.5), pa.scalar(5.5)]) + .. note:: Note that when the passed values to a function are all scalars, internally each scalar @@ -487,7 +482,7 @@ dataset. >>> data_table = pa.Table.from_pydict(sample_data) >>> dataset = ds.dataset(data_table) >>> func_args = [pc.scalar(5.2), ds.field("total_amount($)"), pc.scalar(2.1)] - >>> result_table = dataset.to_table( + >>> dataset.to_table( ... columns={ ... 'total_amount_projected($)': ds.field('')._call("affine", func_args), ... 'total_amount($)': ds.field('total_amount($)'), From feb6bb65593b0e4e9e939364093e2456abb984f8 Mon Sep 17 00:00:00 2001 From: Vibhatha Abeykoon Date: Fri, 22 Jul 2022 19:12:08 +0530 Subject: [PATCH 18/26] adding docs --- python/pyarrow/src/udf.h | 2 ++ 1 file changed, 2 insertions(+) diff --git a/python/pyarrow/src/udf.h b/python/pyarrow/src/udf.h index 52f22b4cb4f..da380acea1d 100644 --- a/python/pyarrow/src/udf.h +++ b/python/pyarrow/src/udf.h @@ -41,6 +41,8 @@ struct ARROW_PYTHON_EXPORT ScalarUdfOptions { std::shared_ptr output_type; }; +/// \brief A context defined to hold meta-data required in +/// scalar UDF execution. struct ARROW_PYTHON_EXPORT ScalarUdfContext { MemoryPool* pool; int64_t batch_length; From 1203d80e9c65bd1b25c56e26498e780be8d0be8c Mon Sep 17 00:00:00 2001 From: Vibhatha Abeykoon Date: Sat, 23 Jul 2022 06:58:08 +0530 Subject: [PATCH 19/26] addressing review comments --- docs/source/python/compute.rst | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/docs/source/python/compute.rst b/docs/source/python/compute.rst index eadb7f00c50..1b0c445a84d 100644 --- a/docs/source/python/compute.rst +++ b/docs/source/python/compute.rst @@ -438,7 +438,6 @@ Considering a series of scalar inputs, .. code-block:: python - >>> import pyarrow as pa >>> import pyarrow.compute as pc >>> function_name = "affine_with_python" >>> function_docs = { @@ -469,10 +468,9 @@ Considering a series of scalar inputs, >>> pc.call_function(function_name, [pa.scalar(10.1), pa.scalar(10.2), pa.scalar(20.2)]) -When all the inputs are scalar, the input is a size=1 array and the values has to be properly -treated within the UDF. And also make sure to include the final output as a size=1 array. +In case of all scalar inputs, make sure to return the final output as an array. -UDFs can be used with tabular data by using `dataset` API and apply a UDF function on the +More generally, UDFs can be used with tabular data by using `dataset` API and apply a UDF function on a dataset. .. code-block:: python From 6a04e0945c0a45725960ece2b13a811cdc419cd0 Mon Sep 17 00:00:00 2001 From: Vibhatha Abeykoon Date: Thu, 28 Jul 2022 14:07:32 +0530 Subject: [PATCH 20/26] fix(reviews): address reviews --- docs/source/python/api/compute.rst | 9 ++++ docs/source/python/compute.rst | 70 +++++++++++++++++++++++------- 2 files changed, 63 insertions(+), 16 deletions(-) diff --git a/docs/source/python/api/compute.rst b/docs/source/python/api/compute.rst index 4a9208fd31b..ce4a76208bf 100644 --- a/docs/source/python/api/compute.rst +++ b/docs/source/python/api/compute.rst @@ -555,3 +555,12 @@ Compute Options TrimOptions VarianceOptions WeekOptions + +Custom Functions +---------------- + +.. autosummary:: + :toctree: ../generated/ + + register_scalar_function + ScalarUdfContext diff --git a/docs/source/python/compute.rst b/docs/source/python/compute.rst index 1b0c445a84d..4dd5f652f55 100644 --- a/docs/source/python/compute.rst +++ b/docs/source/python/compute.rst @@ -384,7 +384,8 @@ Those functions can then be called from Python as well as C++ (and potentially any other implementation wrapping Arrow C++, such as the R ``arrow`` package`) using their registered function name. -To register a UDF, a function name, function docs and input types and output type need to be defined. +To register a UDF, a function name, function docs, input types and +output type need to be defined. Using :func:`pyarrow.compute.register_scalar_function`, .. code-block:: python @@ -427,14 +428,19 @@ You can call a user-defined function directly using :func:`pyarrow.compute.call_ >>> pc.call_function("affine", [pa.scalar(2.5), pa.scalar(10.5), pa.scalar(5.5)]) -.. note:: - Note that when the passed values to a function are all scalars, internally each scalar - is passed as an array of size 1. +Generalizing Usage +------------------ -More generally, user-defined functions are usable everywhere a compute function -can be referred to by its name. For example, they can be called on a dataset's -column using :meth:`Expression._call`: -Considering a series of scalar inputs, +PyArrow UDFs accept input types of both scalar and array. Also it can have +vivid combinations of these types. It is important that the UDF author must make sure, +the UDF is defined such that it can handle such combinations well. + +For instance, when the passed values to a function are all scalars, internally +each scalar is passed as an array of size 1. + +To elaborate on this, let's consider a scenario where we have a function +which computes a scalar `y` value based on scalar inputs +`m`, `x` and `c` using python arithmetic operations. .. code-block:: python @@ -468,23 +474,35 @@ Considering a series of scalar inputs, >>> pc.call_function(function_name, [pa.scalar(10.1), pa.scalar(10.2), pa.scalar(20.2)]) -In case of all scalar inputs, make sure to return the final output as an array. +Note that here the the final output is returned as an array. Depending the usage of vivid libraries +inside the UDF, make sure it is generalized to support the passed input values and return suitable +values. -More generally, UDFs can be used with tabular data by using `dataset` API and apply a UDF function on a -dataset. +Working with Datasets +--------------------- + +More generally, user-defined functions are usable everywhere a compute function +can be referred to by its name. For example, they can be called on a dataset's +column using :meth:`Expression._call`: + +Consider an instance where the data is in a table and you need to create a new +column using existing values in a column by using a mathematical formula. +For instance, let's consider a simple affine operation on values using the +mathematical expression, `y = mx + c`. We will be re-using the registered `affine` +function. .. code-block:: python >>> import pyarrow.dataset as ds - >>> sample_data = {'trip_name': ['A', 'B', 'C', 'D'], 'total_amount($)': [10.21, 20.12, 45.32, 15.12]} + >>> sample_data = {'category': ['A', 'B', 'C', 'D'], 'value': [10.21, 20.12, 45.32, 15.12]} >>> data_table = pa.Table.from_pydict(sample_data) >>> dataset = ds.dataset(data_table) - >>> func_args = [pc.scalar(5.2), ds.field("total_amount($)"), pc.scalar(2.1)] + >>> func_args = [pc.scalar(5.2), ds.field("value"), pc.scalar(2.1)] >>> dataset.to_table( ... columns={ - ... 'total_amount_projected($)': ds.field('')._call("affine", func_args), - ... 'total_amount($)': ds.field('total_amount($)'), - ... 'trip_name': ds.field('trip_name') + ... 'projected_value': ds.field('')._call("affine", func_args), + ... 'value': ds.field('value'), + ... 'category': ds.field('category') ... }) pyarrow.Table total_amount_projected($): int64 @@ -494,3 +512,23 @@ dataset. total_amount_projected($): [[52,102,227,77]] total_amount($): [[10,20,45,15]] trip_name: [["A","B","C","D"]] + +Here note that the `ds.field('')_call()` returns an expression. The passed arguments +to this function call are expressions not scalar values +(i.e `pc.scalar(5.2), ds.field("value"), pc.scalar(2.1)`). This expression is evaluated +when the project operator uses this expression. + +Support +------- + +It is defined that the current support is only for scalar functions. +A scalar function (:class:`arrow::compute::ScalarFunction`) executes elementwise operations +on arrays or scalars. Generally, the result of such an execution doesn't +depend on the order of values. + +There is a limitation in the support to UDFs in the current API. +For instance, with project node, if a UDF is used as the compute function, +it expects the function to be a scalar function. Although, this doesn't stop the user +registering a non-scalar function and using it in a programme. +But it could lead to unexpected behaviors or errors when it is applied in such occasions. +The current UDF support could enhance with the addition of more settings to the API (i.e aggregate UDFs). From f702c51800d833cba6e0be38dc5925bc5a48d5e0 Mon Sep 17 00:00:00 2001 From: Vibhatha Abeykoon Date: Fri, 26 Aug 2022 07:25:02 +0530 Subject: [PATCH 21/26] fix(reviews): updated write up and example --- docs/source/python/compute.rst | 99 ++++++++++++++++++---------------- 1 file changed, 54 insertions(+), 45 deletions(-) diff --git a/docs/source/python/compute.rst b/docs/source/python/compute.rst index 4dd5f652f55..1e1fbd5e002 100644 --- a/docs/source/python/compute.rst +++ b/docs/source/python/compute.rst @@ -381,7 +381,7 @@ User-Defined Functions PyArrow allows defining and registering custom compute functions in Python. Those functions can then be called from Python as well as C++ (and potentially -any other implementation wrapping Arrow C++, such as the R ``arrow`` package`) +any other implementation wrapping Arrow C++, such as the R ``arrow`` package) using their registered function name. To register a UDF, a function name, function docs, input types and @@ -414,7 +414,7 @@ output type need to be defined. Using :func:`pyarrow.compute.register_scalar_fun input_types, output_type) -The implementation of a user-defined function always takes a first *context* +The implementation of a user-defined function always takes first *context* parameter (named ``ctx`` in the example above) which is an instance of :class:`pyarrow.compute.ScalarUdfContext`. This context exposes several useful attributes, particularly a @@ -432,22 +432,21 @@ Generalizing Usage ------------------ PyArrow UDFs accept input types of both scalar and array. Also it can have -vivid combinations of these types. It is important that the UDF author must make sure, -the UDF is defined such that it can handle such combinations well. +any combination of these types. It is important that the UDF author ensures +the UDF can handle such combinations correctly. Also the ability to use UDFs +with existing data processing libraries is very useful. -For instance, when the passed values to a function are all scalars, internally -each scalar is passed as an array of size 1. - -To elaborate on this, let's consider a scenario where we have a function -which computes a scalar `y` value based on scalar inputs -`m`, `x` and `c` using python arithmetic operations. +Let's consider a scenario where we have a function +which computes a scalar `y` value based on scalar/array inputs +`m`, `x` and `c` using Numpy arithmetic operations. .. code-block:: python - >>> import pyarrow.compute as pc - >>> function_name = "affine_with_python" + >>> import pyarrow as pa + >>> import numpy as np + >>> function_name = "affine_with_numpy" >>> function_docs = { - ... "summary": "Calculate y = mx + c with Python", + ... "summary": "Calculate y = mx + c with Numpy", ... "description": ... "Compute the affine function y = mx + c.\n" ... "This function takes three inputs, m, x and c, in order." @@ -459,24 +458,36 @@ which computes a scalar `y` value based on scalar inputs ... } >>> output_type = pa.float64() >>> - >>> def affine_with_python(ctx, m, x, c): - ... m = m[0].as_py() - ... x = x[0].as_py() - ... c = c[0].as_py() - ... return pa.array([m * x + c]) + >>> def to_numpy(val): + ... if isinstance(val, pa.Scalar): + ... return val.as_py() + ... else: + ... return np.array(val) + ... + >>> def affine_with_numpy(ctx, m, x, c): + ... m = to_numpy(m) + ... x = to_numpy(x) + ... c = to_numpy(c) + ... return pa.array(m * x + c) ... - >>> pc.register_scalar_function(affine_with_python, + >>> pc.register_scalar_function(affine_with_numpy, ... function_name, - ... function_docs, + ... function_docs, ... input_types, ... output_type) - >>> >>> pc.call_function(function_name, [pa.scalar(10.1), pa.scalar(10.2), pa.scalar(20.2)]) + >>> pc.call_function(function_name, [pa.scalar(10.1), pa.array([10.2, 20.2]), pa.scalar(20.2)]) + + [ + 123.22, + 224.21999999999997 + ] -Note that here the the final output is returned as an array. Depending the usage of vivid libraries -inside the UDF, make sure it is generalized to support the passed input values and return suitable -values. +Note that there is a helper function `to_numpy` to handle the conversion of scalar an array inputs +to the UDf. Also, the final output is returned as a scalr or an array depending on the inputs. +Depending on the usage of any libraries inside the UDF, make sure it is generalized to support +the passed input values and return suitable values. Working with Datasets --------------------- @@ -486,10 +497,10 @@ can be referred to by its name. For example, they can be called on a dataset's column using :meth:`Expression._call`: Consider an instance where the data is in a table and you need to create a new -column using existing values in a column by using a mathematical formula. +column using existing values in another column by using a mathematical formula. For instance, let's consider a simple affine operation on values using the -mathematical expression, `y = mx + c`. We will be re-using the registered `affine` -function. +mathematical expression, `y = mx + c`. Here, we will be re-using the registered +`affine` function. .. code-block:: python @@ -513,22 +524,20 @@ function. total_amount($): [[10,20,45,15]] trip_name: [["A","B","C","D"]] -Here note that the `ds.field('')_call()` returns an expression. The passed arguments +Note that the `ds.field('')_call()` returns an expression. The passed arguments to this function call are expressions not scalar values -(i.e `pc.scalar(5.2), ds.field("value"), pc.scalar(2.1)`). This expression is evaluated -when the project operator uses this expression. - -Support -------- - -It is defined that the current support is only for scalar functions. -A scalar function (:class:`arrow::compute::ScalarFunction`) executes elementwise operations -on arrays or scalars. Generally, the result of such an execution doesn't -depend on the order of values. - -There is a limitation in the support to UDFs in the current API. -For instance, with project node, if a UDF is used as the compute function, -it expects the function to be a scalar function. Although, this doesn't stop the user -registering a non-scalar function and using it in a programme. -But it could lead to unexpected behaviors or errors when it is applied in such occasions. -The current UDF support could enhance with the addition of more settings to the API (i.e aggregate UDFs). +(i.e `pc.scalar(5.2), ds.field("value"), pc.scalar(2.1)`, notice the difference +of `pa.scalar` vs `pc.scalar`, the latter produces an expression). This expression is evaluated +when the project operator executes it. + +Projection Expressions +^^^^^^^^^^^^^^^^^^^^^^ +In the above example we used an expression to add a new column (`total_amount_projected`) +to our table. Adding new, dynamically computed, columns to a table is known as "projection" +and there are limitations on what kinds of functions can be used in projection expressions. +A projection function must emit a single output value for each input row. That output value +should be calculated entirely from the input row and should not depend on any other row. +For example, the "affine" function that we've been using as an example above is a valid +function to use in a projection. A "cumulative sum" function would not be a valid function +since the result of each input rows depends on the rows that came before. A "drop nulls" +function would also be invalid because it doesn't emit a value for some rows. From e86fadda0ba29aa3d2e62206e282938436f01aa3 Mon Sep 17 00:00:00 2001 From: Vibhatha Abeykoon Date: Fri, 16 Sep 2022 07:18:46 +0530 Subject: [PATCH 22/26] fix(reviews): replaced affine function with gcd and update docs --- docs/source/python/compute.rst | 154 +++++++++++++-------------------- 1 file changed, 58 insertions(+), 96 deletions(-) diff --git a/docs/source/python/compute.rst b/docs/source/python/compute.rst index 1e1fbd5e002..ae7a8e33a79 100644 --- a/docs/source/python/compute.rst +++ b/docs/source/python/compute.rst @@ -379,8 +379,8 @@ User-Defined Functions This API is **experimental**. Also, only scalar functions can currently be user-defined. -PyArrow allows defining and registering custom compute functions in Python. -Those functions can then be called from Python as well as C++ (and potentially +PyArrow allows defining and registering custom compute functions. +These functions can then be called from Python as well as C++ (and potentially any other implementation wrapping Arrow C++, such as the R ``arrow`` package) using their registered function name. @@ -389,30 +389,43 @@ output type need to be defined. Using :func:`pyarrow.compute.register_scalar_fun .. code-block:: python + import numpy as np + + import pyarrow as pa import pyarrow.compute as pc - function_name = "affine" + + function_name = "numpy_gcd" function_docs = { - "summary": "Calculate y = mx + c", - "description": - "Compute the affine function y = mx + c.\n" - "This function takes three inputs, m, x and c, in order." + "summary": "Calculates the greatest common divisor", + "description": + "Given 'x' and 'y' find the greatest number that divides\n" + "evenly into both x and y." } + input_types = { - "m" : pa.float64(), - "x" : pa.float64(), - "c" : pa.float64(), + "x" : pa.int64(), + "y" : pa.int64() } - output_type = pa.float64() - def affine(ctx, m, x, c): - temp = pc.multiply(m, x, memory_pool=ctx.memory_pool) - return pc.add(temp, c, memory_pool=ctx.memory_pool) + output_type = pa.int64() + + def to_np(val): + if isinstance(val, pa.Scalar): + return val.as_py() + else: + return np.array(val) - pc.register_scalar_function(affine, - function_name, - function_docs, - input_types, - output_type) + def gcd_numpy(ctx, x, y): + np_x = to_np(x) + np_y = to_np(y) + return pa.array(np.gcd(np_x, np_y)) + + pc.register_scalar_function(gcd_numpy, + function_name, + function_docs, + input_types, + output_type) + The implementation of a user-defined function always takes first *context* parameter (named ``ctx`` in the example above) which is an instance of @@ -421,123 +434,72 @@ This context exposes several useful attributes, particularly a :attr:`~pyarrow.compute.ScalarUdfContext.memory_pool` to be used for allocations in the context of the user-defined function. -You can call a user-defined function directly using :func:`pyarrow.compute.call_function`: - -.. code-block:: python - - >>> pc.call_function("affine", [pa.scalar(2.5), pa.scalar(10.5), pa.scalar(5.5)]) - - -Generalizing Usage ------------------- - PyArrow UDFs accept input types of both scalar and array. Also it can have any combination of these types. It is important that the UDF author ensures the UDF can handle such combinations correctly. Also the ability to use UDFs with existing data processing libraries is very useful. -Let's consider a scenario where we have a function -which computes a scalar `y` value based on scalar/array inputs -`m`, `x` and `c` using Numpy arithmetic operations. +Note that there is a helper function `to_np` to handle the conversion +of scalar and array inputs to the UDF. Also, the final output is returned +as a scalr or an array depending on the inputs. Based on the usage of any +libraries inside the UDF, make sure it is generalized to support the passed +input values and return suitable values. -.. code-block:: python +You can call a user-defined function directly using :func:`pyarrow.compute.call_function`: - >>> import pyarrow as pa - >>> import numpy as np - >>> function_name = "affine_with_numpy" - >>> function_docs = { - ... "summary": "Calculate y = mx + c with Numpy", - ... "description": - ... "Compute the affine function y = mx + c.\n" - ... "This function takes three inputs, m, x and c, in order." - ... } - >>> input_types = { - ... "m" : pa.float64(), - ... "x" : pa.float64(), - ... "c" : pa.float64(), - ... } - >>> output_type = pa.float64() - >>> - >>> def to_numpy(val): - ... if isinstance(val, pa.Scalar): - ... return val.as_py() - ... else: - ... return np.array(val) - ... - >>> def affine_with_numpy(ctx, m, x, c): - ... m = to_numpy(m) - ... x = to_numpy(x) - ... c = to_numpy(c) - ... return pa.array(m * x + c) - ... - >>> pc.register_scalar_function(affine_with_numpy, - ... function_name, - ... function_docs, - ... input_types, - ... output_type) - >>> pc.call_function(function_name, [pa.scalar(10.1), pa.scalar(10.2), pa.scalar(20.2)]) - - >>> pc.call_function(function_name, [pa.scalar(10.1), pa.array([10.2, 20.2]), pa.scalar(20.2)]) - - [ - 123.22, - 224.21999999999997 - ] +.. code-block:: python -Note that there is a helper function `to_numpy` to handle the conversion of scalar an array inputs -to the UDf. Also, the final output is returned as a scalr or an array depending on the inputs. -Depending on the usage of any libraries inside the UDF, make sure it is generalized to support -the passed input values and return suitable values. + >>> pc.call_function(""numpy_gcd", [pa.scalar(27), pa.scalar(63)]) + 9 Working with Datasets --------------------- More generally, user-defined functions are usable everywhere a compute function -can be referred to by its name. For example, they can be called on a dataset's +can be referred by its name. For example, they can be called on a dataset's column using :meth:`Expression._call`: Consider an instance where the data is in a table and you need to create a new column using existing values in another column by using a mathematical formula. -For instance, let's consider a simple affine operation on values using the -mathematical expression, `y = mx + c`. Here, we will be re-using the registered -`affine` function. +For instance, let's consider applying `gcd` math operation. +Here, we will be re-using the registered `numpy_gcd` function. .. code-block:: python >>> import pyarrow.dataset as ds - >>> sample_data = {'category': ['A', 'B', 'C', 'D'], 'value': [10.21, 20.12, 45.32, 15.12]} + >>> sample_data = {'category': ['A', 'B', 'C', 'D'], 'value': [90, 630, 1827, 2709]} >>> data_table = pa.Table.from_pydict(sample_data) >>> dataset = ds.dataset(data_table) - >>> func_args = [pc.scalar(5.2), ds.field("value"), pc.scalar(2.1)] + >>> func_args = [pc.scalar(30), ds.field("value")] >>> dataset.to_table( ... columns={ - ... 'projected_value': ds.field('')._call("affine", func_args), + ... 'gcd_value': ds.field('')._call("numpy_gcd", func_args), ... 'value': ds.field('value'), ... 'category': ds.field('category') ... }) pyarrow.Table - total_amount_projected($): int64 - total_amount($): int64 - trip_name: string + gcd_value: int64 + value: int64 + category: string ---- - total_amount_projected($): [[52,102,227,77]] - total_amount($): [[10,20,45,15]] - trip_name: [["A","B","C","D"]] + gcd_value: [[30,30,3,3]] + value: [[90,630,1827,2709]] + category: [["A","B","C","D"]] Note that the `ds.field('')_call()` returns an expression. The passed arguments to this function call are expressions not scalar values -(i.e `pc.scalar(5.2), ds.field("value"), pc.scalar(2.1)`, notice the difference -of `pa.scalar` vs `pc.scalar`, the latter produces an expression). This expression is evaluated -when the project operator executes it. +(i.e `pc.scalar(30), ds.field("value")`, notice the difference +of `pa.scalar` vs `pc.scalar`, the latter produces an expression). +This expression is evaluated when the project operator executes it. Projection Expressions ^^^^^^^^^^^^^^^^^^^^^^ -In the above example we used an expression to add a new column (`total_amount_projected`) +In the above example we used an expression to add a new column (`gcd_value`) to our table. Adding new, dynamically computed, columns to a table is known as "projection" and there are limitations on what kinds of functions can be used in projection expressions. A projection function must emit a single output value for each input row. That output value should be calculated entirely from the input row and should not depend on any other row. -For example, the "affine" function that we've been using as an example above is a valid +For example, the "numpy_gcd" function that we've been using as an example above is a valid function to use in a projection. A "cumulative sum" function would not be a valid function since the result of each input rows depends on the rows that came before. A "drop nulls" function would also be invalid because it doesn't emit a value for some rows. From 4aa2902a79a06d25ab449fcaeaeedf265f30ef71 Mon Sep 17 00:00:00 2001 From: Vibhatha Abeykoon Date: Thu, 22 Sep 2022 20:14:39 +0530 Subject: [PATCH 23/26] fix(reviews): updated review comments --- docs/source/python/api/compute.rst | 4 +-- docs/source/python/compute.rst | 48 ++++++++++++++---------------- python/pyarrow/src/udf.h | 3 +- 3 files changed, 26 insertions(+), 29 deletions(-) diff --git a/docs/source/python/api/compute.rst b/docs/source/python/api/compute.rst index ce4a76208bf..574bcbc659f 100644 --- a/docs/source/python/api/compute.rst +++ b/docs/source/python/api/compute.rst @@ -556,8 +556,8 @@ Compute Options VarianceOptions WeekOptions -Custom Functions ----------------- +User-Defined Functions +---------------------- .. autosummary:: :toctree: ../generated/ diff --git a/docs/source/python/compute.rst b/docs/source/python/compute.rst index ae7a8e33a79..5fd26ef3d29 100644 --- a/docs/source/python/compute.rst +++ b/docs/source/python/compute.rst @@ -377,7 +377,6 @@ User-Defined Functions .. warning:: This API is **experimental**. - Also, only scalar functions can currently be user-defined. PyArrow allows defining and registering custom compute functions. These functions can then be called from Python as well as C++ (and potentially @@ -434,35 +433,34 @@ This context exposes several useful attributes, particularly a :attr:`~pyarrow.compute.ScalarUdfContext.memory_pool` to be used for allocations in the context of the user-defined function. -PyArrow UDFs accept input types of both scalar and array. Also it can have -any combination of these types. It is important that the UDF author ensures -the UDF can handle such combinations correctly. Also the ability to use UDFs -with existing data processing libraries is very useful. - -Note that there is a helper function `to_np` to handle the conversion -of scalar and array inputs to the UDF. Also, the final output is returned -as a scalr or an array depending on the inputs. Based on the usage of any -libraries inside the UDF, make sure it is generalized to support the passed -input values and return suitable values. +PyArrow UDFs accept input types of both :class:`~pyarrow.Scalar` and :class:`~pyarrow.Array`, +and there will always be at least one input of type :class:`~pyarrow.Array`. +The output should always be a :class:`~pyarrow.Array`. You can call a user-defined function directly using :func:`pyarrow.compute.call_function`: .. code-block:: python - >>> pc.call_function(""numpy_gcd", [pa.scalar(27), pa.scalar(63)]) - 9 + >>> pc.call_function("numpy_gcd", [pa.scalar(27), pa.scalar(63)]) + + >>> pc.call_function("numpy_gcd", [pa.scalar(27), pa.array([81, 12, 5])]) + + [ + 27, + 3, + 1 + ] Working with Datasets --------------------- More generally, user-defined functions are usable everywhere a compute function can be referred by its name. For example, they can be called on a dataset's -column using :meth:`Expression._call`: +column using :meth:`Expression._call`. -Consider an instance where the data is in a table and you need to create a new -column using existing values in another column by using a mathematical formula. -For instance, let's consider applying `gcd` math operation. -Here, we will be re-using the registered `numpy_gcd` function. +Consider an instance where the data is in a table and we want to compute +the GCD of one column with the scalar value 30. We will be re-using the +"numpy_gcd" user-defined function that was created above: .. code-block:: python @@ -486,20 +484,20 @@ Here, we will be re-using the registered `numpy_gcd` function. value: [[90,630,1827,2709]] category: [["A","B","C","D"]] -Note that the `ds.field('')_call()` returns an expression. The passed arguments -to this function call are expressions not scalar values -(i.e `pc.scalar(30), ds.field("value")`, notice the difference -of `pa.scalar` vs `pc.scalar`, the latter produces an expression). -This expression is evaluated when the project operator executes it. +Note that ``ds.field('')_call(...)`` returns a :func:`pyarrow.compute.Expression`. +The arguments passed to this function call are expressions, not scalar values +(notice the difference between :func:`pyarrow.scalar` and :func:`pyarrow.compute.scalar`, +the latter produces an expression). +This expression is evaluated when the projection operator executes it. Projection Expressions ^^^^^^^^^^^^^^^^^^^^^^ -In the above example we used an expression to add a new column (`gcd_value`) +In the above example we used an expression to add a new column (``gcd_value``) to our table. Adding new, dynamically computed, columns to a table is known as "projection" and there are limitations on what kinds of functions can be used in projection expressions. A projection function must emit a single output value for each input row. That output value should be calculated entirely from the input row and should not depend on any other row. For example, the "numpy_gcd" function that we've been using as an example above is a valid function to use in a projection. A "cumulative sum" function would not be a valid function -since the result of each input rows depends on the rows that came before. A "drop nulls" +since the result of each input row depends on the rows that came before. A "drop nulls" function would also be invalid because it doesn't emit a value for some rows. diff --git a/python/pyarrow/src/udf.h b/python/pyarrow/src/udf.h index da380acea1d..a110440315a 100644 --- a/python/pyarrow/src/udf.h +++ b/python/pyarrow/src/udf.h @@ -41,8 +41,7 @@ struct ARROW_PYTHON_EXPORT ScalarUdfOptions { std::shared_ptr output_type; }; -/// \brief A context defined to hold meta-data required in -/// scalar UDF execution. +/// \brief A context passed as the first argument of scalar UDF functions. struct ARROW_PYTHON_EXPORT ScalarUdfContext { MemoryPool* pool; int64_t batch_length; From 7e88faab8d77c513dcb7f5dadeaebc98cc3d4e77 Mon Sep 17 00:00:00 2001 From: Vibhatha Abeykoon Date: Sat, 24 Sep 2022 06:50:04 +0530 Subject: [PATCH 24/26] fix(reviews): updated the docs and code --- docs/source/python/compute.rst | 28 ++++++++++++++-------------- 1 file changed, 14 insertions(+), 14 deletions(-) diff --git a/docs/source/python/compute.rst b/docs/source/python/compute.rst index 5fd26ef3d29..8d47e6dc7f5 100644 --- a/docs/source/python/compute.rst +++ b/docs/source/python/compute.rst @@ -381,7 +381,12 @@ User-Defined Functions PyArrow allows defining and registering custom compute functions. These functions can then be called from Python as well as C++ (and potentially any other implementation wrapping Arrow C++, such as the R ``arrow`` package) -using their registered function name. +using their registered function name. + +UDF support is limited to scalar functions. A scalar function is a function which +executes elementwise operations on arrays or scalars. In general, the output of a +scalar function do not depend on the order of values in the arguments. Note that +such functions have a rough correspondence to the functions used in SQL expressions. To register a UDF, a function name, function docs, input types and output type need to be defined. Using :func:`pyarrow.compute.register_scalar_function`, @@ -409,15 +414,15 @@ output type need to be defined. Using :func:`pyarrow.compute.register_scalar_fun output_type = pa.int64() def to_np(val): - if isinstance(val, pa.Scalar): - return val.as_py() - else: - return np.array(val) + if isinstance(val, pa.Scalar): + return val.as_py() + else: + return np.array(val) def gcd_numpy(ctx, x, y): - np_x = to_np(x) - np_y = to_np(y) - return pa.array(np.gcd(np_x, np_y)) + np_x = to_np(x) + np_y = to_np(y) + return pa.array(np.gcd(np_x, np_y)) pc.register_scalar_function(gcd_numpy, function_name, @@ -433,10 +438,6 @@ This context exposes several useful attributes, particularly a :attr:`~pyarrow.compute.ScalarUdfContext.memory_pool` to be used for allocations in the context of the user-defined function. -PyArrow UDFs accept input types of both :class:`~pyarrow.Scalar` and :class:`~pyarrow.Array`, -and there will always be at least one input of type :class:`~pyarrow.Array`. -The output should always be a :class:`~pyarrow.Array`. - You can call a user-defined function directly using :func:`pyarrow.compute.call_function`: .. code-block:: python @@ -465,8 +466,7 @@ the GCD of one column with the scalar value 30. We will be re-using the .. code-block:: python >>> import pyarrow.dataset as ds - >>> sample_data = {'category': ['A', 'B', 'C', 'D'], 'value': [90, 630, 1827, 2709]} - >>> data_table = pa.Table.from_pydict(sample_data) + >>> data_table = pa.table({'category': ['A', 'B', 'C', 'D'], 'value': [90, 630, 1827, 2709]}) >>> dataset = ds.dataset(data_table) >>> func_args = [pc.scalar(30), ds.field("value")] >>> dataset.to_table( From 227db701791baa72846b12367e8ae774e03e0f12 Mon Sep 17 00:00:00 2001 From: Vibhatha Abeykoon Date: Wed, 28 Sep 2022 21:07:26 +0530 Subject: [PATCH 25/26] fix(submodule) --- testing | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/testing b/testing index 53b49804710..5bab2f264a2 160000 --- a/testing +++ b/testing @@ -1 +1 @@ -Subproject commit 53b498047109d9940fcfab388bd9d6aeb8c57425 +Subproject commit 5bab2f264a23f5af68f69ea93d24ef1e8e77fc88 From a8555bfa1f798a4c7f26ab554c190da7958795fe Mon Sep 17 00:00:00 2001 From: Vibhatha Abeykoon Date: Wed, 28 Sep 2022 21:32:26 +0530 Subject: [PATCH 26/26] fix(reviews): doc update --- docs/source/python/compute.rst | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/docs/source/python/compute.rst b/docs/source/python/compute.rst index 8d47e6dc7f5..fe7f333300f 100644 --- a/docs/source/python/compute.rst +++ b/docs/source/python/compute.rst @@ -385,8 +385,9 @@ using their registered function name. UDF support is limited to scalar functions. A scalar function is a function which executes elementwise operations on arrays or scalars. In general, the output of a -scalar function do not depend on the order of values in the arguments. Note that -such functions have a rough correspondence to the functions used in SQL expressions. +scalar function does not depend on the order of values in the arguments. Note that +such functions have a rough correspondence to the functions used in SQL expressions, +or to NumPy `universal functions `_. To register a UDF, a function name, function docs, input types and output type need to be defined. Using :func:`pyarrow.compute.register_scalar_function`, @@ -431,7 +432,7 @@ output type need to be defined. Using :func:`pyarrow.compute.register_scalar_fun output_type) -The implementation of a user-defined function always takes first *context* +The implementation of a user-defined function always takes a first *context* parameter (named ``ctx`` in the example above) which is an instance of :class:`pyarrow.compute.ScalarUdfContext`. This context exposes several useful attributes, particularly a