From 1da3a1732805b6b479d57d456721020fe7ffce39 Mon Sep 17 00:00:00 2001 From: Alessandro Molina Date: Wed, 1 Dec 2021 16:11:03 +0100 Subject: [PATCH 01/11] Improve compute documentation --- docs/source/conf.py | 33 +++++++++++++++++++ docs/source/python/compute.rst | 58 ++++++++++++++++++++++++++++++---- 2 files changed, 85 insertions(+), 6 deletions(-) diff --git a/docs/source/conf.py b/docs/source/conf.py index 827f7109c88..032f4936757 100644 --- a/docs/source/conf.py +++ b/docs/source/conf.py @@ -36,6 +36,7 @@ import sys import warnings from unittest import mock +from docutils.parsers.rst import Directive, directives import pyarrow @@ -463,3 +464,35 @@ def setup(app): # This will also rebuild appropriately when the value changes. app.add_config_value('cuda_enabled', cuda_enabled, 'env') app.add_config_value('flight_enabled', flight_enabled, 'env') + app.add_directive('computefuncs', ComputeFunctionsTableDirective) + + +class ComputeFunctionsTableDirective(Directive): + has_content = True + option_spec = { + "kind": directives.unchanged + } + + def run(self): + from docutils.statemachine import ViewList + from docutils import nodes + import pyarrow._compute + + result = ViewList() + print("OPTIONS", self.options) + function_kind = self.options.get('kind', None) + + result.append(".. csv-table::", "") + result.append(" :widths: 30, 70", "") + result.append(" ", "") + funcs_reg = pyarrow._compute.function_registry() + for fname in funcs_reg.list_functions(): + f = funcs_reg.get_function(fname) + if not function_kind or f.kind == function_kind: + result.append(' "{}", "{}"'.format(fname, f._doc.summary), + "") + + node = nodes.section() + node.document = self.state.document + self.state.nested_parse(result, 0, node) + return node.children diff --git a/docs/source/python/compute.rst b/docs/source/python/compute.rst index 133520de970..81f0cf248b7 100644 --- a/docs/source/python/compute.rst +++ b/docs/source/python/compute.rst @@ -23,17 +23,32 @@ Compute Functions ================= Arrow supports logical compute operations over inputs of possibly -varying types. Many compute functions support both array (chunked or not) -and scalar inputs, but some will mandate either. For example, -``sort_indices`` requires its first and only input to be an array. +varying types. -Below are a few simple examples: +The standard compute operations are provided by the :mod:`pyarrow.compute` +module and can be used directly:: >>> import pyarrow as pa >>> import pyarrow.compute as pc >>> a = pa.array([1, 1, 2, 3]) >>> pc.sum(a) + +The grouped aggregation functions are instead an exception +and need to be used through the :meth:`pyarrow.Table.group_by` capabilities. + +Standard Compute Functions +========================== + +Many compute functions support both array (chunked or not) +and scalar inputs, but some will mandate either. For example, +``sort_indices`` requires its first and only input to be an array. + +Below are a few simple examples:: + + >>> import pyarrow as pa + >>> import pyarrow.compute as pc + >>> a = pa.array([1, 1, 2, 3]) >>> b = pa.array([4, 1, 2, 8]) >>> pc.equal(a, b) @@ -48,7 +63,7 @@ Below are a few simple examples: These functions can do more than just element-by-element operations. -Here is an example of sorting a table: +Here is an example of sorting a table:: >>> import pyarrow as pa >>> import pyarrow.compute as pc @@ -62,8 +77,39 @@ Here is an example of sorting a table: 0 ] - +For a complete list of the compute functions that PyArrow provides +you can refer to :ref:`api.compute` reference. .. seealso:: :ref:`Available compute functions (C++ documentation) `. + +Grouped Aggregations +==================== + +PyArrow supports grouped aggregations over :class:`pyarrow.Table` through the +:meth:`pyarrow.Table.group_by` method. +The method will return a grouping declaration +to which the hash aggregation functions can be applied:: + + >>> import pyarrow as pa + >>> t = pa.table([ + ... pa.array(["a", "a", "b", "b", "c"]), + ... pa.array([1, 2, 3, 4, 5]), + ... ], names=["keys", "values"]) + >>> t.group_by("keys").aggregate([("values", "sum")]) + pyarrow.Table + values_sum: int64 + keys: string + ---- + values_sum: [[3,7,5]] + keys: [["a","b","c"]] + +The ``"sum"`` aggregation passed to the ``aggregate`` method in the previous +example is the :func:`hash_sum` compute function. + +Following is a list of all supported grouped aggregation functions. +You can use them with our without the ``"hash_"`` prefix. + +.. computefuncs:: + :kind: hash_aggregate \ No newline at end of file From 5416038ed99a6219506c3c28bd7e29af7963ce60 Mon Sep 17 00:00:00 2001 From: Alessandro Molina Date: Wed, 1 Dec 2021 16:15:09 +0100 Subject: [PATCH 02/11] oops --- docs/source/conf.py | 1 - 1 file changed, 1 deletion(-) diff --git a/docs/source/conf.py b/docs/source/conf.py index 032f4936757..cd51641eecb 100644 --- a/docs/source/conf.py +++ b/docs/source/conf.py @@ -479,7 +479,6 @@ def run(self): import pyarrow._compute result = ViewList() - print("OPTIONS", self.options) function_kind = self.options.get('kind', None) result.append(".. csv-table::", "") From aa6e0337afd26b689c8fd64aa9237eed06c824c1 Mon Sep 17 00:00:00 2001 From: Alessandro Molina Date: Wed, 1 Dec 2021 16:19:30 +0100 Subject: [PATCH 03/11] typo --- docs/source/python/compute.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/source/python/compute.rst b/docs/source/python/compute.rst index 81f0cf248b7..0e983f21b28 100644 --- a/docs/source/python/compute.rst +++ b/docs/source/python/compute.rst @@ -109,7 +109,7 @@ The ``"sum"`` aggregation passed to the ``aggregate`` method in the previous example is the :func:`hash_sum` compute function. Following is a list of all supported grouped aggregation functions. -You can use them with our without the ``"hash_"`` prefix. +You can use them with or without the ``"hash_"`` prefix. .. computefuncs:: :kind: hash_aggregate \ No newline at end of file From adde63314a6f2228119e3a307d375d6e83a99fea Mon Sep 17 00:00:00 2001 From: Alessandro Molina Date: Thu, 2 Dec 2021 10:35:32 +0100 Subject: [PATCH 04/11] Extend doc --- docs/source/python/compute.rst | 50 ++++++++++++++++++++++++++++++++++ 1 file changed, 50 insertions(+) diff --git a/docs/source/python/compute.rst b/docs/source/python/compute.rst index 0e983f21b28..b63b9e424f2 100644 --- a/docs/source/python/compute.rst +++ b/docs/source/python/compute.rst @@ -108,6 +108,56 @@ to which the hash aggregation functions can be applied:: The ``"sum"`` aggregation passed to the ``aggregate`` method in the previous example is the :func:`hash_sum` compute function. +Multiple aggregations can be performed at the same time by providing them +to the ``aggregate`` method:: + + >>> import pyarrow as pa + >>> t = pa.table([ + ... pa.array(["a", "a", "b", "b", "c"]), + ... pa.array([1, 2, 3, 4, 5]), + ... ], names=["keys", "values"]) + >>> t.group_by("keys").aggregate([ + ... ("values", "sum"), + ... ("keys", "count") + ... ]) + pyarrow.Table + values_sum: int64 + keys_count: int64 + keys: string + ---- + values_sum: [[3,7,5]] + keys_count: [[2,2,1]] + keys: [["a","b","c"]] + +Aggregation options can also be provided for each aggregation function, +for example we can use :class:`CountOptions` to change how we count +null values:: + + >>> import pyarrow as pa + >>> import pyarrow.compute as pc + >>> table_with_nulls = pa.table([ + ... pa.array(["a", "a", "a"]), + ... pa.array([1, None, None]) + ... ], names=["keys", "values"]) + >>> table_with_nulls.group_by(["keys"]).aggregate([ + ... ("values", "count", pc.CountOptions(mode="all")) + ... ]) + pyarrow.Table + values_count: int64 + keys: string + ---- + values_count: [[3]] + keys: [["a"]] + >>> table_with_nulls.group_by(["keys"]).aggregate([ + ... ("values", "count", pc.CountOptions(mode="only_valid")) + ... ]) + pyarrow.Table + values_count: int64 + keys: string + ---- + values_count: [[1]] + keys: [["a"]] + Following is a list of all supported grouped aggregation functions. You can use them with or without the ``"hash_"`` prefix. From 38463e936eb138833701dd6e200ad92ff0dfd886 Mon Sep 17 00:00:00 2001 From: Alessandro Molina Date: Thu, 2 Dec 2021 10:51:23 +0100 Subject: [PATCH 05/11] Document options --- docs/source/conf.py | 12 ++++++-- docs/source/python/api/compute.rst | 47 ++++++++++++++++++++++++++++++ 2 files changed, 56 insertions(+), 3 deletions(-) diff --git a/docs/source/conf.py b/docs/source/conf.py index cd51641eecb..a53586454cd 100644 --- a/docs/source/conf.py +++ b/docs/source/conf.py @@ -482,14 +482,20 @@ def run(self): function_kind = self.options.get('kind', None) result.append(".. csv-table::", "") - result.append(" :widths: 30, 70", "") + result.append(" :widths: 20, 60, 20", "") result.append(" ", "") funcs_reg = pyarrow._compute.function_registry() for fname in funcs_reg.list_functions(): f = funcs_reg.get_function(fname) + option_class = "" + if f._doc.options_class: + option_class = ":class:`{}`".format( + f._doc.options_class + ) if not function_kind or f.kind == function_kind: - result.append(' "{}", "{}"'.format(fname, f._doc.summary), - "") + result.append(' "{}", "{}", "{}"'.format( + fname, f._doc.summary, option_class + ), "") node = nodes.section() node.document = self.state.document diff --git a/docs/source/python/api/compute.rst b/docs/source/python/api/compute.rst index 00897a24983..ffe7d7dbc32 100644 --- a/docs/source/python/api/compute.rst +++ b/docs/source/python/api/compute.rst @@ -498,3 +498,50 @@ Structural Transforms make_struct replace_with_mask struct_field + +Compute Options +--------------- + +.. autosummary:: + :toctree: ../generated/ + + ScalarAggregateOptions + CountOptions + TDigestOptions + ArraySortOptions + AssumeTimezoneOptions + CastOptions + CountOptions + DayOfWeekOptions + DictionaryEncodeOptions + ElementWiseAggregateOptions + ExtractRegexOptions + FilterOptions + IndexOptions + JoinOptions + MakeStructOptions + MatchSubstringOptions + ModeOptions + NullOptions + PadOptions + PartitionNthOptions + QuantileOptions + ReplaceSliceOptions + ReplaceSubstringOptions + RoundOptions + RoundToMultipleOptions + ScalarAggregateOptions + SelectKOptions + SetLookupOptions + SliceOptions + SortOptions + SplitOptions + SplitPatternOptions + StrftimeOptions + StrptimeOptions + StructFieldOptions + TakeOptions + TDigestOptions + TrimOptions + VarianceOptions + WeekOptions \ No newline at end of file From 4b71d18f1dcfd779aa0230106fb1fe5038704b33 Mon Sep 17 00:00:00 2001 From: Alessandro Molina Date: Thu, 2 Dec 2021 16:08:44 +0100 Subject: [PATCH 06/11] Sort lines --- docs/source/python/api/compute.rst | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/docs/source/python/api/compute.rst b/docs/source/python/api/compute.rst index ffe7d7dbc32..b75f8ef45dd 100644 --- a/docs/source/python/api/compute.rst +++ b/docs/source/python/api/compute.rst @@ -505,13 +505,11 @@ Compute Options .. autosummary:: :toctree: ../generated/ - ScalarAggregateOptions - CountOptions - TDigestOptions ArraySortOptions AssumeTimezoneOptions CastOptions CountOptions + CountOptions DayOfWeekOptions DictionaryEncodeOptions ElementWiseAggregateOptions @@ -531,6 +529,7 @@ Compute Options RoundOptions RoundToMultipleOptions ScalarAggregateOptions + ScalarAggregateOptions SelectKOptions SetLookupOptions SliceOptions @@ -542,6 +541,7 @@ Compute Options StructFieldOptions TakeOptions TDigestOptions + TDigestOptions TrimOptions VarianceOptions WeekOptions \ No newline at end of file From bdabeab4d87f4f699cf89c5023b431ad9870fbfc Mon Sep 17 00:00:00 2001 From: Alessandro Molina Date: Thu, 2 Dec 2021 16:19:57 +0100 Subject: [PATCH 07/11] code review feedbacks --- docs/source/conf.py | 33 +++++++++++++++++++++------------ docs/source/python/compute.rst | 4 ++-- 2 files changed, 23 insertions(+), 14 deletions(-) diff --git a/docs/source/conf.py b/docs/source/conf.py index a53586454cd..37e3336e743 100644 --- a/docs/source/conf.py +++ b/docs/source/conf.py @@ -464,10 +464,21 @@ def setup(app): # This will also rebuild appropriately when the value changes. app.add_config_value('cuda_enabled', cuda_enabled, 'env') app.add_config_value('flight_enabled', flight_enabled, 'env') - app.add_directive('computefuncs', ComputeFunctionsTableDirective) + app.add_directive('arrow-computefuncs', ComputeFunctionsTableDirective) class ComputeFunctionsTableDirective(Directive): + """Generate a table of Arrow compute functions. + + .. arrow-computefuncs:: + :kind: hash_aggregate + + The generated table will include function name, + description and option class reference. + + The functions listed in the table can be restricted + with the :kind: option. + """ has_content = True option_spec = { "kind": directives.unchanged @@ -476,7 +487,7 @@ class ComputeFunctionsTableDirective(Directive): def run(self): from docutils.statemachine import ViewList from docutils import nodes - import pyarrow._compute + import pyarrow.compute as pc result = ViewList() function_kind = self.options.get('kind', None) @@ -484,18 +495,16 @@ def run(self): result.append(".. csv-table::", "") result.append(" :widths: 20, 60, 20", "") result.append(" ", "") - funcs_reg = pyarrow._compute.function_registry() - for fname in funcs_reg.list_functions(): - f = funcs_reg.get_function(fname) + for fname in pc.list_functions(): + func = pc.get_function(fname) option_class = "" - if f._doc.options_class: - option_class = ":class:`{}`".format( - f._doc.options_class + if func._doc.options_class: + option_class = f":class:`{func._doc.options_class}`" + if not function_kind or func.kind == function_kind: + result.append( + f' "{fname}", "{func._doc.summary}", "{option_class}"', + "" ) - if not function_kind or f.kind == function_kind: - result.append(' "{}", "{}", "{}"'.format( - fname, f._doc.summary, option_class - ), "") node = nodes.section() node.document = self.state.document diff --git a/docs/source/python/compute.rst b/docs/source/python/compute.rst index b63b9e424f2..f5e8370abdb 100644 --- a/docs/source/python/compute.rst +++ b/docs/source/python/compute.rst @@ -106,7 +106,7 @@ to which the hash aggregation functions can be applied:: keys: [["a","b","c"]] The ``"sum"`` aggregation passed to the ``aggregate`` method in the previous -example is the :func:`hash_sum` compute function. +example is the ``hash_sum`` compute function. Multiple aggregations can be performed at the same time by providing them to the ``aggregate`` method:: @@ -161,5 +161,5 @@ null values:: Following is a list of all supported grouped aggregation functions. You can use them with or without the ``"hash_"`` prefix. -.. computefuncs:: +.. arrow-computefuncs:: :kind: hash_aggregate \ No newline at end of file From 42833c510868eda6eda577057a46f3d6ba491122 Mon Sep 17 00:00:00 2001 From: Alessandro Molina Date: Thu, 2 Dec 2021 17:07:00 +0100 Subject: [PATCH 08/11] Update docs/source/python/compute.rst Co-authored-by: Joris Van den Bossche --- docs/source/python/compute.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/source/python/compute.rst b/docs/source/python/compute.rst index f5e8370abdb..3a82f2adcde 100644 --- a/docs/source/python/compute.rst +++ b/docs/source/python/compute.rst @@ -34,7 +34,7 @@ module and can be used directly:: >>> pc.sum(a) -The grouped aggregation functions are instead an exception +The grouped aggregation functions raise an exception instead and need to be used through the :meth:`pyarrow.Table.group_by` capabilities. Standard Compute Functions From c9bb76ad519b3594c608d792153d9808bc744de6 Mon Sep 17 00:00:00 2001 From: Alessandro Molina Date: Mon, 6 Dec 2021 15:22:35 +0100 Subject: [PATCH 09/11] Add reference to grouped aggrs --- docs/source/python/compute.rst | 3 +++ 1 file changed, 3 insertions(+) diff --git a/docs/source/python/compute.rst b/docs/source/python/compute.rst index 3a82f2adcde..f6fc37a3796 100644 --- a/docs/source/python/compute.rst +++ b/docs/source/python/compute.rst @@ -36,6 +36,7 @@ module and can be used directly:: The grouped aggregation functions raise an exception instead and need to be used through the :meth:`pyarrow.Table.group_by` capabilities. +See :ref:`py-grouped-aggrs` for more details. Standard Compute Functions ========================== @@ -84,6 +85,8 @@ you can refer to :ref:`api.compute` reference. :ref:`Available compute functions (C++ documentation) `. +.. _py-grouped-aggrs: + Grouped Aggregations ==================== From c49eaae9df98ffb598783eb36a6a404ca84e06a4 Mon Sep 17 00:00:00 2001 From: Alessandro Molina Date: Mon, 6 Dec 2021 15:24:49 +0100 Subject: [PATCH 10/11] Remove reference to grouped aggregation functioons as they are not callable --- cpp/submodules/parquet-testing | 2 +- docs/source/python/api/compute.rst | 22 ---------------------- testing | 2 +- 3 files changed, 2 insertions(+), 24 deletions(-) diff --git a/cpp/submodules/parquet-testing b/cpp/submodules/parquet-testing index 8f2a069ed2c..600d437de0e 160000 --- a/cpp/submodules/parquet-testing +++ b/cpp/submodules/parquet-testing @@ -1 +1 @@ -Subproject commit 8f2a069ed2c58787e5be2a3ca8c68bc801b8eafa +Subproject commit 600d437de0e8b0e9927c87e76f844a1b385b02e8 diff --git a/docs/source/python/api/compute.rst b/docs/source/python/api/compute.rst index b75f8ef45dd..400461e264e 100644 --- a/docs/source/python/api/compute.rst +++ b/docs/source/python/api/compute.rst @@ -45,28 +45,6 @@ Aggregations tdigest variance -Grouped Aggregations --------------------- - -.. autosummary:: - :toctree: ../generated/ - - hash_all - hash_any - hash_approximate_median - hash_count - hash_count_distinct - hash_distinct - hash_max - hash_mean - hash_min - hash_min_max - hash_product - hash_stddev - hash_sum - hash_tdigest - hash_variance - Arithmetic Functions -------------------- diff --git a/testing b/testing index d6c7b9d670f..2c29a733ac2 160000 --- a/testing +++ b/testing @@ -1 +1 @@ -Subproject commit d6c7b9d670f3cc3af4a27e043749300b9d27addf +Subproject commit 2c29a733ac2c8492d5df3b74ea5ab1a32f892f60 From bb5e88dd8fb7ff9fa70101adfbeff646338176dd Mon Sep 17 00:00:00 2001 From: Alessandro Molina Date: Mon, 6 Dec 2021 15:27:24 +0100 Subject: [PATCH 11/11] submodules --- cpp/submodules/parquet-testing | 2 +- testing | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/cpp/submodules/parquet-testing b/cpp/submodules/parquet-testing index 600d437de0e..8f2a069ed2c 160000 --- a/cpp/submodules/parquet-testing +++ b/cpp/submodules/parquet-testing @@ -1 +1 @@ -Subproject commit 600d437de0e8b0e9927c87e76f844a1b385b02e8 +Subproject commit 8f2a069ed2c58787e5be2a3ca8c68bc801b8eafa diff --git a/testing b/testing index 2c29a733ac2..d6c7b9d670f 160000 --- a/testing +++ b/testing @@ -1 +1 @@ -Subproject commit 2c29a733ac2c8492d5df3b74ea5ab1a32f892f60 +Subproject commit d6c7b9d670f3cc3af4a27e043749300b9d27addf