diff --git a/docs/source/conf.py b/docs/source/conf.py index 827f7109c88..37e3336e743 100644 --- a/docs/source/conf.py +++ b/docs/source/conf.py @@ -36,6 +36,7 @@ import sys import warnings from unittest import mock +from docutils.parsers.rst import Directive, directives import pyarrow @@ -463,3 +464,49 @@ def setup(app): # This will also rebuild appropriately when the value changes. app.add_config_value('cuda_enabled', cuda_enabled, 'env') app.add_config_value('flight_enabled', flight_enabled, 'env') + app.add_directive('arrow-computefuncs', ComputeFunctionsTableDirective) + + +class ComputeFunctionsTableDirective(Directive): + """Generate a table of Arrow compute functions. + + .. arrow-computefuncs:: + :kind: hash_aggregate + + The generated table will include function name, + description and option class reference. + + The functions listed in the table can be restricted + with the :kind: option. + """ + has_content = True + option_spec = { + "kind": directives.unchanged + } + + def run(self): + from docutils.statemachine import ViewList + from docutils import nodes + import pyarrow.compute as pc + + result = ViewList() + function_kind = self.options.get('kind', None) + + result.append(".. csv-table::", "") + result.append(" :widths: 20, 60, 20", "") + result.append(" ", "") + for fname in pc.list_functions(): + func = pc.get_function(fname) + option_class = "" + if func._doc.options_class: + option_class = f":class:`{func._doc.options_class}`" + if not function_kind or func.kind == function_kind: + result.append( + f' "{fname}", "{func._doc.summary}", "{option_class}"', + "" + ) + + node = nodes.section() + node.document = self.state.document + self.state.nested_parse(result, 0, node) + return node.children diff --git a/docs/source/python/api/compute.rst b/docs/source/python/api/compute.rst index 00897a24983..400461e264e 100644 --- a/docs/source/python/api/compute.rst +++ b/docs/source/python/api/compute.rst @@ -45,28 +45,6 @@ Aggregations tdigest variance -Grouped Aggregations --------------------- - -.. autosummary:: - :toctree: ../generated/ - - hash_all - hash_any - hash_approximate_median - hash_count - hash_count_distinct - hash_distinct - hash_max - hash_mean - hash_min - hash_min_max - hash_product - hash_stddev - hash_sum - hash_tdigest - hash_variance - Arithmetic Functions -------------------- @@ -498,3 +476,50 @@ Structural Transforms make_struct replace_with_mask struct_field + +Compute Options +--------------- + +.. autosummary:: + :toctree: ../generated/ + + ArraySortOptions + AssumeTimezoneOptions + CastOptions + CountOptions + CountOptions + DayOfWeekOptions + DictionaryEncodeOptions + ElementWiseAggregateOptions + ExtractRegexOptions + FilterOptions + IndexOptions + JoinOptions + MakeStructOptions + MatchSubstringOptions + ModeOptions + NullOptions + PadOptions + PartitionNthOptions + QuantileOptions + ReplaceSliceOptions + ReplaceSubstringOptions + RoundOptions + RoundToMultipleOptions + ScalarAggregateOptions + ScalarAggregateOptions + SelectKOptions + SetLookupOptions + SliceOptions + SortOptions + SplitOptions + SplitPatternOptions + StrftimeOptions + StrptimeOptions + StructFieldOptions + TakeOptions + TDigestOptions + TDigestOptions + TrimOptions + VarianceOptions + WeekOptions \ No newline at end of file diff --git a/docs/source/python/compute.rst b/docs/source/python/compute.rst index 133520de970..f6fc37a3796 100644 --- a/docs/source/python/compute.rst +++ b/docs/source/python/compute.rst @@ -23,17 +23,33 @@ Compute Functions ================= Arrow supports logical compute operations over inputs of possibly -varying types. Many compute functions support both array (chunked or not) -and scalar inputs, but some will mandate either. For example, -``sort_indices`` requires its first and only input to be an array. +varying types. -Below are a few simple examples: +The standard compute operations are provided by the :mod:`pyarrow.compute` +module and can be used directly:: >>> import pyarrow as pa >>> import pyarrow.compute as pc >>> a = pa.array([1, 1, 2, 3]) >>> pc.sum(a) + +The grouped aggregation functions raise an exception instead +and need to be used through the :meth:`pyarrow.Table.group_by` capabilities. +See :ref:`py-grouped-aggrs` for more details. + +Standard Compute Functions +========================== + +Many compute functions support both array (chunked or not) +and scalar inputs, but some will mandate either. For example, +``sort_indices`` requires its first and only input to be an array. + +Below are a few simple examples:: + + >>> import pyarrow as pa + >>> import pyarrow.compute as pc + >>> a = pa.array([1, 1, 2, 3]) >>> b = pa.array([4, 1, 2, 8]) >>> pc.equal(a, b) @@ -48,7 +64,7 @@ Below are a few simple examples: These functions can do more than just element-by-element operations. -Here is an example of sorting a table: +Here is an example of sorting a table:: >>> import pyarrow as pa >>> import pyarrow.compute as pc @@ -62,8 +78,91 @@ Here is an example of sorting a table: 0 ] - +For a complete list of the compute functions that PyArrow provides +you can refer to :ref:`api.compute` reference. .. seealso:: :ref:`Available compute functions (C++ documentation) `. + +.. _py-grouped-aggrs: + +Grouped Aggregations +==================== + +PyArrow supports grouped aggregations over :class:`pyarrow.Table` through the +:meth:`pyarrow.Table.group_by` method. +The method will return a grouping declaration +to which the hash aggregation functions can be applied:: + + >>> import pyarrow as pa + >>> t = pa.table([ + ... pa.array(["a", "a", "b", "b", "c"]), + ... pa.array([1, 2, 3, 4, 5]), + ... ], names=["keys", "values"]) + >>> t.group_by("keys").aggregate([("values", "sum")]) + pyarrow.Table + values_sum: int64 + keys: string + ---- + values_sum: [[3,7,5]] + keys: [["a","b","c"]] + +The ``"sum"`` aggregation passed to the ``aggregate`` method in the previous +example is the ``hash_sum`` compute function. + +Multiple aggregations can be performed at the same time by providing them +to the ``aggregate`` method:: + + >>> import pyarrow as pa + >>> t = pa.table([ + ... pa.array(["a", "a", "b", "b", "c"]), + ... pa.array([1, 2, 3, 4, 5]), + ... ], names=["keys", "values"]) + >>> t.group_by("keys").aggregate([ + ... ("values", "sum"), + ... ("keys", "count") + ... ]) + pyarrow.Table + values_sum: int64 + keys_count: int64 + keys: string + ---- + values_sum: [[3,7,5]] + keys_count: [[2,2,1]] + keys: [["a","b","c"]] + +Aggregation options can also be provided for each aggregation function, +for example we can use :class:`CountOptions` to change how we count +null values:: + + >>> import pyarrow as pa + >>> import pyarrow.compute as pc + >>> table_with_nulls = pa.table([ + ... pa.array(["a", "a", "a"]), + ... pa.array([1, None, None]) + ... ], names=["keys", "values"]) + >>> table_with_nulls.group_by(["keys"]).aggregate([ + ... ("values", "count", pc.CountOptions(mode="all")) + ... ]) + pyarrow.Table + values_count: int64 + keys: string + ---- + values_count: [[3]] + keys: [["a"]] + >>> table_with_nulls.group_by(["keys"]).aggregate([ + ... ("values", "count", pc.CountOptions(mode="only_valid")) + ... ]) + pyarrow.Table + values_count: int64 + keys: string + ---- + values_count: [[1]] + keys: [["a"]] + +Following is a list of all supported grouped aggregation functions. +You can use them with or without the ``"hash_"`` prefix. + +.. arrow-computefuncs:: + :kind: hash_aggregate \ No newline at end of file