Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion cpp/src/arrow/compute/api_vector.h
Original file line number Diff line number Diff line change
Expand Up @@ -59,7 +59,7 @@ struct ARROW_EXPORT TakeOptions : public FunctionOptions {
};

/// \brief Partitioning options for NthToIndices
struct PartitionNthOptions : public FunctionOptions {
struct ARROW_EXPORT PartitionNthOptions : public FunctionOptions {
explicit PartitionNthOptions(int64_t pivot) : pivot(pivot) {}

/// The index into the equivalent sorted array of the partition pivot element.
Expand Down
6 changes: 4 additions & 2 deletions docs/source/cpp/compute.rst
Original file line number Diff line number Diff line change
Expand Up @@ -31,8 +31,8 @@ The generic Compute API
Functions and function registry
-------------------------------

Functions represent logical compute operations over inputs of possibly
varying types. Internally, a function is implemented by one or several
Functions represent compute operations over inputs of possibly varying
types. Internally, a function is implemented by one or several
"kernels", depending on the concrete input types (for example, a function
adding values from two inputs can have different kernels depending on
whether the inputs are integral or floating-point).
Expand Down Expand Up @@ -101,6 +101,8 @@ exact semantics of the function::
:doc:`Compute API reference <api/compute>`


.. _compute-function-list:

Available functions
===================

Expand Down
1 change: 1 addition & 0 deletions docs/source/python/api.rst
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,7 @@ API Reference
api/datatypes
api/arrays
api/memory
api/compute
api/files
api/tables
api/ipc
Expand Down
206 changes: 206 additions & 0 deletions docs/source/python/api/compute.rst
Original file line number Diff line number Diff line change
@@ -0,0 +1,206 @@
.. Licensed to the Apache Software Foundation (ASF) under one
.. or more contributor license agreements. See the NOTICE file
.. distributed with this work for additional information
.. regarding copyright ownership. The ASF licenses this file
.. to you under the Apache License, Version 2.0 (the
.. "License"); you may not use this file except in compliance
.. with the License. You may obtain a copy of the License at

.. http://www.apache.org/licenses/LICENSE-2.0

.. Unless required by applicable law or agreed to in writing,
.. software distributed under the License is distributed on an
.. "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
.. KIND, either express or implied. See the License for the
.. specific language governing permissions and limitations
.. under the License.

.. _api.compute:
.. currentmodule:: pyarrow.compute

Compute Functions
=================

Aggregations
------------

.. autosummary::
:toctree: ../generated/

count
mean
min_max
mode
Copy link
Member

@pitrou pitrou Sep 15, 2020

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Will this render the docstrings?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think so. It's what's done on the other pages:
https://github.com/apache/arrow/blob/master/docs/source/python/api/datatypes.rst

I'm having trouble building the docs locally (related to ARROW-10018 I think) so haven't been able to check

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This won't render the docstring here, but it will create a separate page for each of those functions and link to that from this table.

(the separate pages might be a bit overkill since the docstrings here are often not that informative yet, but it's indeed how we do it for other functions as well)

stddev
sum
variance

Arithmetic Functions
--------------------

By default these functions do not detect overflow. Each function is also
available in an overflow-checking variant, suffixed ``_checked``, which
throws an ``ArrowInvalid`` exception when overflow is detected.

.. autosummary::
:toctree: ../generated/

add
add_checked
divide
divide_checked
multiply
multiply_checked
subtract
subtract_checked

Comparisons
-----------

These functions expect two inputs of the same type. If one of the inputs is `null`
they return ``null``.

.. autosummary::
:toctree: ../generated/

equal
greater
greater_equal
less
less_equal
not_equal

Logical Functions
-----------

These functions normally emit a null when one of the inputs is null. However, Kleene
logic variants are provided (suffixed ``_kleene``). See User Guide for details.

.. autosummary::
:toctree: ../generated/

and_
and_kleene
invert
or_
or_kleene
xor

String Predicates
-----------------

In these functions an empty string emits false in the output. For ASCII
variants (prefixed ``ascii_``) a string element with non-ASCII characters
emits false in the output.

The first set of functions emit true if the input contains only
characters of a given class.

.. autosummary::
:toctree: ../generated/

ascii_is_alnum
ascii_is_alpha
ascii_is_decimal
ascii_is_lower
ascii_is_printable
ascii_is_space
ascii_is_upper
utf8_is_alnum
utf8_is_alpha
utf8_is_decimal
utf8_is_digit
utf8_is_lower
utf8_is_numeric
utf8_is_printable
utf8_is_space
utf8_is_upper

The second set of functions also consider the order of characters
in the string element.

.. autosummary::
:toctree: ../generated/

ascii_is_title
utf8_is_title

The third set of functions examines string elements on
a byte-by-byte basis.

.. autosummary::
:toctree: ../generated/

string_is_ascii

String Transforms
-----------------

.. autosummary::
:toctree: ../generated/

ascii_lower
ascii_upper
utf8_lower
utf8_upper

Containment tests
-----------------

.. autosummary::
:toctree: ../generated/

index_in
is_in
match_substring

Conversions
-----------

.. autosummary::
:toctree: ../generated/

cast
strptime

Selections
----------

.. autosummary::
:toctree: ../generated/

filter
take

Associative transforms
----------------------

.. autosummary::
:toctree: ../generated/

dictionary_encode
unique
value_counts

Sorts and partitions
--------------------

.. autosummary::
:toctree: ../generated/

partition_nth_indices
sort_indices

Structural Transforms
---------------------

.. autosummary::
:toctree: ../generated/

binary_length
fill_null
is_null
is_valid
list_value_length
list_flatten
list_parent_indices
55 changes: 55 additions & 0 deletions docs/source/python/compute.rst
Original file line number Diff line number Diff line change
@@ -0,0 +1,55 @@
.. Licensed to the Apache Software Foundation (ASF) under one
.. or more contributor license agreements. See the NOTICE file
.. distributed with this work for additional information
.. regarding copyright ownership. The ASF licenses this file
.. to you under the Apache License, Version 2.0 (the
.. "License"); you may not use this file except in compliance
.. with the License. You may obtain a copy of the License at

.. http://www.apache.org/licenses/LICENSE-2.0

.. Unless required by applicable law or agreed to in writing,
.. software distributed under the License is distributed on an
.. "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
.. KIND, either express or implied. See the License for the
.. specific language governing permissions and limitations
.. under the License.

.. currentmodule:: pyarrow.compute
.. _compute:

=================
Compute Functions
=================

Arrow supports logical compute operations over inputs of possibly
varying types. Many compute functions support both array (chunked or not)
and scalar inputs, but some will mandate either. For example,
the ``fill_null`` function requires its second input to be a scalar,
while ``sort_indices`` requires its first and only input to
be an array.

Below are a few simple examples:

>>> import pyarrow as pa
>>> import pyarrow.compute as pc
>>> a = pa.array([1, 1, 2, 3])
>>> pc.sum(a)
<pyarrow.Int64Scalar: 7>
>>> b = pa.array([4, 1, 2, 8])
>>> pc.equal(a, b)
<pyarrow.lib.BooleanArray object at 0x7f686e4eef30>
[
false,
true,
true,
false
]
>>> x, y = pa.scalar(7.8), pa.scalar(9.3)
>>> pc.multiply(x, y)
<pyarrow.DoubleScalar: 72.54>


.. seealso::

:ref:`Available compute functions (C++ documentation) <compute-function-list>`.
1 change: 1 addition & 0 deletions docs/source/python/index.rst
Original file line number Diff line number Diff line change
Expand Up @@ -36,6 +36,7 @@ files into Arrow structures.
install
memory
data
compute
ipc
filesystems
filesystems_deprecated
Expand Down
74 changes: 74 additions & 0 deletions python/pyarrow/_compute.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,8 @@

# cython: language_level = 3

from cython.operator cimport dereference as deref

from pyarrow.lib import frombytes, tobytes, ordered_dict
from pyarrow.lib cimport *
from pyarrow.includes.libarrow cimport *
Expand Down Expand Up @@ -593,6 +595,17 @@ cdef class TakeOptions(FunctionOptions):
return &self.take_options


cdef class PartitionNthOptions(FunctionOptions):
cdef:
unique_ptr[CPartitionNthOptions] partition_nth_options

def __cinit__(self, int64_t pivot):
self.partition_nth_options.reset(new CPartitionNthOptions(pivot))

cdef const CFunctionOptions* get_options(self) except NULL:
return self.partition_nth_options.get()


cdef class MinMaxOptions(FunctionOptions):
cdef:
CMinMaxOptions min_max_options
Expand All @@ -609,3 +622,64 @@ cdef class MinMaxOptions(FunctionOptions):

cdef const CFunctionOptions* get_options(self) except NULL:
return &self.min_max_options


cdef class SetLookupOptions(FunctionOptions):
cdef:
unique_ptr[CSetLookupOptions] set_lookup_options
unique_ptr[CDatum] valset

def __cinit__(self, *, value_set, c_bool skip_null):
if isinstance(value_set, Array):
self.valset.reset(new CDatum((<Array> value_set).sp_array))
elif isinstance(value_set, ChunkedArray):
self.valset.reset(
new CDatum((<ChunkedArray> value_set).sp_chunked_array)
)
elif isinstance(value_set, Scalar):
self.valset.reset(new CDatum((<Scalar> value_set).unwrap()))
else:
raise ValueError('"{}" is not a valid value_set'.format(value_set))

self.set_lookup_options.reset(
new CSetLookupOptions(deref(self.valset), skip_null)
)

cdef const CFunctionOptions* get_options(self) except NULL:
return self.set_lookup_options.get()


cdef class StrptimeOptions(FunctionOptions):
cdef:
unique_ptr[CStrptimeOptions] strptime_options
TimeUnit time_unit

def __cinit__(self, format, unit):
if unit == 's':
self.time_unit = TimeUnit_SECOND
elif unit == 'ms':
self.time_unit = TimeUnit_MILLI
elif unit == 'us':
self.time_unit = TimeUnit_MICRO
elif unit == 'ns':
self.time_unit = TimeUnit_NANO
else:
raise ValueError('"{}" is not a valid time unit'.format(unit))

self.strptime_options.reset(
new CStrptimeOptions(tobytes(format), self.time_unit)
)

cdef const CFunctionOptions* get_options(self) except NULL:
return self.strptime_options.get()


cdef class VarianceOptions(FunctionOptions):
cdef:
CVarianceOptions variance_options

def __cinit__(self, *, ddof=0):
self.variance_options.ddof = ddof

cdef const CFunctionOptions* get_options(self) except NULL:
return &self.variance_options
Loading