diff --git a/cpp/src/arrow/compute/api_vector.h b/cpp/src/arrow/compute/api_vector.h index de36202f019..2c77e8ee155 100644 --- a/cpp/src/arrow/compute/api_vector.h +++ b/cpp/src/arrow/compute/api_vector.h @@ -59,7 +59,7 @@ struct ARROW_EXPORT TakeOptions : public FunctionOptions { }; /// \brief Partitioning options for NthToIndices -struct PartitionNthOptions : public FunctionOptions { +struct ARROW_EXPORT PartitionNthOptions : public FunctionOptions { explicit PartitionNthOptions(int64_t pivot) : pivot(pivot) {} /// The index into the equivalent sorted array of the partition pivot element. diff --git a/docs/source/cpp/compute.rst b/docs/source/cpp/compute.rst index 6ef10abf67d..af2f485058c 100644 --- a/docs/source/cpp/compute.rst +++ b/docs/source/cpp/compute.rst @@ -31,8 +31,8 @@ The generic Compute API Functions and function registry ------------------------------- -Functions represent logical compute operations over inputs of possibly -varying types. Internally, a function is implemented by one or several +Functions represent compute operations over inputs of possibly varying +types. Internally, a function is implemented by one or several "kernels", depending on the concrete input types (for example, a function adding values from two inputs can have different kernels depending on whether the inputs are integral or floating-point). @@ -101,6 +101,8 @@ exact semantics of the function:: :doc:`Compute API reference ` +.. _compute-function-list: + Available functions =================== diff --git a/docs/source/python/api.rst b/docs/source/python/api.rst index 5c4d6074d62..12cf4e06802 100644 --- a/docs/source/python/api.rst +++ b/docs/source/python/api.rst @@ -27,6 +27,7 @@ API Reference api/datatypes api/arrays api/memory + api/compute api/files api/tables api/ipc diff --git a/docs/source/python/api/compute.rst b/docs/source/python/api/compute.rst new file mode 100644 index 00000000000..2ec355d66af --- /dev/null +++ b/docs/source/python/api/compute.rst @@ -0,0 +1,206 @@ +.. Licensed to the Apache Software Foundation (ASF) under one +.. or more contributor license agreements. See the NOTICE file +.. distributed with this work for additional information +.. regarding copyright ownership. The ASF licenses this file +.. to you under the Apache License, Version 2.0 (the +.. "License"); you may not use this file except in compliance +.. with the License. You may obtain a copy of the License at + +.. http://www.apache.org/licenses/LICENSE-2.0 + +.. Unless required by applicable law or agreed to in writing, +.. software distributed under the License is distributed on an +.. "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +.. KIND, either express or implied. See the License for the +.. specific language governing permissions and limitations +.. under the License. + +.. _api.compute: +.. currentmodule:: pyarrow.compute + +Compute Functions +================= + +Aggregations +------------ + +.. autosummary:: + :toctree: ../generated/ + + count + mean + min_max + mode + stddev + sum + variance + +Arithmetic Functions +-------------------- + +By default these functions do not detect overflow. Each function is also +available in an overflow-checking variant, suffixed ``_checked``, which +throws an ``ArrowInvalid`` exception when overflow is detected. + +.. autosummary:: + :toctree: ../generated/ + + add + add_checked + divide + divide_checked + multiply + multiply_checked + subtract + subtract_checked + +Comparisons +----------- + +These functions expect two inputs of the same type. If one of the inputs is `null` +they return ``null``. + +.. autosummary:: + :toctree: ../generated/ + + equal + greater + greater_equal + less + less_equal + not_equal + +Logical Functions +----------- + +These functions normally emit a null when one of the inputs is null. However, Kleene +logic variants are provided (suffixed ``_kleene``). See User Guide for details. + +.. autosummary:: + :toctree: ../generated/ + + and_ + and_kleene + invert + or_ + or_kleene + xor + +String Predicates +----------------- + +In these functions an empty string emits false in the output. For ASCII +variants (prefixed ``ascii_``) a string element with non-ASCII characters +emits false in the output. + +The first set of functions emit true if the input contains only +characters of a given class. + +.. autosummary:: + :toctree: ../generated/ + + ascii_is_alnum + ascii_is_alpha + ascii_is_decimal + ascii_is_lower + ascii_is_printable + ascii_is_space + ascii_is_upper + utf8_is_alnum + utf8_is_alpha + utf8_is_decimal + utf8_is_digit + utf8_is_lower + utf8_is_numeric + utf8_is_printable + utf8_is_space + utf8_is_upper + +The second set of functions also consider the order of characters +in the string element. + +.. autosummary:: + :toctree: ../generated/ + + ascii_is_title + utf8_is_title + +The third set of functions examines string elements on +a byte-by-byte basis. + +.. autosummary:: + :toctree: ../generated/ + + string_is_ascii + +String Transforms +----------------- + +.. autosummary:: + :toctree: ../generated/ + + ascii_lower + ascii_upper + utf8_lower + utf8_upper + +Containment tests +----------------- + +.. autosummary:: + :toctree: ../generated/ + + index_in + is_in + match_substring + +Conversions +----------- + +.. autosummary:: + :toctree: ../generated/ + + cast + strptime + +Selections +---------- + +.. autosummary:: + :toctree: ../generated/ + + filter + take + +Associative transforms +---------------------- + +.. autosummary:: + :toctree: ../generated/ + + dictionary_encode + unique + value_counts + +Sorts and partitions +-------------------- + +.. autosummary:: + :toctree: ../generated/ + + partition_nth_indices + sort_indices + +Structural Transforms +--------------------- + +.. autosummary:: + :toctree: ../generated/ + + binary_length + fill_null + is_null + is_valid + list_value_length + list_flatten + list_parent_indices diff --git a/docs/source/python/compute.rst b/docs/source/python/compute.rst new file mode 100644 index 00000000000..51126d97c82 --- /dev/null +++ b/docs/source/python/compute.rst @@ -0,0 +1,55 @@ +.. Licensed to the Apache Software Foundation (ASF) under one +.. or more contributor license agreements. See the NOTICE file +.. distributed with this work for additional information +.. regarding copyright ownership. The ASF licenses this file +.. to you under the Apache License, Version 2.0 (the +.. "License"); you may not use this file except in compliance +.. with the License. You may obtain a copy of the License at + +.. http://www.apache.org/licenses/LICENSE-2.0 + +.. Unless required by applicable law or agreed to in writing, +.. software distributed under the License is distributed on an +.. "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +.. KIND, either express or implied. See the License for the +.. specific language governing permissions and limitations +.. under the License. + +.. currentmodule:: pyarrow.compute +.. _compute: + +================= +Compute Functions +================= + +Arrow supports logical compute operations over inputs of possibly +varying types. Many compute functions support both array (chunked or not) +and scalar inputs, but some will mandate either. For example, +the ``fill_null`` function requires its second input to be a scalar, +while ``sort_indices`` requires its first and only input to +be an array. + +Below are a few simple examples: + + >>> import pyarrow as pa + >>> import pyarrow.compute as pc + >>> a = pa.array([1, 1, 2, 3]) + >>> pc.sum(a) + + >>> b = pa.array([4, 1, 2, 8]) + >>> pc.equal(a, b) + + [ + false, + true, + true, + false + ] + >>> x, y = pa.scalar(7.8), pa.scalar(9.3) + >>> pc.multiply(x, y) + + + +.. seealso:: + + :ref:`Available compute functions (C++ documentation) `. diff --git a/docs/source/python/index.rst b/docs/source/python/index.rst index d4daf4029ac..cc7383044e0 100644 --- a/docs/source/python/index.rst +++ b/docs/source/python/index.rst @@ -36,6 +36,7 @@ files into Arrow structures. install memory data + compute ipc filesystems filesystems_deprecated diff --git a/python/pyarrow/_compute.pyx b/python/pyarrow/_compute.pyx index 6fbe1581f6a..323b9c43f68 100644 --- a/python/pyarrow/_compute.pyx +++ b/python/pyarrow/_compute.pyx @@ -17,6 +17,8 @@ # cython: language_level = 3 +from cython.operator cimport dereference as deref + from pyarrow.lib import frombytes, tobytes, ordered_dict from pyarrow.lib cimport * from pyarrow.includes.libarrow cimport * @@ -593,6 +595,17 @@ cdef class TakeOptions(FunctionOptions): return &self.take_options +cdef class PartitionNthOptions(FunctionOptions): + cdef: + unique_ptr[CPartitionNthOptions] partition_nth_options + + def __cinit__(self, int64_t pivot): + self.partition_nth_options.reset(new CPartitionNthOptions(pivot)) + + cdef const CFunctionOptions* get_options(self) except NULL: + return self.partition_nth_options.get() + + cdef class MinMaxOptions(FunctionOptions): cdef: CMinMaxOptions min_max_options @@ -609,3 +622,64 @@ cdef class MinMaxOptions(FunctionOptions): cdef const CFunctionOptions* get_options(self) except NULL: return &self.min_max_options + + +cdef class SetLookupOptions(FunctionOptions): + cdef: + unique_ptr[CSetLookupOptions] set_lookup_options + unique_ptr[CDatum] valset + + def __cinit__(self, *, value_set, c_bool skip_null): + if isinstance(value_set, Array): + self.valset.reset(new CDatum(( value_set).sp_array)) + elif isinstance(value_set, ChunkedArray): + self.valset.reset( + new CDatum(( value_set).sp_chunked_array) + ) + elif isinstance(value_set, Scalar): + self.valset.reset(new CDatum(( value_set).unwrap())) + else: + raise ValueError('"{}" is not a valid value_set'.format(value_set)) + + self.set_lookup_options.reset( + new CSetLookupOptions(deref(self.valset), skip_null) + ) + + cdef const CFunctionOptions* get_options(self) except NULL: + return self.set_lookup_options.get() + + +cdef class StrptimeOptions(FunctionOptions): + cdef: + unique_ptr[CStrptimeOptions] strptime_options + TimeUnit time_unit + + def __cinit__(self, format, unit): + if unit == 's': + self.time_unit = TimeUnit_SECOND + elif unit == 'ms': + self.time_unit = TimeUnit_MILLI + elif unit == 'us': + self.time_unit = TimeUnit_MICRO + elif unit == 'ns': + self.time_unit = TimeUnit_NANO + else: + raise ValueError('"{}" is not a valid time unit'.format(unit)) + + self.strptime_options.reset( + new CStrptimeOptions(tobytes(format), self.time_unit) + ) + + cdef const CFunctionOptions* get_options(self) except NULL: + return self.strptime_options.get() + + +cdef class VarianceOptions(FunctionOptions): + cdef: + CVarianceOptions variance_options + + def __cinit__(self, *, ddof=0): + self.variance_options.ddof = ddof + + cdef const CFunctionOptions* get_options(self) except NULL: + return &self.variance_options diff --git a/python/pyarrow/compute.py b/python/pyarrow/compute.py index 2204471b0ee..df6d21505d1 100644 --- a/python/pyarrow/compute.py +++ b/python/pyarrow/compute.py @@ -15,7 +15,6 @@ # specific language governing permissions and limitations # under the License. - from pyarrow._compute import ( # noqa Function, FunctionRegistry, @@ -31,7 +30,11 @@ FilterOptions, MatchSubstringOptions, MinMaxOptions, + PartitionNthOptions, + SetLookupOptions, + StrptimeOptions, TakeOptions, + VarianceOptions, # Functions function_registry, call_function, @@ -91,13 +94,18 @@ def _decorate_compute_function(wrapper, exposed_name, func, option_class): _option_classes = { - # TODO this is not complete - # (export the option class name from C++ metadata?) + # TODO: export the option class name from C++ metadata? 'cast': CastOptions, 'filter': FilterOptions, + 'index_in': SetLookupOptions, + 'is_in': SetLookupOptions, 'match_substring': MatchSubstringOptions, 'min_max': MinMaxOptions, + 'partition_nth_indices': PartitionNthOptions, + 'stddev': VarianceOptions, + 'strptime': StrptimeOptions, 'take': TakeOptions, + 'variance': VarianceOptions, } @@ -417,3 +425,7 @@ def fill_null(values, fill_value): fill_value = pa.scalar(fill_value.as_py(), type=values.type) return call_function("fill_null", [values, fill_value]) + + +and_ = globals()['and'] +or_ = globals()['or'] diff --git a/python/pyarrow/includes/libarrow.pxd b/python/pyarrow/includes/libarrow.pxd index 5d5800eec58..dee022f5ca7 100644 --- a/python/pyarrow/includes/libarrow.pxd +++ b/python/pyarrow/includes/libarrow.pxd @@ -1710,6 +1710,14 @@ cdef extern from "arrow/compute/api.h" namespace "arrow::compute" nogil: " arrow::compute::TakeOptions"(CFunctionOptions): c_bool boundscheck + cdef cppclass CStrptimeOptions \ + "arrow::compute::StrptimeOptions"(CFunctionOptions): + CStrptimeOptions(c_string format, TimeUnit unit) + + cdef cppclass CVarianceOptions \ + "arrow::compute::VarianceOptions"(CFunctionOptions): + int ddof + enum CMinMaxMode \ "arrow::compute::MinMaxOptions::Mode": CMinMaxMode_SKIP \ @@ -1721,6 +1729,11 @@ cdef extern from "arrow/compute/api.h" namespace "arrow::compute" nogil: "arrow::compute::MinMaxOptions"(CFunctionOptions): CMinMaxMode null_handling + cdef cppclass CPartitionNthOptions \ + "arrow::compute::PartitionNthOptions"(CFunctionOptions): + CPartitionNthOptions(int64_t pivot) + int64_t pivot + enum DatumType" arrow::Datum::type": DatumType_NONE" arrow::Datum::NONE" DatumType_SCALAR" arrow::Datum::SCALAR" @@ -1746,6 +1759,12 @@ cdef extern from "arrow/compute/api.h" namespace "arrow::compute" nogil: shared_ptr[CTable] table() shared_ptr[CScalar] scalar() + cdef cppclass CSetLookupOptions \ + "arrow::compute::SetLookupOptions"(CFunctionOptions): + CSetLookupOptions(CDatum value_set, c_bool skip_nulls) + CDatum value_set + c_bool skip_nulls + cdef extern from "arrow/python/api.h" namespace "arrow::py": # Requires GIL diff --git a/python/pyarrow/tests/test_compute.py b/python/pyarrow/tests/test_compute.py index 8b0859ccf39..dc1b2856287 100644 --- a/python/pyarrow/tests/test_compute.py +++ b/python/pyarrow/tests/test_compute.py @@ -15,6 +15,7 @@ # specific language governing permissions and limitations # under the License. +from datetime import datetime from functools import lru_cache import pickle import pytest @@ -223,6 +224,13 @@ def test_mode_chunked_array(): assert pc.mode(arr).as_py() == expected +def test_variance(): + data = [1, 2, 3, 4, 5, 6, 7, 8] + assert pc.variance(data).as_py() == 5.25 + assert pc.variance(data, ddof=0).as_py() == 5.25 + assert pc.variance(data, ddof=1).as_py() == 6.0 + + def test_match_substring(): arr = pa.array(["ab", "abc", "ba", None]) result = pc.match_substring(arr, "ab") @@ -822,3 +830,31 @@ def test_fill_null_chunked_array(arrow_type): result = arr.fill_null(pa.scalar(5, type='int8')) assert result.equals(expected) + + +def test_logical(): + a = pa.array([True, False, False, None]) + b = pa.array([True, True, False, True]) + + assert pc.and_(a, b) == pa.array([True, False, False, None]) + assert pc.and_kleene(a, b) == pa.array([True, False, False, None]) + + assert pc.or_(a, b) == pa.array([True, True, False, None]) + assert pc.or_kleene(a, b) == pa.array([True, True, False, True]) + + assert pc.xor(a, b) == pa.array([False, True, False, None]) + + assert pc.invert(a) == pa.array([False, True, True, None]) + + +def test_cast(): + arr = pa.array([2**63 - 1], type='int64') + + with pytest.raises(pa.ArrowInvalid): + pc.cast(arr, 'int32') + + assert pc.cast(arr, 'int32', safe=False) == pa.array([-1], type='int32') + + arr = pa.array([datetime(2010, 1, 1), datetime(2015, 1, 1)]) + expected = pa.array([1262304000000, 1420070400000], type='timestamp[ms]') + assert pc.cast(arr, 'timestamp[ms]') == expected