From e5a4c8ac7970dcb4d5b755ed0d6208bbc356e20f Mon Sep 17 00:00:00 2001 From: arw2019 Date: Sat, 5 Sep 2020 00:07:22 -0400 Subject: [PATCH 1/4] add Python bindings for mode kernel --- python/pyarrow/array.pxi | 6 +++ python/pyarrow/compute.py | 27 +++++++++++ python/pyarrow/table.pxi | 11 +++++ python/pyarrow/tests/test_compute.py | 72 ++++++++++++++++++++++++++++ 4 files changed, 116 insertions(+) diff --git a/python/pyarrow/array.pxi b/python/pyarrow/array.pxi index 34417da63ff..02780d52cce 100644 --- a/python/pyarrow/array.pxi +++ b/python/pyarrow/array.pxi @@ -802,6 +802,12 @@ cdef class Array(_PandasConvertible): """ return _pc().call_function('sum', [self]) + def mode(self): + """ + Compute the mode of valuesd in a numerical array. + """ + return _pc().call_function('mode', [self]) + def unique(self): """ Compute distinct elements in array. diff --git a/python/pyarrow/compute.py b/python/pyarrow/compute.py index cc9847e2dce..f941ba3a94d 100644 --- a/python/pyarrow/compute.py +++ b/python/pyarrow/compute.py @@ -183,6 +183,33 @@ def sum(array): return call_function('sum', [array]) +def mode(array): + """ + Return the mode (most common value) of a passed numerical + (chunked) array. If there is more than one such value, only + the smallest is returned. + + Parameters + ---------- + array : pyarrow.Array or pyarrow.ChunkedArray + + Returns + ------- + mode : pyarrow.StructScalar + + Examples + -------- + >>> import pyarrow as pa + >>> import pyarrow.compute as pc + >>> arr = pa.array([1, 1, 2, 2, 3, 2, 2, 2]) + >>> pc.mode(arr) + + + """ + return call_function('mode', [array]) + + + def filter(data, mask, null_selection_behavior='drop'): """ Select values (or records) from array- or table-like data given boolean diff --git a/python/pyarrow/table.pxi b/python/pyarrow/table.pxi index b8205a31c02..8e803373a53 100644 --- a/python/pyarrow/table.pxi +++ b/python/pyarrow/table.pxi @@ -333,6 +333,17 @@ cdef class ChunkedArray(_PandasConvertible): """ return _pc().call_function('value_counts', [self]) + def mode(self): + """ + Compute the mode of elements in array. + + Returns + ------- + pyarrow.StructScalar + """ + return _pc().call_function('mode', [self]) + + def slice(self, offset=0, length=None): """ Compute zero-copy slice of this ChunkedArray diff --git a/python/pyarrow/tests/test_compute.py b/python/pyarrow/tests/test_compute.py index bdc057d707b..3a8b1b10453 100644 --- a/python/pyarrow/tests/test_compute.py +++ b/python/pyarrow/tests/test_compute.py @@ -16,7 +16,9 @@ # under the License. from functools import lru_cache +import itertools import numpy as np +import pandas as pd import pytest import pyarrow as pa @@ -109,6 +111,76 @@ def test_sum_chunked_array(arrow_type): assert pc.sum(arr).as_py() is None # noqa: E711 +@pytest.mark.parametrize('arrow_type', numerical_arrow_types) +def test_mode_array(arrow_type): + # ARROW-9917 + + arr = pa.array([1, 1, 3, 4, 3, 5], type=arrow_type) + + expected = {'mode': 1, 'count': 2} + assert pc.mode(arr).as_py() == {'mode': 1, 'count': 2} + assert arr.mode().as_py() == {'mode': 1, 'count': 2} + + arr = pa.array([], type=arrow_type) + + expected = {'mode': None, 'count': None} + assert pc.mode(arr).as_py() == expected + assert arr.mode().as_py() == expected + + +@pytest.mark.parametrize('arrow_type', numerical_arrow_types) +def test_mode_chunked_array(arrow_type): + # ARROW-9917 + + expected = {'mode': 1, 'count': 2} + + arr = pa.chunked_array([pa.array([1, 1, 3, 4, 3, 5], type=arrow_type)]) + assert pc.mode(arr).as_py() == expected + assert arr.mode().as_py() == expected + + arr = pa.chunked_array([ + pa.array([1, 1, 3], type=arrow_type), pa.array([4, 3, 5], type=arrow_type) + ]) + assert pc.mode(arr).as_py() == expected + assert arr.mode().as_py() == expected + + arr = pa.chunked_array([ + pa.array([1, 1, 3], type=arrow_type), + pa.array([], type=arrow_type), + pa.array([4, 3, 5], type=arrow_type) + ]) + assert pc.mode(arr).as_py() == expected + assert arr.mode().as_py() == expected + + + expected = {'mode': None, 'count': None} + arr = pa.chunked_array((), type=arrow_type) + assert arr.num_chunks == 0 + assert pc.mode(arr).as_py() == expected + assert arr.mode().as_py() == expected + + +def test_mode_array_with_nan(): + # ARROW-9917 + + arr = pa.array([1, 1, 3, 4, 3, 5, np.nan], type='float') + + expected = {'mode': 1, 'count': 2} + assert pc.mode(arr).as_py() == {'mode': 1, 'count': 2} + assert arr.mode().as_py() == {'mode': 1, 'count': 2} + + + arr = pa.array([1, 1, 3, 4, np.nan, 3, 5, np.nan, np.nan], type='float') + + result = pc.mode(arr).as_py() + assert pd.isna(result['mode']) is True + assert result['count'] == 3 + + result = arr.mode().as_py() + assert pd.isna(result['mode']) is True + assert result['count'] == 3 + + def test_match_substring(): arr = pa.array(["ab", "abc", "ba", None]) result = pc.match_substring(arr, "ab") From 9abf421031aa6f29ea2b347bd2ffed9fd13381e0 Mon Sep 17 00:00:00 2001 From: arw2019 Date: Sat, 5 Sep 2020 00:32:17 -0400 Subject: [PATCH 2/4] linting --- python/pyarrow/compute.py | 7 ++-- python/pyarrow/table.pxi | 1 - python/pyarrow/tests/test_compute.py | 52 ++++++++++++++-------------- 3 files changed, 29 insertions(+), 31 deletions(-) diff --git a/python/pyarrow/compute.py b/python/pyarrow/compute.py index f941ba3a94d..c4c1e70d089 100644 --- a/python/pyarrow/compute.py +++ b/python/pyarrow/compute.py @@ -185,9 +185,9 @@ def sum(array): def mode(array): """ - Return the mode (most common value) of a passed numerical + Return the mode (most common value) of a passed numerical (chunked) array. If there is more than one such value, only - the smallest is returned. + the smallest is returned. Parameters ---------- @@ -206,8 +206,7 @@ def mode(array): """ - return call_function('mode', [array]) - + return call_function("mode", [array]) def filter(data, mask, null_selection_behavior='drop'): diff --git a/python/pyarrow/table.pxi b/python/pyarrow/table.pxi index 8e803373a53..afe72f036d8 100644 --- a/python/pyarrow/table.pxi +++ b/python/pyarrow/table.pxi @@ -343,7 +343,6 @@ cdef class ChunkedArray(_PandasConvertible): """ return _pc().call_function('mode', [self]) - def slice(self, offset=0, length=None): """ Compute zero-copy slice of this ChunkedArray diff --git a/python/pyarrow/tests/test_compute.py b/python/pyarrow/tests/test_compute.py index 3a8b1b10453..520f5ffc7dd 100644 --- a/python/pyarrow/tests/test_compute.py +++ b/python/pyarrow/tests/test_compute.py @@ -16,7 +16,6 @@ # under the License. from functools import lru_cache -import itertools import numpy as np import pandas as pd import pytest @@ -111,49 +110,51 @@ def test_sum_chunked_array(arrow_type): assert pc.sum(arr).as_py() is None # noqa: E711 -@pytest.mark.parametrize('arrow_type', numerical_arrow_types) +@pytest.mark.parametrize("arrow_type", numerical_arrow_types) def test_mode_array(arrow_type): # ARROW-9917 arr = pa.array([1, 1, 3, 4, 3, 5], type=arrow_type) - expected = {'mode': 1, 'count': 2} - assert pc.mode(arr).as_py() == {'mode': 1, 'count': 2} - assert arr.mode().as_py() == {'mode': 1, 'count': 2} + expected = {"mode": 1, "count": 2} + assert pc.mode(arr).as_py() == {"mode": 1, "count": 2} + assert arr.mode().as_py() == {"mode": 1, "count": 2} arr = pa.array([], type=arrow_type) - expected = {'mode': None, 'count': None} + expected = {"mode": None, "count": None} assert pc.mode(arr).as_py() == expected assert arr.mode().as_py() == expected -@pytest.mark.parametrize('arrow_type', numerical_arrow_types) +@pytest.mark.parametrize("arrow_type", numerical_arrow_types) def test_mode_chunked_array(arrow_type): # ARROW-9917 - expected = {'mode': 1, 'count': 2} + expected = {"mode": 1, "count": 2} arr = pa.chunked_array([pa.array([1, 1, 3, 4, 3, 5], type=arrow_type)]) assert pc.mode(arr).as_py() == expected assert arr.mode().as_py() == expected arr = pa.chunked_array([ - pa.array([1, 1, 3], type=arrow_type), pa.array([4, 3, 5], type=arrow_type) + pa.array([1, 1, 3], type=arrow_type), + pa.array([4, 3, 5], type=arrow_type) ]) assert pc.mode(arr).as_py() == expected assert arr.mode().as_py() == expected - arr = pa.chunked_array([ - pa.array([1, 1, 3], type=arrow_type), - pa.array([], type=arrow_type), - pa.array([4, 3, 5], type=arrow_type) - ]) + arr = pa.chunked_array( + [ + pa.array([1, 1, 3], type=arrow_type), + pa.array([], type=arrow_type), + pa.array([4, 3, 5], type=arrow_type), + ] + ) assert pc.mode(arr).as_py() == expected assert arr.mode().as_py() == expected - - expected = {'mode': None, 'count': None} + expected = {"mode": None, "count": None} arr = pa.chunked_array((), type=arrow_type) assert arr.num_chunks == 0 assert pc.mode(arr).as_py() == expected @@ -163,22 +164,21 @@ def test_mode_chunked_array(arrow_type): def test_mode_array_with_nan(): # ARROW-9917 - arr = pa.array([1, 1, 3, 4, 3, 5, np.nan], type='float') - - expected = {'mode': 1, 'count': 2} - assert pc.mode(arr).as_py() == {'mode': 1, 'count': 2} - assert arr.mode().as_py() == {'mode': 1, 'count': 2} + arr = pa.array([1, 1, 3, 4, 3, 5, np.nan], type="float") + expected = {"mode": 1, "count": 2} + assert pc.mode(arr).as_py() == expected + assert arr.mode().as_py() == expected - arr = pa.array([1, 1, 3, 4, np.nan, 3, 5, np.nan, np.nan], type='float') + arr = pa.array([1, 1, 3, 4, np.nan, 3, 5, np.nan, np.nan], type="float") result = pc.mode(arr).as_py() - assert pd.isna(result['mode']) is True - assert result['count'] == 3 + assert pd.isna(result["mode"]) is True + assert result["count"] == 3 result = arr.mode().as_py() - assert pd.isna(result['mode']) is True - assert result['count'] == 3 + assert pd.isna(result["mode"]) is True + assert result["count"] == 3 def test_match_substring(): From 3d38b46999c38723a310444fa1d67d654bbbab70 Mon Sep 17 00:00:00 2001 From: arw2019 Date: Sun, 6 Sep 2020 09:59:41 -0400 Subject: [PATCH 3/4] remove pandas dependency --- python/pyarrow/tests/test_compute.py | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/python/pyarrow/tests/test_compute.py b/python/pyarrow/tests/test_compute.py index 520f5ffc7dd..e44cba5d0f1 100644 --- a/python/pyarrow/tests/test_compute.py +++ b/python/pyarrow/tests/test_compute.py @@ -17,7 +17,6 @@ from functools import lru_cache import numpy as np -import pandas as pd import pytest import pyarrow as pa @@ -173,11 +172,11 @@ def test_mode_array_with_nan(): arr = pa.array([1, 1, 3, 4, np.nan, 3, 5, np.nan, np.nan], type="float") result = pc.mode(arr).as_py() - assert pd.isna(result["mode"]) is True + assert np.isnan(result["mode"]) assert result["count"] == 3 result = arr.mode().as_py() - assert pd.isna(result["mode"]) is True + assert np.isnan(result["mode"]) assert result["count"] == 3 From 96fe1c7a4b51522db0982d6ff45c53473d381993 Mon Sep 17 00:00:00 2001 From: arw2019 Date: Mon, 7 Sep 2020 14:51:57 -0400 Subject: [PATCH 4/4] feedback --- python/pyarrow/array.pxi | 6 --- python/pyarrow/table.pxi | 10 ----- python/pyarrow/tests/test_compute.py | 58 +++------------------------- 3 files changed, 6 insertions(+), 68 deletions(-) diff --git a/python/pyarrow/array.pxi b/python/pyarrow/array.pxi index 02780d52cce..34417da63ff 100644 --- a/python/pyarrow/array.pxi +++ b/python/pyarrow/array.pxi @@ -802,12 +802,6 @@ cdef class Array(_PandasConvertible): """ return _pc().call_function('sum', [self]) - def mode(self): - """ - Compute the mode of valuesd in a numerical array. - """ - return _pc().call_function('mode', [self]) - def unique(self): """ Compute distinct elements in array. diff --git a/python/pyarrow/table.pxi b/python/pyarrow/table.pxi index afe72f036d8..b8205a31c02 100644 --- a/python/pyarrow/table.pxi +++ b/python/pyarrow/table.pxi @@ -333,16 +333,6 @@ cdef class ChunkedArray(_PandasConvertible): """ return _pc().call_function('value_counts', [self]) - def mode(self): - """ - Compute the mode of elements in array. - - Returns - ------- - pyarrow.StructScalar - """ - return _pc().call_function('mode', [self]) - def slice(self, offset=0, length=None): """ Compute zero-copy slice of this ChunkedArray diff --git a/python/pyarrow/tests/test_compute.py b/python/pyarrow/tests/test_compute.py index e44cba5d0f1..129c7826759 100644 --- a/python/pyarrow/tests/test_compute.py +++ b/python/pyarrow/tests/test_compute.py @@ -109,75 +109,29 @@ def test_sum_chunked_array(arrow_type): assert pc.sum(arr).as_py() is None # noqa: E711 -@pytest.mark.parametrize("arrow_type", numerical_arrow_types) -def test_mode_array(arrow_type): +def test_mode_array(): # ARROW-9917 - arr = pa.array([1, 1, 3, 4, 3, 5], type=arrow_type) - + arr = pa.array([1, 1, 3, 4, 3, 5], type='int64') expected = {"mode": 1, "count": 2} assert pc.mode(arr).as_py() == {"mode": 1, "count": 2} - assert arr.mode().as_py() == {"mode": 1, "count": 2} - - arr = pa.array([], type=arrow_type) + arr = pa.array([], type='int64') expected = {"mode": None, "count": None} assert pc.mode(arr).as_py() == expected - assert arr.mode().as_py() == expected -@pytest.mark.parametrize("arrow_type", numerical_arrow_types) -def test_mode_chunked_array(arrow_type): +def test_mode_chunked_array(): # ARROW-9917 + arr = pa.chunked_array([pa.array([1, 1, 3, 4, 3, 5], type='int64')]) expected = {"mode": 1, "count": 2} - - arr = pa.chunked_array([pa.array([1, 1, 3, 4, 3, 5], type=arrow_type)]) - assert pc.mode(arr).as_py() == expected - assert arr.mode().as_py() == expected - - arr = pa.chunked_array([ - pa.array([1, 1, 3], type=arrow_type), - pa.array([4, 3, 5], type=arrow_type) - ]) - assert pc.mode(arr).as_py() == expected - assert arr.mode().as_py() == expected - - arr = pa.chunked_array( - [ - pa.array([1, 1, 3], type=arrow_type), - pa.array([], type=arrow_type), - pa.array([4, 3, 5], type=arrow_type), - ] - ) assert pc.mode(arr).as_py() == expected - assert arr.mode().as_py() == expected + arr = pa.chunked_array((), type='int64') expected = {"mode": None, "count": None} - arr = pa.chunked_array((), type=arrow_type) assert arr.num_chunks == 0 assert pc.mode(arr).as_py() == expected - assert arr.mode().as_py() == expected - - -def test_mode_array_with_nan(): - # ARROW-9917 - - arr = pa.array([1, 1, 3, 4, 3, 5, np.nan], type="float") - - expected = {"mode": 1, "count": 2} - assert pc.mode(arr).as_py() == expected - assert arr.mode().as_py() == expected - - arr = pa.array([1, 1, 3, 4, np.nan, 3, 5, np.nan, np.nan], type="float") - - result = pc.mode(arr).as_py() - assert np.isnan(result["mode"]) - assert result["count"] == 3 - - result = arr.mode().as_py() - assert np.isnan(result["mode"]) - assert result["count"] == 3 def test_match_substring():