From bda438e22a152b307bb06e6c75b0dd994f3d783f Mon Sep 17 00:00:00 2001 From: Alessandro Molina Date: Wed, 30 Nov 2022 11:52:31 +0100 Subject: [PATCH 01/15] Add Python sorting helpers --- python/pyarrow/array.pxi | 35 +++++++++++ python/pyarrow/table.pxi | 35 +++++++++++ python/pyarrow/tests/test_array.py | 25 ++++++++ python/pyarrow/tests/test_table.py | 97 ++++++++++++++++++++++++++++++ 4 files changed, 192 insertions(+) diff --git a/python/pyarrow/array.pxi b/python/pyarrow/array.pxi index 86d1f0e39cf..79f8739b896 100644 --- a/python/pyarrow/array.pxi +++ b/python/pyarrow/array.pxi @@ -1399,6 +1399,20 @@ cdef class Array(_PandasConvertible): """ return _pc().index(self, value, start, end, memory_pool=memory_pool) + def sort(self, order="ascending"): + """ + Sort the Array + Parameters + ---------- + order : "ascending" or "descending" + The order of the sorting. + Returns + ------- + result : Array + """ + indices = _pc().sort_indices(self, sort_keys=[("", order)]) + return self.take(indices) + def _to_pandas(self, options, types_mapper=None, **kwargs): return _array_like_to_pandas(self, options, types_mapper=types_mapper) @@ -2743,6 +2757,27 @@ cdef class StructArray(Array): result.validate() return result + def sort(self, order="ascending", fieldname=None): + """ + Sort the StructArray + Parameters + ---------- + order : "ascending" or "descending" + The order of the sorting. + fieldname : str or None, default None + If to sort the array by one of its fields + or by the whole array. + Returns + ------- + result : StructArray + """ + if fieldname is not None: + tosort = self.field(fieldname) + else: + tosort = self + indices = _pc().sort_indices(tosort, sort_keys=[("", order)]) + return self.take(indices) + cdef class ExtensionArray(Array): """ diff --git a/python/pyarrow/table.pxi b/python/pyarrow/table.pxi index 5c58ae61f19..40c3510e505 100644 --- a/python/pyarrow/table.pxi +++ b/python/pyarrow/table.pxi @@ -1039,6 +1039,20 @@ cdef class ChunkedArray(_PandasConvertible): """ return _pc().drop_null(self) + def sort(self, order="ascending"): + """ + Sort the ChunkedArray + Parameters + ---------- + order : "ascending" or "descending" + The order of the sorting. + Returns + ------- + result : ChunkedArray + """ + indices = _pc().sort_indices(self, sort_keys=[("", order)]) + return self.take(indices) + def unify_dictionaries(self, MemoryPool memory_pool=None): """ Unify dictionaries across all chunks. @@ -2231,6 +2245,27 @@ cdef class RecordBatch(_PandasConvertible): """ return _pc().drop_null(self) + def sort_by(self, sorting): + """ + Sort the RecordBatch by one or multiple columns. + Parameters + ---------- + sorting : str or list[tuple(name, order)] + Name of the column to use to sort (ascending), or + a list of multiple sorting conditions where + each entry is a tuple with column name + and sorting order ("ascending" or "descending") + Returns + ------- + RecordBatch + A new record batch sorted according to the sort keys. + """ + if isinstance(sorting, str): + sorting = [(sorting, "ascending")] + + indices = _pc().sort_indices(self, sort_keys=sorting) + return self.take(indices) + def to_pydict(self): """ Convert the RecordBatch to a dict or OrderedDict. diff --git a/python/pyarrow/tests/test_array.py b/python/pyarrow/tests/test_array.py index 65604668918..d6e5c65efac 100644 --- a/python/pyarrow/tests/test_array.py +++ b/python/pyarrow/tests/test_array.py @@ -3287,3 +3287,28 @@ def test_to_pandas_timezone(): arr = pa.chunked_array([arr]) s = arr.to_pandas() assert s.dt.tz is not None + + +def test_array_sort(): + arr = pa.array([5, 7, 35], type=pa.int64()) + sorted_arr = arr.sort("descending") + assert sorted_arr.to_pylist() == [35, 7, 5] + + arr = pa.chunked_array([[1, 2, 3], [4, 5, 6]]) + sorted_arr = arr.sort("descending") + assert sorted_arr.to_pylist() == [6, 5, 4, 3, 2, 1] + + +def test_struct_array_sort(): + arr = pa.StructArray.from_arrays([ + pa.array([5, 7, 7, 35], type=pa.int64()), + pa.array(["foo", "car", "bar", "foobar"]) + ], names=["a", "b"]) + + sorted_arr = arr.sort("descending", fieldname="a") + assert sorted_arr.to_pylist() == [ + {"a": 35, "b": "foobar"}, + {"a": 7, "b": "car"}, + {"a": 7, "b": "bar"}, + {"a": 5, "b": "foo"}, + ] diff --git a/python/pyarrow/tests/test_table.py b/python/pyarrow/tests/test_table.py index fad1c0acb24..5c6a5400e36 100644 --- a/python/pyarrow/tests/test_table.py +++ b/python/pyarrow/tests/test_table.py @@ -2204,3 +2204,100 @@ def test_table_cast_invalid(): table = pa.table({'a': [None, 1], 'b': [False, True]}) assert table.cast(new_schema).schema == new_schema + + +def test_table_sort(): + tab = pa.Table.from_arrays([ + pa.array([5, 7, 7, 35], type=pa.int64()), + pa.array(["foo", "car", "bar", "foobar"]) + ], names=["a", "b"]) + + sorted_tab = tab.sort_by([("a", "descending")]) + sorted_tab_dict = sorted_tab.to_pydict() + assert sorted_tab_dict["a"] == [35, 7, 7, 5] + assert sorted_tab_dict["b"] == ["foobar", "car", "bar", "foo"] + + sorted_tab = tab.sort_by([("a", "ascending")]) + sorted_tab_dict = sorted_tab.to_pydict() + assert sorted_tab_dict["a"] == [5, 7, 7, 35] + assert sorted_tab_dict["b"] == ["foo", "car", "bar", "foobar"] + + +def test_record_batch_sort(): + rb = pa.RecordBatch.from_arrays([ + pa.array([7, 35, 7, 5], type=pa.int64()), + pa.array([4, 1, 3, 2], type=pa.int64()), + pa.array(["foo", "car", "bar", "foobar"]) + ], names=["a", "b", "c"]) + + sorted_rb = rb.sort_by([("a", "descending"), ("b", "descending")]) + sorted_rb_dict = sorted_rb.to_pydict() + assert sorted_rb_dict["a"] == [35, 7, 7, 5] + assert sorted_rb_dict["b"] == [1, 4, 3, 2] + assert sorted_rb_dict["c"] == ["car", "foo", "bar", "foobar"] + + sorted_rb = rb.sort_by([("a", "ascending"), ("b", "ascending")]) + sorted_rb_dict = sorted_rb.to_pydict() + assert sorted_rb_dict["a"] == [5, 7, 7, 35] + assert sorted_rb_dict["b"] == [2, 3, 4, 1] + assert sorted_rb_dict["c"] == ["foobar", "bar", "foo", "car"] + + # test multi-key record batch sorter (> 8 sort keys) + rb1_names = ["a", "b", "c", "d", "e", "f", "g", "h", "i", "j"] + rb1 = pa.RecordBatch.from_arrays([ + pa.array([4, 4, 4, 4], type=pa.int64()), + pa.array([4, 4, 4, 4], type=pa.int64()), + pa.array([4, 4, 4, 4], type=pa.int64()), + pa.array([4, 4, 2, 2], type=pa.int64()), + pa.array([4, 4, 2, 2], type=pa.int64()), + pa.array([4, 4, 2, 2], type=pa.int64()), + pa.array([4, 4, 2, 2], type=pa.int64()), + pa.array([4, 4, 2, 2], type=pa.int64()), + pa.array([2, 1, 4, 3], type=pa.int64()), + pa.array(["foo", "car", "bar", "foobar"]) + ], names=rb1_names) + + sort_keys_list = [(name, "ascending") for name in rb1_names] + + sorted_rb1 = rb1.sort_by(sort_keys_list) + sorted_rb1_dict = sorted_rb1.to_pydict() + assert sorted_rb1_dict["a"] == [4, 4, 4, 4] + assert sorted_rb1_dict["b"] == [4, 4, 4, 4] + assert sorted_rb1_dict["c"] == [4, 4, 4, 4] + assert sorted_rb1_dict["d"] == [2, 2, 4, 4] + assert sorted_rb1_dict["e"] == [2, 2, 4, 4] + assert sorted_rb1_dict["f"] == [2, 2, 4, 4] + assert sorted_rb1_dict["g"] == [2, 2, 4, 4] + assert sorted_rb1_dict["h"] == [2, 2, 4, 4] + assert sorted_rb1_dict["i"] == [3, 4, 1, 2] + assert sorted_rb1_dict["j"] == ["foobar", "bar", "car", "foo"] + + # test radix sort with nulls + rb2_names = ["a", "b", "c", "d", "e", "f", "g", "h", "i", "j"] + rb2 = pa.RecordBatch.from_arrays([ + pa.array([None, None, None, None]), + pa.array([4, 4, 4, 4], type=pa.int64()), + pa.array([4, 4, 4, 4], type=pa.int64()), + pa.array([4, 4, 2, 2], type=pa.int64()), + pa.array([4, 4, 2, 2], type=pa.int64()), + pa.array([4, 4, 2, 2], type=pa.int64()), + pa.array([4, 4, 2, 2], type=pa.int64()), + pa.array([4, 4, 2, 2], type=pa.int64()), + pa.array([2, 1, 4, 3], type=pa.int64()), + pa.array([2, 1, 4, 3], type=pa.int64()), + ], names=rb2_names) + + sort_keys_list = [("a", "ascending"), ("j", "ascending")] + sorted_rb2 = rb2.sort_by(sort_keys_list) + sorted_rb2_dict = sorted_rb2.to_pydict() + + assert sorted_rb2_dict["a"] == [None, None, None, None] + assert sorted_rb2_dict["b"] == [4, 4, 4, 4] + assert sorted_rb2_dict["c"] == [4, 4, 4, 4] + assert sorted_rb2_dict["d"] == [4, 4, 2, 2] + assert sorted_rb2_dict["e"] == [4, 4, 2, 2] + assert sorted_rb2_dict["f"] == [4, 4, 2, 2] + assert sorted_rb2_dict["g"] == [4, 4, 2, 2] + assert sorted_rb2_dict["h"] == [4, 4, 2, 2] + assert sorted_rb2_dict["i"] == [1, 2, 3, 4] + assert sorted_rb2_dict["j"] == [1, 2, 3, 4] From e3603874cd0f1a72594650c8fef4b78edd224dbe Mon Sep 17 00:00:00 2001 From: Alessandro Molina Date: Wed, 30 Nov 2022 12:32:37 +0100 Subject: [PATCH 02/15] Fix docstrings --- python/pyarrow/array.pxi | 4 ++++ python/pyarrow/table.pxi | 4 ++++ 2 files changed, 8 insertions(+) diff --git a/python/pyarrow/array.pxi b/python/pyarrow/array.pxi index 79f8739b896..8426568d951 100644 --- a/python/pyarrow/array.pxi +++ b/python/pyarrow/array.pxi @@ -1402,10 +1402,12 @@ cdef class Array(_PandasConvertible): def sort(self, order="ascending"): """ Sort the Array + Parameters ---------- order : "ascending" or "descending" The order of the sorting. + Returns ------- result : Array @@ -2760,6 +2762,7 @@ cdef class StructArray(Array): def sort(self, order="ascending", fieldname=None): """ Sort the StructArray + Parameters ---------- order : "ascending" or "descending" @@ -2767,6 +2770,7 @@ cdef class StructArray(Array): fieldname : str or None, default None If to sort the array by one of its fields or by the whole array. + Returns ------- result : StructArray diff --git a/python/pyarrow/table.pxi b/python/pyarrow/table.pxi index 40c3510e505..a5615816eed 100644 --- a/python/pyarrow/table.pxi +++ b/python/pyarrow/table.pxi @@ -1042,10 +1042,12 @@ cdef class ChunkedArray(_PandasConvertible): def sort(self, order="ascending"): """ Sort the ChunkedArray + Parameters ---------- order : "ascending" or "descending" The order of the sorting. + Returns ------- result : ChunkedArray @@ -2248,6 +2250,7 @@ cdef class RecordBatch(_PandasConvertible): def sort_by(self, sorting): """ Sort the RecordBatch by one or multiple columns. + Parameters ---------- sorting : str or list[tuple(name, order)] @@ -2255,6 +2258,7 @@ cdef class RecordBatch(_PandasConvertible): a list of multiple sorting conditions where each entry is a tuple with column name and sorting order ("ascending" or "descending") + Returns ------- RecordBatch From 740e6e08ef5bcb342115f71d36f60f0e9e7889b0 Mon Sep 17 00:00:00 2001 From: Alessandro Molina Date: Thu, 1 Dec 2022 13:39:45 +0100 Subject: [PATCH 03/15] Address feedback --- python/pyarrow/array.pxi | 25 +++++++++++++++++-------- python/pyarrow/table.pxi | 19 +++++++++++++------ 2 files changed, 30 insertions(+), 14 deletions(-) diff --git a/python/pyarrow/array.pxi b/python/pyarrow/array.pxi index 8426568d951..53fbe4bef85 100644 --- a/python/pyarrow/array.pxi +++ b/python/pyarrow/array.pxi @@ -1399,20 +1399,25 @@ cdef class Array(_PandasConvertible): """ return _pc().index(self, value, start, end, memory_pool=memory_pool) - def sort(self, order="ascending"): + def sort(self, order="ascending", options=None): """ Sort the Array Parameters ---------- - order : "ascending" or "descending" - The order of the sorting. + order : str, default "ascending" + Which order to sort values in. + Accepted values are "ascending", "descending". + options : SortOptions, default None + Additional sorting options. + Returns ------- result : Array """ - indices = _pc().sort_indices(self, sort_keys=[("", order)]) + indices = _pc().sort_indices(self, sort_keys=[("", order)], + options=options) return self.take(indices) def _to_pandas(self, options, types_mapper=None, **kwargs): @@ -2759,17 +2764,20 @@ cdef class StructArray(Array): result.validate() return result - def sort(self, order="ascending", fieldname=None): + def sort(self, order="ascending", fieldname=None, options=None): """ Sort the StructArray Parameters ---------- - order : "ascending" or "descending" - The order of the sorting. + order : str, default "ascending" + Which order to sort values in. + Accepted values are "ascending", "descending". fieldname : str or None, default None If to sort the array by one of its fields or by the whole array. + options : SortOptions, default None + Additional sorting options. Returns ------- @@ -2779,7 +2787,8 @@ cdef class StructArray(Array): tosort = self.field(fieldname) else: tosort = self - indices = _pc().sort_indices(tosort, sort_keys=[("", order)]) + indices = _pc().sort_indices(tosort, sort_keys=[("", order)], + options=options) return self.take(indices) diff --git a/python/pyarrow/table.pxi b/python/pyarrow/table.pxi index a5615816eed..3956104e69c 100644 --- a/python/pyarrow/table.pxi +++ b/python/pyarrow/table.pxi @@ -1039,20 +1039,24 @@ cdef class ChunkedArray(_PandasConvertible): """ return _pc().drop_null(self) - def sort(self, order="ascending"): + def sort(self, order="ascending", options=None): """ Sort the ChunkedArray Parameters ---------- - order : "ascending" or "descending" - The order of the sorting. + order : str, default "ascending" + Which order to sort values in. + Accepted values are "ascending", "descending". + options : SortOptions, default None + Additional sorting options. Returns ------- result : ChunkedArray """ - indices = _pc().sort_indices(self, sort_keys=[("", order)]) + indices = _pc().sort_indices(self, sort_keys=[("", order)], + options=options) return self.take(indices) def unify_dictionaries(self, MemoryPool memory_pool=None): @@ -4703,7 +4707,7 @@ cdef class Table(_PandasConvertible): """ return TableGroupBy(self, keys) - def sort_by(self, sorting): + def sort_by(self, sorting, options=None): """ Sort the table by one or multiple columns. @@ -4714,6 +4718,8 @@ cdef class Table(_PandasConvertible): a list of multiple sorting conditions where each entry is a tuple with column name and sorting order ("ascending" or "descending") + options : SortOptions, default None + Additional sorting options. Returns ------- @@ -4744,7 +4750,8 @@ cdef class Table(_PandasConvertible): indices = _pc().sort_indices( self, - sort_keys=sorting + sort_keys=sorting, + options=options ) return self.take(indices) From d158174ba58044a5a271bce34c0535f9577111af Mon Sep 17 00:00:00 2001 From: Alessandro Molina Date: Thu, 1 Dec 2022 14:28:01 +0100 Subject: [PATCH 04/15] SortOptions fixes --- python/pyarrow/array.pxi | 25 ++++++++++++++----------- python/pyarrow/table.pxi | 30 ++++++++++++++++++------------ python/pyarrow/tests/test_array.py | 6 ++++++ 3 files changed, 38 insertions(+), 23 deletions(-) diff --git a/python/pyarrow/array.pxi b/python/pyarrow/array.pxi index 53fbe4bef85..2e1ffbe0a87 100644 --- a/python/pyarrow/array.pxi +++ b/python/pyarrow/array.pxi @@ -1399,7 +1399,7 @@ cdef class Array(_PandasConvertible): """ return _pc().index(self, value, start, end, memory_pool=memory_pool) - def sort(self, order="ascending", options=None): + def sort(self, order="ascending", **options): """ Sort the Array @@ -1408,16 +1408,17 @@ cdef class Array(_PandasConvertible): order : str, default "ascending" Which order to sort values in. Accepted values are "ascending", "descending". - options : SortOptions, default None - Additional sorting options. - + **options : Additional sorting options + As allowed by :class:`SortOptions` Returns ------- result : Array """ - indices = _pc().sort_indices(self, sort_keys=[("", order)], - options=options) + indices = _pc().sort_indices( + self, + options=_pc().SortOptions(sort_keys=[("", order)], **options) + ) return self.take(indices) def _to_pandas(self, options, types_mapper=None, **kwargs): @@ -2764,7 +2765,7 @@ cdef class StructArray(Array): result.validate() return result - def sort(self, order="ascending", fieldname=None, options=None): + def sort(self, order="ascending", fieldname=None, **options): """ Sort the StructArray @@ -2776,8 +2777,8 @@ cdef class StructArray(Array): fieldname : str or None, default None If to sort the array by one of its fields or by the whole array. - options : SortOptions, default None - Additional sorting options. + **options : Additional sorting options + As allowed by :class:`SortOptions` Returns ------- @@ -2787,8 +2788,10 @@ cdef class StructArray(Array): tosort = self.field(fieldname) else: tosort = self - indices = _pc().sort_indices(tosort, sort_keys=[("", order)], - options=options) + indices = _pc().sort_indices( + tosort, + options=_pc().SortOptions(sort_keys=[("", order)], **options) + ) return self.take(indices) diff --git a/python/pyarrow/table.pxi b/python/pyarrow/table.pxi index 3956104e69c..5130de3294a 100644 --- a/python/pyarrow/table.pxi +++ b/python/pyarrow/table.pxi @@ -1039,7 +1039,7 @@ cdef class ChunkedArray(_PandasConvertible): """ return _pc().drop_null(self) - def sort(self, order="ascending", options=None): + def sort(self, order="ascending", **options): """ Sort the ChunkedArray @@ -1048,15 +1048,17 @@ cdef class ChunkedArray(_PandasConvertible): order : str, default "ascending" Which order to sort values in. Accepted values are "ascending", "descending". - options : SortOptions, default None - Additional sorting options. + **options : Additional sorting options + As allowed by :class:`SortOptions` Returns ------- result : ChunkedArray """ - indices = _pc().sort_indices(self, sort_keys=[("", order)], - options=options) + indices = _pc().sort_indices( + self, + options=_pc().SortOptions(sort_keys=[("", order)], **options) + ) return self.take(indices) def unify_dictionaries(self, MemoryPool memory_pool=None): @@ -2251,7 +2253,7 @@ cdef class RecordBatch(_PandasConvertible): """ return _pc().drop_null(self) - def sort_by(self, sorting): + def sort_by(self, sorting, **options): """ Sort the RecordBatch by one or multiple columns. @@ -2262,6 +2264,8 @@ cdef class RecordBatch(_PandasConvertible): a list of multiple sorting conditions where each entry is a tuple with column name and sorting order ("ascending" or "descending") + **options : Additional sorting options + As allowed by :class:`SortOptions` Returns ------- @@ -2271,7 +2275,10 @@ cdef class RecordBatch(_PandasConvertible): if isinstance(sorting, str): sorting = [(sorting, "ascending")] - indices = _pc().sort_indices(self, sort_keys=sorting) + indices = _pc().sort_indices( + self, + options=_pc().SortOptions(sort_keys=sorting, **options) + ) return self.take(indices) def to_pydict(self): @@ -4707,7 +4714,7 @@ cdef class Table(_PandasConvertible): """ return TableGroupBy(self, keys) - def sort_by(self, sorting, options=None): + def sort_by(self, sorting, **options): """ Sort the table by one or multiple columns. @@ -4718,8 +4725,8 @@ cdef class Table(_PandasConvertible): a list of multiple sorting conditions where each entry is a tuple with column name and sorting order ("ascending" or "descending") - options : SortOptions, default None - Additional sorting options. + **options : Additional sorting options + As allowed by :class:`SortOptions` Returns ------- @@ -4750,8 +4757,7 @@ cdef class Table(_PandasConvertible): indices = _pc().sort_indices( self, - sort_keys=sorting, - options=options + options=_pc().SortOptions(sort_keys=sorting, **options) ) return self.take(indices) diff --git a/python/pyarrow/tests/test_array.py b/python/pyarrow/tests/test_array.py index d6e5c65efac..0654ffcdd51 100644 --- a/python/pyarrow/tests/test_array.py +++ b/python/pyarrow/tests/test_array.py @@ -3298,6 +3298,12 @@ def test_array_sort(): sorted_arr = arr.sort("descending") assert sorted_arr.to_pylist() == [6, 5, 4, 3, 2, 1] + arr = pa.array([5, 7, 35, None], type=pa.int64()) + sorted_arr = arr.sort("descending", null_placement="at_end") + assert sorted_arr.to_pylist() == [35, 7, 5, None] + sorted_arr = arr.sort("descending", null_placement="at_start") + assert sorted_arr.to_pylist() == [None, 35, 7, 5] + def test_struct_array_sort(): arr = pa.StructArray.from_arrays([ From 4cbd391330b4abd6dcc5ed0cbee71c4302f44e91 Mon Sep 17 00:00:00 2001 From: Alessandro Molina Date: Thu, 1 Dec 2022 14:41:04 +0100 Subject: [PATCH 05/15] Adhere to https://numpydoc.readthedocs.io/en/latest/format.html#parameters for keyword args --- python/pyarrow/array.pxi | 6 ++++-- python/pyarrow/table.pxi | 9 ++++++--- 2 files changed, 10 insertions(+), 5 deletions(-) diff --git a/python/pyarrow/array.pxi b/python/pyarrow/array.pxi index 2e1ffbe0a87..cbdb02f5bef 100644 --- a/python/pyarrow/array.pxi +++ b/python/pyarrow/array.pxi @@ -1408,7 +1408,8 @@ cdef class Array(_PandasConvertible): order : str, default "ascending" Which order to sort values in. Accepted values are "ascending", "descending". - **options : Additional sorting options + **options : dict, optional + Additional sorting options As allowed by :class:`SortOptions` Returns @@ -2777,7 +2778,8 @@ cdef class StructArray(Array): fieldname : str or None, default None If to sort the array by one of its fields or by the whole array. - **options : Additional sorting options + **options : dict, optional + Additional sorting options As allowed by :class:`SortOptions` Returns diff --git a/python/pyarrow/table.pxi b/python/pyarrow/table.pxi index 5130de3294a..43906d9d234 100644 --- a/python/pyarrow/table.pxi +++ b/python/pyarrow/table.pxi @@ -1048,7 +1048,8 @@ cdef class ChunkedArray(_PandasConvertible): order : str, default "ascending" Which order to sort values in. Accepted values are "ascending", "descending". - **options : Additional sorting options + **options : dict, optional + Additional sorting options As allowed by :class:`SortOptions` Returns @@ -2264,7 +2265,8 @@ cdef class RecordBatch(_PandasConvertible): a list of multiple sorting conditions where each entry is a tuple with column name and sorting order ("ascending" or "descending") - **options : Additional sorting options + **options : dict, optional + Additional sorting options As allowed by :class:`SortOptions` Returns @@ -4725,7 +4727,8 @@ cdef class Table(_PandasConvertible): a list of multiple sorting conditions where each entry is a tuple with column name and sorting order ("ascending" or "descending") - **options : Additional sorting options + **options : dict, optional + Additional sorting options As allowed by :class:`SortOptions` Returns From b919fa6ceaa92e6d91764b20f4c1eb44f692f4a1 Mon Sep 17 00:00:00 2001 From: Alessandro Molina Date: Fri, 2 Dec 2022 14:31:29 +0100 Subject: [PATCH 06/15] Switch to kwargs convention --- python/pyarrow/array.pxi | 4 ++-- python/pyarrow/table.pxi | 6 +++--- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/python/pyarrow/array.pxi b/python/pyarrow/array.pxi index cbdb02f5bef..3a05c213a9d 100644 --- a/python/pyarrow/array.pxi +++ b/python/pyarrow/array.pxi @@ -1408,7 +1408,7 @@ cdef class Array(_PandasConvertible): order : str, default "ascending" Which order to sort values in. Accepted values are "ascending", "descending". - **options : dict, optional + **kwargs : dict, optional Additional sorting options As allowed by :class:`SortOptions` @@ -2778,7 +2778,7 @@ cdef class StructArray(Array): fieldname : str or None, default None If to sort the array by one of its fields or by the whole array. - **options : dict, optional + **kwargs : dict, optional Additional sorting options As allowed by :class:`SortOptions` diff --git a/python/pyarrow/table.pxi b/python/pyarrow/table.pxi index 43906d9d234..7aa6ee9f0cf 100644 --- a/python/pyarrow/table.pxi +++ b/python/pyarrow/table.pxi @@ -1048,7 +1048,7 @@ cdef class ChunkedArray(_PandasConvertible): order : str, default "ascending" Which order to sort values in. Accepted values are "ascending", "descending". - **options : dict, optional + **kwargs : dict, optional Additional sorting options As allowed by :class:`SortOptions` @@ -2265,7 +2265,7 @@ cdef class RecordBatch(_PandasConvertible): a list of multiple sorting conditions where each entry is a tuple with column name and sorting order ("ascending" or "descending") - **options : dict, optional + **kwargs : dict, optional Additional sorting options As allowed by :class:`SortOptions` @@ -4727,7 +4727,7 @@ cdef class Table(_PandasConvertible): a list of multiple sorting conditions where each entry is a tuple with column name and sorting order ("ascending" or "descending") - **options : dict, optional + **kwargs : dict, optional Additional sorting options As allowed by :class:`SortOptions` From b89be31cc06313aa66db6c3f8ab0dc2a66b1dd99 Mon Sep 17 00:00:00 2001 From: Alessandro Molina Date: Fri, 2 Dec 2022 14:32:30 +0100 Subject: [PATCH 07/15] from fieldname=X to by=X --- python/pyarrow/array.pxi | 4 ++-- python/pyarrow/tests/test_array.py | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/python/pyarrow/array.pxi b/python/pyarrow/array.pxi index 3a05c213a9d..2547a001c4c 100644 --- a/python/pyarrow/array.pxi +++ b/python/pyarrow/array.pxi @@ -2766,7 +2766,7 @@ cdef class StructArray(Array): result.validate() return result - def sort(self, order="ascending", fieldname=None, **options): + def sort(self, order="ascending", by=None, **options): """ Sort the StructArray @@ -2775,7 +2775,7 @@ cdef class StructArray(Array): order : str, default "ascending" Which order to sort values in. Accepted values are "ascending", "descending". - fieldname : str or None, default None + by : str or None, default None If to sort the array by one of its fields or by the whole array. **kwargs : dict, optional diff --git a/python/pyarrow/tests/test_array.py b/python/pyarrow/tests/test_array.py index 0654ffcdd51..292baf75380 100644 --- a/python/pyarrow/tests/test_array.py +++ b/python/pyarrow/tests/test_array.py @@ -3311,7 +3311,7 @@ def test_struct_array_sort(): pa.array(["foo", "car", "bar", "foobar"]) ], names=["a", "b"]) - sorted_arr = arr.sort("descending", fieldname="a") + sorted_arr = arr.sort("descending", by="a") assert sorted_arr.to_pylist() == [ {"a": 35, "b": "foobar"}, {"a": 7, "b": "car"}, From 33ce6caeb100894bf7b5a6f87effe37f92958aa3 Mon Sep 17 00:00:00 2001 From: Alessandro Molina Date: Fri, 2 Dec 2022 14:35:25 +0100 Subject: [PATCH 08/15] Consolidate tests --- python/pyarrow/tests/test_table.py | 34 ++++++++++++++---------------- 1 file changed, 16 insertions(+), 18 deletions(-) diff --git a/python/pyarrow/tests/test_table.py b/python/pyarrow/tests/test_table.py index 5c6a5400e36..22a4e8732a1 100644 --- a/python/pyarrow/tests/test_table.py +++ b/python/pyarrow/tests/test_table.py @@ -2025,23 +2025,6 @@ def sorted_by_keys(d): } -def test_table_sort_by(): - table = pa.table([ - pa.array([3, 1, 4, 2, 5]), - pa.array(["b", "a", "b", "a", "c"]), - ], names=["values", "keys"]) - - assert table.sort_by("values").to_pydict() == { - "keys": ["a", "a", "b", "b", "c"], - "values": [1, 2, 3, 4, 5] - } - - assert table.sort_by([("values", "descending")]).to_pydict() == { - "keys": ["c", "b", "b", "a", "a"], - "values": [5, 4, 3, 2, 1] - } - - def test_table_to_recordbatchreader(): table = pa.Table.from_pydict({'x': [1, 2, 3]}) reader = table.to_reader() @@ -2206,7 +2189,22 @@ def test_table_cast_invalid(): assert table.cast(new_schema).schema == new_schema -def test_table_sort(): +def test_table_sort_by(): + table = pa.table([ + pa.array([3, 1, 4, 2, 5]), + pa.array(["b", "a", "b", "a", "c"]), + ], names=["values", "keys"]) + + assert table.sort_by("values").to_pydict() == { + "keys": ["a", "a", "b", "b", "c"], + "values": [1, 2, 3, 4, 5] + } + + assert table.sort_by([("values", "descending")]).to_pydict() == { + "keys": ["c", "b", "b", "a", "a"], + "values": [5, 4, 3, 2, 1] + } + tab = pa.Table.from_arrays([ pa.array([5, 7, 7, 35], type=pa.int64()), pa.array(["foo", "car", "bar", "foobar"]) From 699dce17814d155c88a360019473f2acc0d9ef27 Mon Sep 17 00:00:00 2001 From: Alessandro Molina Date: Fri, 2 Dec 2022 14:59:34 +0100 Subject: [PATCH 09/15] rename fieldname to by --- python/pyarrow/array.pxi | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/python/pyarrow/array.pxi b/python/pyarrow/array.pxi index 2547a001c4c..3a0ed4c406b 100644 --- a/python/pyarrow/array.pxi +++ b/python/pyarrow/array.pxi @@ -2786,8 +2786,8 @@ cdef class StructArray(Array): ------- result : StructArray """ - if fieldname is not None: - tosort = self.field(fieldname) + if by is not None: + tosort = self.field(by) else: tosort = self indices = _pc().sort_indices( From 0082b8884ed248f7e10cbab86dea70a4470b7836 Mon Sep 17 00:00:00 2001 From: Alessandro Molina Date: Fri, 2 Dec 2022 15:51:53 +0100 Subject: [PATCH 10/15] Rename options to kwargs --- python/pyarrow/array.pxi | 8 ++++---- python/pyarrow/table.pxi | 12 ++++++------ 2 files changed, 10 insertions(+), 10 deletions(-) diff --git a/python/pyarrow/array.pxi b/python/pyarrow/array.pxi index 3a0ed4c406b..4193869cbc1 100644 --- a/python/pyarrow/array.pxi +++ b/python/pyarrow/array.pxi @@ -1399,7 +1399,7 @@ cdef class Array(_PandasConvertible): """ return _pc().index(self, value, start, end, memory_pool=memory_pool) - def sort(self, order="ascending", **options): + def sort(self, order="ascending", **kwargs): """ Sort the Array @@ -1418,7 +1418,7 @@ cdef class Array(_PandasConvertible): """ indices = _pc().sort_indices( self, - options=_pc().SortOptions(sort_keys=[("", order)], **options) + options=_pc().SortOptions(sort_keys=[("", order)], **kwargs) ) return self.take(indices) @@ -2766,7 +2766,7 @@ cdef class StructArray(Array): result.validate() return result - def sort(self, order="ascending", by=None, **options): + def sort(self, order="ascending", by=None, **kwargs): """ Sort the StructArray @@ -2792,7 +2792,7 @@ cdef class StructArray(Array): tosort = self indices = _pc().sort_indices( tosort, - options=_pc().SortOptions(sort_keys=[("", order)], **options) + options=_pc().SortOptions(sort_keys=[("", order)], **kwargs) ) return self.take(indices) diff --git a/python/pyarrow/table.pxi b/python/pyarrow/table.pxi index 7aa6ee9f0cf..de5dfc97200 100644 --- a/python/pyarrow/table.pxi +++ b/python/pyarrow/table.pxi @@ -1039,7 +1039,7 @@ cdef class ChunkedArray(_PandasConvertible): """ return _pc().drop_null(self) - def sort(self, order="ascending", **options): + def sort(self, order="ascending", **kwargs): """ Sort the ChunkedArray @@ -1058,7 +1058,7 @@ cdef class ChunkedArray(_PandasConvertible): """ indices = _pc().sort_indices( self, - options=_pc().SortOptions(sort_keys=[("", order)], **options) + options=_pc().SortOptions(sort_keys=[("", order)], **kwargs) ) return self.take(indices) @@ -2254,7 +2254,7 @@ cdef class RecordBatch(_PandasConvertible): """ return _pc().drop_null(self) - def sort_by(self, sorting, **options): + def sort_by(self, sorting, **kwargs): """ Sort the RecordBatch by one or multiple columns. @@ -2279,7 +2279,7 @@ cdef class RecordBatch(_PandasConvertible): indices = _pc().sort_indices( self, - options=_pc().SortOptions(sort_keys=sorting, **options) + options=_pc().SortOptions(sort_keys=sorting, **kwargs) ) return self.take(indices) @@ -4716,7 +4716,7 @@ cdef class Table(_PandasConvertible): """ return TableGroupBy(self, keys) - def sort_by(self, sorting, **options): + def sort_by(self, sorting, **kwargs): """ Sort the table by one or multiple columns. @@ -4760,7 +4760,7 @@ cdef class Table(_PandasConvertible): indices = _pc().sort_indices( self, - options=_pc().SortOptions(sort_keys=sorting, **options) + options=_pc().SortOptions(sort_keys=sorting, **kwargs) ) return self.take(indices) From 83a0a1f2af5ba924e3b482230a21d9899265ddef Mon Sep 17 00:00:00 2001 From: Alessandro Molina Date: Wed, 14 Dec 2022 14:25:39 +0100 Subject: [PATCH 11/15] Fix sort StructArray for nulls --- python/pyarrow/array.pxi | 38 +++++++++++++++++++++++++++- python/pyarrow/includes/libarrow.pxd | 1 + python/pyarrow/tests/test_array.py | 21 +++++++++++++++ 3 files changed, 59 insertions(+), 1 deletion(-) diff --git a/python/pyarrow/array.pxi b/python/pyarrow/array.pxi index 4193869cbc1..cbb7cb3b861 100644 --- a/python/pyarrow/array.pxi +++ b/python/pyarrow/array.pxi @@ -2664,6 +2664,42 @@ cdef class StructArray(Array): return pyarrow_wrap_array(child) + def flattened_field(self, index, MemoryPool memory_pool=None): + """ + Retrieves the child array belonging to field, + accounting for the parent array null bitmap. + + Parameters + ---------- + index : Union[int, str] + Index / position or name of the field. + memory_pool : MemoryPool, default None + For memory allocations, if required, otherwise use default pool. + + Returns + ------- + result : Array + """ + cdef: + CStructArray* arr = self.ap + shared_ptr[CArray] child + CMemoryPool* pool = maybe_unbox_memory_pool(memory_pool) + + if isinstance(index, (bytes, str)): + int_index = self.type.get_field_index(index) + if int_index < 0: + raise KeyError(index) + child = GetResultValue(arr.GetFlattenedField(int_index, pool)) + elif isinstance(index, int): + child = GetResultValue(arr.GetFlattenedField( + _normalize_index(index, self.ap.num_fields()), + pool + )) + else: + raise TypeError('Expected integer or string index') + + return pyarrow_wrap_array(child) + def flatten(self, MemoryPool memory_pool=None): """ Return one individual array for each field in the struct. @@ -2787,7 +2823,7 @@ cdef class StructArray(Array): result : StructArray """ if by is not None: - tosort = self.field(by) + tosort = self.flattened_field(by) else: tosort = self indices = _pc().sort_indices( diff --git a/python/pyarrow/includes/libarrow.pxd b/python/pyarrow/includes/libarrow.pxd index 9cea340a309..8baf1a7f80a 100644 --- a/python/pyarrow/includes/libarrow.pxd +++ b/python/pyarrow/includes/libarrow.pxd @@ -766,6 +766,7 @@ cdef extern from "arrow/api.h" namespace "arrow" nogil: shared_ptr[CArray] field(int pos) shared_ptr[CArray] GetFieldByName(const c_string& name) const + CResult[shared_ptr[CArray]] GetFlattenedField(int index, CMemoryPool* pool) const CResult[vector[shared_ptr[CArray]]] Flatten(CMemoryPool* pool) diff --git a/python/pyarrow/tests/test_array.py b/python/pyarrow/tests/test_array.py index 292baf75380..d3887f705e7 100644 --- a/python/pyarrow/tests/test_array.py +++ b/python/pyarrow/tests/test_array.py @@ -3318,3 +3318,24 @@ def test_struct_array_sort(): {"a": 7, "b": "bar"}, {"a": 5, "b": "foo"}, ] + + arr_with_nulls = pa.StructArray.from_arrays([ + pa.array([5, 7, 7, 35], type=pa.int64()), + pa.array(["foo", "car", "bar", "foobar"]) + ], names=["a", "b"], mask=pa.array([False, False, True, False])) + + sorted_arr = arr_with_nulls.sort("descending", by="a", null_placement="at_start") + assert sorted_arr.to_pylist() == [ + None, + {"a": 35, "b": "foobar"}, + {"a": 7, "b": "car"}, + {"a": 5, "b": "foo"}, + ] + + sorted_arr = arr_with_nulls.sort("descending", by="a", null_placement="at_end") + assert sorted_arr.to_pylist() == [ + {"a": 35, "b": "foobar"}, + {"a": 7, "b": "car"}, + {"a": 5, "b": "foo"}, + None + ] From 4e27cb2a5fec20b4ed86c183e44ae53ee4df7a51 Mon Sep 17 00:00:00 2001 From: Alessandro Molina Date: Wed, 14 Dec 2022 14:25:53 +0100 Subject: [PATCH 12/15] Remove unecessary complex test --- python/pyarrow/tests/test_table.py | 62 +----------------------------- 1 file changed, 1 insertion(+), 61 deletions(-) diff --git a/python/pyarrow/tests/test_table.py b/python/pyarrow/tests/test_table.py index 22a4e8732a1..76a6f9657cb 100644 --- a/python/pyarrow/tests/test_table.py +++ b/python/pyarrow/tests/test_table.py @@ -2238,64 +2238,4 @@ def test_record_batch_sort(): sorted_rb_dict = sorted_rb.to_pydict() assert sorted_rb_dict["a"] == [5, 7, 7, 35] assert sorted_rb_dict["b"] == [2, 3, 4, 1] - assert sorted_rb_dict["c"] == ["foobar", "bar", "foo", "car"] - - # test multi-key record batch sorter (> 8 sort keys) - rb1_names = ["a", "b", "c", "d", "e", "f", "g", "h", "i", "j"] - rb1 = pa.RecordBatch.from_arrays([ - pa.array([4, 4, 4, 4], type=pa.int64()), - pa.array([4, 4, 4, 4], type=pa.int64()), - pa.array([4, 4, 4, 4], type=pa.int64()), - pa.array([4, 4, 2, 2], type=pa.int64()), - pa.array([4, 4, 2, 2], type=pa.int64()), - pa.array([4, 4, 2, 2], type=pa.int64()), - pa.array([4, 4, 2, 2], type=pa.int64()), - pa.array([4, 4, 2, 2], type=pa.int64()), - pa.array([2, 1, 4, 3], type=pa.int64()), - pa.array(["foo", "car", "bar", "foobar"]) - ], names=rb1_names) - - sort_keys_list = [(name, "ascending") for name in rb1_names] - - sorted_rb1 = rb1.sort_by(sort_keys_list) - sorted_rb1_dict = sorted_rb1.to_pydict() - assert sorted_rb1_dict["a"] == [4, 4, 4, 4] - assert sorted_rb1_dict["b"] == [4, 4, 4, 4] - assert sorted_rb1_dict["c"] == [4, 4, 4, 4] - assert sorted_rb1_dict["d"] == [2, 2, 4, 4] - assert sorted_rb1_dict["e"] == [2, 2, 4, 4] - assert sorted_rb1_dict["f"] == [2, 2, 4, 4] - assert sorted_rb1_dict["g"] == [2, 2, 4, 4] - assert sorted_rb1_dict["h"] == [2, 2, 4, 4] - assert sorted_rb1_dict["i"] == [3, 4, 1, 2] - assert sorted_rb1_dict["j"] == ["foobar", "bar", "car", "foo"] - - # test radix sort with nulls - rb2_names = ["a", "b", "c", "d", "e", "f", "g", "h", "i", "j"] - rb2 = pa.RecordBatch.from_arrays([ - pa.array([None, None, None, None]), - pa.array([4, 4, 4, 4], type=pa.int64()), - pa.array([4, 4, 4, 4], type=pa.int64()), - pa.array([4, 4, 2, 2], type=pa.int64()), - pa.array([4, 4, 2, 2], type=pa.int64()), - pa.array([4, 4, 2, 2], type=pa.int64()), - pa.array([4, 4, 2, 2], type=pa.int64()), - pa.array([4, 4, 2, 2], type=pa.int64()), - pa.array([2, 1, 4, 3], type=pa.int64()), - pa.array([2, 1, 4, 3], type=pa.int64()), - ], names=rb2_names) - - sort_keys_list = [("a", "ascending"), ("j", "ascending")] - sorted_rb2 = rb2.sort_by(sort_keys_list) - sorted_rb2_dict = sorted_rb2.to_pydict() - - assert sorted_rb2_dict["a"] == [None, None, None, None] - assert sorted_rb2_dict["b"] == [4, 4, 4, 4] - assert sorted_rb2_dict["c"] == [4, 4, 4, 4] - assert sorted_rb2_dict["d"] == [4, 4, 2, 2] - assert sorted_rb2_dict["e"] == [4, 4, 2, 2] - assert sorted_rb2_dict["f"] == [4, 4, 2, 2] - assert sorted_rb2_dict["g"] == [4, 4, 2, 2] - assert sorted_rb2_dict["h"] == [4, 4, 2, 2] - assert sorted_rb2_dict["i"] == [1, 2, 3, 4] - assert sorted_rb2_dict["j"] == [1, 2, 3, 4] + assert sorted_rb_dict["c"] == ["foobar", "bar", "foo", "car"] \ No newline at end of file From 383629c5d148a1c12c656786debf7009a7da2aec Mon Sep 17 00:00:00 2001 From: Alessandro Molina Date: Wed, 14 Dec 2022 14:27:04 +0100 Subject: [PATCH 13/15] lint --- python/pyarrow/tests/test_array.py | 6 ++++-- python/pyarrow/tests/test_table.py | 2 +- 2 files changed, 5 insertions(+), 3 deletions(-) diff --git a/python/pyarrow/tests/test_array.py b/python/pyarrow/tests/test_array.py index d3887f705e7..9d6b6274506 100644 --- a/python/pyarrow/tests/test_array.py +++ b/python/pyarrow/tests/test_array.py @@ -3324,7 +3324,8 @@ def test_struct_array_sort(): pa.array(["foo", "car", "bar", "foobar"]) ], names=["a", "b"], mask=pa.array([False, False, True, False])) - sorted_arr = arr_with_nulls.sort("descending", by="a", null_placement="at_start") + sorted_arr = arr_with_nulls.sort( + "descending", by="a", null_placement="at_start") assert sorted_arr.to_pylist() == [ None, {"a": 35, "b": "foobar"}, @@ -3332,7 +3333,8 @@ def test_struct_array_sort(): {"a": 5, "b": "foo"}, ] - sorted_arr = arr_with_nulls.sort("descending", by="a", null_placement="at_end") + sorted_arr = arr_with_nulls.sort( + "descending", by="a", null_placement="at_end") assert sorted_arr.to_pylist() == [ {"a": 35, "b": "foobar"}, {"a": 7, "b": "car"}, diff --git a/python/pyarrow/tests/test_table.py b/python/pyarrow/tests/test_table.py index 76a6f9657cb..04e2dacc481 100644 --- a/python/pyarrow/tests/test_table.py +++ b/python/pyarrow/tests/test_table.py @@ -2238,4 +2238,4 @@ def test_record_batch_sort(): sorted_rb_dict = sorted_rb.to_pydict() assert sorted_rb_dict["a"] == [5, 7, 7, 35] assert sorted_rb_dict["b"] == [2, 3, 4, 1] - assert sorted_rb_dict["c"] == ["foobar", "bar", "foo", "car"] \ No newline at end of file + assert sorted_rb_dict["c"] == ["foobar", "bar", "foo", "car"] From 3883edbfba9dbf53cab3bcbc2ed27fd3aca25e2e Mon Sep 17 00:00:00 2001 From: Alessandro Molina Date: Wed, 14 Dec 2022 14:31:40 +0100 Subject: [PATCH 14/15] Add test for StructArray.flattened_field --- python/pyarrow/tests/test_array.py | 35 ++++++++++++++++++++++++++++++ 1 file changed, 35 insertions(+) diff --git a/python/pyarrow/tests/test_array.py b/python/pyarrow/tests/test_array.py index 9d6b6274506..63547b2efba 100644 --- a/python/pyarrow/tests/test_array.py +++ b/python/pyarrow/tests/test_array.py @@ -2916,6 +2916,41 @@ def test_struct_array_field(): a.field(invalid_name) +def test_struct_array_flattened_field(): + ty = pa.struct([pa.field('x', pa.int16()), + pa.field('y', pa.float32())]) + a = pa.array([(1, 2.5), (3, 4.5), (5, 6.5)], type=ty, + mask=pa.array([False, True, False])) + + x0 = a.flattened_field(0) + y0 = a.flattened_field(1) + x1 = a.flattened_field(-2) + y1 = a.flattened_field(-1) + x2 = a.flattened_field('x') + y2 = a.flattened_field('y') + + assert isinstance(x0, pa.lib.Int16Array) + assert isinstance(y1, pa.lib.FloatArray) + assert x0.equals(pa.array([1, None, 5], type=pa.int16())) + assert y0.equals(pa.array([2.5, None, 6.5], type=pa.float32())) + assert x0.equals(x1) + assert x0.equals(x2) + assert y0.equals(y1) + assert y0.equals(y2) + + for invalid_index in [None, pa.int16()]: + with pytest.raises(TypeError): + a.flattened_field(invalid_index) + + for invalid_index in [3, -3]: + with pytest.raises(IndexError): + a.flattened_field(invalid_index) + + for invalid_name in ['z', '']: + with pytest.raises(KeyError): + a.flattened_field(invalid_name) + + def test_empty_cast(): types = [ pa.null(), From 06340ba77e5f5fe069fb7e25c7f80e3cfd32f50d Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Thu, 15 Dec 2022 12:30:33 +0100 Subject: [PATCH 15/15] flattened_field -> _flattened_field --- python/pyarrow/array.pxi | 15 ++++++--------- python/pyarrow/table.pxi | 6 +++--- python/pyarrow/tests/test_array.py | 18 +++++++++--------- 3 files changed, 18 insertions(+), 21 deletions(-) diff --git a/python/pyarrow/array.pxi b/python/pyarrow/array.pxi index cbb7cb3b861..5772592ead1 100644 --- a/python/pyarrow/array.pxi +++ b/python/pyarrow/array.pxi @@ -1409,7 +1409,7 @@ cdef class Array(_PandasConvertible): Which order to sort values in. Accepted values are "ascending", "descending". **kwargs : dict, optional - Additional sorting options + Additional sorting options. As allowed by :class:`SortOptions` Returns @@ -2664,7 +2664,7 @@ cdef class StructArray(Array): return pyarrow_wrap_array(child) - def flattened_field(self, index, MemoryPool memory_pool=None): + def _flattened_field(self, index, MemoryPool memory_pool=None): """ Retrieves the child array belonging to field, accounting for the parent array null bitmap. @@ -2689,15 +2689,12 @@ cdef class StructArray(Array): int_index = self.type.get_field_index(index) if int_index < 0: raise KeyError(index) - child = GetResultValue(arr.GetFlattenedField(int_index, pool)) elif isinstance(index, int): - child = GetResultValue(arr.GetFlattenedField( - _normalize_index(index, self.ap.num_fields()), - pool - )) + int_index = _normalize_index(index, self.ap.num_fields()) else: raise TypeError('Expected integer or string index') + child = GetResultValue(arr.GetFlattenedField(int_index, pool)) return pyarrow_wrap_array(child) def flatten(self, MemoryPool memory_pool=None): @@ -2815,7 +2812,7 @@ cdef class StructArray(Array): If to sort the array by one of its fields or by the whole array. **kwargs : dict, optional - Additional sorting options + Additional sorting options. As allowed by :class:`SortOptions` Returns @@ -2823,7 +2820,7 @@ cdef class StructArray(Array): result : StructArray """ if by is not None: - tosort = self.flattened_field(by) + tosort = self._flattened_field(by) else: tosort = self indices = _pc().sort_indices( diff --git a/python/pyarrow/table.pxi b/python/pyarrow/table.pxi index de5dfc97200..5fba3cbfb14 100644 --- a/python/pyarrow/table.pxi +++ b/python/pyarrow/table.pxi @@ -1049,7 +1049,7 @@ cdef class ChunkedArray(_PandasConvertible): Which order to sort values in. Accepted values are "ascending", "descending". **kwargs : dict, optional - Additional sorting options + Additional sorting options. As allowed by :class:`SortOptions` Returns @@ -2266,7 +2266,7 @@ cdef class RecordBatch(_PandasConvertible): each entry is a tuple with column name and sorting order ("ascending" or "descending") **kwargs : dict, optional - Additional sorting options + Additional sorting options. As allowed by :class:`SortOptions` Returns @@ -4728,7 +4728,7 @@ cdef class Table(_PandasConvertible): each entry is a tuple with column name and sorting order ("ascending" or "descending") **kwargs : dict, optional - Additional sorting options + Additional sorting options. As allowed by :class:`SortOptions` Returns diff --git a/python/pyarrow/tests/test_array.py b/python/pyarrow/tests/test_array.py index 63547b2efba..b00e72e1bbc 100644 --- a/python/pyarrow/tests/test_array.py +++ b/python/pyarrow/tests/test_array.py @@ -2922,12 +2922,12 @@ def test_struct_array_flattened_field(): a = pa.array([(1, 2.5), (3, 4.5), (5, 6.5)], type=ty, mask=pa.array([False, True, False])) - x0 = a.flattened_field(0) - y0 = a.flattened_field(1) - x1 = a.flattened_field(-2) - y1 = a.flattened_field(-1) - x2 = a.flattened_field('x') - y2 = a.flattened_field('y') + x0 = a._flattened_field(0) + y0 = a._flattened_field(1) + x1 = a._flattened_field(-2) + y1 = a._flattened_field(-1) + x2 = a._flattened_field('x') + y2 = a._flattened_field('y') assert isinstance(x0, pa.lib.Int16Array) assert isinstance(y1, pa.lib.FloatArray) @@ -2940,15 +2940,15 @@ def test_struct_array_flattened_field(): for invalid_index in [None, pa.int16()]: with pytest.raises(TypeError): - a.flattened_field(invalid_index) + a._flattened_field(invalid_index) for invalid_index in [3, -3]: with pytest.raises(IndexError): - a.flattened_field(invalid_index) + a._flattened_field(invalid_index) for invalid_name in ['z', '']: with pytest.raises(KeyError): - a.flattened_field(invalid_name) + a._flattened_field(invalid_name) def test_empty_cast():