diff --git a/python/pyarrow/array.pxi b/python/pyarrow/array.pxi index 86d1f0e39cf..5772592ead1 100644 --- a/python/pyarrow/array.pxi +++ b/python/pyarrow/array.pxi @@ -1399,6 +1399,29 @@ cdef class Array(_PandasConvertible): """ return _pc().index(self, value, start, end, memory_pool=memory_pool) + def sort(self, order="ascending", **kwargs): + """ + Sort the Array + + Parameters + ---------- + order : str, default "ascending" + Which order to sort values in. + Accepted values are "ascending", "descending". + **kwargs : dict, optional + Additional sorting options. + As allowed by :class:`SortOptions` + + Returns + ------- + result : Array + """ + indices = _pc().sort_indices( + self, + options=_pc().SortOptions(sort_keys=[("", order)], **kwargs) + ) + return self.take(indices) + def _to_pandas(self, options, types_mapper=None, **kwargs): return _array_like_to_pandas(self, options, types_mapper=types_mapper) @@ -2641,6 +2664,39 @@ cdef class StructArray(Array): return pyarrow_wrap_array(child) + def _flattened_field(self, index, MemoryPool memory_pool=None): + """ + Retrieves the child array belonging to field, + accounting for the parent array null bitmap. + + Parameters + ---------- + index : Union[int, str] + Index / position or name of the field. + memory_pool : MemoryPool, default None + For memory allocations, if required, otherwise use default pool. + + Returns + ------- + result : Array + """ + cdef: + CStructArray* arr = self.ap + shared_ptr[CArray] child + CMemoryPool* pool = maybe_unbox_memory_pool(memory_pool) + + if isinstance(index, (bytes, str)): + int_index = self.type.get_field_index(index) + if int_index < 0: + raise KeyError(index) + elif isinstance(index, int): + int_index = _normalize_index(index, self.ap.num_fields()) + else: + raise TypeError('Expected integer or string index') + + child = GetResultValue(arr.GetFlattenedField(int_index, pool)) + return pyarrow_wrap_array(child) + def flatten(self, MemoryPool memory_pool=None): """ Return one individual array for each field in the struct. @@ -2743,6 +2799,36 @@ cdef class StructArray(Array): result.validate() return result + def sort(self, order="ascending", by=None, **kwargs): + """ + Sort the StructArray + + Parameters + ---------- + order : str, default "ascending" + Which order to sort values in. + Accepted values are "ascending", "descending". + by : str or None, default None + If to sort the array by one of its fields + or by the whole array. + **kwargs : dict, optional + Additional sorting options. + As allowed by :class:`SortOptions` + + Returns + ------- + result : StructArray + """ + if by is not None: + tosort = self._flattened_field(by) + else: + tosort = self + indices = _pc().sort_indices( + tosort, + options=_pc().SortOptions(sort_keys=[("", order)], **kwargs) + ) + return self.take(indices) + cdef class ExtensionArray(Array): """ diff --git a/python/pyarrow/includes/libarrow.pxd b/python/pyarrow/includes/libarrow.pxd index 9cea340a309..8baf1a7f80a 100644 --- a/python/pyarrow/includes/libarrow.pxd +++ b/python/pyarrow/includes/libarrow.pxd @@ -766,6 +766,7 @@ cdef extern from "arrow/api.h" namespace "arrow" nogil: shared_ptr[CArray] field(int pos) shared_ptr[CArray] GetFieldByName(const c_string& name) const + CResult[shared_ptr[CArray]] GetFlattenedField(int index, CMemoryPool* pool) const CResult[vector[shared_ptr[CArray]]] Flatten(CMemoryPool* pool) diff --git a/python/pyarrow/table.pxi b/python/pyarrow/table.pxi index 5c58ae61f19..5fba3cbfb14 100644 --- a/python/pyarrow/table.pxi +++ b/python/pyarrow/table.pxi @@ -1039,6 +1039,29 @@ cdef class ChunkedArray(_PandasConvertible): """ return _pc().drop_null(self) + def sort(self, order="ascending", **kwargs): + """ + Sort the ChunkedArray + + Parameters + ---------- + order : str, default "ascending" + Which order to sort values in. + Accepted values are "ascending", "descending". + **kwargs : dict, optional + Additional sorting options. + As allowed by :class:`SortOptions` + + Returns + ------- + result : ChunkedArray + """ + indices = _pc().sort_indices( + self, + options=_pc().SortOptions(sort_keys=[("", order)], **kwargs) + ) + return self.take(indices) + def unify_dictionaries(self, MemoryPool memory_pool=None): """ Unify dictionaries across all chunks. @@ -2231,6 +2254,35 @@ cdef class RecordBatch(_PandasConvertible): """ return _pc().drop_null(self) + def sort_by(self, sorting, **kwargs): + """ + Sort the RecordBatch by one or multiple columns. + + Parameters + ---------- + sorting : str or list[tuple(name, order)] + Name of the column to use to sort (ascending), or + a list of multiple sorting conditions where + each entry is a tuple with column name + and sorting order ("ascending" or "descending") + **kwargs : dict, optional + Additional sorting options. + As allowed by :class:`SortOptions` + + Returns + ------- + RecordBatch + A new record batch sorted according to the sort keys. + """ + if isinstance(sorting, str): + sorting = [(sorting, "ascending")] + + indices = _pc().sort_indices( + self, + options=_pc().SortOptions(sort_keys=sorting, **kwargs) + ) + return self.take(indices) + def to_pydict(self): """ Convert the RecordBatch to a dict or OrderedDict. @@ -4664,7 +4716,7 @@ cdef class Table(_PandasConvertible): """ return TableGroupBy(self, keys) - def sort_by(self, sorting): + def sort_by(self, sorting, **kwargs): """ Sort the table by one or multiple columns. @@ -4675,6 +4727,9 @@ cdef class Table(_PandasConvertible): a list of multiple sorting conditions where each entry is a tuple with column name and sorting order ("ascending" or "descending") + **kwargs : dict, optional + Additional sorting options. + As allowed by :class:`SortOptions` Returns ------- @@ -4705,7 +4760,7 @@ cdef class Table(_PandasConvertible): indices = _pc().sort_indices( self, - sort_keys=sorting + options=_pc().SortOptions(sort_keys=sorting, **kwargs) ) return self.take(indices) diff --git a/python/pyarrow/tests/test_array.py b/python/pyarrow/tests/test_array.py index 65604668918..b00e72e1bbc 100644 --- a/python/pyarrow/tests/test_array.py +++ b/python/pyarrow/tests/test_array.py @@ -2916,6 +2916,41 @@ def test_struct_array_field(): a.field(invalid_name) +def test_struct_array_flattened_field(): + ty = pa.struct([pa.field('x', pa.int16()), + pa.field('y', pa.float32())]) + a = pa.array([(1, 2.5), (3, 4.5), (5, 6.5)], type=ty, + mask=pa.array([False, True, False])) + + x0 = a._flattened_field(0) + y0 = a._flattened_field(1) + x1 = a._flattened_field(-2) + y1 = a._flattened_field(-1) + x2 = a._flattened_field('x') + y2 = a._flattened_field('y') + + assert isinstance(x0, pa.lib.Int16Array) + assert isinstance(y1, pa.lib.FloatArray) + assert x0.equals(pa.array([1, None, 5], type=pa.int16())) + assert y0.equals(pa.array([2.5, None, 6.5], type=pa.float32())) + assert x0.equals(x1) + assert x0.equals(x2) + assert y0.equals(y1) + assert y0.equals(y2) + + for invalid_index in [None, pa.int16()]: + with pytest.raises(TypeError): + a._flattened_field(invalid_index) + + for invalid_index in [3, -3]: + with pytest.raises(IndexError): + a._flattened_field(invalid_index) + + for invalid_name in ['z', '']: + with pytest.raises(KeyError): + a._flattened_field(invalid_name) + + def test_empty_cast(): types = [ pa.null(), @@ -3287,3 +3322,57 @@ def test_to_pandas_timezone(): arr = pa.chunked_array([arr]) s = arr.to_pandas() assert s.dt.tz is not None + + +def test_array_sort(): + arr = pa.array([5, 7, 35], type=pa.int64()) + sorted_arr = arr.sort("descending") + assert sorted_arr.to_pylist() == [35, 7, 5] + + arr = pa.chunked_array([[1, 2, 3], [4, 5, 6]]) + sorted_arr = arr.sort("descending") + assert sorted_arr.to_pylist() == [6, 5, 4, 3, 2, 1] + + arr = pa.array([5, 7, 35, None], type=pa.int64()) + sorted_arr = arr.sort("descending", null_placement="at_end") + assert sorted_arr.to_pylist() == [35, 7, 5, None] + sorted_arr = arr.sort("descending", null_placement="at_start") + assert sorted_arr.to_pylist() == [None, 35, 7, 5] + + +def test_struct_array_sort(): + arr = pa.StructArray.from_arrays([ + pa.array([5, 7, 7, 35], type=pa.int64()), + pa.array(["foo", "car", "bar", "foobar"]) + ], names=["a", "b"]) + + sorted_arr = arr.sort("descending", by="a") + assert sorted_arr.to_pylist() == [ + {"a": 35, "b": "foobar"}, + {"a": 7, "b": "car"}, + {"a": 7, "b": "bar"}, + {"a": 5, "b": "foo"}, + ] + + arr_with_nulls = pa.StructArray.from_arrays([ + pa.array([5, 7, 7, 35], type=pa.int64()), + pa.array(["foo", "car", "bar", "foobar"]) + ], names=["a", "b"], mask=pa.array([False, False, True, False])) + + sorted_arr = arr_with_nulls.sort( + "descending", by="a", null_placement="at_start") + assert sorted_arr.to_pylist() == [ + None, + {"a": 35, "b": "foobar"}, + {"a": 7, "b": "car"}, + {"a": 5, "b": "foo"}, + ] + + sorted_arr = arr_with_nulls.sort( + "descending", by="a", null_placement="at_end") + assert sorted_arr.to_pylist() == [ + {"a": 35, "b": "foobar"}, + {"a": 7, "b": "car"}, + {"a": 5, "b": "foo"}, + None + ] diff --git a/python/pyarrow/tests/test_table.py b/python/pyarrow/tests/test_table.py index fad1c0acb24..04e2dacc481 100644 --- a/python/pyarrow/tests/test_table.py +++ b/python/pyarrow/tests/test_table.py @@ -2025,23 +2025,6 @@ def sorted_by_keys(d): } -def test_table_sort_by(): - table = pa.table([ - pa.array([3, 1, 4, 2, 5]), - pa.array(["b", "a", "b", "a", "c"]), - ], names=["values", "keys"]) - - assert table.sort_by("values").to_pydict() == { - "keys": ["a", "a", "b", "b", "c"], - "values": [1, 2, 3, 4, 5] - } - - assert table.sort_by([("values", "descending")]).to_pydict() == { - "keys": ["c", "b", "b", "a", "a"], - "values": [5, 4, 3, 2, 1] - } - - def test_table_to_recordbatchreader(): table = pa.Table.from_pydict({'x': [1, 2, 3]}) reader = table.to_reader() @@ -2204,3 +2187,55 @@ def test_table_cast_invalid(): table = pa.table({'a': [None, 1], 'b': [False, True]}) assert table.cast(new_schema).schema == new_schema + + +def test_table_sort_by(): + table = pa.table([ + pa.array([3, 1, 4, 2, 5]), + pa.array(["b", "a", "b", "a", "c"]), + ], names=["values", "keys"]) + + assert table.sort_by("values").to_pydict() == { + "keys": ["a", "a", "b", "b", "c"], + "values": [1, 2, 3, 4, 5] + } + + assert table.sort_by([("values", "descending")]).to_pydict() == { + "keys": ["c", "b", "b", "a", "a"], + "values": [5, 4, 3, 2, 1] + } + + tab = pa.Table.from_arrays([ + pa.array([5, 7, 7, 35], type=pa.int64()), + pa.array(["foo", "car", "bar", "foobar"]) + ], names=["a", "b"]) + + sorted_tab = tab.sort_by([("a", "descending")]) + sorted_tab_dict = sorted_tab.to_pydict() + assert sorted_tab_dict["a"] == [35, 7, 7, 5] + assert sorted_tab_dict["b"] == ["foobar", "car", "bar", "foo"] + + sorted_tab = tab.sort_by([("a", "ascending")]) + sorted_tab_dict = sorted_tab.to_pydict() + assert sorted_tab_dict["a"] == [5, 7, 7, 35] + assert sorted_tab_dict["b"] == ["foo", "car", "bar", "foobar"] + + +def test_record_batch_sort(): + rb = pa.RecordBatch.from_arrays([ + pa.array([7, 35, 7, 5], type=pa.int64()), + pa.array([4, 1, 3, 2], type=pa.int64()), + pa.array(["foo", "car", "bar", "foobar"]) + ], names=["a", "b", "c"]) + + sorted_rb = rb.sort_by([("a", "descending"), ("b", "descending")]) + sorted_rb_dict = sorted_rb.to_pydict() + assert sorted_rb_dict["a"] == [35, 7, 7, 5] + assert sorted_rb_dict["b"] == [1, 4, 3, 2] + assert sorted_rb_dict["c"] == ["car", "foo", "bar", "foobar"] + + sorted_rb = rb.sort_by([("a", "ascending"), ("b", "ascending")]) + sorted_rb_dict = sorted_rb.to_pydict() + assert sorted_rb_dict["a"] == [5, 7, 7, 35] + assert sorted_rb_dict["b"] == [2, 3, 4, 1] + assert sorted_rb_dict["c"] == ["foobar", "bar", "foo", "car"]