apache · jorisvandenbossche · Dec 15, 2022 · Nov 30, 2022 · Nov 30, 2022 · Dec 1, 2022
diff --git a/python/pyarrow/array.pxi b/python/pyarrow/array.pxi
@@ -1399,6 +1399,29 @@ cdef class Array(_PandasConvertible):
         """
         return _pc().index(self, value, start, end, memory_pool=memory_pool)
 
+    def sort(self, order="ascending", **kwargs):
+        """
+        Sort the Array
+
+        Parameters
+        ----------
+        order : str, default "ascending"
+            Which order to sort values in.
+            Accepted values are "ascending", "descending".
+        **kwargs : dict, optional
+            Additional sorting options.
+            As allowed by :class:`SortOptions`
+
+        Returns
+        -------
+        result : Array
+        """
+        indices = _pc().sort_indices(
+            self,
+            options=_pc().SortOptions(sort_keys=[("", order)], **kwargs)
+        )
+        return self.take(indices)
+
     def _to_pandas(self, options, types_mapper=None, **kwargs):
         return _array_like_to_pandas(self, options, types_mapper=types_mapper)
 
@@ -2641,6 +2664,39 @@ cdef class StructArray(Array):
 
         return pyarrow_wrap_array(child)
 
+    def _flattened_field(self, index, MemoryPool memory_pool=None):
+        """
+        Retrieves the child array belonging to field,
+        accounting for the parent array null bitmap.
+
+        Parameters
+        ----------
+        index : Union[int, str]
+            Index / position or name of the field.
+        memory_pool : MemoryPool, default None
+            For memory allocations, if required, otherwise use default pool.
+
+        Returns
+        -------
+        result : Array
+        """
+        cdef:
+            CStructArray* arr = <CStructArray*> self.ap
+            shared_ptr[CArray] child
+            CMemoryPool* pool = maybe_unbox_memory_pool(memory_pool)
+
+        if isinstance(index, (bytes, str)):
+            int_index = self.type.get_field_index(index)
+            if int_index < 0:
+                raise KeyError(index)
+        elif isinstance(index, int):
+            int_index = _normalize_index(index, self.ap.num_fields())
+        else:
+            raise TypeError('Expected integer or string index')
+
+        child = GetResultValue(arr.GetFlattenedField(int_index, pool))
+        return pyarrow_wrap_array(child)
+
     def flatten(self, MemoryPool memory_pool=None):
         """
         Return one individual array for each field in the struct.
@@ -2743,6 +2799,36 @@ cdef class StructArray(Array):
         result.validate()
         return result
 
+    def sort(self, order="ascending", by=None, **kwargs):
+        """
+        Sort the StructArray
+
+        Parameters
+        ----------
+        order : str, default "ascending"
+            Which order to sort values in.
+            Accepted values are "ascending", "descending".
+        by : str or None, default None
+            If to sort the array by one of its fields
+            or by the whole array.
+        **kwargs : dict, optional
+            Additional sorting options.
+            As allowed by :class:`SortOptions`
+
+        Returns
+        -------
+        result : StructArray
+        """
+        if by is not None:
+            tosort = self._flattened_field(by)
+        else:
+            tosort = self
+        indices = _pc().sort_indices(
+            tosort,
+            options=_pc().SortOptions(sort_keys=[("", order)], **kwargs)
+        )
+        return self.take(indices)
+
 
 cdef class ExtensionArray(Array):
     """

diff --git a/python/pyarrow/includes/libarrow.pxd b/python/pyarrow/includes/libarrow.pxd
@@ -766,6 +766,7 @@ cdef extern from "arrow/api.h" namespace "arrow" nogil:
 
         shared_ptr[CArray] field(int pos)
         shared_ptr[CArray] GetFieldByName(const c_string& name) const
+        CResult[shared_ptr[CArray]] GetFlattenedField(int index, CMemoryPool* pool) const
 
         CResult[vector[shared_ptr[CArray]]] Flatten(CMemoryPool* pool)
 

diff --git a/python/pyarrow/table.pxi b/python/pyarrow/table.pxi
@@ -1039,6 +1039,29 @@ cdef class ChunkedArray(_PandasConvertible):
         """
         return _pc().drop_null(self)
 
+    def sort(self, order="ascending", **kwargs):
+        """
+        Sort the ChunkedArray
+
+        Parameters
+        ----------
+        order : str, default "ascending"
+            Which order to sort values in.
+            Accepted values are "ascending", "descending".
+        **kwargs : dict, optional
+            Additional sorting options.
+            As allowed by :class:`SortOptions`
+
+        Returns
+        -------
+        result : ChunkedArray
+        """
+        indices = _pc().sort_indices(
+            self,
+            options=_pc().SortOptions(sort_keys=[("", order)], **kwargs)
+        )
+        return self.take(indices)
+
     def unify_dictionaries(self, MemoryPool memory_pool=None):
         """
         Unify dictionaries across all chunks.
@@ -2231,6 +2254,35 @@ cdef class RecordBatch(_PandasConvertible):
         """
         return _pc().drop_null(self)
 
+    def sort_by(self, sorting, **kwargs):
+        """
+        Sort the RecordBatch by one or multiple columns.
+
+        Parameters
+        ----------
+        sorting : str or list[tuple(name, order)]
+            Name of the column to use to sort (ascending), or
+            a list of multiple sorting conditions where
+            each entry is a tuple with column name
+            and sorting order ("ascending" or "descending")
+        **kwargs : dict, optional
+            Additional sorting options.
+            As allowed by :class:`SortOptions`
+
+        Returns
+        -------
+        RecordBatch
+            A new record batch sorted according to the sort keys.
+        """
+        if isinstance(sorting, str):
+            sorting = [(sorting, "ascending")]
+
+        indices = _pc().sort_indices(
+            self,
+            options=_pc().SortOptions(sort_keys=sorting, **kwargs)
+        )
+        return self.take(indices)
+
     def to_pydict(self):
         """
         Convert the RecordBatch to a dict or OrderedDict.
@@ -4664,7 +4716,7 @@ cdef class Table(_PandasConvertible):
         """
         return TableGroupBy(self, keys)
 
-    def sort_by(self, sorting):
+    def sort_by(self, sorting, **kwargs):
         """
         Sort the table by one or multiple columns.
 
@@ -4675,6 +4727,9 @@ cdef class Table(_PandasConvertible):
             a list of multiple sorting conditions where
             each entry is a tuple with column name
             and sorting order ("ascending" or "descending")
+        **kwargs : dict, optional
+            Additional sorting options.
+            As allowed by :class:`SortOptions`
 
         Returns
         -------
@@ -4705,7 +4760,7 @@ cdef class Table(_PandasConvertible):
 
         indices = _pc().sort_indices(
             self,
-            sort_keys=sorting
+            options=_pc().SortOptions(sort_keys=sorting, **kwargs)
         )
         return self.take(indices)
 

diff --git a/python/pyarrow/tests/test_array.py b/python/pyarrow/tests/test_array.py
@@ -2916,6 +2916,41 @@ def test_struct_array_field():
             a.field(invalid_name)
 
 
+def test_struct_array_flattened_field():
+    ty = pa.struct([pa.field('x', pa.int16()),
+                    pa.field('y', pa.float32())])
+    a = pa.array([(1, 2.5), (3, 4.5), (5, 6.5)], type=ty,
+                 mask=pa.array([False, True, False]))
+
+    x0 = a._flattened_field(0)
+    y0 = a._flattened_field(1)
+    x1 = a._flattened_field(-2)
+    y1 = a._flattened_field(-1)
+    x2 = a._flattened_field('x')
+    y2 = a._flattened_field('y')
+
+    assert isinstance(x0, pa.lib.Int16Array)
+    assert isinstance(y1, pa.lib.FloatArray)
+    assert x0.equals(pa.array([1, None, 5], type=pa.int16()))
+    assert y0.equals(pa.array([2.5, None, 6.5], type=pa.float32()))
+    assert x0.equals(x1)
+    assert x0.equals(x2)
+    assert y0.equals(y1)
+    assert y0.equals(y2)
+
+    for invalid_index in [None, pa.int16()]:
+        with pytest.raises(TypeError):
+            a._flattened_field(invalid_index)
+
+    for invalid_index in [3, -3]:
+        with pytest.raises(IndexError):
+            a._flattened_field(invalid_index)
+
+    for invalid_name in ['z', '']:
+        with pytest.raises(KeyError):
+            a._flattened_field(invalid_name)
+
+
 def test_empty_cast():
     types = [
         pa.null(),
@@ -3287,3 +3322,57 @@ def test_to_pandas_timezone():
     arr = pa.chunked_array([arr])
     s = arr.to_pandas()
     assert s.dt.tz is not None
+
+
+def test_array_sort():
+    arr = pa.array([5, 7, 35], type=pa.int64())
+    sorted_arr = arr.sort("descending")
+    assert sorted_arr.to_pylist() == [35, 7, 5]
+
+    arr = pa.chunked_array([[1, 2, 3], [4, 5, 6]])
+    sorted_arr = arr.sort("descending")
+    assert sorted_arr.to_pylist() == [6, 5, 4, 3, 2, 1]
+
+    arr = pa.array([5, 7, 35, None], type=pa.int64())
+    sorted_arr = arr.sort("descending", null_placement="at_end")
+    assert sorted_arr.to_pylist() == [35, 7, 5, None]
+    sorted_arr = arr.sort("descending", null_placement="at_start")
+    assert sorted_arr.to_pylist() == [None, 35, 7, 5]
+
+
+def test_struct_array_sort():
+    arr = pa.StructArray.from_arrays([
+        pa.array([5, 7, 7, 35], type=pa.int64()),
+        pa.array(["foo", "car", "bar", "foobar"])
+    ], names=["a", "b"])
+
+    sorted_arr = arr.sort("descending", by="a")
+    assert sorted_arr.to_pylist() == [
+        {"a": 35, "b": "foobar"},
+        {"a": 7, "b": "car"},
+        {"a": 7, "b": "bar"},
+        {"a": 5, "b": "foo"},
+    ]
+
+    arr_with_nulls = pa.StructArray.from_arrays([
+        pa.array([5, 7, 7, 35], type=pa.int64()),
+        pa.array(["foo", "car", "bar", "foobar"])
+    ], names=["a", "b"], mask=pa.array([False, False, True, False]))
+
+    sorted_arr = arr_with_nulls.sort(
+        "descending", by="a", null_placement="at_start")
+    assert sorted_arr.to_pylist() == [
+        None,
+        {"a": 35, "b": "foobar"},
+        {"a": 7, "b": "car"},
+        {"a": 5, "b": "foo"},
+    ]
+
+    sorted_arr = arr_with_nulls.sort(
+        "descending", by="a", null_placement="at_end")
+    assert sorted_arr.to_pylist() == [
+        {"a": 35, "b": "foobar"},
+        {"a": 7, "b": "car"},
+        {"a": 5, "b": "foo"},
+        None
+    ]
diff --git a/python/pyarrow/tests/test_table.py b/python/pyarrow/tests/test_table.py
@@ -2025,23 +2025,6 @@ def sorted_by_keys(d):
     }
 
 
-def test_table_sort_by():
-    table = pa.table([
-        pa.array([3, 1, 4, 2, 5]),
-        pa.array(["b", "a", "b", "a", "c"]),
-    ], names=["values", "keys"])
-
-    assert table.sort_by("values").to_pydict() == {
-        "keys": ["a", "a", "b", "b", "c"],
-        "values": [1, 2, 3, 4, 5]
-    }
-
-    assert table.sort_by([("values", "descending")]).to_pydict() == {
-        "keys": ["c", "b", "b", "a", "a"],
-        "values": [5, 4, 3, 2, 1]
-    }
-
-
 def test_table_to_recordbatchreader():
     table = pa.Table.from_pydict({'x': [1, 2, 3]})
     reader = table.to_reader()
@@ -2204,3 +2187,55 @@ def test_table_cast_invalid():
 
     table = pa.table({'a': [None, 1], 'b': [False, True]})
     assert table.cast(new_schema).schema == new_schema
+
+
+def test_table_sort_by():
+    table = pa.table([
+        pa.array([3, 1, 4, 2, 5]),
+        pa.array(["b", "a", "b", "a", "c"]),
+    ], names=["values", "keys"])
+
+    assert table.sort_by("values").to_pydict() == {
+        "keys": ["a", "a", "b", "b", "c"],
+        "values": [1, 2, 3, 4, 5]
+    }
+
+    assert table.sort_by([("values", "descending")]).to_pydict() == {
+        "keys": ["c", "b", "b", "a", "a"],
+        "values": [5, 4, 3, 2, 1]
+    }
+
+    tab = pa.Table.from_arrays([
+        pa.array([5, 7, 7, 35], type=pa.int64()),
+        pa.array(["foo", "car", "bar", "foobar"])
+    ], names=["a", "b"])
+
+    sorted_tab = tab.sort_by([("a", "descending")])
+    sorted_tab_dict = sorted_tab.to_pydict()
+    assert sorted_tab_dict["a"] == [35, 7, 7, 5]
+    assert sorted_tab_dict["b"] == ["foobar", "car", "bar", "foo"]
+
+    sorted_tab = tab.sort_by([("a", "ascending")])
+    sorted_tab_dict = sorted_tab.to_pydict()
+    assert sorted_tab_dict["a"] == [5, 7, 7, 35]
+    assert sorted_tab_dict["b"] == ["foo", "car", "bar", "foobar"]
+
+
+def test_record_batch_sort():
+    rb = pa.RecordBatch.from_arrays([
+        pa.array([7, 35, 7, 5], type=pa.int64()),
+        pa.array([4, 1, 3, 2], type=pa.int64()),
+        pa.array(["foo", "car", "bar", "foobar"])
+    ], names=["a", "b", "c"])
+
+    sorted_rb = rb.sort_by([("a", "descending"), ("b", "descending")])
+    sorted_rb_dict = sorted_rb.to_pydict()
+    assert sorted_rb_dict["a"] == [35, 7, 7, 5]
+    assert sorted_rb_dict["b"] == [1, 4, 3, 2]
+    assert sorted_rb_dict["c"] == ["car", "foo", "bar", "foobar"]
+
+    sorted_rb = rb.sort_by([("a", "ascending"), ("b", "ascending")])
+    sorted_rb_dict = sorted_rb.to_pydict()
+    assert sorted_rb_dict["a"] == [5, 7, 7, 35]
+    assert sorted_rb_dict["b"] == [2, 3, 4, 1]
+    assert sorted_rb_dict["c"] == ["foobar", "bar", "foo", "car"]