Skip to content
Merged
86 changes: 86 additions & 0 deletions python/pyarrow/array.pxi
Original file line number Diff line number Diff line change
Expand Up @@ -1399,6 +1399,29 @@ cdef class Array(_PandasConvertible):
"""
return _pc().index(self, value, start, end, memory_pool=memory_pool)

def sort(self, order="ascending", **kwargs):
"""
Sort the Array

Parameters
----------
order : str, default "ascending"
Which order to sort values in.
Accepted values are "ascending", "descending".
**kwargs : dict, optional
Additional sorting options.
As allowed by :class:`SortOptions`

Returns
-------
result : Array
"""
indices = _pc().sort_indices(
self,
options=_pc().SortOptions(sort_keys=[("", order)], **kwargs)
)
return self.take(indices)

def _to_pandas(self, options, types_mapper=None, **kwargs):
return _array_like_to_pandas(self, options, types_mapper=types_mapper)

Expand Down Expand Up @@ -2641,6 +2664,39 @@ cdef class StructArray(Array):

return pyarrow_wrap_array(child)

def _flattened_field(self, index, MemoryPool memory_pool=None):
"""
Retrieves the child array belonging to field,
accounting for the parent array null bitmap.

Parameters
----------
index : Union[int, str]
Index / position or name of the field.
memory_pool : MemoryPool, default None
For memory allocations, if required, otherwise use default pool.

Returns
-------
result : Array
"""
cdef:
CStructArray* arr = <CStructArray*> self.ap
shared_ptr[CArray] child
CMemoryPool* pool = maybe_unbox_memory_pool(memory_pool)

if isinstance(index, (bytes, str)):
int_index = self.type.get_field_index(index)
if int_index < 0:
raise KeyError(index)
elif isinstance(index, int):
int_index = _normalize_index(index, self.ap.num_fields())
else:
raise TypeError('Expected integer or string index')

child = GetResultValue(arr.GetFlattenedField(int_index, pool))
return pyarrow_wrap_array(child)

def flatten(self, MemoryPool memory_pool=None):
"""
Return one individual array for each field in the struct.
Expand Down Expand Up @@ -2743,6 +2799,36 @@ cdef class StructArray(Array):
result.validate()
return result

def sort(self, order="ascending", by=None, **kwargs):
"""
Sort the StructArray

Parameters
----------
order : str, default "ascending"
Which order to sort values in.
Accepted values are "ascending", "descending".
by : str or None, default None
If to sort the array by one of its fields
or by the whole array.
**kwargs : dict, optional
Additional sorting options.
As allowed by :class:`SortOptions`

Returns
-------
result : StructArray
"""
if by is not None:
tosort = self._flattened_field(by)
else:
tosort = self
indices = _pc().sort_indices(
tosort,
options=_pc().SortOptions(sort_keys=[("", order)], **kwargs)
)
return self.take(indices)


cdef class ExtensionArray(Array):
"""
Expand Down
1 change: 1 addition & 0 deletions python/pyarrow/includes/libarrow.pxd
Original file line number Diff line number Diff line change
Expand Up @@ -766,6 +766,7 @@ cdef extern from "arrow/api.h" namespace "arrow" nogil:

shared_ptr[CArray] field(int pos)
shared_ptr[CArray] GetFieldByName(const c_string& name) const
CResult[shared_ptr[CArray]] GetFlattenedField(int index, CMemoryPool* pool) const

CResult[vector[shared_ptr[CArray]]] Flatten(CMemoryPool* pool)

Expand Down
59 changes: 57 additions & 2 deletions python/pyarrow/table.pxi
Original file line number Diff line number Diff line change
Expand Up @@ -1039,6 +1039,29 @@ cdef class ChunkedArray(_PandasConvertible):
"""
return _pc().drop_null(self)

def sort(self, order="ascending", **kwargs):
"""
Sort the ChunkedArray

Parameters
----------
order : str, default "ascending"
Which order to sort values in.
Accepted values are "ascending", "descending".
**kwargs : dict, optional
Additional sorting options.
As allowed by :class:`SortOptions`

Returns
-------
result : ChunkedArray
"""
indices = _pc().sort_indices(
self,
options=_pc().SortOptions(sort_keys=[("", order)], **kwargs)
)
return self.take(indices)

def unify_dictionaries(self, MemoryPool memory_pool=None):
"""
Unify dictionaries across all chunks.
Expand Down Expand Up @@ -2231,6 +2254,35 @@ cdef class RecordBatch(_PandasConvertible):
"""
return _pc().drop_null(self)

def sort_by(self, sorting, **kwargs):
"""
Sort the RecordBatch by one or multiple columns.

Parameters
----------
sorting : str or list[tuple(name, order)]
Name of the column to use to sort (ascending), or
a list of multiple sorting conditions where
each entry is a tuple with column name
and sorting order ("ascending" or "descending")
**kwargs : dict, optional
Additional sorting options.
As allowed by :class:`SortOptions`

Returns
-------
RecordBatch
A new record batch sorted according to the sort keys.
"""
if isinstance(sorting, str):
sorting = [(sorting, "ascending")]

indices = _pc().sort_indices(
self,
options=_pc().SortOptions(sort_keys=sorting, **kwargs)
)
return self.take(indices)

def to_pydict(self):
"""
Convert the RecordBatch to a dict or OrderedDict.
Expand Down Expand Up @@ -4664,7 +4716,7 @@ cdef class Table(_PandasConvertible):
"""
return TableGroupBy(self, keys)

def sort_by(self, sorting):
def sort_by(self, sorting, **kwargs):
"""
Sort the table by one or multiple columns.

Expand All @@ -4675,6 +4727,9 @@ cdef class Table(_PandasConvertible):
a list of multiple sorting conditions where
each entry is a tuple with column name
and sorting order ("ascending" or "descending")
**kwargs : dict, optional
Additional sorting options.
As allowed by :class:`SortOptions`

Returns
-------
Expand Down Expand Up @@ -4705,7 +4760,7 @@ cdef class Table(_PandasConvertible):

indices = _pc().sort_indices(
self,
sort_keys=sorting
options=_pc().SortOptions(sort_keys=sorting, **kwargs)
)
return self.take(indices)

Expand Down
89 changes: 89 additions & 0 deletions python/pyarrow/tests/test_array.py
Original file line number Diff line number Diff line change
Expand Up @@ -2916,6 +2916,41 @@ def test_struct_array_field():
a.field(invalid_name)


def test_struct_array_flattened_field():
ty = pa.struct([pa.field('x', pa.int16()),
pa.field('y', pa.float32())])
a = pa.array([(1, 2.5), (3, 4.5), (5, 6.5)], type=ty,
mask=pa.array([False, True, False]))

x0 = a._flattened_field(0)
y0 = a._flattened_field(1)
x1 = a._flattened_field(-2)
y1 = a._flattened_field(-1)
x2 = a._flattened_field('x')
y2 = a._flattened_field('y')

assert isinstance(x0, pa.lib.Int16Array)
assert isinstance(y1, pa.lib.FloatArray)
assert x0.equals(pa.array([1, None, 5], type=pa.int16()))
assert y0.equals(pa.array([2.5, None, 6.5], type=pa.float32()))
assert x0.equals(x1)
assert x0.equals(x2)
assert y0.equals(y1)
assert y0.equals(y2)

for invalid_index in [None, pa.int16()]:
with pytest.raises(TypeError):
a._flattened_field(invalid_index)

for invalid_index in [3, -3]:
with pytest.raises(IndexError):
a._flattened_field(invalid_index)

for invalid_name in ['z', '']:
with pytest.raises(KeyError):
a._flattened_field(invalid_name)


def test_empty_cast():
types = [
pa.null(),
Expand Down Expand Up @@ -3287,3 +3322,57 @@ def test_to_pandas_timezone():
arr = pa.chunked_array([arr])
s = arr.to_pandas()
assert s.dt.tz is not None


def test_array_sort():
arr = pa.array([5, 7, 35], type=pa.int64())
sorted_arr = arr.sort("descending")
assert sorted_arr.to_pylist() == [35, 7, 5]

arr = pa.chunked_array([[1, 2, 3], [4, 5, 6]])
sorted_arr = arr.sort("descending")
assert sorted_arr.to_pylist() == [6, 5, 4, 3, 2, 1]

arr = pa.array([5, 7, 35, None], type=pa.int64())
sorted_arr = arr.sort("descending", null_placement="at_end")
assert sorted_arr.to_pylist() == [35, 7, 5, None]
sorted_arr = arr.sort("descending", null_placement="at_start")
assert sorted_arr.to_pylist() == [None, 35, 7, 5]


def test_struct_array_sort():
arr = pa.StructArray.from_arrays([
pa.array([5, 7, 7, 35], type=pa.int64()),
pa.array(["foo", "car", "bar", "foobar"])
], names=["a", "b"])

sorted_arr = arr.sort("descending", by="a")
assert sorted_arr.to_pylist() == [
{"a": 35, "b": "foobar"},
{"a": 7, "b": "car"},
{"a": 7, "b": "bar"},
{"a": 5, "b": "foo"},
]

arr_with_nulls = pa.StructArray.from_arrays([
pa.array([5, 7, 7, 35], type=pa.int64()),
pa.array(["foo", "car", "bar", "foobar"])
], names=["a", "b"], mask=pa.array([False, False, True, False]))

sorted_arr = arr_with_nulls.sort(
"descending", by="a", null_placement="at_start")
assert sorted_arr.to_pylist() == [
None,
{"a": 35, "b": "foobar"},
{"a": 7, "b": "car"},
{"a": 5, "b": "foo"},
]

sorted_arr = arr_with_nulls.sort(
"descending", by="a", null_placement="at_end")
assert sorted_arr.to_pylist() == [
{"a": 35, "b": "foobar"},
{"a": 7, "b": "car"},
{"a": 5, "b": "foo"},
None
]
69 changes: 52 additions & 17 deletions python/pyarrow/tests/test_table.py
Original file line number Diff line number Diff line change
Expand Up @@ -2025,23 +2025,6 @@ def sorted_by_keys(d):
}


def test_table_sort_by():
table = pa.table([
pa.array([3, 1, 4, 2, 5]),
pa.array(["b", "a", "b", "a", "c"]),
], names=["values", "keys"])

assert table.sort_by("values").to_pydict() == {
"keys": ["a", "a", "b", "b", "c"],
"values": [1, 2, 3, 4, 5]
}

assert table.sort_by([("values", "descending")]).to_pydict() == {
"keys": ["c", "b", "b", "a", "a"],
"values": [5, 4, 3, 2, 1]
}


def test_table_to_recordbatchreader():
table = pa.Table.from_pydict({'x': [1, 2, 3]})
reader = table.to_reader()
Expand Down Expand Up @@ -2204,3 +2187,55 @@ def test_table_cast_invalid():

table = pa.table({'a': [None, 1], 'b': [False, True]})
assert table.cast(new_schema).schema == new_schema


def test_table_sort_by():
table = pa.table([
pa.array([3, 1, 4, 2, 5]),
pa.array(["b", "a", "b", "a", "c"]),
], names=["values", "keys"])

assert table.sort_by("values").to_pydict() == {
"keys": ["a", "a", "b", "b", "c"],
"values": [1, 2, 3, 4, 5]
}

assert table.sort_by([("values", "descending")]).to_pydict() == {
"keys": ["c", "b", "b", "a", "a"],
"values": [5, 4, 3, 2, 1]
}

tab = pa.Table.from_arrays([
pa.array([5, 7, 7, 35], type=pa.int64()),
pa.array(["foo", "car", "bar", "foobar"])
], names=["a", "b"])

sorted_tab = tab.sort_by([("a", "descending")])
sorted_tab_dict = sorted_tab.to_pydict()
assert sorted_tab_dict["a"] == [35, 7, 7, 5]
assert sorted_tab_dict["b"] == ["foobar", "car", "bar", "foo"]

sorted_tab = tab.sort_by([("a", "ascending")])
sorted_tab_dict = sorted_tab.to_pydict()
assert sorted_tab_dict["a"] == [5, 7, 7, 35]
assert sorted_tab_dict["b"] == ["foo", "car", "bar", "foobar"]


def test_record_batch_sort():
rb = pa.RecordBatch.from_arrays([
pa.array([7, 35, 7, 5], type=pa.int64()),
pa.array([4, 1, 3, 2], type=pa.int64()),
pa.array(["foo", "car", "bar", "foobar"])
], names=["a", "b", "c"])

sorted_rb = rb.sort_by([("a", "descending"), ("b", "descending")])
sorted_rb_dict = sorted_rb.to_pydict()
assert sorted_rb_dict["a"] == [35, 7, 7, 5]
assert sorted_rb_dict["b"] == [1, 4, 3, 2]
assert sorted_rb_dict["c"] == ["car", "foo", "bar", "foobar"]

sorted_rb = rb.sort_by([("a", "ascending"), ("b", "ascending")])
sorted_rb_dict = sorted_rb.to_pydict()
assert sorted_rb_dict["a"] == [5, 7, 7, 35]
assert sorted_rb_dict["b"] == [2, 3, 4, 1]
assert sorted_rb_dict["c"] == ["foobar", "bar", "foo", "car"]