Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
27 changes: 18 additions & 9 deletions cpp/src/arrow/python/python_to_arrow.cc
Original file line number Diff line number Diff line change
Expand Up @@ -582,27 +582,30 @@ class StringConverter
// ----------------------------------------------------------------------
// Convert lists (NumPy arrays containing lists or ndarrays as values)

class ListConverter : public TypedConverter<ListType, ListConverter> {
template <typename TypeClass>
class ListConverter : public TypedConverter<TypeClass, ListConverter<TypeClass>> {
public:
using BuilderType = typename TypeTraits<TypeClass>::BuilderType;

explicit ListConverter(bool from_pandas, bool strict_conversions)
: from_pandas_(from_pandas), strict_conversions_(strict_conversions) {}

Status Init(ArrayBuilder* builder) {
builder_ = builder;
typed_builder_ = checked_cast<ListBuilder*>(builder);
this->builder_ = builder;
this->typed_builder_ = checked_cast<BuilderType*>(builder);

value_type_ = checked_cast<const ListType&>(*builder->type()).value_type();
value_type_ = checked_cast<const TypeClass&>(*builder->type()).value_type();
RETURN_NOT_OK(
GetConverter(value_type_, from_pandas_, strict_conversions_, &value_converter_));
return value_converter_->Init(typed_builder_->value_builder());
return value_converter_->Init(this->typed_builder_->value_builder());
}

template <int NUMPY_TYPE, typename Type>
Status AppendNdarrayTypedItem(PyArrayObject* arr);
Status AppendNdarrayItem(PyObject* arr);

Status AppendItem(PyObject* obj) {
RETURN_NOT_OK(typed_builder_->Append());
RETURN_NOT_OK(this->typed_builder_->Append());
if (PyArray_Check(obj)) {
return AppendNdarrayItem(obj);
}
Expand All @@ -625,8 +628,9 @@ class ListConverter : public TypedConverter<ListType, ListConverter> {
bool strict_conversions_;
};

template <typename TypeClass>
template <int NUMPY_TYPE, typename Type>
Status ListConverter::AppendNdarrayTypedItem(PyArrayObject* arr) {
Status ListConverter<TypeClass>::AppendNdarrayTypedItem(PyArrayObject* arr) {
using traits = internal::npy_traits<NUMPY_TYPE>;
using T = typename traits::value_type;
using ValueBuilderType = typename TypeTraits<Type>::BuilderType;
Expand Down Expand Up @@ -673,7 +677,8 @@ Status ListConverter::AppendNdarrayTypedItem(PyArrayObject* arr) {
return value_converter_->AppendMultiple(obj, value_length); \
}

Status ListConverter::AppendNdarrayItem(PyObject* obj) {
template <typename TypeClass>
Status ListConverter<TypeClass>::AppendNdarrayItem(PyObject* obj) {
PyArrayObject* arr = reinterpret_cast<PyArrayObject*>(obj);

if (PyArray_NDIM(arr) != 1) {
Expand Down Expand Up @@ -914,7 +919,11 @@ Status GetConverter(const std::shared_ptr<DataType>& type, bool from_pandas,
}
case Type::LIST:
*out = std::unique_ptr<SeqConverter>(
new ListConverter(from_pandas, strict_conversions));
new ListConverter<ListType>(from_pandas, strict_conversions));
break;
case Type::LARGE_LIST:
*out = std::unique_ptr<SeqConverter>(
new ListConverter<LargeListType>(from_pandas, strict_conversions));
break;
case Type::STRUCT:
*out = std::unique_ptr<SeqConverter>(
Expand Down
2 changes: 2 additions & 0 deletions docs/source/python/api/arrays.rst
Original file line number Diff line number Diff line change
Expand Up @@ -67,6 +67,7 @@ may expose data type-specific methods or properties.
Decimal128Array
DictionaryArray
ListArray
LargeListArray
StructArray
UnionArray

Expand Down Expand Up @@ -109,5 +110,6 @@ any of those classes directly.
DecimalValue
DictionaryValue
ListValue
LargeListValue
StructValue
UnionValue
2 changes: 2 additions & 0 deletions docs/source/python/api/datatypes.rst
Original file line number Diff line number Diff line change
Expand Up @@ -55,6 +55,7 @@ These should be used to create Arrow data types and schemas.
large_utf8
decimal128
list_
large_list
struct
dictionary
field
Expand Down Expand Up @@ -117,6 +118,7 @@ represents a given data type (such as ``int32``) or general category
is_float64
is_decimal
is_list
is_large_list
is_struct
is_union
is_nested
Expand Down
10 changes: 6 additions & 4 deletions python/pyarrow/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -55,9 +55,10 @@ def parse_git(root, **kwargs):
binary, string, utf8,
large_binary, large_string, large_utf8,
decimal128,
list_, struct, union, dictionary, field,
list_, large_list, struct, union, dictionary, field,
type_for_alias,
DataType, DictionaryType, ListType, StructType,
DataType, DictionaryType, StructType,
ListType, LargeListType,
UnionType, TimestampType, Time32Type, Time64Type,
FixedSizeBinaryType, Decimal128Type,
BaseExtensionType, ExtensionType,
Expand All @@ -77,7 +78,7 @@ def parse_git(root, **kwargs):
Int16Array, UInt16Array,
Int32Array, UInt32Array,
Int64Array, UInt64Array,
ListArray, UnionArray,
ListArray, LargeListArray, UnionArray,
BinaryArray, StringArray,
LargeBinaryArray, LargeStringArray,
FixedSizeBinaryArray,
Expand All @@ -89,7 +90,8 @@ def parse_git(root, **kwargs):
BooleanValue,
Int8Value, Int16Value, Int32Value, Int64Value,
UInt8Value, UInt16Value, UInt32Value, UInt64Value,
HalfFloatValue, FloatValue, DoubleValue, ListValue,
HalfFloatValue, FloatValue, DoubleValue,
ListValue, LargeListValue,
BinaryValue, StringValue,
LargeBinaryValue, LargeStringValue,
FixedSizeBinaryValue,
Expand Down
47 changes: 47 additions & 0 deletions python/pyarrow/array.pxi
Original file line number Diff line number Diff line change
Expand Up @@ -1070,6 +1070,52 @@ cdef class ListArray(Array):
return pyarrow_wrap_array(arr.values())


cdef class LargeListArray(Array):
"""
Concrete class for Arrow arrays of a large list data type
(like ListArray, but 64-bit offsets).
"""

@staticmethod
def from_arrays(offsets, values, MemoryPool pool=None):
"""
Construct LargeListArray from arrays of int64 offsets and values

Parameters
----------
offset : Array (int64 type)
values : Array (any type)

Returns
-------
list_array : LargeListArray
"""
cdef:
Array _offsets, _values
shared_ptr[CArray] out
cdef CMemoryPool* cpool = maybe_unbox_memory_pool(pool)

_offsets = asarray(offsets, type='int64')
_values = asarray(values)

with nogil:
check_status(CLargeListArray.FromArrays(_offsets.ap[0],
_values.ap[0],
cpool, &out))
return pyarrow_wrap_array(out)

def flatten(self):
"""
Unnest this LargeListArray by one level

Returns
-------
result : Array
"""
cdef CLargeListArray* arr = <CLargeListArray*> self.ap
return pyarrow_wrap_array(arr.values())


cdef class UnionArray(Array):
"""
Concrete class for Arrow arrays of a Union data type.
Expand Down Expand Up @@ -1511,6 +1557,7 @@ cdef dict _array_classes = {
_Type_FLOAT: FloatArray,
_Type_DOUBLE: DoubleArray,
_Type_LIST: ListArray,
_Type_LARGE_LIST: LargeListArray,
_Type_UNION: UnionArray,
_Type_BINARY: BinaryArray,
_Type_STRING: StringArray,
Expand Down
18 changes: 18 additions & 0 deletions python/pyarrow/includes/libarrow.pxd
Original file line number Diff line number Diff line change
Expand Up @@ -70,6 +70,7 @@ cdef extern from "arrow/api.h" namespace "arrow" nogil:
_Type_FIXED_SIZE_BINARY" arrow::Type::FIXED_SIZE_BINARY"

_Type_LIST" arrow::Type::LIST"
_Type_LARGE_LIST" arrow::Type::LARGE_LIST"
_Type_STRUCT" arrow::Type::STRUCT"
_Type_UNION" arrow::Type::UNION"
_Type_DICTIONARY" arrow::Type::DICTIONARY"
Expand Down Expand Up @@ -252,6 +253,12 @@ cdef extern from "arrow/api.h" namespace "arrow" nogil:
shared_ptr[CDataType] value_type()
shared_ptr[CField] value_field()

cdef cppclass CLargeListType" arrow::LargeListType"(CDataType):
CLargeListType(const shared_ptr[CDataType]& value_type)
CLargeListType(const shared_ptr[CField]& field)
shared_ptr[CDataType] value_type()
shared_ptr[CField] value_field()

cdef cppclass CStringType" arrow::StringType"(CDataType):
pass

Expand Down Expand Up @@ -419,6 +426,17 @@ cdef extern from "arrow/api.h" namespace "arrow" nogil:
shared_ptr[CArray] values()
shared_ptr[CDataType] value_type()

cdef cppclass CLargeListArray" arrow::LargeListArray"(CArray):
@staticmethod
CStatus FromArrays(const CArray& offsets, const CArray& values,
CMemoryPool* pool, shared_ptr[CArray]* out)

const int64_t* raw_value_offsets()
int64_t value_offset(int i)
int64_t value_length(int i)
shared_ptr[CArray] values()
shared_ptr[CDataType] value_type()

cdef cppclass CUnionArray" arrow::UnionArray"(CArray):
@staticmethod
CStatus MakeSparse(const CArray& type_ids,
Expand Down
20 changes: 20 additions & 0 deletions python/pyarrow/lib.pxd
Original file line number Diff line number Diff line change
Expand Up @@ -64,6 +64,11 @@ cdef class ListType(DataType):
const CListType* list_type


cdef class LargeListType(DataType):
cdef:
const CLargeListType* list_type


cdef class StructType(DataType):
cdef:
const CStructType* struct_type
Expand Down Expand Up @@ -184,6 +189,17 @@ cdef class ListValue(ArrayValue):
cdef int64_t length(self)


cdef class LargeListValue(ArrayValue):
cdef readonly:
DataType value_type

cdef:
CLargeListArray* ap

cdef getitem(self, int64_t i)
cdef int64_t length(self)


cdef class StructValue(ArrayValue):
cdef:
CStructArray* ap
Expand Down Expand Up @@ -336,6 +352,10 @@ cdef class ListArray(Array):
pass


cdef class LargeListArray(Array):
pass


cdef class UnionArray(Array):
pass

Expand Down
1 change: 1 addition & 0 deletions python/pyarrow/lib.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -91,6 +91,7 @@ Type_LARGE_BINARY = _Type_LARGE_BINARY
Type_LARGE_STRING = _Type_LARGE_STRING
Type_FIXED_SIZE_BINARY = _Type_FIXED_SIZE_BINARY
Type_LIST = _Type_LIST
Type_LARGE_LIST = _Type_LARGE_LIST
Type_STRUCT = _Type_STRUCT
Type_UNION = _Type_UNION
Type_DICTIONARY = _Type_DICTIONARY
Expand Down
2 changes: 2 additions & 0 deletions python/pyarrow/public-api.pxi
Original file line number Diff line number Diff line change
Expand Up @@ -79,6 +79,8 @@ cdef api object pyarrow_wrap_data_type(
out = DictionaryType.__new__(DictionaryType)
elif type.get().id() == _Type_LIST:
out = ListType.__new__(ListType)
elif type.get().id() == _Type_LARGE_LIST:
out = LargeListType.__new__(LargeListType)
elif type.get().id() == _Type_STRUCT:
out = StructType.__new__(StructType)
elif type.get().id() == _Type_UNION:
Expand Down
52 changes: 52 additions & 0 deletions python/pyarrow/scalar.pxi
Original file line number Diff line number Diff line change
Expand Up @@ -586,6 +586,57 @@ cdef class ListValue(ArrayValue):
return result


cdef class LargeListValue(ArrayValue):
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It's not possible to have this subclass ListValue to reduce code duplication? (getitem, iter, as_py look the same)

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The problem is that the C++ types are different (e.g. ListArray vs. LargeListArray), and those need to be compile-time constants for Cython.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Code like this makes me wish for some kind of macro system in Cython.

"""
Concrete class for large list array elements.
"""

def __len__(self):
"""
Return the number of values.
"""
return self.length()

def __getitem__(self, i):
"""
Return the value at the given index.
"""
return self.getitem(_normalize_index(i, self.length()))

def __iter__(self):
"""
Iterate over this element's values.
"""
for i in range(len(self)):
yield self.getitem(i)
raise StopIteration

cdef void _set_array(self, const shared_ptr[CArray]& sp_array):
self.sp_array = sp_array
self.ap = <CLargeListArray*> sp_array.get()
self.value_type = pyarrow_wrap_data_type(self.ap.value_type())

cdef getitem(self, int64_t i):
cdef int64_t j = self.ap.value_offset(self.index) + i
return box_scalar(self.value_type, self.ap.values(), j)

cdef int64_t length(self):
return self.ap.value_length(self.index)

def as_py(self):
"""
Return this value as a Python list.
"""
cdef:
int64_t j
list result = []

for j in range(len(self)):
result.append(self.getitem(j).as_py())

return result


cdef class UnionValue(ArrayValue):
"""
Concrete class for union array elements.
Expand Down Expand Up @@ -729,6 +780,7 @@ cdef dict _array_value_classes = {
_Type_FLOAT: FloatValue,
_Type_DOUBLE: DoubleValue,
_Type_LIST: ListValue,
_Type_LARGE_LIST: LargeListValue,
_Type_UNION: UnionValue,
_Type_BINARY: BinaryValue,
_Type_STRING: StringValue,
Expand Down
12 changes: 9 additions & 3 deletions python/pyarrow/tests/strategies.py
Original file line number Diff line number Diff line change
Expand Up @@ -104,7 +104,10 @@ def fields(type_strategy=primitive_types):


def list_types(item_strategy=primitive_types):
return st.builds(pa.list_, item_strategy)
return (
st.builds(pa.list_, item_strategy) |
st.builds(pa.large_list, item_strategy)
)
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@kszucs Does this look right?



def struct_types(item_strategy=primitive_types):
Expand Down Expand Up @@ -159,11 +162,14 @@ def arrays(draw, type, size=None):

shape = (size,)

if pa.types.is_list(type):
if pa.types.is_list(type) or pa.types.is_large_list(type):
offsets = draw(npst.arrays(np.uint8(), shape=shape)).cumsum() // 20
offsets = np.insert(offsets, 0, 0, axis=0) # prepend with zero
values = draw(arrays(type.value_type, size=int(offsets.sum())))
return pa.ListArray.from_arrays(offsets, values)
array_type = (
pa.LargeListArray if pa.types.is_large_list(type)
else pa.ListArray)
return array_type.from_arrays(offsets, values)

if pa.types.is_struct(type):
h.assume(len(type) > 0)
Expand Down
Loading