From 4266ea2c6614e2c2da9f02a7f5dfd0344a16d400 Mon Sep 17 00:00:00 2001 From: Antoine Pitrou Date: Thu, 1 Aug 2019 14:31:21 +0200 Subject: [PATCH] ARROW-6084: [Python] Support LargeList --- cpp/src/arrow/python/python_to_arrow.cc | 27 +++++--- docs/source/python/api/arrays.rst | 2 + docs/source/python/api/datatypes.rst | 2 + python/pyarrow/__init__.py | 10 +-- python/pyarrow/array.pxi | 47 +++++++++++++ python/pyarrow/includes/libarrow.pxd | 18 +++++ python/pyarrow/lib.pxd | 20 ++++++ python/pyarrow/lib.pyx | 1 + python/pyarrow/public-api.pxi | 2 + python/pyarrow/scalar.pxi | 52 +++++++++++++++ python/pyarrow/tests/strategies.py | 12 +++- python/pyarrow/tests/test_array.py | 70 +++++++++++++++++--- python/pyarrow/tests/test_compute.py | 1 + python/pyarrow/tests/test_convert_builtin.py | 12 ++++ python/pyarrow/tests/test_scalars.py | 23 +++++++ python/pyarrow/tests/test_types.py | 21 +++++- python/pyarrow/types.pxi | 55 +++++++++++++++ python/pyarrow/types.py | 10 ++- 18 files changed, 356 insertions(+), 29 deletions(-) diff --git a/cpp/src/arrow/python/python_to_arrow.cc b/cpp/src/arrow/python/python_to_arrow.cc index 424e3097b96..a990aecace4 100644 --- a/cpp/src/arrow/python/python_to_arrow.cc +++ b/cpp/src/arrow/python/python_to_arrow.cc @@ -582,19 +582,22 @@ class StringConverter // ---------------------------------------------------------------------- // Convert lists (NumPy arrays containing lists or ndarrays as values) -class ListConverter : public TypedConverter { +template +class ListConverter : public TypedConverter> { public: + using BuilderType = typename TypeTraits::BuilderType; + explicit ListConverter(bool from_pandas, bool strict_conversions) : from_pandas_(from_pandas), strict_conversions_(strict_conversions) {} Status Init(ArrayBuilder* builder) { - builder_ = builder; - typed_builder_ = checked_cast(builder); + this->builder_ = builder; + this->typed_builder_ = checked_cast(builder); - value_type_ = checked_cast(*builder->type()).value_type(); + value_type_ = checked_cast(*builder->type()).value_type(); RETURN_NOT_OK( GetConverter(value_type_, from_pandas_, strict_conversions_, &value_converter_)); - return value_converter_->Init(typed_builder_->value_builder()); + return value_converter_->Init(this->typed_builder_->value_builder()); } template @@ -602,7 +605,7 @@ class ListConverter : public TypedConverter { Status AppendNdarrayItem(PyObject* arr); Status AppendItem(PyObject* obj) { - RETURN_NOT_OK(typed_builder_->Append()); + RETURN_NOT_OK(this->typed_builder_->Append()); if (PyArray_Check(obj)) { return AppendNdarrayItem(obj); } @@ -625,8 +628,9 @@ class ListConverter : public TypedConverter { bool strict_conversions_; }; +template template -Status ListConverter::AppendNdarrayTypedItem(PyArrayObject* arr) { +Status ListConverter::AppendNdarrayTypedItem(PyArrayObject* arr) { using traits = internal::npy_traits; using T = typename traits::value_type; using ValueBuilderType = typename TypeTraits::BuilderType; @@ -673,7 +677,8 @@ Status ListConverter::AppendNdarrayTypedItem(PyArrayObject* arr) { return value_converter_->AppendMultiple(obj, value_length); \ } -Status ListConverter::AppendNdarrayItem(PyObject* obj) { +template +Status ListConverter::AppendNdarrayItem(PyObject* obj) { PyArrayObject* arr = reinterpret_cast(obj); if (PyArray_NDIM(arr) != 1) { @@ -914,7 +919,11 @@ Status GetConverter(const std::shared_ptr& type, bool from_pandas, } case Type::LIST: *out = std::unique_ptr( - new ListConverter(from_pandas, strict_conversions)); + new ListConverter(from_pandas, strict_conversions)); + break; + case Type::LARGE_LIST: + *out = std::unique_ptr( + new ListConverter(from_pandas, strict_conversions)); break; case Type::STRUCT: *out = std::unique_ptr( diff --git a/docs/source/python/api/arrays.rst b/docs/source/python/api/arrays.rst index e10b5afef43..ca7d1e70d8c 100644 --- a/docs/source/python/api/arrays.rst +++ b/docs/source/python/api/arrays.rst @@ -67,6 +67,7 @@ may expose data type-specific methods or properties. Decimal128Array DictionaryArray ListArray + LargeListArray StructArray UnionArray @@ -109,5 +110,6 @@ any of those classes directly. DecimalValue DictionaryValue ListValue + LargeListValue StructValue UnionValue diff --git a/docs/source/python/api/datatypes.rst b/docs/source/python/api/datatypes.rst index 327bcf63ac9..6502ae227ec 100644 --- a/docs/source/python/api/datatypes.rst +++ b/docs/source/python/api/datatypes.rst @@ -55,6 +55,7 @@ These should be used to create Arrow data types and schemas. large_utf8 decimal128 list_ + large_list struct dictionary field @@ -117,6 +118,7 @@ represents a given data type (such as ``int32``) or general category is_float64 is_decimal is_list + is_large_list is_struct is_union is_nested diff --git a/python/pyarrow/__init__.py b/python/pyarrow/__init__.py index 51afa0f1c48..4e7707adcb3 100644 --- a/python/pyarrow/__init__.py +++ b/python/pyarrow/__init__.py @@ -55,9 +55,10 @@ def parse_git(root, **kwargs): binary, string, utf8, large_binary, large_string, large_utf8, decimal128, - list_, struct, union, dictionary, field, + list_, large_list, struct, union, dictionary, field, type_for_alias, - DataType, DictionaryType, ListType, StructType, + DataType, DictionaryType, StructType, + ListType, LargeListType, UnionType, TimestampType, Time32Type, Time64Type, FixedSizeBinaryType, Decimal128Type, BaseExtensionType, ExtensionType, @@ -77,7 +78,7 @@ def parse_git(root, **kwargs): Int16Array, UInt16Array, Int32Array, UInt32Array, Int64Array, UInt64Array, - ListArray, UnionArray, + ListArray, LargeListArray, UnionArray, BinaryArray, StringArray, LargeBinaryArray, LargeStringArray, FixedSizeBinaryArray, @@ -89,7 +90,8 @@ def parse_git(root, **kwargs): BooleanValue, Int8Value, Int16Value, Int32Value, Int64Value, UInt8Value, UInt16Value, UInt32Value, UInt64Value, - HalfFloatValue, FloatValue, DoubleValue, ListValue, + HalfFloatValue, FloatValue, DoubleValue, + ListValue, LargeListValue, BinaryValue, StringValue, LargeBinaryValue, LargeStringValue, FixedSizeBinaryValue, diff --git a/python/pyarrow/array.pxi b/python/pyarrow/array.pxi index ecb8ff5ca33..b93cf10a096 100644 --- a/python/pyarrow/array.pxi +++ b/python/pyarrow/array.pxi @@ -1070,6 +1070,52 @@ cdef class ListArray(Array): return pyarrow_wrap_array(arr.values()) +cdef class LargeListArray(Array): + """ + Concrete class for Arrow arrays of a large list data type + (like ListArray, but 64-bit offsets). + """ + + @staticmethod + def from_arrays(offsets, values, MemoryPool pool=None): + """ + Construct LargeListArray from arrays of int64 offsets and values + + Parameters + ---------- + offset : Array (int64 type) + values : Array (any type) + + Returns + ------- + list_array : LargeListArray + """ + cdef: + Array _offsets, _values + shared_ptr[CArray] out + cdef CMemoryPool* cpool = maybe_unbox_memory_pool(pool) + + _offsets = asarray(offsets, type='int64') + _values = asarray(values) + + with nogil: + check_status(CLargeListArray.FromArrays(_offsets.ap[0], + _values.ap[0], + cpool, &out)) + return pyarrow_wrap_array(out) + + def flatten(self): + """ + Unnest this LargeListArray by one level + + Returns + ------- + result : Array + """ + cdef CLargeListArray* arr = self.ap + return pyarrow_wrap_array(arr.values()) + + cdef class UnionArray(Array): """ Concrete class for Arrow arrays of a Union data type. @@ -1511,6 +1557,7 @@ cdef dict _array_classes = { _Type_FLOAT: FloatArray, _Type_DOUBLE: DoubleArray, _Type_LIST: ListArray, + _Type_LARGE_LIST: LargeListArray, _Type_UNION: UnionArray, _Type_BINARY: BinaryArray, _Type_STRING: StringArray, diff --git a/python/pyarrow/includes/libarrow.pxd b/python/pyarrow/includes/libarrow.pxd index 4dc642726ba..ad0fa09a546 100644 --- a/python/pyarrow/includes/libarrow.pxd +++ b/python/pyarrow/includes/libarrow.pxd @@ -70,6 +70,7 @@ cdef extern from "arrow/api.h" namespace "arrow" nogil: _Type_FIXED_SIZE_BINARY" arrow::Type::FIXED_SIZE_BINARY" _Type_LIST" arrow::Type::LIST" + _Type_LARGE_LIST" arrow::Type::LARGE_LIST" _Type_STRUCT" arrow::Type::STRUCT" _Type_UNION" arrow::Type::UNION" _Type_DICTIONARY" arrow::Type::DICTIONARY" @@ -252,6 +253,12 @@ cdef extern from "arrow/api.h" namespace "arrow" nogil: shared_ptr[CDataType] value_type() shared_ptr[CField] value_field() + cdef cppclass CLargeListType" arrow::LargeListType"(CDataType): + CLargeListType(const shared_ptr[CDataType]& value_type) + CLargeListType(const shared_ptr[CField]& field) + shared_ptr[CDataType] value_type() + shared_ptr[CField] value_field() + cdef cppclass CStringType" arrow::StringType"(CDataType): pass @@ -419,6 +426,17 @@ cdef extern from "arrow/api.h" namespace "arrow" nogil: shared_ptr[CArray] values() shared_ptr[CDataType] value_type() + cdef cppclass CLargeListArray" arrow::LargeListArray"(CArray): + @staticmethod + CStatus FromArrays(const CArray& offsets, const CArray& values, + CMemoryPool* pool, shared_ptr[CArray]* out) + + const int64_t* raw_value_offsets() + int64_t value_offset(int i) + int64_t value_length(int i) + shared_ptr[CArray] values() + shared_ptr[CDataType] value_type() + cdef cppclass CUnionArray" arrow::UnionArray"(CArray): @staticmethod CStatus MakeSparse(const CArray& type_ids, diff --git a/python/pyarrow/lib.pxd b/python/pyarrow/lib.pxd index 09314630f2e..b59f3f82b46 100644 --- a/python/pyarrow/lib.pxd +++ b/python/pyarrow/lib.pxd @@ -64,6 +64,11 @@ cdef class ListType(DataType): const CListType* list_type +cdef class LargeListType(DataType): + cdef: + const CLargeListType* list_type + + cdef class StructType(DataType): cdef: const CStructType* struct_type @@ -184,6 +189,17 @@ cdef class ListValue(ArrayValue): cdef int64_t length(self) +cdef class LargeListValue(ArrayValue): + cdef readonly: + DataType value_type + + cdef: + CLargeListArray* ap + + cdef getitem(self, int64_t i) + cdef int64_t length(self) + + cdef class StructValue(ArrayValue): cdef: CStructArray* ap @@ -336,6 +352,10 @@ cdef class ListArray(Array): pass +cdef class LargeListArray(Array): + pass + + cdef class UnionArray(Array): pass diff --git a/python/pyarrow/lib.pyx b/python/pyarrow/lib.pyx index 0b33f3906c4..8baaab320ce 100644 --- a/python/pyarrow/lib.pyx +++ b/python/pyarrow/lib.pyx @@ -91,6 +91,7 @@ Type_LARGE_BINARY = _Type_LARGE_BINARY Type_LARGE_STRING = _Type_LARGE_STRING Type_FIXED_SIZE_BINARY = _Type_FIXED_SIZE_BINARY Type_LIST = _Type_LIST +Type_LARGE_LIST = _Type_LARGE_LIST Type_STRUCT = _Type_STRUCT Type_UNION = _Type_UNION Type_DICTIONARY = _Type_DICTIONARY diff --git a/python/pyarrow/public-api.pxi b/python/pyarrow/public-api.pxi index f6ef2c955cd..bb5bdcad447 100644 --- a/python/pyarrow/public-api.pxi +++ b/python/pyarrow/public-api.pxi @@ -79,6 +79,8 @@ cdef api object pyarrow_wrap_data_type( out = DictionaryType.__new__(DictionaryType) elif type.get().id() == _Type_LIST: out = ListType.__new__(ListType) + elif type.get().id() == _Type_LARGE_LIST: + out = LargeListType.__new__(LargeListType) elif type.get().id() == _Type_STRUCT: out = StructType.__new__(StructType) elif type.get().id() == _Type_UNION: diff --git a/python/pyarrow/scalar.pxi b/python/pyarrow/scalar.pxi index 0ead3e57552..aa100ca5351 100644 --- a/python/pyarrow/scalar.pxi +++ b/python/pyarrow/scalar.pxi @@ -586,6 +586,57 @@ cdef class ListValue(ArrayValue): return result +cdef class LargeListValue(ArrayValue): + """ + Concrete class for large list array elements. + """ + + def __len__(self): + """ + Return the number of values. + """ + return self.length() + + def __getitem__(self, i): + """ + Return the value at the given index. + """ + return self.getitem(_normalize_index(i, self.length())) + + def __iter__(self): + """ + Iterate over this element's values. + """ + for i in range(len(self)): + yield self.getitem(i) + raise StopIteration + + cdef void _set_array(self, const shared_ptr[CArray]& sp_array): + self.sp_array = sp_array + self.ap = sp_array.get() + self.value_type = pyarrow_wrap_data_type(self.ap.value_type()) + + cdef getitem(self, int64_t i): + cdef int64_t j = self.ap.value_offset(self.index) + i + return box_scalar(self.value_type, self.ap.values(), j) + + cdef int64_t length(self): + return self.ap.value_length(self.index) + + def as_py(self): + """ + Return this value as a Python list. + """ + cdef: + int64_t j + list result = [] + + for j in range(len(self)): + result.append(self.getitem(j).as_py()) + + return result + + cdef class UnionValue(ArrayValue): """ Concrete class for union array elements. @@ -729,6 +780,7 @@ cdef dict _array_value_classes = { _Type_FLOAT: FloatValue, _Type_DOUBLE: DoubleValue, _Type_LIST: ListValue, + _Type_LARGE_LIST: LargeListValue, _Type_UNION: UnionValue, _Type_BINARY: BinaryValue, _Type_STRING: StringValue, diff --git a/python/pyarrow/tests/strategies.py b/python/pyarrow/tests/strategies.py index 498d7382b44..4bbe86de840 100644 --- a/python/pyarrow/tests/strategies.py +++ b/python/pyarrow/tests/strategies.py @@ -104,7 +104,10 @@ def fields(type_strategy=primitive_types): def list_types(item_strategy=primitive_types): - return st.builds(pa.list_, item_strategy) + return ( + st.builds(pa.list_, item_strategy) | + st.builds(pa.large_list, item_strategy) + ) def struct_types(item_strategy=primitive_types): @@ -159,11 +162,14 @@ def arrays(draw, type, size=None): shape = (size,) - if pa.types.is_list(type): + if pa.types.is_list(type) or pa.types.is_large_list(type): offsets = draw(npst.arrays(np.uint8(), shape=shape)).cumsum() // 20 offsets = np.insert(offsets, 0, 0, axis=0) # prepend with zero values = draw(arrays(type.value_type, size=int(offsets.sum()))) - return pa.ListArray.from_arrays(offsets, values) + array_type = ( + pa.LargeListArray if pa.types.is_large_list(type) + else pa.ListArray) + return array_type.from_arrays(offsets, values) if pa.types.is_struct(type): h.assume(len(type) > 0) diff --git a/python/pyarrow/tests/test_array.py b/python/pyarrow/tests/test_array.py index c5915449987..03db7e94655 100644 --- a/python/pyarrow/tests/test_array.py +++ b/python/pyarrow/tests/test_array.py @@ -346,9 +346,11 @@ def test_string_binary_from_buffers(): assert copied.null_count == 0 -def test_list_from_buffers(): - ty = pa.list_(pa.int16()) +@pytest.mark.parametrize('list_type_factory', [pa.list_, pa.large_list]) +def test_list_from_buffers(list_type_factory): + ty = list_type_factory(pa.int16()) array = pa.array([[0, 1, 2], None, [], [3, 4, 5]], type=ty) + assert array.type == ty buffers = array.buffers() @@ -486,31 +488,36 @@ def test_dictionary_from_arrays_boundscheck(): pa.DictionaryArray.from_arrays(indices2, dictionary, safe=False) -def test_list_from_arrays(): +@pytest.mark.parametrize(('list_array_type', 'list_type_factory'), + [(pa.ListArray, pa.list_), + (pa.LargeListArray, pa.large_list)]) +def test_list_from_arrays(list_array_type, list_type_factory): offsets_arr = np.array([0, 2, 5, 8], dtype='i4') offsets = pa.array(offsets_arr, type='int32') pyvalues = [b'a', b'b', b'c', b'd', b'e', b'f', b'g', b'h'] values = pa.array(pyvalues, type='binary') - result = pa.ListArray.from_arrays(offsets, values) - expected = pa.array([pyvalues[:2], pyvalues[2:5], pyvalues[5:8]]) + result = list_array_type.from_arrays(offsets, values) + expected = pa.array([pyvalues[:2], pyvalues[2:5], pyvalues[5:8]], + type=list_type_factory(pa.binary())) assert result.equals(expected) # With nulls offsets = [0, None, 2, 6] + values = [b'a', b'b', b'c', b'd', b'e', b'f'] - values = ['a', 'b', 'c', 'd', 'e', 'f'] - - result = pa.ListArray.from_arrays(offsets, values) - expected = pa.array([values[:2], None, values[2:]]) + result = list_array_type.from_arrays(offsets, values) + expected = pa.array([values[:2], None, values[2:]], + type=list_type_factory(pa.binary())) assert result.equals(expected) # Another edge case offsets2 = [0, 2, None, 6] - result = pa.ListArray.from_arrays(offsets2, values) - expected = pa.array([values[:2], values[2:], None]) + result = list_array_type.from_arrays(offsets2, values) + expected = pa.array([values[:2], values[2:], None], + type=list_type_factory(pa.binary())) assert result.equals(expected) @@ -767,6 +774,7 @@ def test_cast_from_null(): pa.binary(), pa.binary(10), pa.list_(pa.int16()), + pa.large_list(pa.uint8()), pa.decimal128(19, 4), pa.timestamp('us'), pa.timestamp('us', tz='UTC'), @@ -925,6 +933,7 @@ def test_cast_identities(ty, values): (['a', None, 'b'], pa.string()), ([], None), ([[1, 2], [3]], pa.list_(pa.int64())), + ([[4, 5], [6]], pa.large_list(pa.int16())), ([['a'], None, ['b', 'c']], pa.list_(pa.string())), ([(1, 'a'), (2, 'c'), None], pa.struct([pa.field('a', pa.int64()), pa.field('b', pa.string())])) @@ -1307,6 +1316,45 @@ def test_list_array_flatten(): assert arr2.flatten().flatten().equals(arr0) +def test_large_list_array_flatten(): + typ2 = pa.large_list( + pa.large_list( + pa.int16() + ) + ) + arr2 = pa.array([ + None, + [ + [1, None, 2], + None, + [3, 4] + ], + [], + [ + [], + [5, 6], + None + ], + [ + [7, 8] + ] + ], type=typ2) + + typ1 = pa.large_list(pa.int16()) + assert typ1 == typ2.value_type + arr1 = pa.array([ + [1, None, 2], + None, + [3, 4], + [], + [5, 6], + None, + [7, 8] + ], type=typ1) + + assert arr2.flatten().equals(arr1) + + def test_struct_array_flatten(): ty = pa.struct([pa.field('x', pa.int16()), pa.field('y', pa.float32())]) diff --git a/python/pyarrow/tests/test_compute.py b/python/pyarrow/tests/test_compute.py index 37da62c4815..2520bbd44b3 100644 --- a/python/pyarrow/tests/test_compute.py +++ b/python/pyarrow/tests/test_compute.py @@ -53,6 +53,7 @@ def test_sum(arrow_type): ('binary', [b'a', b'b', b'c', b'ddd', b'ee']), (pa.binary(3), [b'abc', b'bcd', b'cde', b'def', b'efg']), (pa.list_(pa.int8()), [[1, 2], [3, 4], [5, 6], None, [9, 16]]), + (pa.large_list(pa.int16()), [[1], [2, 3, 4], [5, 6], None, [9, 16]]), (pa.struct([('a', pa.int8()), ('b', pa.int8())]), [ {'a': 1, 'b': 2}, None, {'a': 3, 'b': 4}, None, {'a': 5, 'b': 6}]), ]) diff --git a/python/pyarrow/tests/test_convert_builtin.py b/python/pyarrow/tests/test_convert_builtin.py index f39706e3cbc..dec5993f441 100644 --- a/python/pyarrow/tests/test_convert_builtin.py +++ b/python/pyarrow/tests/test_convert_builtin.py @@ -205,11 +205,23 @@ def test_nested_lists(seq): assert arr.to_pylist() == data +@parametrize_with_iterable_types +def test_nested_large_lists(seq): + data = [[], [1, 2], None] + arr = pa.array(seq(data), type=pa.large_list(pa.int16())) + assert len(arr) == 3 + assert arr.null_count == 1 + assert arr.type == pa.large_list(pa.int16()) + assert arr.to_pylist() == data + + @parametrize_with_iterable_types def test_list_with_non_list(seq): # List types don't accept non-sequences with pytest.raises(TypeError): pa.array(seq([[], [1, 2], 3]), type=pa.list_(pa.int64())) + with pytest.raises(TypeError): + pa.array(seq([[], [1, 2], 3]), type=pa.large_list(pa.int64())) @parametrize_with_iterable_types diff --git a/python/pyarrow/tests/test_scalars.py b/python/pyarrow/tests/test_scalars.py index ca7a10e0046..b319c6fb190 100644 --- a/python/pyarrow/tests/test_scalars.py +++ b/python/pyarrow/tests/test_scalars.py @@ -195,6 +195,29 @@ def test_list(self): v = arr[3] assert len(v) == 0 + def test_large_list(self): + arr = pa.array([[123, None], None, [456], []], + type=pa.large_list(pa.int16())) + + v = arr[0] + assert len(v) == 2 + assert isinstance(v, pa.LargeListValue) + assert repr(v) == "[123, None]" + assert v.as_py() == [123, None] + assert v[0].as_py() == 123 + assert v[1] is pa.NA + assert v[-1] == v[1] + assert v[-2] == v[0] + with pytest.raises(IndexError): + v[-3] + with pytest.raises(IndexError): + v[2] + + assert arr[1] is pa.NA + + v = arr[3] + assert len(v) == 0 + @pytest.mark.pandas def test_timestamp(self): import pandas as pd diff --git a/python/pyarrow/tests/test_types.py b/python/pyarrow/tests/test_types.py index de532e6cb98..fb1437dcfab 100644 --- a/python/pyarrow/tests/test_types.py +++ b/python/pyarrow/tests/test_types.py @@ -52,6 +52,7 @@ def get_many_types(): pa.large_string(), pa.large_binary(), pa.list_(pa.int32()), + pa.large_list(pa.uint16()), pa.struct([pa.field('a', pa.int32()), pa.field('b', pa.int8()), pa.field('c', pa.string())]), @@ -110,7 +111,14 @@ def test_is_decimal(): def test_is_list(): - assert types.is_list(pa.list_(pa.int32())) + a = pa.list_(pa.int32()) + b = pa.large_list(pa.int32()) + + assert types.is_list(a) + assert not types.is_large_list(a) + assert types.is_large_list(b) + assert not types.is_list(b) + assert not types.is_list(pa.int32()) @@ -129,6 +137,7 @@ def test_is_nested_or_struct(): assert types.is_nested(struct_ex) assert types.is_nested(pa.list_(pa.int32())) + assert types.is_nested(pa.large_list(pa.int32())) assert not types.is_nested(pa.int32()) @@ -237,12 +246,22 @@ def test_time64_units(): def test_list_type(): ty = pa.list_(pa.int64()) + assert isinstance(ty, pa.ListType) assert ty.value_type == pa.int64() with pytest.raises(TypeError): pa.list_(None) +def test_large_list_type(): + ty = pa.large_list(pa.utf8()) + assert isinstance(ty, pa.LargeListType) + assert ty.value_type == pa.utf8() + + with pytest.raises(TypeError): + pa.large_list(None) + + def test_struct_type(): fields = [ # Duplicate field name on purpose diff --git a/python/pyarrow/types.pxi b/python/pyarrow/types.pxi index 0db15d583f3..03a4b3d87fa 100644 --- a/python/pyarrow/types.pxi +++ b/python/pyarrow/types.pxi @@ -259,6 +259,27 @@ cdef class ListType(DataType): return pyarrow_wrap_data_type(self.list_type.value_type()) +cdef class LargeListType(DataType): + """ + Concrete class for large list data types + (like ListType, but with 64-bit offsets). + """ + + cdef void init(self, const shared_ptr[CDataType]& type) except *: + DataType.init(self, type) + self.list_type = type.get() + + def __reduce__(self): + return large_list, (self.value_type,) + + @property + def value_type(self): + """ + The data type of large list values. + """ + return pyarrow_wrap_data_type(self.list_type.value_type()) + + cdef class StructType(DataType): """ Concrete class for struct data types. @@ -1589,6 +1610,40 @@ cpdef ListType list_(value_type): return out +cpdef LargeListType large_list(value_type): + """ + Create LargeListType instance from child data type or field + + This data type may not be supported by all Arrow implementations. + Unless you need to represent data larger than 2**31 elements, you should + prefer list_(). + + Parameters + ---------- + value_type : DataType or Field + + Returns + ------- + list_type : DataType + """ + cdef: + DataType data_type + Field _field + shared_ptr[CDataType] list_type + LargeListType out = LargeListType.__new__(LargeListType) + + if isinstance(value_type, DataType): + _field = field('item', value_type) + elif isinstance(value_type, Field): + _field = value_type + else: + raise TypeError('List requires DataType or Field') + + list_type.reset(new CLargeListType(_field.sp_field)) + out.init(list_type) + return out + + cpdef DictionaryType dictionary(index_type, value_type, bint ordered=False): """ Dictionary (categorical, or simply encoded) type diff --git a/python/pyarrow/types.py b/python/pyarrow/types.py index dc314e8a043..d4dffb746c7 100644 --- a/python/pyarrow/types.py +++ b/python/pyarrow/types.py @@ -35,7 +35,8 @@ _DATE_TYPES = {lib.Type_DATE32, lib.Type_DATE64} _TIME_TYPES = {lib.Type_TIME32, lib.Type_TIME64} _TEMPORAL_TYPES = {lib.Type_TIMESTAMP} | _TIME_TYPES | _DATE_TYPES -_NESTED_TYPES = {lib.Type_LIST, lib.Type_STRUCT, lib.Type_UNION, lib.Type_MAP} +_NESTED_TYPES = {lib.Type_LIST, lib.Type_LARGE_LIST, lib.Type_STRUCT, + lib.Type_UNION, lib.Type_MAP} def is_null(t): @@ -164,6 +165,13 @@ def is_list(t): return t.id == lib.Type_LIST +def is_large_list(t): + """ + Return True if value is an instance of a large list type + """ + return t.id == lib.Type_LARGE_LIST + + def is_struct(t): """ Return True if value is an instance of a struct type