diff --git a/docs/source/python/api/arrays.rst b/docs/source/python/api/arrays.rst index b858862dcff..e6f6c3dbbd3 100644 --- a/docs/source/python/api/arrays.rst +++ b/docs/source/python/api/arrays.rst @@ -77,6 +77,8 @@ may expose data type-specific methods or properties. ListArray FixedSizeListArray LargeListArray + ListViewArray + LargeListViewArray MapArray RunEndEncodedArray StructArray @@ -135,6 +137,8 @@ classes may expose data type-specific methods or properties. RunEndEncodedScalar ListScalar LargeListScalar + ListViewScalar + LargeListViewScalar MapScalar StructScalar UnionScalar diff --git a/docs/source/python/api/datatypes.rst b/docs/source/python/api/datatypes.rst index 642c243b21a..62bf4b77235 100644 --- a/docs/source/python/api/datatypes.rst +++ b/docs/source/python/api/datatypes.rst @@ -60,6 +60,8 @@ These should be used to create Arrow data types and schemas. decimal128 list_ large_list + list_view + large_list_view map_ struct dictionary @@ -149,6 +151,8 @@ represents a given data type (such as ``int32``) or general category is_list is_large_list is_fixed_size_list + is_list_view + is_large_list_view is_struct is_union is_nested diff --git a/python/pyarrow/__init__.py b/python/pyarrow/__init__.py index 4dbd1258d3c..2ee97ddb662 100644 --- a/python/pyarrow/__init__.py +++ b/python/pyarrow/__init__.py @@ -166,7 +166,8 @@ def print_entry(label, value): binary, string, utf8, binary_view, string_view, large_binary, large_string, large_utf8, decimal128, decimal256, - list_, large_list, map_, struct, + list_, large_list, list_view, large_list_view, + map_, struct, union, sparse_union, dense_union, dictionary, run_end_encoded, @@ -174,8 +175,9 @@ def print_entry(label, value): field, type_for_alias, DataType, DictionaryType, StructType, - ListType, LargeListType, MapType, FixedSizeListType, - UnionType, SparseUnionType, DenseUnionType, + ListType, LargeListType, FixedSizeListType, + ListViewType, LargeListViewType, + MapType, UnionType, SparseUnionType, DenseUnionType, TimestampType, Time32Type, Time64Type, DurationType, FixedSizeBinaryType, Decimal128Type, Decimal256Type, BaseExtensionType, ExtensionType, @@ -201,8 +203,9 @@ def print_entry(label, value): Int32Array, UInt32Array, Int64Array, UInt64Array, HalfFloatArray, FloatArray, DoubleArray, - ListArray, LargeListArray, MapArray, - FixedSizeListArray, UnionArray, + ListArray, LargeListArray, FixedSizeListArray, + ListViewArray, LargeListViewArray, + MapArray, UnionArray, BinaryArray, StringArray, LargeBinaryArray, LargeStringArray, BinaryViewArray, StringViewArray, @@ -220,6 +223,7 @@ def print_entry(label, value): HalfFloatScalar, FloatScalar, DoubleScalar, Decimal128Scalar, Decimal256Scalar, ListScalar, LargeListScalar, FixedSizeListScalar, + ListViewScalar, LargeListViewScalar, Date32Scalar, Date64Scalar, Time32Scalar, Time64Scalar, TimestampScalar, DurationScalar, diff --git a/python/pyarrow/array.pxi b/python/pyarrow/array.pxi index 1029f3a6298..3a319ab4544 100644 --- a/python/pyarrow/array.pxi +++ b/python/pyarrow/array.pxi @@ -2460,6 +2460,578 @@ cdef class LargeListArray(BaseListArray): return pyarrow_wrap_array(( self.ap).offsets()) +cdef class ListViewArray(Array): + """ + Concrete class for Arrow arrays of a list view data type. + """ + + @staticmethod + def from_arrays(offsets, sizes, values, DataType type=None, MemoryPool pool=None, mask=None): + """ + Construct ListViewArray from arrays of int32 offsets, sizes, and values. + + Parameters + ---------- + offsets : Array (int32 type) + sizes : Array (int32 type) + values : Array (any type) + type : DataType, optional + If not specified, a default ListType with the values' type is + used. + pool : MemoryPool, optional + mask : Array (boolean type), optional + Indicate which values are null (True) or not null (False). + + Returns + ------- + list_view_array : ListViewArray + + Examples + -------- + >>> import pyarrow as pa + >>> values = pa.array([1, 2, 3, 4]) + >>> offsets = pa.array([0, 1, 2]) + >>> sizes = pa.array([2, 2, 2]) + >>> pa.ListViewArray.from_arrays(offsets, sizes, values) + + [ + [ + 1, + 2 + ], + [ + 2, + 3 + ], + [ + 3, + 4 + ] + ] + >>> # use a null mask to represent null values + >>> mask = pa.array([False, True, False]) + >>> pa.ListViewArray.from_arrays(offsets, sizes, values, mask=mask) + + [ + [ + 1, + 2 + ], + null, + [ + 3, + 4 + ] + ] + >>> # null values can be defined in either offsets or sizes arrays + >>> # WARNING: this will result in a copy of the offsets or sizes arrays + >>> offsets = pa.array([0, None, 2]) + >>> pa.ListViewArray.from_arrays(offsets, sizes, values) + + [ + [ + 1, + 2 + ], + null, + [ + 3, + 4 + ] + ] + """ + cdef: + Array _offsets, _sizes, _values + shared_ptr[CArray] out + shared_ptr[CBuffer] c_mask + CMemoryPool* cpool = maybe_unbox_memory_pool(pool) + + _offsets = asarray(offsets, type='int32') + _sizes = asarray(sizes, type='int32') + _values = asarray(values) + + c_mask = c_mask_inverted_from_obj(mask, pool) + + if type is not None: + with nogil: + out = GetResultValue( + CListViewArray.FromArraysAndType( + type.sp_type, _offsets.ap[0], _sizes.ap[0], _values.ap[0], cpool, c_mask)) + else: + with nogil: + out = GetResultValue( + CListViewArray.FromArrays( + _offsets.ap[0], _sizes.ap[0], _values.ap[0], cpool, c_mask)) + cdef Array result = pyarrow_wrap_array(out) + result.validate() + return result + + @property + def values(self): + """ + Return the underlying array of values which backs the ListViewArray + ignoring the array's offset and sizes. + + The values array may be out of order and/or contain additional values + that are not found in the logical representation of the array. The only + guarantee is that each non-null value in the ListView Array is contiguous. + + Compare with :meth:`flatten`, which returns only the non-null + values taking into consideration the array's order and offset. + + Returns + ------- + values : Array + + Examples + -------- + The values include null elements from sub-lists: + + >>> import pyarrow as pa + >>> values = [1, 2, None, 3, 4] + >>> offsets = [0, 0, 1] + >>> sizes = [2, 0, 4] + >>> array = pa.ListViewArray.from_arrays(offsets, sizes, values) + >>> array + + [ + [ + 1, + 2 + ], + [], + [ + 2, + null, + 3, + 4 + ] + ] + >>> array.values + + [ + 1, + 2, + null, + 3, + 4 + ] + """ + cdef CListViewArray* arr = self.ap + return pyarrow_wrap_array(arr.values()) + + @property + def offsets(self): + """ + Return the list offsets as an int32 array. + + The returned array will not have a validity bitmap, so you cannot + expect to pass it to `ListViewArray.from_arrays` and get back the same + list array if the original one has nulls. + + Returns + ------- + offsets : Int32Array + + Examples + -------- + >>> import pyarrow as pa + >>> values = [1, 2, None, 3, 4] + >>> offsets = [0, 0, 1] + >>> sizes = [2, 0, 4] + >>> array = pa.ListViewArray.from_arrays(offsets, sizes, values) + >>> array.offsets + + [ + 0, + 0, + 1 + ] + """ + return pyarrow_wrap_array(( self.ap).offsets()) + + @property + def sizes(self): + """ + Return the list sizes as an int32 array. + + The returned array will not have a validity bitmap, so you cannot + expect to pass it to `ListViewArray.from_arrays` and get back the same + list array if the original one has nulls. + + Returns + ------- + sizes : Int32Array + + Examples + -------- + >>> import pyarrow as pa + >>> values = [1, 2, None, 3, 4] + >>> offsets = [0, 0, 1] + >>> sizes = [2, 0, 4] + >>> array = pa.ListViewArray.from_arrays(offsets, sizes, values) + >>> array.sizes + + [ + 2, + 0, + 4 + ] + """ + return pyarrow_wrap_array(( self.ap).sizes()) + + def flatten(self, memory_pool=None): + """ + Unnest this ListViewArray by one level. + + The returned Array is logically a concatenation of all the sub-lists + in this Array. + + Note that this method is different from ``self.values`` in that + it takes care of the slicing offset as well as null elements backed + by non-empty sub-lists. + + Parameters + ---------- + memory_pool : MemoryPool, optional + + Returns + ------- + result : Array + + Examples + -------- + + >>> import pyarrow as pa + >>> values = [1, 2, 3, 4] + >>> offsets = [2, 1, 0] + >>> sizes = [2, 2, 2] + >>> array = pa.ListViewArray.from_arrays(offsets, sizes, values) + >>> array + + [ + [ + 3, + 4 + ], + [ + 2, + 3 + ], + [ + 1, + 2 + ] + ] + >>> array.flatten() + + [ + 3, + 4, + 2, + 3, + 1, + 2 + ] + """ + cdef CMemoryPool* cpool = maybe_unbox_memory_pool(memory_pool) + with nogil: + out = GetResultValue(( self.ap).Flatten(cpool)) + cdef Array result = pyarrow_wrap_array(out) + result.validate() + return result + + +cdef class LargeListViewArray(Array): + """ + Concrete class for Arrow arrays of a large list view data type. + + Identical to ListViewArray, but with 64-bit offsets. + """ + @staticmethod + def from_arrays(offsets, sizes, values, DataType type=None, MemoryPool pool=None, mask=None): + """ + Construct LargeListViewArray from arrays of int64 offsets and values. + + Parameters + ---------- + offsets : Array (int64 type) + sizes : Array (int64 type) + values : Array (any type) + type : DataType, optional + If not specified, a default ListType with the values' type is + used. + pool : MemoryPool, optional + mask : Array (boolean type), optional + Indicate which values are null (True) or not null (False). + + Returns + ------- + list_view_array : LargeListViewArray + + Examples + -------- + >>> import pyarrow as pa + >>> values = pa.array([1, 2, 3, 4]) + >>> offsets = pa.array([0, 1, 2]) + >>> sizes = pa.array([2, 2, 2]) + >>> pa.LargeListViewArray.from_arrays(offsets, sizes, values) + + [ + [ + 1, + 2 + ], + [ + 2, + 3 + ], + [ + 3, + 4 + ] + ] + >>> # use a null mask to represent null values + >>> mask = pa.array([False, True, False]) + >>> pa.LargeListViewArray.from_arrays(offsets, sizes, values, mask=mask) + + [ + [ + 1, + 2 + ], + null, + [ + 3, + 4 + ] + ] + >>> # null values can be defined in either offsets or sizes arrays + >>> # WARNING: this will result in a copy of the offsets or sizes arrays + >>> offsets = pa.array([0, None, 2]) + >>> pa.LargeListViewArray.from_arrays(offsets, sizes, values) + + [ + [ + 1, + 2 + ], + null, + [ + 3, + 4 + ] + ] + """ + cdef: + Array _offsets, _sizes, _values + shared_ptr[CArray] out + shared_ptr[CBuffer] c_mask + CMemoryPool* cpool = maybe_unbox_memory_pool(pool) + + _offsets = asarray(offsets, type='int64') + _sizes = asarray(sizes, type='int64') + _values = asarray(values) + + c_mask = c_mask_inverted_from_obj(mask, pool) + + if type is not None: + with nogil: + out = GetResultValue( + CLargeListViewArray.FromArraysAndType( + type.sp_type, _offsets.ap[0], _sizes.ap[0], _values.ap[0], cpool, c_mask)) + else: + with nogil: + out = GetResultValue( + CLargeListViewArray.FromArrays( + _offsets.ap[0], _sizes.ap[0], _values.ap[0], cpool, c_mask)) + cdef Array result = pyarrow_wrap_array(out) + result.validate() + return result + + @property + def values(self): + """ + Return the underlying array of values which backs the LargeListArray + ignoring the array's offset. + + The values array may be out of order and/or contain additional values + that are not found in the logical representation of the array. The only + guarantee is that each non-null value in the ListView Array is contiguous. + + Compare with :meth:`flatten`, which returns only the non-null + values taking into consideration the array's order and offset. + + Returns + ------- + values : Array + + See Also + -------- + LargeListArray.flatten : ... + + Examples + -------- + + The values include null elements from sub-lists: + + >>> import pyarrow as pa + >>> values = [1, 2, None, 3, 4] + >>> offsets = [0, 0, 1] + >>> sizes = [2, 0, 4] + >>> array = pa.LargeListViewArray.from_arrays(offsets, sizes, values) + >>> array + + [ + [ + 1, + 2 + ], + [], + [ + 2, + null, + 3, + 4 + ] + ] + >>> array.values + + [ + 1, + 2, + null, + 3, + 4 + ] + """ + cdef CLargeListViewArray* arr = self.ap + return pyarrow_wrap_array(arr.values()) + + @property + def offsets(self): + """ + Return the list view offsets as an int64 array. + + The returned array will not have a validity bitmap, so you cannot + expect to pass it to `LargeListViewArray.from_arrays` and get back the + same list array if the original one has nulls. + + Returns + ------- + offsets : Int64Array + + Examples + -------- + + >>> import pyarrow as pa + >>> values = [1, 2, None, 3, 4] + >>> offsets = [0, 0, 1] + >>> sizes = [2, 0, 4] + >>> array = pa.LargeListViewArray.from_arrays(offsets, sizes, values) + >>> array.offsets + + [ + 0, + 0, + 1 + ] + """ + return pyarrow_wrap_array(( self.ap).offsets()) + + @property + def sizes(self): + """ + Return the list view sizes as an int64 array. + + The returned array will not have a validity bitmap, so you cannot + expect to pass it to `LargeListViewArray.from_arrays` and get back the + same list array if the original one has nulls. + + Returns + ------- + sizes : Int64Array + + Examples + -------- + + >>> import pyarrow as pa + >>> values = [1, 2, None, 3, 4] + >>> offsets = [0, 0, 1] + >>> sizes = [2, 0, 4] + >>> array = pa.LargeListViewArray.from_arrays(offsets, sizes, values) + >>> array.sizes + + [ + 2, + 0, + 4 + ] + """ + return pyarrow_wrap_array(( self.ap).sizes()) + + def flatten(self, memory_pool=None): + """ + Unnest this LargeListViewArray by one level. + + The returned Array is logically a concatenation of all the sub-lists + in this Array. + + Note that this method is different from ``self.values`` in that + it takes care of the slicing offset as well as null elements backed + by non-empty sub-lists. + + Parameters + ---------- + memory_pool : MemoryPool, optional + + Returns + ------- + result : Array + + Examples + -------- + + >>> import pyarrow as pa + >>> values = [1, 2, 3, 4] + >>> offsets = [2, 1, 0] + >>> sizes = [2, 2, 2] + >>> array = pa.LargeListViewArray.from_arrays(offsets, sizes, values) + >>> array + + [ + [ + 3, + 4 + ], + [ + 2, + 3 + ], + [ + 1, + 2 + ] + ] + >>> array.flatten() + + [ + 3, + 4, + 2, + 3, + 1, + 2 + ] + """ + cdef CMemoryPool* cpool = maybe_unbox_memory_pool(memory_pool) + with nogil: + out = GetResultValue(( self.ap).Flatten(cpool)) + cdef Array result = pyarrow_wrap_array(out) + result.validate() + return result + + cdef class MapArray(ListArray): """ Concrete class for Arrow arrays of a map data type. @@ -3673,6 +4245,8 @@ cdef dict _array_classes = { _Type_DOUBLE: DoubleArray, _Type_LIST: ListArray, _Type_LARGE_LIST: LargeListArray, + _Type_LIST_VIEW: ListViewArray, + _Type_LARGE_LIST_VIEW: LargeListViewArray, _Type_MAP: MapArray, _Type_FIXED_SIZE_LIST: FixedSizeListArray, _Type_SPARSE_UNION: UnionArray, diff --git a/python/pyarrow/includes/libarrow.pxd b/python/pyarrow/includes/libarrow.pxd index d92f09da779..f0597972cd2 100644 --- a/python/pyarrow/includes/libarrow.pxd +++ b/python/pyarrow/includes/libarrow.pxd @@ -132,6 +132,8 @@ cdef extern from "arrow/api.h" namespace "arrow" nogil: _Type_LIST" arrow::Type::LIST" _Type_LARGE_LIST" arrow::Type::LARGE_LIST" _Type_FIXED_SIZE_LIST" arrow::Type::FIXED_SIZE_LIST" + _Type_LIST_VIEW" arrow::Type::LIST_VIEW" + _Type_LARGE_LIST_VIEW" arrow::Type::LARGE_LIST_VIEW" _Type_STRUCT" arrow::Type::STRUCT" _Type_SPARSE_UNION" arrow::Type::SPARSE_UNION" _Type_DENSE_UNION" arrow::Type::DENSE_UNION" @@ -366,6 +368,18 @@ cdef extern from "arrow/api.h" namespace "arrow" nogil: shared_ptr[CDataType] value_type() shared_ptr[CField] value_field() + cdef cppclass CListViewType" arrow::ListViewType"(CDataType): + CListViewType(const shared_ptr[CDataType]& value_type) + CListViewType(const shared_ptr[CField]& field) + shared_ptr[CDataType] value_type() + shared_ptr[CField] value_field() + + cdef cppclass CLargeListViewType" arrow::LargeListViewType"(CDataType): + CLargeListViewType(const shared_ptr[CDataType]& value_type) + CLargeListViewType(const shared_ptr[CField]& field) + shared_ptr[CDataType] value_type() + shared_ptr[CField] value_field() + cdef cppclass CMapType" arrow::MapType"(CDataType): CMapType(const shared_ptr[CField]& key_field, const shared_ptr[CField]& item_field, c_bool keys_sorted) @@ -485,6 +499,12 @@ cdef extern from "arrow/api.h" namespace "arrow" nogil: shared_ptr[CDataType] run_end_type, shared_ptr[CDataType] value_type) + cdef shared_ptr[CDataType] CMakeListViewType" arrow::list_view"( + shared_ptr[CField] value_type) + + cdef shared_ptr[CDataType] CMakeLargeListViewType" arrow::large_list_view"( + shared_ptr[CField] value_type) + cdef cppclass CSchema" arrow::Schema": CSchema(const vector[shared_ptr[CField]]& fields) CSchema(const vector[shared_ptr[CField]]& fields, @@ -690,6 +710,70 @@ cdef extern from "arrow/api.h" namespace "arrow" nogil: shared_ptr[CArray] values() shared_ptr[CDataType] value_type() + cdef cppclass CListViewArray" arrow::ListViewArray"(CArray): + @staticmethod + CResult[shared_ptr[CArray]] FromArrays( + const CArray& offsets, + const CArray& sizes, + const CArray& values, + CMemoryPool* pool, + shared_ptr[CBuffer] null_bitmap, + ) + + @staticmethod + CResult[shared_ptr[CArray]] FromArraysAndType" FromArrays"( + shared_ptr[CDataType], + const CArray& offsets, + const CArray& sizes, + const CArray& values, + CMemoryPool* pool, + shared_ptr[CBuffer] null_bitmap, + ) + + CResult[shared_ptr[CArray]] Flatten( + CMemoryPool* pool + ) + + const int32_t* raw_value_offsets() + const int32_t* raw_value_sizes() + int32_t value_offset(int i) + int32_t value_length(int i) + shared_ptr[CArray] values() + shared_ptr[CArray] offsets() + shared_ptr[CArray] sizes() + shared_ptr[CDataType] value_type() + + cdef cppclass CLargeListViewArray" arrow::LargeListViewArray"(CArray): + @staticmethod + CResult[shared_ptr[CArray]] FromArrays( + const CArray& offsets, + const CArray& sizes, + const CArray& values, + CMemoryPool* pool, + shared_ptr[CBuffer] null_bitmap, + ) + + @staticmethod + CResult[shared_ptr[CArray]] FromArraysAndType" FromArrays"( + shared_ptr[CDataType], + const CArray& offsets, + const CArray& sizes, + const CArray& values, + CMemoryPool* pool, + shared_ptr[CBuffer] null_bitmap, + ) + + CResult[shared_ptr[CArray]] Flatten( + CMemoryPool* pool + ) + + int64_t value_offset(int i) + int64_t value_length(int i) + shared_ptr[CArray] values() + shared_ptr[CArray] offsets() + shared_ptr[CArray] sizes() + shared_ptr[CDataType] value_type() + cdef cppclass CMapArray" arrow::MapArray"(CArray): @staticmethod CResult[shared_ptr[CArray]] FromArrays( @@ -1150,6 +1234,12 @@ cdef extern from "arrow/api.h" namespace "arrow" nogil: cdef cppclass CListScalar" arrow::ListScalar"(CBaseListScalar): pass + cdef cppclass CListViewScalar" arrow::ListViewScalar"(CBaseListScalar): + pass + + cdef cppclass CLargeListViewScalar" arrow::LargeListViewScalar"(CBaseListScalar): + pass + cdef cppclass CMapScalar" arrow::MapScalar"(CListScalar): pass diff --git a/python/pyarrow/lib.pxd b/python/pyarrow/lib.pxd index c1104864066..48350212c20 100644 --- a/python/pyarrow/lib.pxd +++ b/python/pyarrow/lib.pxd @@ -120,6 +120,16 @@ cdef class LargeListType(DataType): const CLargeListType* list_type +cdef class ListViewType(DataType): + cdef: + const CListViewType* list_view_type + + +cdef class LargeListViewType(DataType): + cdef: + const CLargeListViewType* list_view_type + + cdef class MapType(DataType): cdef: const CMapType* map_type @@ -425,6 +435,14 @@ cdef class LargeListArray(BaseListArray): pass +cdef class ListViewArray(Array): + pass + + +cdef class LargeListViewArray(Array): + pass + + cdef class MapArray(ListArray): pass diff --git a/python/pyarrow/lib.pyx b/python/pyarrow/lib.pyx index b0368b67f79..3245e50f0fe 100644 --- a/python/pyarrow/lib.pyx +++ b/python/pyarrow/lib.pyx @@ -110,6 +110,8 @@ Type_BINARY_VIEW = _Type_BINARY_VIEW Type_STRING_VIEW = _Type_STRING_VIEW Type_LIST = _Type_LIST Type_LARGE_LIST = _Type_LARGE_LIST +Type_LIST_VIEW = _Type_LIST_VIEW +Type_LARGE_LIST_VIEW = _Type_LARGE_LIST_VIEW Type_MAP = _Type_MAP Type_FIXED_SIZE_LIST = _Type_FIXED_SIZE_LIST Type_STRUCT = _Type_STRUCT diff --git a/python/pyarrow/public-api.pxi b/python/pyarrow/public-api.pxi index 72e16f2cec3..966273b4bea 100644 --- a/python/pyarrow/public-api.pxi +++ b/python/pyarrow/public-api.pxi @@ -87,6 +87,10 @@ cdef api object pyarrow_wrap_data_type( out = ListType.__new__(ListType) elif type.get().id() == _Type_LARGE_LIST: out = LargeListType.__new__(LargeListType) + elif type.get().id() == _Type_LIST_VIEW: + out = ListViewType.__new__(ListViewType) + elif type.get().id() == _Type_LARGE_LIST_VIEW: + out = LargeListViewType.__new__(LargeListViewType) elif type.get().id() == _Type_MAP: out = MapType.__new__(MapType) elif type.get().id() == _Type_FIXED_SIZE_LIST: diff --git a/python/pyarrow/scalar.pxi b/python/pyarrow/scalar.pxi index 2772acf8186..cb080ba53d5 100644 --- a/python/pyarrow/scalar.pxi +++ b/python/pyarrow/scalar.pxi @@ -720,6 +720,14 @@ cdef class LargeListScalar(ListScalar): pass +cdef class ListViewScalar(ListScalar): + pass + + +cdef class LargeListViewScalar(ListScalar): + pass + + cdef class StructScalar(Scalar, collections.abc.Mapping): """ Concrete class for struct scalars. @@ -1066,6 +1074,8 @@ cdef dict _scalar_classes = { _Type_LIST: ListScalar, _Type_LARGE_LIST: LargeListScalar, _Type_FIXED_SIZE_LIST: FixedSizeListScalar, + _Type_LIST_VIEW: ListViewScalar, + _Type_LARGE_LIST_VIEW: LargeListViewScalar, _Type_STRUCT: StructScalar, _Type_MAP: MapScalar, _Type_DICTIONARY: DictionaryScalar, diff --git a/python/pyarrow/tests/test_array.py b/python/pyarrow/tests/test_array.py index f851d4e0b6c..bd9ae214b04 100644 --- a/python/pyarrow/tests/test_array.py +++ b/python/pyarrow/tests/test_array.py @@ -3573,3 +3573,74 @@ def test_run_end_encoded_from_buffers(): with pytest.raises(ValueError): pa.RunEndEncodedArray.from_buffers(ree_type, length, buffers, 1, offset, children) + + +@pytest.mark.parametrize(('list_array_type'), + [pa.ListViewArray, pa.LargeListViewArray]) +def test_list_view_from_arrays(list_array_type): + # test in order offsets, similar to ListArray representation + values = [1, 2, 3, 4, 5, 6, None, 7] + offsets = [0, 2, 4, 6] + sizes = [2, 2, 2, 2] + array = list_array_type.from_arrays(offsets, sizes, values) + + assert array.to_pylist() == [[1, 2], [3, 4], [5, 6], [None, 7]] + assert array.values.to_pylist() == values + assert array.offsets.to_pylist() == offsets + assert array.sizes.to_pylist() == sizes + + # test out of order offsets with overlapping values + values = [1, 2, 3, 4] + offsets = [2, 1, 0] + sizes = [2, 2, 2] + array = list_array_type.from_arrays(offsets, sizes, values) + + assert array.to_pylist() == [[3, 4], [2, 3], [1, 2]] + assert array.values.to_pylist() == values + assert array.offsets.to_pylist() == offsets + assert array.sizes.to_pylist() == sizes + + # test null offsets and empty list values + values = [] + offsets = [0, None] + sizes = [0, 0] + array = list_array_type.from_arrays(offsets, sizes, values) + + assert array.to_pylist() == [[], None] + assert array.values.to_pylist() == values + assert array.offsets.to_pylist() == [0, 0] + assert array.sizes.to_pylist() == sizes + + # test null sizes and empty list values + values = [] + offsets = [0, 0] + sizes = [None, 0] + array = list_array_type.from_arrays(offsets, sizes, values) + + assert array.to_pylist() == [None, []] + assert array.values.to_pylist() == values + assert array.offsets.to_pylist() == offsets + assert array.sizes.to_pylist() == [0, 0] + + # test null bitmask + values = [1, 2] + offsets = [0, 0, 1] + sizes = [1, 0, 1] + mask = pa.array([False, True, False]) + array = list_array_type.from_arrays(offsets, sizes, values, mask=mask) + + assert array.to_pylist() == [[1], None, [2]] + assert array.values.to_pylist() == values + assert array.offsets.to_pylist() == offsets + assert array.sizes.to_pylist() == sizes + + +@pytest.mark.parametrize(('list_array_type'), + [pa.ListViewArray, pa.LargeListViewArray]) +def test_list_view_flatten(list_array_type): + values = [1, 2, 3, 4] + offsets = [3, 2, 1, 0] + sizes = [1, 1, 1, 1] + array = list_array_type.from_arrays(offsets, sizes, values) + + assert array.flatten().to_pylist() == [4, 3, 2, 1] diff --git a/python/pyarrow/tests/test_misc.py b/python/pyarrow/tests/test_misc.py index 8cec8783280..39dac4eb81d 100644 --- a/python/pyarrow/tests/test_misc.py +++ b/python/pyarrow/tests/test_misc.py @@ -154,6 +154,8 @@ def test_set_timezone_db_path_non_windows(): pa.ListType, pa.LargeListType, pa.FixedSizeListType, + pa.ListViewType, + pa.LargeListViewType, pa.UnionType, pa.SparseUnionType, pa.DenseUnionType, @@ -227,6 +229,8 @@ def test_set_timezone_db_path_non_windows(): pa.StringViewScalar, pa.ListScalar, pa.LargeListScalar, + pa.ListViewScalar, + pa.LargeListViewScalar, pa.MapScalar, pa.FixedSizeListScalar, pa.UnionScalar, diff --git a/python/pyarrow/tests/test_scalars.py b/python/pyarrow/tests/test_scalars.py index eed5f045be9..074fb757e26 100644 --- a/python/pyarrow/tests/test_scalars.py +++ b/python/pyarrow/tests/test_scalars.py @@ -57,6 +57,9 @@ ([1, 2, 3], None, pa.ListScalar), ([1, 2, 3, 4], pa.large_list(pa.int8()), pa.LargeListScalar), ([1, 2, 3, 4, 5], pa.list_(pa.int8(), 5), pa.FixedSizeListScalar), + # TODO GH-39855 + # ([1, 2, 3], pa.list_view(pa.int8()), pa.ListViewScalar), + # ([1, 2, 3, 4], pa.large_list_view(pa.int8()), pa.LargeListViewScalar), (datetime.date.today(), None, pa.Date32Scalar), (datetime.date.today(), pa.date64(), pa.Date64Scalar), (datetime.datetime.now(), None, pa.TimestampScalar), @@ -537,7 +540,10 @@ def test_fixed_size_binary(): @pytest.mark.parametrize(('ty', 'klass'), [ (pa.list_(pa.string()), pa.ListScalar), - (pa.large_list(pa.string()), pa.LargeListScalar) + (pa.large_list(pa.string()), pa.LargeListScalar), + # TODO GH-39855 + # (pa.list_view(pa.string()), pa.ListViewScalar), + # (pa.large_list_view(pa.string()), pa.LargeListViewScalar) ]) def test_list(ty, klass): v = ['foo', None] diff --git a/python/pyarrow/tests/test_types.py b/python/pyarrow/tests/test_types.py index a5ab3128dc8..0add5786088 100644 --- a/python/pyarrow/tests/test_types.py +++ b/python/pyarrow/tests/test_types.py @@ -66,6 +66,8 @@ def get_many_types(): pa.list_(pa.int32()), pa.list_(pa.int32(), 2), pa.large_list(pa.uint16()), + pa.list_view(pa.int32()), + pa.large_list_view(pa.uint16()), pa.map_(pa.string(), pa.int32()), pa.map_(pa.field('key', pa.int32(), nullable=False), pa.field('value', pa.int32())), @@ -169,6 +171,18 @@ def test_is_list(): assert not types.is_list(pa.int32()) +def test_is_list_view(): + a = pa.list_view(pa.int32()) + b = pa.large_list_view(pa.int32()) + + assert types.is_list_view(a) + assert not types.is_large_list_view(a) + assert not types.is_list(a) + assert types.is_large_list_view(b) + assert not types.is_list_view(b) + assert not types.is_large_list(b) + + def test_is_map(): m = pa.map_(pa.utf8(), pa.int32()) @@ -573,6 +587,41 @@ def test_large_list_type(): pa.large_list(None) +def test_list_view_type(): + ty = pa.list_view(pa.int64()) + assert isinstance(ty, pa.ListViewType) + assert ty.value_type == pa.int64() + assert ty.value_field == pa.field("item", pa.int64(), nullable=True) + + # nullability matters in comparison + ty_non_nullable = pa.list_view(pa.field("item", pa.int64(), nullable=False)) + assert ty != ty_non_nullable + + # field names don't matter by default + ty_named = pa.list_view(pa.field("element", pa.int64())) + assert ty == ty_named + assert not ty.equals(ty_named, check_metadata=True) + + # metadata doesn't matter by default + ty_metadata = pa.list_view( + pa.field("item", pa.int64(), metadata={"hello": "world"})) + assert ty == ty_metadata + assert not ty.equals(ty_metadata, check_metadata=True) + + with pytest.raises(TypeError): + pa.list_view(None) + + +def test_large_list_view_type(): + ty = pa.large_list_view(pa.utf8()) + assert isinstance(ty, pa.LargeListViewType) + assert ty.value_type == pa.utf8() + assert ty.value_field == pa.field("item", pa.utf8(), nullable=True) + + with pytest.raises(TypeError): + pa.large_list_view(None) + + def test_map_type(): ty = pa.map_(pa.utf8(), pa.int32()) assert isinstance(ty, pa.MapType) diff --git a/python/pyarrow/types.pxi b/python/pyarrow/types.pxi index ce3736b5af8..6c023f1ce44 100644 --- a/python/pyarrow/types.pxi +++ b/python/pyarrow/types.pxi @@ -557,6 +557,101 @@ cdef class LargeListType(DataType): return pyarrow_wrap_data_type(self.list_type.value_type()) +cdef class ListViewType(DataType): + """ + Concrete class for list view data types. + + Examples + -------- + Create an instance of ListViewType: + + >>> import pyarrow as pa + >>> pa.list_view(pa.string()) + ListViewType(list_view) + """ + + cdef void init(self, const shared_ptr[CDataType]& type) except *: + DataType.init(self, type) + self.list_view_type = type.get() + + def __reduce__(self): + return list_view, (self.value_field,) + + @property + def value_field(self): + """ + The field for list view values. + + Examples + -------- + >>> import pyarrow as pa + >>> pa.list_view(pa.string()).value_field + pyarrow.Field + """ + return pyarrow_wrap_field(self.list_view_type.value_field()) + + @property + def value_type(self): + """ + The data type of list view values. + + Examples + -------- + >>> import pyarrow as pa + >>> pa.list_view(pa.string()).value_type + DataType(string) + """ + return pyarrow_wrap_data_type(self.list_view_type.value_type()) + + +cdef class LargeListViewType(DataType): + """ + Concrete class for large list view data types + (like ListViewType, but with 64-bit offsets). + + Examples + -------- + Create an instance of LargeListViewType: + + >>> import pyarrow as pa + >>> pa.large_list_view(pa.string()) + LargeListViewType(large_list_view) + """ + + cdef void init(self, const shared_ptr[CDataType]& type) except *: + DataType.init(self, type) + self.list_view_type = type.get() + + def __reduce__(self): + return large_list_view, (self.value_field,) + + @property + def value_field(self): + """ + The field for large list view values. + + Examples + -------- + >>> import pyarrow as pa + >>> pa.large_list_view(pa.string()).value_field + pyarrow.Field + """ + return pyarrow_wrap_field(self.list_view_type.value_field()) + + @property + def value_type(self): + """ + The data type of large list view values. + + Examples + -------- + >>> import pyarrow as pa + >>> pa.large_list_view(pa.string()).value_type + DataType(string) + """ + return pyarrow_wrap_data_type(self.list_view_type.value_type()) + + cdef class MapType(DataType): """ Concrete class for map data types. @@ -4528,6 +4623,82 @@ cpdef LargeListType large_list(value_type): return out +cpdef ListViewType list_view(value_type): + """ + Create ListViewType instance from child data type or field. + + This data type may not be supported by all Arrow implementations + because it is an alternative to the ListType. + + Parameters + ---------- + value_type : DataType or Field + + Returns + ------- + list_view_type : DataType + + Examples + -------- + Create an instance of ListViewType: + + >>> import pyarrow as pa + >>> pa.list_view(pa.string()) + ListViewType(list_view) + """ + cdef: + Field _field + shared_ptr[CDataType] list_view_type + + if isinstance(value_type, DataType): + _field = field('item', value_type) + elif isinstance(value_type, Field): + _field = value_type + else: + raise TypeError('ListView requires DataType or Field') + + list_view_type = CMakeListViewType(_field.sp_field) + return pyarrow_wrap_data_type(list_view_type) + + +cpdef LargeListViewType large_list_view(value_type): + """ + Create LargeListViewType instance from child data type or field. + + This data type may not be supported by all Arrow implementations + because it is an alternative to the ListType. + + Parameters + ---------- + value_type : DataType or Field + + Returns + ------- + list_view_type : DataType + + Examples + -------- + Create an instance of LargeListViewType: + + >>> import pyarrow as pa + >>> pa.large_list_view(pa.int8()) + LargeListViewType(large_list_view) + """ + cdef: + Field _field + shared_ptr[CDataType] list_view_type + + if isinstance(value_type, DataType): + _field = field('item', value_type) + elif isinstance(value_type, Field): + _field = value_type + else: + raise TypeError('LargeListView requires DataType or Field') + + list_view_type = CMakeLargeListViewType(_field.sp_field) + return pyarrow_wrap_data_type(list_view_type) + + cpdef MapType map_(key_type, item_type, keys_sorted=False): """ Create MapType instance from key and item data types or fields. diff --git a/python/pyarrow/types.py b/python/pyarrow/types.py index 32398dac9c5..0f68ca9fe57 100644 --- a/python/pyarrow/types.py +++ b/python/pyarrow/types.py @@ -151,6 +151,16 @@ def is_fixed_size_list(t): return t.id == lib.Type_FIXED_SIZE_LIST +@doc(is_null, datatype="list view") +def is_list_view(t): + return t.id == lib.Type_LIST_VIEW + + +@doc(is_null, datatype="large list view") +def is_large_list_view(t): + return t.id == lib.Type_LARGE_LIST_VIEW + + @doc(is_null, datatype="struct") def is_struct(t): return t.id == lib.Type_STRUCT