From 13e0e40420e8ea68c0bc5558c4fe000560e31999 Mon Sep 17 00:00:00 2001 From: Dane Pitkin Date: Fri, 26 Jan 2024 17:56:06 -0500 Subject: [PATCH 01/18] GH-39812: [Python] Add ListView and LargeListView --- docs/source/python/api/arrays.rst | 4 ++++ docs/source/python/api/datatypes.rst | 4 ++++ python/pyarrow/__init__.py | 13 ++++++++----- python/pyarrow/array.pxi | 18 ++++++++++++++++++ python/pyarrow/includes/libarrow.pxd | 20 ++++++++++++++++++++ python/pyarrow/lib.pxd | 12 ++++++++++++ python/pyarrow/lib.pyx | 2 ++ python/pyarrow/public-api.pxi | 4 ++++ python/pyarrow/scalar.pxi | 12 ++++++++++++ python/pyarrow/types.pxi | 16 ++++++++++++++++ python/pyarrow/types.py | 10 ++++++++++ 11 files changed, 110 insertions(+), 5 deletions(-) diff --git a/docs/source/python/api/arrays.rst b/docs/source/python/api/arrays.rst index b858862dcff..e6f6c3dbbd3 100644 --- a/docs/source/python/api/arrays.rst +++ b/docs/source/python/api/arrays.rst @@ -77,6 +77,8 @@ may expose data type-specific methods or properties. ListArray FixedSizeListArray LargeListArray + ListViewArray + LargeListViewArray MapArray RunEndEncodedArray StructArray @@ -135,6 +137,8 @@ classes may expose data type-specific methods or properties. RunEndEncodedScalar ListScalar LargeListScalar + ListViewScalar + LargeListViewScalar MapScalar StructScalar UnionScalar diff --git a/docs/source/python/api/datatypes.rst b/docs/source/python/api/datatypes.rst index 642c243b21a..62bf4b77235 100644 --- a/docs/source/python/api/datatypes.rst +++ b/docs/source/python/api/datatypes.rst @@ -60,6 +60,8 @@ These should be used to create Arrow data types and schemas. decimal128 list_ large_list + list_view + large_list_view map_ struct dictionary @@ -149,6 +151,8 @@ represents a given data type (such as ``int32``) or general category is_list is_large_list is_fixed_size_list + is_list_view + is_large_list_view is_struct is_union is_nested diff --git a/python/pyarrow/__init__.py b/python/pyarrow/__init__.py index 4dbd1258d3c..1c5a3c033e6 100644 --- a/python/pyarrow/__init__.py +++ b/python/pyarrow/__init__.py @@ -166,7 +166,7 @@ def print_entry(label, value): binary, string, utf8, binary_view, string_view, large_binary, large_string, large_utf8, decimal128, decimal256, - list_, large_list, map_, struct, + list_, large_list, list_view, large_list_view, map_, struct, union, sparse_union, dense_union, dictionary, run_end_encoded, @@ -174,8 +174,9 @@ def print_entry(label, value): field, type_for_alias, DataType, DictionaryType, StructType, - ListType, LargeListType, MapType, FixedSizeListType, - UnionType, SparseUnionType, DenseUnionType, + ListType, LargeListType, FixedSizeListType, + ListViewType, LargeListViewType, + MapType, UnionType, SparseUnionType, DenseUnionType, TimestampType, Time32Type, Time64Type, DurationType, FixedSizeBinaryType, Decimal128Type, Decimal256Type, BaseExtensionType, ExtensionType, @@ -201,8 +202,9 @@ def print_entry(label, value): Int32Array, UInt32Array, Int64Array, UInt64Array, HalfFloatArray, FloatArray, DoubleArray, - ListArray, LargeListArray, MapArray, - FixedSizeListArray, UnionArray, + ListArray, LargeListArray, FixedSizeListArray, + ListViewArray, LargeListViewArray, + MapArray, UnionArray, BinaryArray, StringArray, LargeBinaryArray, LargeStringArray, BinaryViewArray, StringViewArray, @@ -220,6 +222,7 @@ def print_entry(label, value): HalfFloatScalar, FloatScalar, DoubleScalar, Decimal128Scalar, Decimal256Scalar, ListScalar, LargeListScalar, FixedSizeListScalar, + ListViewScalar, LargeListViewScalar, Date32Scalar, Date64Scalar, Time32Scalar, Time64Scalar, TimestampScalar, DurationScalar, diff --git a/python/pyarrow/array.pxi b/python/pyarrow/array.pxi index 1029f3a6298..71f0f2bfdd1 100644 --- a/python/pyarrow/array.pxi +++ b/python/pyarrow/array.pxi @@ -2460,6 +2460,24 @@ cdef class LargeListArray(BaseListArray): return pyarrow_wrap_array(( self.ap).offsets()) +cdef class BaseListViewArray(Array): + pass + + +cdef class ListViewArray(BaseListViewArray): + """ + Concrete class for Arrow arrays of a list view data type. + """ + + +cdef class LargeListViewArray(BaseListViewArray): + """ + Concrete class for Arrow arrays of a large list view data type. + + Identical to ListViewArray, but with 64-bit offsets. + """ + + cdef class MapArray(ListArray): """ Concrete class for Arrow arrays of a map data type. diff --git a/python/pyarrow/includes/libarrow.pxd b/python/pyarrow/includes/libarrow.pxd index d92f09da779..b01d2a12c0e 100644 --- a/python/pyarrow/includes/libarrow.pxd +++ b/python/pyarrow/includes/libarrow.pxd @@ -132,6 +132,8 @@ cdef extern from "arrow/api.h" namespace "arrow" nogil: _Type_LIST" arrow::Type::LIST" _Type_LARGE_LIST" arrow::Type::LARGE_LIST" _Type_FIXED_SIZE_LIST" arrow::Type::FIXED_SIZE_LIST" + _Type_LIST_VIEW" arrow::Type::LIST_VIEW" + _Type_LARGE_LIST_VIEW" arrow::Type::LARGE_LIST_VIEW" _Type_STRUCT" arrow::Type::STRUCT" _Type_SPARSE_UNION" arrow::Type::SPARSE_UNION" _Type_DENSE_UNION" arrow::Type::DENSE_UNION" @@ -366,6 +368,12 @@ cdef extern from "arrow/api.h" namespace "arrow" nogil: shared_ptr[CDataType] value_type() shared_ptr[CField] value_field() + cdef cppclass CListViewType" arrow::ListViewType"(CDataType): + pass + + cdef cppclass CLargeListViewType" arrow::LargeListViewType"(CDataType): + pass + cdef cppclass CMapType" arrow::MapType"(CDataType): CMapType(const shared_ptr[CField]& key_field, const shared_ptr[CField]& item_field, c_bool keys_sorted) @@ -690,6 +698,12 @@ cdef extern from "arrow/api.h" namespace "arrow" nogil: shared_ptr[CArray] values() shared_ptr[CDataType] value_type() + cdef cppclass CListViewArray" arrow::ListViewArray"(CArray): + pass + + cdef cppclass CLargeListViewArray" arrow::LargeListViewArray"(CArray): + pass + cdef cppclass CMapArray" arrow::MapArray"(CArray): @staticmethod CResult[shared_ptr[CArray]] FromArrays( @@ -1150,6 +1164,12 @@ cdef extern from "arrow/api.h" namespace "arrow" nogil: cdef cppclass CListScalar" arrow::ListScalar"(CBaseListScalar): pass + cdef cppclass CListViewScalar" arrow::ListViewScalar"(CBaseListScalar): + pass + + cdef cppclass CLargeListViewScalar" arrow::LargeListViewScalar"(CBaseListScalar): + pass + cdef cppclass CMapScalar" arrow::MapScalar"(CListScalar): pass diff --git a/python/pyarrow/lib.pxd b/python/pyarrow/lib.pxd index c1104864066..6929a758f7e 100644 --- a/python/pyarrow/lib.pxd +++ b/python/pyarrow/lib.pxd @@ -425,6 +425,18 @@ cdef class LargeListArray(BaseListArray): pass +cdef class BaseListViewArray(Array): + pass + + +cdef class ListViewArray(BaseListViewArray): + pass + + +cdef class LargeListViewArray(BaseListViewArray): + pass + + cdef class MapArray(ListArray): pass diff --git a/python/pyarrow/lib.pyx b/python/pyarrow/lib.pyx index b0368b67f79..3245e50f0fe 100644 --- a/python/pyarrow/lib.pyx +++ b/python/pyarrow/lib.pyx @@ -110,6 +110,8 @@ Type_BINARY_VIEW = _Type_BINARY_VIEW Type_STRING_VIEW = _Type_STRING_VIEW Type_LIST = _Type_LIST Type_LARGE_LIST = _Type_LARGE_LIST +Type_LIST_VIEW = _Type_LIST_VIEW +Type_LARGE_LIST_VIEW = _Type_LARGE_LIST_VIEW Type_MAP = _Type_MAP Type_FIXED_SIZE_LIST = _Type_FIXED_SIZE_LIST Type_STRUCT = _Type_STRUCT diff --git a/python/pyarrow/public-api.pxi b/python/pyarrow/public-api.pxi index 72e16f2cec3..966273b4bea 100644 --- a/python/pyarrow/public-api.pxi +++ b/python/pyarrow/public-api.pxi @@ -87,6 +87,10 @@ cdef api object pyarrow_wrap_data_type( out = ListType.__new__(ListType) elif type.get().id() == _Type_LARGE_LIST: out = LargeListType.__new__(LargeListType) + elif type.get().id() == _Type_LIST_VIEW: + out = ListViewType.__new__(ListViewType) + elif type.get().id() == _Type_LARGE_LIST_VIEW: + out = LargeListViewType.__new__(LargeListViewType) elif type.get().id() == _Type_MAP: out = MapType.__new__(MapType) elif type.get().id() == _Type_FIXED_SIZE_LIST: diff --git a/python/pyarrow/scalar.pxi b/python/pyarrow/scalar.pxi index 2772acf8186..cf8750c272f 100644 --- a/python/pyarrow/scalar.pxi +++ b/python/pyarrow/scalar.pxi @@ -720,6 +720,16 @@ cdef class LargeListScalar(ListScalar): pass +cdef class ListViewScalar(Scalar): + """ + Concrete class for list view scalars. + """ + + +cdef class LargeListViewScalar(ListViewScalar): + pass + + cdef class StructScalar(Scalar, collections.abc.Mapping): """ Concrete class for struct scalars. @@ -1066,6 +1076,8 @@ cdef dict _scalar_classes = { _Type_LIST: ListScalar, _Type_LARGE_LIST: LargeListScalar, _Type_FIXED_SIZE_LIST: FixedSizeListScalar, + _Type_LIST_VIEW: ListViewScalar, + _Type_LARGE_LIST_VIEW: LargeListViewScalar, _Type_STRUCT: StructScalar, _Type_MAP: MapScalar, _Type_DICTIONARY: DictionaryScalar, diff --git a/python/pyarrow/types.pxi b/python/pyarrow/types.pxi index ce3736b5af8..15c76531b7a 100644 --- a/python/pyarrow/types.pxi +++ b/python/pyarrow/types.pxi @@ -557,6 +557,14 @@ cdef class LargeListType(DataType): return pyarrow_wrap_data_type(self.list_type.value_type()) +cdef class ListViewType(DataType): + pass + + +cdef class LargeListViewType(DataType): + pass + + cdef class MapType(DataType): """ Concrete class for map data types. @@ -4528,6 +4536,14 @@ cpdef LargeListType large_list(value_type): return out +cpdef ListViewType list_view(): + pass + + +cpdef LargeListViewType large_list_view(): + pass + + cpdef MapType map_(key_type, item_type, keys_sorted=False): """ Create MapType instance from key and item data types or fields. diff --git a/python/pyarrow/types.py b/python/pyarrow/types.py index 32398dac9c5..0f68ca9fe57 100644 --- a/python/pyarrow/types.py +++ b/python/pyarrow/types.py @@ -151,6 +151,16 @@ def is_fixed_size_list(t): return t.id == lib.Type_FIXED_SIZE_LIST +@doc(is_null, datatype="list view") +def is_list_view(t): + return t.id == lib.Type_LIST_VIEW + + +@doc(is_null, datatype="large list view") +def is_large_list_view(t): + return t.id == lib.Type_LARGE_LIST_VIEW + + @doc(is_null, datatype="struct") def is_struct(t): return t.id == lib.Type_STRUCT From f8fa96b9f1d8ddc7ff35be71a6e910e88fc26726 Mon Sep 17 00:00:00 2001 From: Dane Pitkin Date: Mon, 29 Jan 2024 15:00:34 -0500 Subject: [PATCH 02/18] Add ListViewType and LargeListViewType --- python/pyarrow/includes/libarrow.pxd | 16 ++- python/pyarrow/lib.pxd | 10 ++ python/pyarrow/tests/test_types.py | 49 ++++++++ python/pyarrow/types.pxi | 161 ++++++++++++++++++++++++++- 4 files changed, 228 insertions(+), 8 deletions(-) diff --git a/python/pyarrow/includes/libarrow.pxd b/python/pyarrow/includes/libarrow.pxd index b01d2a12c0e..dff8cffeafe 100644 --- a/python/pyarrow/includes/libarrow.pxd +++ b/python/pyarrow/includes/libarrow.pxd @@ -369,10 +369,16 @@ cdef extern from "arrow/api.h" namespace "arrow" nogil: shared_ptr[CField] value_field() cdef cppclass CListViewType" arrow::ListViewType"(CDataType): - pass + CListViewType(const shared_ptr[CDataType]& value_type) + CListViewType(const shared_ptr[CField]& field) + shared_ptr[CDataType] value_type() + shared_ptr[CField] value_field() cdef cppclass CLargeListViewType" arrow::LargeListViewType"(CDataType): - pass + CLargeListViewType(const shared_ptr[CDataType]& value_type) + CLargeListViewType(const shared_ptr[CField]& field) + shared_ptr[CDataType] value_type() + shared_ptr[CField] value_field() cdef cppclass CMapType" arrow::MapType"(CDataType): CMapType(const shared_ptr[CField]& key_field, @@ -493,6 +499,12 @@ cdef extern from "arrow/api.h" namespace "arrow" nogil: shared_ptr[CDataType] run_end_type, shared_ptr[CDataType] value_type) + cdef shared_ptr[CDataType] CMakeListViewType" arrow::list_view"( + shared_ptr[CField] value_type) + + cdef shared_ptr[CDataType] CMakeLargeListViewType" arrow::large_list_view"( + shared_ptr[CField] value_type) + cdef cppclass CSchema" arrow::Schema": CSchema(const vector[shared_ptr[CField]]& fields) CSchema(const vector[shared_ptr[CField]]& fields, diff --git a/python/pyarrow/lib.pxd b/python/pyarrow/lib.pxd index 6929a758f7e..63f75747db3 100644 --- a/python/pyarrow/lib.pxd +++ b/python/pyarrow/lib.pxd @@ -120,6 +120,16 @@ cdef class LargeListType(DataType): const CLargeListType* list_type +cdef class ListViewType(DataType): + cdef: + const CListViewType* list_view_type + + +cdef class LargeListViewType(DataType): + cdef: + const CLargeListViewType* list_view_type + + cdef class MapType(DataType): cdef: const CMapType* map_type diff --git a/python/pyarrow/tests/test_types.py b/python/pyarrow/tests/test_types.py index a5ab3128dc8..75204a6d6be 100644 --- a/python/pyarrow/tests/test_types.py +++ b/python/pyarrow/tests/test_types.py @@ -66,6 +66,8 @@ def get_many_types(): pa.list_(pa.int32()), pa.list_(pa.int32(), 2), pa.large_list(pa.uint16()), + pa.list_view(pa.int32()), + pa.large_list_view(pa.uint16()), pa.map_(pa.string(), pa.int32()), pa.map_(pa.field('key', pa.int32(), nullable=False), pa.field('value', pa.int32())), @@ -169,6 +171,18 @@ def test_is_list(): assert not types.is_list(pa.int32()) +def test_is_list_view(): + a = pa.list_view(pa.int32()) + b = pa.large_list_view(pa.int32()) + + assert types.is_list_view(a) + assert not types.is_large_list_view(a) + assert not types.is_list(a) + assert types.is_large_list_view(b) + assert not types.is_list_view(b) + assert not types.is_large_list(b) + + def test_is_map(): m = pa.map_(pa.utf8(), pa.int32()) @@ -573,6 +587,41 @@ def test_large_list_type(): pa.large_list(None) +def test_list_view_type(): + ty = pa.list_view(pa.int64()) + assert isinstance(ty, pa.ListViewType) + assert ty.value_type == pa.int64() + assert ty.value_field == pa.field("item", pa.int64(), nullable=True) + + # nullability matters in comparison + ty_non_nullable = pa.list_view(pa.field("item", pa.int64(), nullable=False)) + assert ty != ty_non_nullable + + # field names don't matter by default + ty_named = pa.list_view(pa.field("element", pa.int64())) + assert ty == ty_named + assert not ty.equals(ty_named, check_metadata=True) + + # metadata doesn't matter by default + ty_metadata = pa.list_view( + pa.field("item", pa.int64(), metadata={"hello": "world"})) + assert ty == ty_metadata + assert not ty.equals(ty_metadata, check_metadata=True) + + with pytest.raises(TypeError): + pa.list_view(None) + + +def test_large_list_type(): + ty = pa.large_list_view(pa.utf8()) + assert isinstance(ty, pa.LargeListViewType) + assert ty.value_type == pa.utf8() + assert ty.value_field == pa.field("item", pa.utf8(), nullable=True) + + with pytest.raises(TypeError): + pa.large_list_view(None) + + def test_map_type(): ty = pa.map_(pa.utf8(), pa.int32()) assert isinstance(ty, pa.MapType) diff --git a/python/pyarrow/types.pxi b/python/pyarrow/types.pxi index 15c76531b7a..95ac7e84bd1 100644 --- a/python/pyarrow/types.pxi +++ b/python/pyarrow/types.pxi @@ -558,11 +558,98 @@ cdef class LargeListType(DataType): cdef class ListViewType(DataType): - pass + """ + Concrete class for list view data types. + + Examples + -------- + Create an instance of ListViewType: + + >>> import pyarrow as pa + >>> pa.list_view(pa.string()) + ListViewType(list_view) + """ + + cdef void init(self, const shared_ptr[CDataType]& type) except *: + DataType.init(self, type) + self.list_view_type = type.get() + + def __reduce__(self): + return list_view, (self.value_field,) + + @property + def value_field(self): + """ + The field for list view values. + + Examples + -------- + >>> import pyarrow as pa + >>> pa.list_view(pa.string()).value_field + pyarrow.Field + """ + return pyarrow_wrap_field(self.list_view_type.value_field()) + + @property + def value_type(self): + """ + The data type of list view values. + + Examples + -------- + >>> import pyarrow as pa + >>> pa.list_view(pa.string()).value_type + DataType(string) + """ + return pyarrow_wrap_data_type(self.list_view_type.value_type()) cdef class LargeListViewType(DataType): - pass + """ + Concrete class for large list view data types + (like ListViewType, but with 64-bit offsets). + + Examples + -------- + Create an instance of LargeListViewType: + + >>> import pyarrow as pa + >>> pa.large_list_view(pa.string()) + LargeListViewType(large_list_view) + """ + + cdef void init(self, const shared_ptr[CDataType]& type) except *: + DataType.init(self, type) + self.list_view_type = type.get() + + def __reduce__(self): + return large_list_view, (self.value_field,) + + @property + def value_field(self): + """ + The field for large list view values. + + Examples + -------- + >>> import pyarrow as pa + >>> pa.large_list_view(pa.string()).value_field + pyarrow.Field + """ + return pyarrow_wrap_field(self.list_view_type.value_field()) + + @property + def value_type(self): + """ + The data type of large list view values. + + Examples + -------- + >>> import pyarrow as pa + >>> pa.large_list_view(pa.string()).value_type + DataType(string) + """ + return pyarrow_wrap_data_type(self.list_view_type.value_type()) cdef class MapType(DataType): @@ -4536,12 +4623,74 @@ cpdef LargeListType large_list(value_type): return out -cpdef ListViewType list_view(): - pass +cpdef ListViewType list_view(value_type): + """ + Create ListViewType instance from child data type or field. + + Parameters + ---------- + value_type : DataType or Field + + Returns + ------- + list_view_type : DataType + + Examples + -------- + Create an instance of ListViewType: + + TODO + """ + cdef: + Field _field + shared_ptr[CDataType] list_view_type + if isinstance(value_type, DataType): + _field = field('item', value_type) + elif isinstance(value_type, Field): + _field = value_type + else: + raise TypeError('ListView requires DataType or Field') + + list_view_type = CMakeListViewType(_field.sp_field) + return pyarrow_wrap_data_type(list_view_type) + + +cpdef LargeListViewType large_list_view(value_type): + """ + Create LargeListViewType instance from child data type or field. + + This data type may not be supported by all Arrow implementations. + Unless you need to represent data larger than 2**31 elements, you should + prefer list_view(). + + Parameters + ---------- + value_type : DataType or Field + + Returns + ------- + list_view_type : DataType + + Examples + -------- + Create an instance of LargeListViewType: + + TODO + """ + cdef: + Field _field + shared_ptr[CDataType] list_view_type + + if isinstance(value_type, DataType): + _field = field('item', value_type) + elif isinstance(value_type, Field): + _field = value_type + else: + raise TypeError('LargeListView requires DataType or Field') -cpdef LargeListViewType large_list_view(): - pass + list_view_type = CMakeLargeListViewType(_field.sp_field) + return pyarrow_wrap_data_type(list_view_type) cpdef MapType map_(key_type, item_type, keys_sorted=False): From 03949631c76d181eafa5761dec313dc60bdfb2c1 Mon Sep 17 00:00:00 2001 From: Dane Pitkin Date: Tue, 30 Jan 2024 10:59:45 -0500 Subject: [PATCH 03/18] Implement ListViewScalar and LargeListViewScalar --- python/pyarrow/__init__.py | 3 ++- python/pyarrow/scalar.pxi | 8 +++----- python/pyarrow/src/arrow/python/python_to_arrow.cc | 4 +++- python/pyarrow/tests/test_misc.py | 4 ++++ python/pyarrow/tests/test_scalars.py | 9 ++++++++- python/pyarrow/types.pxi | 8 ++++++-- 6 files changed, 26 insertions(+), 10 deletions(-) diff --git a/python/pyarrow/__init__.py b/python/pyarrow/__init__.py index 1c5a3c033e6..2ee97ddb662 100644 --- a/python/pyarrow/__init__.py +++ b/python/pyarrow/__init__.py @@ -166,7 +166,8 @@ def print_entry(label, value): binary, string, utf8, binary_view, string_view, large_binary, large_string, large_utf8, decimal128, decimal256, - list_, large_list, list_view, large_list_view, map_, struct, + list_, large_list, list_view, large_list_view, + map_, struct, union, sparse_union, dense_union, dictionary, run_end_encoded, diff --git a/python/pyarrow/scalar.pxi b/python/pyarrow/scalar.pxi index cf8750c272f..cb080ba53d5 100644 --- a/python/pyarrow/scalar.pxi +++ b/python/pyarrow/scalar.pxi @@ -720,13 +720,11 @@ cdef class LargeListScalar(ListScalar): pass -cdef class ListViewScalar(Scalar): - """ - Concrete class for list view scalars. - """ +cdef class ListViewScalar(ListScalar): + pass -cdef class LargeListViewScalar(ListViewScalar): +cdef class LargeListViewScalar(ListScalar): pass diff --git a/python/pyarrow/src/arrow/python/python_to_arrow.cc b/python/pyarrow/src/arrow/python/python_to_arrow.cc index 3c4d59d6594..45567935e7b 100644 --- a/python/pyarrow/src/arrow/python/python_to_arrow.cc +++ b/python/pyarrow/src/arrow/python/python_to_arrow.cc @@ -20,6 +20,7 @@ #include +#include #include #include #include @@ -1227,9 +1228,10 @@ Result> ConvertPySequence(PyObject* obj, PyObject* options.strict = true; } DCHECK_GE(size, 0); - + std::cout << "HELLO1" << std::endl; ARROW_ASSIGN_OR_RAISE(auto converter, (MakeConverter( options.type, options, pool))); + std::cout << "HELLO2" << std::endl; if (converter->may_overflow()) { // The converter hierarchy contains binary- or list-like builders which can overflow // depending on the input values. Wrap the converter with a chunker which detects diff --git a/python/pyarrow/tests/test_misc.py b/python/pyarrow/tests/test_misc.py index 8cec8783280..39dac4eb81d 100644 --- a/python/pyarrow/tests/test_misc.py +++ b/python/pyarrow/tests/test_misc.py @@ -154,6 +154,8 @@ def test_set_timezone_db_path_non_windows(): pa.ListType, pa.LargeListType, pa.FixedSizeListType, + pa.ListViewType, + pa.LargeListViewType, pa.UnionType, pa.SparseUnionType, pa.DenseUnionType, @@ -227,6 +229,8 @@ def test_set_timezone_db_path_non_windows(): pa.StringViewScalar, pa.ListScalar, pa.LargeListScalar, + pa.ListViewScalar, + pa.LargeListViewScalar, pa.MapScalar, pa.FixedSizeListScalar, pa.UnionScalar, diff --git a/python/pyarrow/tests/test_scalars.py b/python/pyarrow/tests/test_scalars.py index eed5f045be9..8217d9c22c3 100644 --- a/python/pyarrow/tests/test_scalars.py +++ b/python/pyarrow/tests/test_scalars.py @@ -57,6 +57,9 @@ ([1, 2, 3], None, pa.ListScalar), ([1, 2, 3, 4], pa.large_list(pa.int8()), pa.LargeListScalar), ([1, 2, 3, 4, 5], pa.list_(pa.int8(), 5), pa.FixedSizeListScalar), + # TODO GH-39855 + # ([1, 2, 3], pa.list_view(pa.int8()), pa.ListViewScalar), + # ([1, 2, 3, 4], pa.large_list_view(pa.int8()), pa.LargeListViewScalar), (datetime.date.today(), None, pa.Date32Scalar), (datetime.date.today(), pa.date64(), pa.Date64Scalar), (datetime.datetime.now(), None, pa.TimestampScalar), @@ -537,9 +540,13 @@ def test_fixed_size_binary(): @pytest.mark.parametrize(('ty', 'klass'), [ (pa.list_(pa.string()), pa.ListScalar), - (pa.large_list(pa.string()), pa.LargeListScalar) + (pa.large_list(pa.string()), pa.LargeListScalar), + # TODO GH-39855 + # (pa.list_view(pa.string()), pa.ListViewScalar), + # (pa.large_list_view(pa.string()), pa.LargeListViewScalar) ]) def test_list(ty, klass): + breakpoint() v = ['foo', None] s = pa.scalar(v, type=ty) assert s.type == ty diff --git a/python/pyarrow/types.pxi b/python/pyarrow/types.pxi index 95ac7e84bd1..67f8c897ead 100644 --- a/python/pyarrow/types.pxi +++ b/python/pyarrow/types.pxi @@ -4639,7 +4639,9 @@ cpdef ListViewType list_view(value_type): -------- Create an instance of ListViewType: - TODO + >>> import pyarrow as pa + >>> pa.list_view(pa.string()) + ListViewType(list) """ cdef: Field _field @@ -4676,7 +4678,9 @@ cpdef LargeListViewType large_list_view(value_type): -------- Create an instance of LargeListViewType: - TODO + >>> import pyarrow as pa + >>> pa.large_list_view(pa.int8()) + LargeListViewType(large_list) """ cdef: Field _field From c03abe290c469f7b39df9715b3a9e888229bb958 Mon Sep 17 00:00:00 2001 From: Dane Pitkin Date: Tue, 30 Jan 2024 11:01:26 -0500 Subject: [PATCH 04/18] Remove debug statement --- python/pyarrow/src/arrow/python/python_to_arrow.cc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python/pyarrow/src/arrow/python/python_to_arrow.cc b/python/pyarrow/src/arrow/python/python_to_arrow.cc index 45567935e7b..1b49ef26253 100644 --- a/python/pyarrow/src/arrow/python/python_to_arrow.cc +++ b/python/pyarrow/src/arrow/python/python_to_arrow.cc @@ -1228,7 +1228,7 @@ Result> ConvertPySequence(PyObject* obj, PyObject* options.strict = true; } DCHECK_GE(size, 0); - std::cout << "HELLO1" << std::endl; + ARROW_ASSIGN_OR_RAISE(auto converter, (MakeConverter( options.type, options, pool))); std::cout << "HELLO2" << std::endl; From bd81932771e099e8afdf9a454c6a1c7cda4eb0d1 Mon Sep 17 00:00:00 2001 From: Dane Pitkin Date: Tue, 30 Jan 2024 11:02:37 -0500 Subject: [PATCH 05/18] Remove debug include --- python/pyarrow/src/arrow/python/python_to_arrow.cc | 1 - 1 file changed, 1 deletion(-) diff --git a/python/pyarrow/src/arrow/python/python_to_arrow.cc b/python/pyarrow/src/arrow/python/python_to_arrow.cc index 1b49ef26253..ae4b0bf5594 100644 --- a/python/pyarrow/src/arrow/python/python_to_arrow.cc +++ b/python/pyarrow/src/arrow/python/python_to_arrow.cc @@ -20,7 +20,6 @@ #include -#include #include #include #include From 6248f6b447cf8891c8058ee6b32f26b3c26de5a8 Mon Sep 17 00:00:00 2001 From: Dane Pitkin Date: Tue, 30 Jan 2024 11:47:50 -0500 Subject: [PATCH 06/18] Remove debug statement --- python/pyarrow/src/arrow/python/python_to_arrow.cc | 2 -- 1 file changed, 2 deletions(-) diff --git a/python/pyarrow/src/arrow/python/python_to_arrow.cc b/python/pyarrow/src/arrow/python/python_to_arrow.cc index ae4b0bf5594..fee9c73863b 100644 --- a/python/pyarrow/src/arrow/python/python_to_arrow.cc +++ b/python/pyarrow/src/arrow/python/python_to_arrow.cc @@ -1227,10 +1227,8 @@ Result> ConvertPySequence(PyObject* obj, PyObject* options.strict = true; } DCHECK_GE(size, 0); - ARROW_ASSIGN_OR_RAISE(auto converter, (MakeConverter( options.type, options, pool))); - std::cout << "HELLO2" << std::endl; if (converter->may_overflow()) { // The converter hierarchy contains binary- or list-like builders which can overflow // depending on the input values. Wrap the converter with a chunker which detects From d1f6ab1153efb922434f55265a14a4632198ffa2 Mon Sep 17 00:00:00 2001 From: Dane Pitkin Date: Tue, 30 Jan 2024 13:43:54 -0500 Subject: [PATCH 07/18] Remove breakpoint --- python/pyarrow/tests/test_scalars.py | 1 - 1 file changed, 1 deletion(-) diff --git a/python/pyarrow/tests/test_scalars.py b/python/pyarrow/tests/test_scalars.py index 8217d9c22c3..074fb757e26 100644 --- a/python/pyarrow/tests/test_scalars.py +++ b/python/pyarrow/tests/test_scalars.py @@ -546,7 +546,6 @@ def test_fixed_size_binary(): # (pa.large_list_view(pa.string()), pa.LargeListViewScalar) ]) def test_list(ty, klass): - breakpoint() v = ['foo', None] s = pa.scalar(v, type=ty) assert s.type == ty From bfe5670bddcb63ac0aeea586cff21fd02f18fb21 Mon Sep 17 00:00:00 2001 From: Dane Pitkin Date: Tue, 30 Jan 2024 15:48:28 -0500 Subject: [PATCH 08/18] Fix doctests --- python/pyarrow/types.pxi | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/python/pyarrow/types.pxi b/python/pyarrow/types.pxi index 67f8c897ead..dccf5364b17 100644 --- a/python/pyarrow/types.pxi +++ b/python/pyarrow/types.pxi @@ -4641,7 +4641,7 @@ cpdef ListViewType list_view(value_type): >>> import pyarrow as pa >>> pa.list_view(pa.string()) - ListViewType(list) + ListViewType(list_view) """ cdef: Field _field @@ -4680,7 +4680,7 @@ cpdef LargeListViewType large_list_view(value_type): >>> import pyarrow as pa >>> pa.large_list_view(pa.int8()) - LargeListViewType(large_list) + LargeListViewType(large_list_view) """ cdef: Field _field From ce1e3da64665c0f1aa7039f53338af7a97248185 Mon Sep 17 00:00:00 2001 From: Dane Pitkin Date: Wed, 31 Jan 2024 17:24:21 -0500 Subject: [PATCH 09/18] Add ListViewArray and LargeListViewArray --- python/pyarrow/array.pxi | 385 ++++++++++++++++++++++++++- python/pyarrow/includes/libarrow.pxd | 54 +++- python/pyarrow/lib.pxd | 8 +- python/pyarrow/tests/test_array.py | 26 ++ python/pyarrow/tests/test_types.py | 2 +- 5 files changed, 460 insertions(+), 15 deletions(-) diff --git a/python/pyarrow/array.pxi b/python/pyarrow/array.pxi index 71f0f2bfdd1..89d26c763f3 100644 --- a/python/pyarrow/array.pxi +++ b/python/pyarrow/array.pxi @@ -2460,22 +2460,393 @@ cdef class LargeListArray(BaseListArray): return pyarrow_wrap_array(( self.ap).offsets()) -cdef class BaseListViewArray(Array): - pass - - -cdef class ListViewArray(BaseListViewArray): +cdef class ListViewArray(Array): """ Concrete class for Arrow arrays of a list view data type. """ + @staticmethod + def from_arrays(offsets, sizes, values, DataType type=None, MemoryPool pool=None, mask=None): + """ + Construct ListViewArray from arrays of int32 offsets, sizes, and values. + + Parameters + ---------- + offsets : Array (int32 type) + sizes : Array (int32 type) + values : Array (any type) + type : DataType, optional + If not specified, a default ListType with the values' type is + used. + pool : MemoryPool, optional + mask : Array (boolean type), optional + Indicate which values are null (True) or not null (False). + + Returns + ------- + list_view_array : ListViewArray + + Examples + -------- + >>> import pyarrow as pa + >>> values = pa.array([1, 2, 3, 4]) + >>> offsets = pa.array([0, 1, 2]) + >>> sizes = pa.array([2, 2, 2]) + >>> pa.ListViewArray.from_arrays(offsets, sizes, values) + + [ + [ + 1, + 2 + ], + [ + 2, + 3 + ], + [ + 3, + 4 + ] + ] + >>> # nulls in the offsets array become null lists + >>> offsets = pa.array([0, None, 2]) + >>> sizes = pa.array([2, 0, 2]) + >>> pa.ListViewArray.from_arrays(offsets, sizes, values) + + [ + [ + 1, + 2 + ], + null, + [ + 3, + 4 + ] + ] + """ + cdef: + Array _offsets, _sizes, _values + shared_ptr[CArray] out + shared_ptr[CBuffer] c_mask + cdef CMemoryPool* cpool = maybe_unbox_memory_pool(pool) + + _offsets = asarray(offsets, type='int32') + _sizes = asarray(sizes, type='int32') + _values = asarray(values) -cdef class LargeListViewArray(BaseListViewArray): + c_mask = c_mask_inverted_from_obj(mask, pool) + + if type is not None: + with nogil: + out = GetResultValue( + CListViewArray.FromArraysAndType( + type.sp_type, _offsets.ap[0], _sizes.ap[0], _values.ap[0], cpool, c_mask)) + else: + with nogil: + out = GetResultValue( + CListViewArray.FromArrays( + _offsets.ap[0], _sizes.ap[0], _values.ap[0], cpool, c_mask)) + cdef Array result = pyarrow_wrap_array(out) + result.validate() + return result + + @property + def values(self): + """ + Return the underlying array of values which backs the ListViewArray + ignoring the array's offset and sizes. + + If any of the list elements are null, but are backed by a + non-empty sub-list, those elements will be included in the + output. + + Returns + ------- + values : Array + + Examples + -------- + The values include null elements from sub-lists: + + >>> import pyarrow as pa + >>> values = [[1, 2], None, [3, 4, None, 6]] + >>> offsets = [0, None, 2] + >>> sizes = [2, 0, 4] + >>> array = pa.ListViewArray(offsets, sizes, values) + >>> array.values + + [ + 1, + 2, + 3, + 4, + null, + 6 + ] + """ + cdef CListViewArray* arr = self.ap + return pyarrow_wrap_array(arr.values()) + + @property + def offsets(self): + """ + Return the list offsets as an int32 array. + + The returned array will not have a validity bitmap, so you cannot + expect to pass it to `ListViewArray.from_arrays` and get back the same + list array if the original one has nulls. + + Returns + ------- + offsets : Int32Array + + Examples + -------- + >>> import pyarrow as pa + >>> values = [[1, 2], None, [3, 4, None, 6]] + >>> offsets = [0, None, 2] + >>> sizes = [2, 0, 4] + >>> array = pa.ListViewArray(offsets, sizes, values) + >>> array.offsets + + [ + 0, + 0, + 2 + ] + """ + return pyarrow_wrap_array(( self.ap).offsets()) + + @property + def sizes(self): + """ + Return the sizes offsets as an int32 array. + + The returned array will not have a validity bitmap, so you cannot + expect to pass it to `ListViewArray.from_arrays` and get back the same + list array if the original one has nulls. + + Returns + ------- + sizes : Int32Array + + Examples + -------- + >>> import pyarrow as pa + >>> values = [[1, 2], None, [3, 4, None, 6]] + >>> offsets = [0, None, 2] + >>> sizes = [2, 0, 4] + >>> array = pa.ListViewArray(offsets, sizes, values) + >>> array.offsets + + [ + 2, + 0, + 4 + ] + """ + return pyarrow_wrap_array(( self.ap).sizes()) + +cdef class LargeListViewArray(Array): """ Concrete class for Arrow arrays of a large list view data type. Identical to ListViewArray, but with 64-bit offsets. """ + @staticmethod + def from_arrays(offsets, sizes, values, DataType type=None, MemoryPool pool=None, mask=None): + """ + Construct LargeListViewArray from arrays of int64 offsets and values. + + Parameters + ---------- + offsets : Array (int64 type) + sizes : Array (int64 type) + values : Array (any type) + type : DataType, optional + If not specified, a default ListType with the values' type is + used. + pool : MemoryPool, optional + mask : Array (boolean type), optional + Indicate which values are null (True) or not null (False). + + Returns + ------- + list_view_array : LargeListViewArray + + Examples + -------- + >>> import pyarrow as pa + >>> values = pa.array([1, 2, 3, 4]) + >>> offsets = pa.array([0, 1, 2]) + >>> sizes = pa.array([2, 2, 2]) + >>> pa.LargeListViewArray.from_arrays(offsets, sizes, values) + + [ + [ + 1, + 2 + ], + [ + 2, + 3 + ], + [ + 3, + 4 + ] + ] + >>> # nulls in the offsets array become null lists + >>> offsets = pa.array([0, None, 2]) + >>> sizes = pa.array([2, 0, 2]) + >>> pa.LargeListViewArray.from_arrays(offsets, sizes, values) + + [ + [ + 1, + 2 + ], + null, + [ + 3, + 4 + ] + ] + """ + cdef: + Array _offsets, _sizes, _values + shared_ptr[CArray] out + shared_ptr[CBuffer] c_mask + + cdef CMemoryPool* cpool = maybe_unbox_memory_pool(pool) + + _offsets = asarray(offsets, type='int64') + _sizes = asarray(sizes, type='int64') + _values = asarray(values) + + c_mask = c_mask_inverted_from_obj(mask, pool) + + if type is not None: + with nogil: + out = GetResultValue( + CLargeListViewArray.FromArraysAndType( + type.sp_type, _offsets.ap[0], _sizes.ap[0], _values.ap[0], cpool, c_mask)) + else: + with nogil: + out = GetResultValue( + CLargeListViewArray.FromArrays( + _offsets.ap[0], _sizes.ap[0], _values.ap[0], cpool, c_mask)) + cdef Array result = pyarrow_wrap_array(out) + result.validate() + return result + + @property + def values(self): + """ + Return the underlying array of values which backs the LargeListArray + ignoring the array's offset. + + If any of the list elements are null, but are backed by a + non-empty sub-list, those elements will be included in the + output. + + Compare with :meth:`flatten`, which returns only the non-null + values taking into consideration the array's offset. + + Returns + ------- + values : Array + + See Also + -------- + LargeListArray.flatten : ... + + Examples + -------- + + The values include null elements from sub-lists: + + >>> import pyarrow as pa + >>> values = [[1, 2], None, [3, 4, None, 6]] + >>> offsets = [0, None, 2] + >>> sizes = [2, 0, 4] + >>> array = pa.LargeListViewArray.from_arrays(offsets, sizes, values) + >>> array.values + + [ + 1, + 2, + 3, + 4, + null, + 6 + ] + """ + cdef CLargeListViewArray* arr = self.ap + return pyarrow_wrap_array(arr.values()) + + @property + def offsets(self): + """ + Return the list view offsets as an int64 array. + + The returned array will not have a validity bitmap, so you cannot + expect to pass it to `LargeListViewArray.from_arrays` and get back the + same list array if the original one has nulls. + + Returns + ------- + offsets : Int64Array + + Examples + -------- + + >>> import pyarrow as pa + >>> values = [[1, 2], None, [3, 4, None, 6]] + >>> offsets = [0, None, 2] + >>> sizes = [2, 0, 4] + >>> array = pa.LargeListViewArray.from_arrays(offsets, sizes, values) + >>> array.offsets + + [ + 0, + 0, + 2 + ] + """ + return pyarrow_wrap_array(( self.ap).offsets()) + + @property + def sizes(self): + """ + Return the list view sizes as an int64 array. + + The returned array will not have a validity bitmap, so you cannot + expect to pass it to `LargeListViewArray.from_arrays` and get back the + same list array if the original one has nulls. + + Returns + ------- + sizes : Int64Array + + Examples + -------- + + >>> import pyarrow as pa + >>> values = [[1, 2], None, [3, 4, None, 6]] + >>> offsets = [0, None, 2] + >>> sizes = [2, 0, 4] + >>> array = pa.LargeListViewArray.from_arrays(offsets, sizes, values) + >>> array.offsets + + [ + 2, + 0, + 4 + ] + """ + return pyarrow_wrap_array(( self.ap).sizes()) cdef class MapArray(ListArray): @@ -3691,6 +4062,8 @@ cdef dict _array_classes = { _Type_DOUBLE: DoubleArray, _Type_LIST: ListArray, _Type_LARGE_LIST: LargeListArray, + _Type_LIST_VIEW: ListViewArray, + _Type_LARGE_LIST_VIEW: LargeListViewArray, _Type_MAP: MapArray, _Type_FIXED_SIZE_LIST: FixedSizeListArray, _Type_SPARSE_UNION: UnionArray, diff --git a/python/pyarrow/includes/libarrow.pxd b/python/pyarrow/includes/libarrow.pxd index dff8cffeafe..3b1258e2f3a 100644 --- a/python/pyarrow/includes/libarrow.pxd +++ b/python/pyarrow/includes/libarrow.pxd @@ -711,10 +711,60 @@ cdef extern from "arrow/api.h" namespace "arrow" nogil: shared_ptr[CDataType] value_type() cdef cppclass CListViewArray" arrow::ListViewArray"(CArray): - pass + @staticmethod + CResult[shared_ptr[CArray]] FromArrays( + const CArray& offsets, + const CArray& sizes, + const CArray& values, + CMemoryPool* pool, + shared_ptr[CBuffer] null_bitmap, + ) + + @staticmethod + CResult[shared_ptr[CArray]] FromArraysAndType" FromArrays"( + shared_ptr[CDataType], + const CArray& offsets, + const CArray& sizes, + const CArray& values, + CMemoryPool* pool, + shared_ptr[CBuffer] null_bitmap, + ) + + const int32_t* raw_value_offsets() + const int32_t* raw_value_sizes() + int32_t value_offset(int i) + int32_t value_length(int i) + shared_ptr[CArray] values() + shared_ptr[CArray] offsets() + shared_ptr[CArray] sizes() + shared_ptr[CDataType] value_type() cdef cppclass CLargeListViewArray" arrow::LargeListViewArray"(CArray): - pass + @staticmethod + CResult[shared_ptr[CArray]] FromArrays( + const CArray& offsets, + const CArray& sizes, + const CArray& values, + CMemoryPool* pool, + shared_ptr[CBuffer] null_bitmap, + ) + + @staticmethod + CResult[shared_ptr[CArray]] FromArraysAndType" FromArrays"( + shared_ptr[CDataType], + const CArray& offsets, + const CArray& sizes, + const CArray& values, + CMemoryPool* pool, + shared_ptr[CBuffer] null_bitmap, + ) + + int64_t value_offset(int i) + int64_t value_length(int i) + shared_ptr[CArray] values() + shared_ptr[CArray] offsets() + shared_ptr[CArray] sizes() + shared_ptr[CDataType] value_type() cdef cppclass CMapArray" arrow::MapArray"(CArray): @staticmethod diff --git a/python/pyarrow/lib.pxd b/python/pyarrow/lib.pxd index 63f75747db3..48350212c20 100644 --- a/python/pyarrow/lib.pxd +++ b/python/pyarrow/lib.pxd @@ -435,15 +435,11 @@ cdef class LargeListArray(BaseListArray): pass -cdef class BaseListViewArray(Array): +cdef class ListViewArray(Array): pass -cdef class ListViewArray(BaseListViewArray): - pass - - -cdef class LargeListViewArray(BaseListViewArray): +cdef class LargeListViewArray(Array): pass diff --git a/python/pyarrow/tests/test_array.py b/python/pyarrow/tests/test_array.py index f851d4e0b6c..83e39394225 100644 --- a/python/pyarrow/tests/test_array.py +++ b/python/pyarrow/tests/test_array.py @@ -3573,3 +3573,29 @@ def test_run_end_encoded_from_buffers(): with pytest.raises(ValueError): pa.RunEndEncodedArray.from_buffers(ree_type, length, buffers, 1, offset, children) + + +@pytest.mark.parametrize(('list_array_type'), + [pa.ListViewArray, pa.LargeListViewArray]) +def test_list_view_from_arrays(list_array_type): + values = [[1, 2], [3, 4, 5], [6, None, 7], [8]] + offsets = [0, 0, 1, 2, 3] + sizes = [2, 0, 3, 3, 1] + array = list_array_type.from_arrays(offsets, sizes, values) + + assert array.values.to_pylist() == values + assert array.values.values.to_pylist() == [1, 2, 3, 4, 5, 6, None, 7, 8] + assert array.offsets.to_pylist() == offsets + assert array.sizes.to_pylist() == sizes + + # test out of order offsets with overlapping values + values = [1, 2, 3, 4] + offsets = [2, 1, 0] + sizes = [2, 2, 2] + array = list_array_type.from_arrays(offsets, sizes, values) + + assert array.to_pylist() == [[3, 4], [2, 3], [1, 2]] + assert array.values.to_pylist() == values + assert array.offsets.to_pylist() == offsets + assert array.sizes.to_pylist() == sizes + diff --git a/python/pyarrow/tests/test_types.py b/python/pyarrow/tests/test_types.py index 75204a6d6be..0add5786088 100644 --- a/python/pyarrow/tests/test_types.py +++ b/python/pyarrow/tests/test_types.py @@ -612,7 +612,7 @@ def test_list_view_type(): pa.list_view(None) -def test_large_list_type(): +def test_large_list_view_type(): ty = pa.large_list_view(pa.utf8()) assert isinstance(ty, pa.LargeListViewType) assert ty.value_type == pa.utf8() From e2bd20258cc1d9e8b5100b16b0af93c9ce9b7999 Mon Sep 17 00:00:00 2001 From: Dane Pitkin Date: Wed, 31 Jan 2024 17:32:46 -0500 Subject: [PATCH 10/18] Fix linter and remove unnecessary whitespace edit --- python/pyarrow/src/arrow/python/python_to_arrow.cc | 1 + python/pyarrow/tests/test_array.py | 1 - 2 files changed, 1 insertion(+), 1 deletion(-) diff --git a/python/pyarrow/src/arrow/python/python_to_arrow.cc b/python/pyarrow/src/arrow/python/python_to_arrow.cc index fee9c73863b..3c4d59d6594 100644 --- a/python/pyarrow/src/arrow/python/python_to_arrow.cc +++ b/python/pyarrow/src/arrow/python/python_to_arrow.cc @@ -1227,6 +1227,7 @@ Result> ConvertPySequence(PyObject* obj, PyObject* options.strict = true; } DCHECK_GE(size, 0); + ARROW_ASSIGN_OR_RAISE(auto converter, (MakeConverter( options.type, options, pool))); if (converter->may_overflow()) { diff --git a/python/pyarrow/tests/test_array.py b/python/pyarrow/tests/test_array.py index 83e39394225..d85fe253277 100644 --- a/python/pyarrow/tests/test_array.py +++ b/python/pyarrow/tests/test_array.py @@ -3598,4 +3598,3 @@ def test_list_view_from_arrays(list_array_type): assert array.values.to_pylist() == values assert array.offsets.to_pylist() == offsets assert array.sizes.to_pylist() == sizes - From 0bdf95ab1e887c5ef7ab82d7eea26d918550d6ea Mon Sep 17 00:00:00 2001 From: Dane Pitkin Date: Wed, 31 Jan 2024 22:10:35 -0500 Subject: [PATCH 11/18] Fix docstrings --- python/pyarrow/array.pxi | 62 +++++++++++++++++++++++----------------- 1 file changed, 35 insertions(+), 27 deletions(-) diff --git a/python/pyarrow/array.pxi b/python/pyarrow/array.pxi index 89d26c763f3..cc443c6e0fb 100644 --- a/python/pyarrow/array.pxi +++ b/python/pyarrow/array.pxi @@ -2570,19 +2570,23 @@ cdef class ListViewArray(Array): The values include null elements from sub-lists: >>> import pyarrow as pa - >>> values = [[1, 2], None, [3, 4, None, 6]] + >>> values = [[1, 2], [3, 4, None, 6]] >>> offsets = [0, None, 2] >>> sizes = [2, 0, 4] - >>> array = pa.ListViewArray(offsets, sizes, values) + >>> array = pa.ListViewArray.from_arrays(offsets, sizes, values) >>> array.values - + [ - 1, - 2, - 3, - 4, - null, - 6 + [ + 1, + 2 + ], + [ + 3, + 4, + null, + 6 + ] ] """ cdef CListViewArray* arr = self.ap @@ -2604,10 +2608,10 @@ cdef class ListViewArray(Array): Examples -------- >>> import pyarrow as pa - >>> values = [[1, 2], None, [3, 4, None, 6]] + >>> values = [[1, 2], [3, 4, None, 6]] >>> offsets = [0, None, 2] >>> sizes = [2, 0, 4] - >>> array = pa.ListViewArray(offsets, sizes, values) + >>> array = pa.ListViewArray.from_arrays(offsets, sizes, values) >>> array.offsets [ @@ -2634,16 +2638,16 @@ cdef class ListViewArray(Array): Examples -------- >>> import pyarrow as pa - >>> values = [[1, 2], None, [3, 4, None, 6]] + >>> values = [[1, 2], [3, 4, None, 6]] >>> offsets = [0, None, 2] >>> sizes = [2, 0, 4] - >>> array = pa.ListViewArray(offsets, sizes, values) + >>> array = pa.ListViewArray.from_arrays(offsets, sizes, values) >>> array.offsets [ - 2, 0, - 4 + 0, + 2 ] """ return pyarrow_wrap_array(( self.ap).sizes()) @@ -2768,19 +2772,23 @@ cdef class LargeListViewArray(Array): The values include null elements from sub-lists: >>> import pyarrow as pa - >>> values = [[1, 2], None, [3, 4, None, 6]] + >>> values = [[1, 2], [3, 4, None, 6]] >>> offsets = [0, None, 2] >>> sizes = [2, 0, 4] >>> array = pa.LargeListViewArray.from_arrays(offsets, sizes, values) >>> array.values - + [ - 1, - 2, - 3, - 4, - null, - 6 + [ + 1, + 2 + ], + [ + 3, + 4, + null, + 6 + ] ] """ cdef CLargeListViewArray* arr = self.ap @@ -2803,7 +2811,7 @@ cdef class LargeListViewArray(Array): -------- >>> import pyarrow as pa - >>> values = [[1, 2], None, [3, 4, None, 6]] + >>> values = [[1, 2], [3, 4, None, 6]] >>> offsets = [0, None, 2] >>> sizes = [2, 0, 4] >>> array = pa.LargeListViewArray.from_arrays(offsets, sizes, values) @@ -2834,16 +2842,16 @@ cdef class LargeListViewArray(Array): -------- >>> import pyarrow as pa - >>> values = [[1, 2], None, [3, 4, None, 6]] + >>> values = [[1, 2], [3, 4, None, 6]] >>> offsets = [0, None, 2] >>> sizes = [2, 0, 4] >>> array = pa.LargeListViewArray.from_arrays(offsets, sizes, values) >>> array.offsets [ - 2, 0, - 4 + 0, + 2 ] """ return pyarrow_wrap_array(( self.ap).sizes()) From 3e2bba338bf6786e80bdaa7d91c1ba2fff49c86b Mon Sep 17 00:00:00 2001 From: Dane Pitkin <48041712+danepitkin@users.noreply.github.com> Date: Thu, 1 Feb 2024 10:36:39 -0500 Subject: [PATCH 12/18] Update python/pyarrow/array.pxi Co-authored-by: Joris Van den Bossche --- python/pyarrow/array.pxi | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python/pyarrow/array.pxi b/python/pyarrow/array.pxi index cc443c6e0fb..8273cd10b3d 100644 --- a/python/pyarrow/array.pxi +++ b/python/pyarrow/array.pxi @@ -2625,7 +2625,7 @@ cdef class ListViewArray(Array): @property def sizes(self): """ - Return the sizes offsets as an int32 array. + Return the list sizes as an int32 array. The returned array will not have a validity bitmap, so you cannot expect to pass it to `ListViewArray.from_arrays` and get back the same From 79c06984645446380b652cbc1a572024720024b5 Mon Sep 17 00:00:00 2001 From: Dane Pitkin Date: Thu, 1 Feb 2024 12:41:26 -0500 Subject: [PATCH 13/18] Update docstrings, update comments, fix test cases --- python/pyarrow/array.pxi | 127 ++++++++++++++++------------- python/pyarrow/tests/test_array.py | 43 +++++++++- python/pyarrow/types.pxi | 8 +- 3 files changed, 116 insertions(+), 62 deletions(-) diff --git a/python/pyarrow/array.pxi b/python/pyarrow/array.pxi index 8273cd10b3d..ba669bb88e8 100644 --- a/python/pyarrow/array.pxi +++ b/python/pyarrow/array.pxi @@ -2508,9 +2508,24 @@ cdef class ListViewArray(Array): 4 ] ] - >>> # nulls in the offsets array become null lists + >>> # use a null mask to represent null values + >>> mask = pa.array([False, True, False]) + >>> pa.ListViewArray.from_arrays(offsets, sizes, values, mask=mask) + + [ + [ + 1, + 2 + ], + null, + [ + 3, + 4 + ] + ] + >>> # null values can be defined in either offsets or sizes arrays + >>> # WARNING: this will result in a copy of the offsets or sizes arrays >>> offsets = pa.array([0, None, 2]) - >>> sizes = pa.array([2, 0, 2]) >>> pa.ListViewArray.from_arrays(offsets, sizes, values) [ @@ -2557,9 +2572,9 @@ cdef class ListViewArray(Array): Return the underlying array of values which backs the ListViewArray ignoring the array's offset and sizes. - If any of the list elements are null, but are backed by a - non-empty sub-list, those elements will be included in the - output. + The values array may be out of order and/or contain additional values + that are not found in the logical representation of the array. The only + guarantee is that each non-null value in the ListView Array is contiguous. Returns ------- @@ -2570,23 +2585,18 @@ cdef class ListViewArray(Array): The values include null elements from sub-lists: >>> import pyarrow as pa - >>> values = [[1, 2], [3, 4, None, 6]] - >>> offsets = [0, None, 2] + >>> values = [1, 2, None, 3, 4] + >>> offsets = [0, 0, 1] >>> sizes = [2, 0, 4] >>> array = pa.ListViewArray.from_arrays(offsets, sizes, values) >>> array.values - + [ - [ - 1, - 2 - ], - [ - 3, - 4, - null, - 6 - ] + 1, + 2, + null, + 3, + 4 ] """ cdef CListViewArray* arr = self.ap @@ -2608,8 +2618,8 @@ cdef class ListViewArray(Array): Examples -------- >>> import pyarrow as pa - >>> values = [[1, 2], [3, 4, None, 6]] - >>> offsets = [0, None, 2] + >>> values = [1, 2, None, 3, 4] + >>> offsets = [0, 0, 1] >>> sizes = [2, 0, 4] >>> array = pa.ListViewArray.from_arrays(offsets, sizes, values) >>> array.offsets @@ -2617,7 +2627,7 @@ cdef class ListViewArray(Array): [ 0, 0, - 2 + 1 ] """ return pyarrow_wrap_array(( self.ap).offsets()) @@ -2638,16 +2648,16 @@ cdef class ListViewArray(Array): Examples -------- >>> import pyarrow as pa - >>> values = [[1, 2], [3, 4, None, 6]] - >>> offsets = [0, None, 2] + >>> values = [1, 2, None, 3, 4] + >>> offsets = [0, 0, 1] >>> sizes = [2, 0, 4] >>> array = pa.ListViewArray.from_arrays(offsets, sizes, values) - >>> array.offsets + >>> array.sizes [ + 2, 0, - 0, - 2 + 4 ] """ return pyarrow_wrap_array(( self.ap).sizes()) @@ -2701,9 +2711,24 @@ cdef class LargeListViewArray(Array): 4 ] ] - >>> # nulls in the offsets array become null lists + >>> # use a null mask to represent null values + >>> mask = pa.array([False, True, False]) + >>> pa.LargeListViewArray.from_arrays(offsets, sizes, values, mask=mask) + + [ + [ + 1, + 2 + ], + null, + [ + 3, + 4 + ] + ] + >>> # null values can be defined in either offsets or sizes arrays + >>> # WARNING: this will result in a copy of the offsets or sizes arrays >>> offsets = pa.array([0, None, 2]) - >>> sizes = pa.array([2, 0, 2]) >>> pa.LargeListViewArray.from_arrays(offsets, sizes, values) [ @@ -2751,12 +2776,9 @@ cdef class LargeListViewArray(Array): Return the underlying array of values which backs the LargeListArray ignoring the array's offset. - If any of the list elements are null, but are backed by a - non-empty sub-list, those elements will be included in the - output. - - Compare with :meth:`flatten`, which returns only the non-null - values taking into consideration the array's offset. + The values array may be out of order and/or contain additional values + that are not found in the logical representation of the array. The only + guarantee is that each non-null value in the ListView Array is contiguous. Returns ------- @@ -2772,23 +2794,18 @@ cdef class LargeListViewArray(Array): The values include null elements from sub-lists: >>> import pyarrow as pa - >>> values = [[1, 2], [3, 4, None, 6]] - >>> offsets = [0, None, 2] + >>> values = [1, 2, None, 3, 4] + >>> offsets = [0, 0, 1] >>> sizes = [2, 0, 4] >>> array = pa.LargeListViewArray.from_arrays(offsets, sizes, values) >>> array.values - + [ - [ - 1, - 2 - ], - [ - 3, - 4, - null, - 6 - ] + 1, + 2, + null, + 3, + 4 ] """ cdef CLargeListViewArray* arr = self.ap @@ -2811,8 +2828,8 @@ cdef class LargeListViewArray(Array): -------- >>> import pyarrow as pa - >>> values = [[1, 2], [3, 4, None, 6]] - >>> offsets = [0, None, 2] + >>> values = [1, 2, None, 3, 4] + >>> offsets = [0, 0, 1] >>> sizes = [2, 0, 4] >>> array = pa.LargeListViewArray.from_arrays(offsets, sizes, values) >>> array.offsets @@ -2820,7 +2837,7 @@ cdef class LargeListViewArray(Array): [ 0, 0, - 2 + 1 ] """ return pyarrow_wrap_array(( self.ap).offsets()) @@ -2842,16 +2859,16 @@ cdef class LargeListViewArray(Array): -------- >>> import pyarrow as pa - >>> values = [[1, 2], [3, 4, None, 6]] - >>> offsets = [0, None, 2] + >>> values = [1, 2, None, 3, 4] + >>> offsets = [0, 0, 1] >>> sizes = [2, 0, 4] >>> array = pa.LargeListViewArray.from_arrays(offsets, sizes, values) - >>> array.offsets + >>> array.sizes [ + 2, 0, - 0, - 2 + 4 ] """ return pyarrow_wrap_array(( self.ap).sizes()) diff --git a/python/pyarrow/tests/test_array.py b/python/pyarrow/tests/test_array.py index d85fe253277..380bdf7b91f 100644 --- a/python/pyarrow/tests/test_array.py +++ b/python/pyarrow/tests/test_array.py @@ -3578,13 +3578,14 @@ def test_run_end_encoded_from_buffers(): @pytest.mark.parametrize(('list_array_type'), [pa.ListViewArray, pa.LargeListViewArray]) def test_list_view_from_arrays(list_array_type): - values = [[1, 2], [3, 4, 5], [6, None, 7], [8]] - offsets = [0, 0, 1, 2, 3] - sizes = [2, 0, 3, 3, 1] + # test in order offsets, similar to ListArray representation + values = [1, 2, 3, 4, 5, 6, None, 7] + offsets = [0, 2, 4, 6] + sizes = [2, 2, 2, 2] array = list_array_type.from_arrays(offsets, sizes, values) + assert array.to_pylist() == [[1, 2], [3, 4], [5, 6], [None, 7]] assert array.values.to_pylist() == values - assert array.values.values.to_pylist() == [1, 2, 3, 4, 5, 6, None, 7, 8] assert array.offsets.to_pylist() == offsets assert array.sizes.to_pylist() == sizes @@ -3598,3 +3599,37 @@ def test_list_view_from_arrays(list_array_type): assert array.values.to_pylist() == values assert array.offsets.to_pylist() == offsets assert array.sizes.to_pylist() == sizes + + # test null offsets and empty list values + values = [] + offsets = [0, None] + sizes = [0, 0] + array = list_array_type.from_arrays(offsets, sizes, values) + + assert array.to_pylist() == [[], None] + assert array.values.to_pylist() == values + assert array.offsets.to_pylist() == [0, 0] + assert array.sizes.to_pylist() == sizes + + # test null sizes and empty list values + values = [] + offsets = [0, 0] + sizes = [None, 0] + array = list_array_type.from_arrays(offsets, sizes, values) + + assert array.to_pylist() == [None, []] + assert array.values.to_pylist() == values + assert array.offsets.to_pylist() == offsets + assert array.sizes.to_pylist() == [0, 0] + + # test null bitmask + values = [1, 2] + offsets = [0, 0, 1] + sizes = [1, 0, 1] + mask = pa.array([False, True, False]) + array = list_array_type.from_arrays(offsets, sizes, values, mask=mask) + + assert array.to_pylist() == [[1], None, [2]] + assert array.values.to_pylist() == values + assert array.offsets.to_pylist() == offsets + assert array.sizes.to_pylist() == sizes diff --git a/python/pyarrow/types.pxi b/python/pyarrow/types.pxi index dccf5364b17..6c023f1ce44 100644 --- a/python/pyarrow/types.pxi +++ b/python/pyarrow/types.pxi @@ -4627,6 +4627,9 @@ cpdef ListViewType list_view(value_type): """ Create ListViewType instance from child data type or field. + This data type may not be supported by all Arrow implementations + because it is an alternative to the ListType. + Parameters ---------- value_type : DataType or Field @@ -4662,9 +4665,8 @@ cpdef LargeListViewType large_list_view(value_type): """ Create LargeListViewType instance from child data type or field. - This data type may not be supported by all Arrow implementations. - Unless you need to represent data larger than 2**31 elements, you should - prefer list_view(). + This data type may not be supported by all Arrow implementations + because it is an alternative to the ListType. Parameters ---------- From 81311234e88ceae66a6df8e2bd61830349a7dbba Mon Sep 17 00:00:00 2001 From: Dane Pitkin Date: Tue, 6 Feb 2024 11:51:57 -0500 Subject: [PATCH 14/18] Add flatten() api --- python/pyarrow/array.pxi | 94 +++++++++++++++++++++++++++- python/pyarrow/includes/libarrow.pxd | 8 +++ python/pyarrow/tests/test_array.py | 12 ++++ 3 files changed, 111 insertions(+), 3 deletions(-) diff --git a/python/pyarrow/array.pxi b/python/pyarrow/array.pxi index ba669bb88e8..ad65c7760b9 100644 --- a/python/pyarrow/array.pxi +++ b/python/pyarrow/array.pxi @@ -2544,7 +2544,7 @@ cdef class ListViewArray(Array): Array _offsets, _sizes, _values shared_ptr[CArray] out shared_ptr[CBuffer] c_mask - cdef CMemoryPool* cpool = maybe_unbox_memory_pool(pool) + CMemoryPool* cpool = maybe_unbox_memory_pool(pool) _offsets = asarray(offsets, type='int32') _sizes = asarray(sizes, type='int32') @@ -2576,6 +2576,9 @@ cdef class ListViewArray(Array): that are not found in the logical representation of the array. The only guarantee is that each non-null value in the ListView Array is contiguous. + Compare with :meth:`flatten`, which returns only the non-null + values taking into consideration the array's offset. + Returns ------- values : Array @@ -2662,6 +2665,48 @@ cdef class ListViewArray(Array): """ return pyarrow_wrap_array(( self.ap).sizes()) + def flatten(self, MemoryPool pool=None): + """ + Unnest this ListViewArray by one level. + + The returned Array is logically a concatenation of all the sub-lists + in this Array. + + Note that this method is different from ``self.values`` in that + it takes care of the slicing offset as well as null elements backed + by non-empty sub-lists. + + Returns + ------- + result : Array + + Examples + -------- + + >>> import pyarrow as pa + >>> values = [1, 2, 3, 4] + >>> offsets = [2, 1, 0] + >>> sizes = [2, 2, 2] + >>> array = pa.ListViewArray.from_arrays(offsets, sizes, values) + >>> array.flatten() + + [ + 3, + 4, + 2, + 3, + 1, + 2 + ] + """ + cdef CMemoryPool* cpool = maybe_unbox_memory_pool(pool) + with nogil: + out = GetResultValue(( self.ap).Flatten(cpool)) + cdef Array result = pyarrow_wrap_array(out) + result.validate() + return result + + cdef class LargeListViewArray(Array): """ Concrete class for Arrow arrays of a large list view data type. @@ -2747,8 +2792,7 @@ cdef class LargeListViewArray(Array): Array _offsets, _sizes, _values shared_ptr[CArray] out shared_ptr[CBuffer] c_mask - - cdef CMemoryPool* cpool = maybe_unbox_memory_pool(pool) + CMemoryPool* cpool = maybe_unbox_memory_pool(pool) _offsets = asarray(offsets, type='int64') _sizes = asarray(sizes, type='int64') @@ -2780,6 +2824,9 @@ cdef class LargeListViewArray(Array): that are not found in the logical representation of the array. The only guarantee is that each non-null value in the ListView Array is contiguous. + Compare with :meth:`flatten`, which returns only the non-null + values taking into consideration the array's offset. + Returns ------- values : Array @@ -2873,6 +2920,47 @@ cdef class LargeListViewArray(Array): """ return pyarrow_wrap_array(( self.ap).sizes()) + def flatten(self, MemoryPool pool=None): + """ + Unnest this LargeListViewArray by one level. + + The returned Array is logically a concatenation of all the sub-lists + in this Array. + + Note that this method is different from ``self.values`` in that + it takes care of the slicing offset as well as null elements backed + by non-empty sub-lists. + + Returns + ------- + result : Array + + Examples + -------- + + >>> import pyarrow as pa + >>> values = [1, 2, 3, 4] + >>> offsets = [2, 1, 0] + >>> sizes = [2, 2, 2] + >>> array = pa.LargeListViewArray.from_arrays(offsets, sizes, values) + >>> array.flatten() + + [ + 3, + 4, + 2, + 3, + 1, + 2 + ] + """ + cdef CMemoryPool* cpool = maybe_unbox_memory_pool(pool) + with nogil: + out = GetResultValue(( self.ap).Flatten(cpool)) + cdef Array result = pyarrow_wrap_array(out) + result.validate() + return result + cdef class MapArray(ListArray): """ diff --git a/python/pyarrow/includes/libarrow.pxd b/python/pyarrow/includes/libarrow.pxd index 3b1258e2f3a..f0597972cd2 100644 --- a/python/pyarrow/includes/libarrow.pxd +++ b/python/pyarrow/includes/libarrow.pxd @@ -730,6 +730,10 @@ cdef extern from "arrow/api.h" namespace "arrow" nogil: shared_ptr[CBuffer] null_bitmap, ) + CResult[shared_ptr[CArray]] Flatten( + CMemoryPool* pool + ) + const int32_t* raw_value_offsets() const int32_t* raw_value_sizes() int32_t value_offset(int i) @@ -759,6 +763,10 @@ cdef extern from "arrow/api.h" namespace "arrow" nogil: shared_ptr[CBuffer] null_bitmap, ) + CResult[shared_ptr[CArray]] Flatten( + CMemoryPool* pool + ) + int64_t value_offset(int i) int64_t value_length(int i) shared_ptr[CArray] values() diff --git a/python/pyarrow/tests/test_array.py b/python/pyarrow/tests/test_array.py index 380bdf7b91f..403d36369ee 100644 --- a/python/pyarrow/tests/test_array.py +++ b/python/pyarrow/tests/test_array.py @@ -3633,3 +3633,15 @@ def test_list_view_from_arrays(list_array_type): assert array.values.to_pylist() == values assert array.offsets.to_pylist() == offsets assert array.sizes.to_pylist() == sizes + + +@pytest.mark.parametrize(('list_array_type'), + [pa.ListViewArray, pa.LargeListViewArray]) +def test_list_view_flatten(list_array_type): + values = [1, 2, 3, 4] + offsets = [3, 2, 1, 0] + sizes = [1, 1, 1, 1] + array = list_array_type.from_arrays(offsets, sizes, values) + + assert array.flatten().to_pylist() == [4, 3, 2, 1] + From 5e87ec19efb0341e4c4b1124279e7da217930f09 Mon Sep 17 00:00:00 2001 From: Dane Pitkin Date: Tue, 6 Feb 2024 12:18:36 -0500 Subject: [PATCH 15/18] Fix linter --- python/pyarrow/tests/test_array.py | 1 - 1 file changed, 1 deletion(-) diff --git a/python/pyarrow/tests/test_array.py b/python/pyarrow/tests/test_array.py index 403d36369ee..bd9ae214b04 100644 --- a/python/pyarrow/tests/test_array.py +++ b/python/pyarrow/tests/test_array.py @@ -3644,4 +3644,3 @@ def test_list_view_flatten(list_array_type): array = list_array_type.from_arrays(offsets, sizes, values) assert array.flatten().to_pylist() == [4, 3, 2, 1] - From 2990362f01e122e4853f392e46cdf4ed1c6d7e65 Mon Sep 17 00:00:00 2001 From: Dane Pitkin Date: Tue, 6 Feb 2024 13:28:08 -0500 Subject: [PATCH 16/18] Fix docstrings --- python/pyarrow/array.pxi | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/python/pyarrow/array.pxi b/python/pyarrow/array.pxi index ad65c7760b9..e3a2937f192 100644 --- a/python/pyarrow/array.pxi +++ b/python/pyarrow/array.pxi @@ -2614,6 +2614,10 @@ cdef class ListViewArray(Array): expect to pass it to `ListViewArray.from_arrays` and get back the same list array if the original one has nulls. + Parameters + ---------- + pool : MemoryPool, optional + Returns ------- offsets : Int32Array @@ -2931,6 +2935,10 @@ cdef class LargeListViewArray(Array): it takes care of the slicing offset as well as null elements backed by non-empty sub-lists. + Parameters + ---------- + pool : MemoryPool, optional + Returns ------- result : Array From 6a53242e46bac9785942724dc63daeb60296fe8e Mon Sep 17 00:00:00 2001 From: Dane Pitkin Date: Tue, 6 Feb 2024 16:22:54 -0500 Subject: [PATCH 17/18] Fix docstrings properly.. --- python/pyarrow/array.pxi | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/python/pyarrow/array.pxi b/python/pyarrow/array.pxi index e3a2937f192..29e173f5631 100644 --- a/python/pyarrow/array.pxi +++ b/python/pyarrow/array.pxi @@ -2614,10 +2614,6 @@ cdef class ListViewArray(Array): expect to pass it to `ListViewArray.from_arrays` and get back the same list array if the original one has nulls. - Parameters - ---------- - pool : MemoryPool, optional - Returns ------- offsets : Int32Array @@ -2669,7 +2665,7 @@ cdef class ListViewArray(Array): """ return pyarrow_wrap_array(( self.ap).sizes()) - def flatten(self, MemoryPool pool=None): + def flatten(self, pool=None): """ Unnest this ListViewArray by one level. @@ -2680,6 +2676,10 @@ cdef class ListViewArray(Array): it takes care of the slicing offset as well as null elements backed by non-empty sub-lists. + Parameters + ---------- + pool : MemoryPool, optional + Returns ------- result : Array @@ -2924,7 +2924,7 @@ cdef class LargeListViewArray(Array): """ return pyarrow_wrap_array(( self.ap).sizes()) - def flatten(self, MemoryPool pool=None): + def flatten(self, pool=None): """ Unnest this LargeListViewArray by one level. From 9bf1673c1ae8483e878dd370cef432fec4acf0e5 Mon Sep 17 00:00:00 2001 From: Dane Pitkin Date: Wed, 7 Feb 2024 11:21:22 -0500 Subject: [PATCH 18/18] Improve docstring examples --- python/pyarrow/array.pxi | 78 +++++++++++++++++++++++++++++++++++----- 1 file changed, 70 insertions(+), 8 deletions(-) diff --git a/python/pyarrow/array.pxi b/python/pyarrow/array.pxi index 29e173f5631..3a319ab4544 100644 --- a/python/pyarrow/array.pxi +++ b/python/pyarrow/array.pxi @@ -2577,7 +2577,7 @@ cdef class ListViewArray(Array): guarantee is that each non-null value in the ListView Array is contiguous. Compare with :meth:`flatten`, which returns only the non-null - values taking into consideration the array's offset. + values taking into consideration the array's order and offset. Returns ------- @@ -2592,6 +2592,21 @@ cdef class ListViewArray(Array): >>> offsets = [0, 0, 1] >>> sizes = [2, 0, 4] >>> array = pa.ListViewArray.from_arrays(offsets, sizes, values) + >>> array + + [ + [ + 1, + 2 + ], + [], + [ + 2, + null, + 3, + 4 + ] + ] >>> array.values [ @@ -2665,7 +2680,7 @@ cdef class ListViewArray(Array): """ return pyarrow_wrap_array(( self.ap).sizes()) - def flatten(self, pool=None): + def flatten(self, memory_pool=None): """ Unnest this ListViewArray by one level. @@ -2678,7 +2693,7 @@ cdef class ListViewArray(Array): Parameters ---------- - pool : MemoryPool, optional + memory_pool : MemoryPool, optional Returns ------- @@ -2692,6 +2707,22 @@ cdef class ListViewArray(Array): >>> offsets = [2, 1, 0] >>> sizes = [2, 2, 2] >>> array = pa.ListViewArray.from_arrays(offsets, sizes, values) + >>> array + + [ + [ + 3, + 4 + ], + [ + 2, + 3 + ], + [ + 1, + 2 + ] + ] >>> array.flatten() [ @@ -2703,7 +2734,7 @@ cdef class ListViewArray(Array): 2 ] """ - cdef CMemoryPool* cpool = maybe_unbox_memory_pool(pool) + cdef CMemoryPool* cpool = maybe_unbox_memory_pool(memory_pool) with nogil: out = GetResultValue(( self.ap).Flatten(cpool)) cdef Array result = pyarrow_wrap_array(out) @@ -2829,7 +2860,7 @@ cdef class LargeListViewArray(Array): guarantee is that each non-null value in the ListView Array is contiguous. Compare with :meth:`flatten`, which returns only the non-null - values taking into consideration the array's offset. + values taking into consideration the array's order and offset. Returns ------- @@ -2849,6 +2880,21 @@ cdef class LargeListViewArray(Array): >>> offsets = [0, 0, 1] >>> sizes = [2, 0, 4] >>> array = pa.LargeListViewArray.from_arrays(offsets, sizes, values) + >>> array + + [ + [ + 1, + 2 + ], + [], + [ + 2, + null, + 3, + 4 + ] + ] >>> array.values [ @@ -2924,7 +2970,7 @@ cdef class LargeListViewArray(Array): """ return pyarrow_wrap_array(( self.ap).sizes()) - def flatten(self, pool=None): + def flatten(self, memory_pool=None): """ Unnest this LargeListViewArray by one level. @@ -2937,7 +2983,7 @@ cdef class LargeListViewArray(Array): Parameters ---------- - pool : MemoryPool, optional + memory_pool : MemoryPool, optional Returns ------- @@ -2951,6 +2997,22 @@ cdef class LargeListViewArray(Array): >>> offsets = [2, 1, 0] >>> sizes = [2, 2, 2] >>> array = pa.LargeListViewArray.from_arrays(offsets, sizes, values) + >>> array + + [ + [ + 3, + 4 + ], + [ + 2, + 3 + ], + [ + 1, + 2 + ] + ] >>> array.flatten() [ @@ -2962,7 +3024,7 @@ cdef class LargeListViewArray(Array): 2 ] """ - cdef CMemoryPool* cpool = maybe_unbox_memory_pool(pool) + cdef CMemoryPool* cpool = maybe_unbox_memory_pool(memory_pool) with nogil: out = GetResultValue(( self.ap).Flatten(cpool)) cdef Array result = pyarrow_wrap_array(out)