From fa61d91d4861f25084bd517549cad18abf205caf Mon Sep 17 00:00:00 2001 From: Dane Pitkin Date: Thu, 22 Aug 2024 21:24:06 +0000 Subject: [PATCH 01/18] GH-43728:[Python] ChunkedArray fails gracefully on non-cpu devices --- cpp/src/arrow/chunked_array.cc | 12 ++++++++++++ cpp/src/arrow/chunked_array.h | 11 +++++++++++ python/pyarrow/includes/libarrow.pxd | 2 ++ python/pyarrow/lib.pxd | 1 + python/pyarrow/table.pxi | 22 ++++++++++++++++++++++ 5 files changed, 48 insertions(+) diff --git a/cpp/src/arrow/chunked_array.cc b/cpp/src/arrow/chunked_array.cc index dd6aa51534f..a30ece2f5e4 100644 --- a/cpp/src/arrow/chunked_array.cc +++ b/cpp/src/arrow/chunked_array.cc @@ -49,6 +49,7 @@ ChunkedArray::ChunkedArray(ArrayVector chunks, std::shared_ptr type) type_(std::move(type)), length_(0), null_count_(0), + device_type_(DeviceAllocationType::kCPU), chunk_resolver_{chunks_} { if (type_ == nullptr) { ARROW_CHECK_GT(chunks_.size(), 0) @@ -56,9 +57,14 @@ ChunkedArray::ChunkedArray(ArrayVector chunks, std::shared_ptr type) type_ = chunks_[0]->type(); } + if (chunks_.size() > 0) { + device_type_ = chunks[0]->device_type(); + } + for (const auto& chunk : chunks_) { length_ += chunk->length(); null_count_ += chunk->null_count(); + DCHECK_EQ(device_type_, chunk->device_type()); } } @@ -106,6 +112,9 @@ bool ChunkedArray::Equals(const ChunkedArray& other, const EqualOptions& opts) c if (null_count_ != other.null_count()) { return false; } + if (device_type_ != other.device_type()) { + return false; + } // We cannot toggle check_metadata here yet, so we don't check it if (!type_->Equals(*other.type_, /*check_metadata=*/false)) { return false; @@ -161,6 +170,9 @@ bool ChunkedArray::ApproxEquals(const ChunkedArray& other, if (null_count_ != other.null_count()) { return false; } + if (device_type_ != other.device_type()) { + return false; + } // We cannot toggle check_metadata here yet, so we don't check it if (!type_->Equals(*other.type_, /*check_metadata=*/false)) { return false; diff --git a/cpp/src/arrow/chunked_array.h b/cpp/src/arrow/chunked_array.h index c65b6cb6e22..8c9647271c5 100644 --- a/cpp/src/arrow/chunked_array.h +++ b/cpp/src/arrow/chunked_array.h @@ -26,6 +26,7 @@ #include "arrow/chunk_resolver.h" #include "arrow/compare.h" #include "arrow/device_allocation_type_set.h" +#include "arrow/device.h" #include "arrow/result.h" #include "arrow/status.h" #include "arrow/type_fwd.h" @@ -190,11 +191,21 @@ class ARROW_EXPORT ChunkedArray { /// \return Status Status ValidateFull() const; + /// \brief Return the device_type that this chunked array's data is allocated + /// on. + /// + /// This just delegates to calling device_type on the underlying ArrayData + /// object which backs this Array. + /// + /// \return DeviceAllocationType + DeviceAllocationType device_type() const { return device_type_; } + protected: ArrayVector chunks_; std::shared_ptr type_; int64_t length_; int64_t null_count_; + DeviceAllocationType device_type_; private: template diff --git a/python/pyarrow/includes/libarrow.pxd b/python/pyarrow/includes/libarrow.pxd index c2346750a19..8d6455c0c44 100644 --- a/python/pyarrow/includes/libarrow.pxd +++ b/python/pyarrow/includes/libarrow.pxd @@ -983,6 +983,8 @@ cdef extern from "arrow/api.h" namespace "arrow" nogil: CResult[vector[shared_ptr[CChunkedArray]]] Flatten(CMemoryPool* pool) + CDeviceAllocationType device_type() + CStatus Validate() const CStatus ValidateFull() const diff --git a/python/pyarrow/lib.pxd b/python/pyarrow/lib.pxd index ad05ea31c91..ba989f12111 100644 --- a/python/pyarrow/lib.pxd +++ b/python/pyarrow/lib.pxd @@ -513,6 +513,7 @@ cdef class ChunkedArray(_PandasConvertible): cdef void init(self, const shared_ptr[CChunkedArray]& chunked_array) cdef getitem(self, int64_t i) + cdef void _assert_cpu(self) except * cdef class _Tabular(_PandasConvertible): diff --git a/python/pyarrow/table.pxi b/python/pyarrow/table.pxi index 9bb86236659..2012edaa5c1 100644 --- a/python/pyarrow/table.pxi +++ b/python/pyarrow/table.pxi @@ -1410,6 +1410,28 @@ cdef class ChunkedArray(_PandasConvertible): self.init(c_chunked_array) return self + @property + def device_type(self): + """ + The device type where the chunks in the ChunkedArray reside. + + Returns + ------- + DeviceAllocationType + """ + return _wrap_device_allocation_type(self.sp_chunked_array.get().device_type()) + + @property + def is_cpu(self): + """ + Whether the ChunkedArrays's chunks are CPU-accessible. + """ + return self.device_type == DeviceAllocationType.CPU + + cdef void _assert_cpu(self) except *: + if self.sp_chunked_array.get().device_type() != CDeviceAllocationType_kCPU: + raise NotImplementedError("Implemented only for data on CPU device") + def chunked_array(arrays, type=None): """ From ebdc7ae52d54faa28df6c3bb756e544dcbe29d69 Mon Sep 17 00:00:00 2001 From: Dane Pitkin Date: Thu, 22 Aug 2024 21:29:36 +0000 Subject: [PATCH 02/18] Update C++ docstring --- cpp/src/arrow/chunked_array.h | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/cpp/src/arrow/chunked_array.h b/cpp/src/arrow/chunked_array.h index 8c9647271c5..7393bf7a42a 100644 --- a/cpp/src/arrow/chunked_array.h +++ b/cpp/src/arrow/chunked_array.h @@ -194,8 +194,7 @@ class ARROW_EXPORT ChunkedArray { /// \brief Return the device_type that this chunked array's data is allocated /// on. /// - /// This just delegates to calling device_type on the underlying ArrayData - /// object which backs this Array. + /// This just delegates to calling device_type on the underlying chunks. /// /// \return DeviceAllocationType DeviceAllocationType device_type() const { return device_type_; } From 414532f72408ca1070cbe7ac18d69f3ee4f4a286 Mon Sep 17 00:00:00 2001 From: Dane Pitkin Date: Fri, 23 Aug 2024 14:41:22 +0000 Subject: [PATCH 03/18] Revert "Update C++ docstring" This reverts commit d91cfabbcc374b3fd30e263284a2168c7c7cbf71. --- cpp/src/arrow/chunked_array.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/cpp/src/arrow/chunked_array.h b/cpp/src/arrow/chunked_array.h index 7393bf7a42a..8c9647271c5 100644 --- a/cpp/src/arrow/chunked_array.h +++ b/cpp/src/arrow/chunked_array.h @@ -194,7 +194,8 @@ class ARROW_EXPORT ChunkedArray { /// \brief Return the device_type that this chunked array's data is allocated /// on. /// - /// This just delegates to calling device_type on the underlying chunks. + /// This just delegates to calling device_type on the underlying ArrayData + /// object which backs this Array. /// /// \return DeviceAllocationType DeviceAllocationType device_type() const { return device_type_; } From f680af0bdc4f32cc0ea56b4b20ec8ab400827e59 Mon Sep 17 00:00:00 2001 From: Dane Pitkin Date: Fri, 23 Aug 2024 14:41:38 +0000 Subject: [PATCH 04/18] Revert "GH-43728:[Python] ChunkedArray fails gracefully on non-cpu devices" This reverts commit 1fcdb1f790f9d34b4d63e33f8a162b0346bc2ab5. --- cpp/src/arrow/chunked_array.cc | 12 ------------ cpp/src/arrow/chunked_array.h | 10 ---------- python/pyarrow/includes/libarrow.pxd | 2 -- python/pyarrow/lib.pxd | 1 - python/pyarrow/table.pxi | 22 ---------------------- 5 files changed, 47 deletions(-) diff --git a/cpp/src/arrow/chunked_array.cc b/cpp/src/arrow/chunked_array.cc index a30ece2f5e4..dd6aa51534f 100644 --- a/cpp/src/arrow/chunked_array.cc +++ b/cpp/src/arrow/chunked_array.cc @@ -49,7 +49,6 @@ ChunkedArray::ChunkedArray(ArrayVector chunks, std::shared_ptr type) type_(std::move(type)), length_(0), null_count_(0), - device_type_(DeviceAllocationType::kCPU), chunk_resolver_{chunks_} { if (type_ == nullptr) { ARROW_CHECK_GT(chunks_.size(), 0) @@ -57,14 +56,9 @@ ChunkedArray::ChunkedArray(ArrayVector chunks, std::shared_ptr type) type_ = chunks_[0]->type(); } - if (chunks_.size() > 0) { - device_type_ = chunks[0]->device_type(); - } - for (const auto& chunk : chunks_) { length_ += chunk->length(); null_count_ += chunk->null_count(); - DCHECK_EQ(device_type_, chunk->device_type()); } } @@ -112,9 +106,6 @@ bool ChunkedArray::Equals(const ChunkedArray& other, const EqualOptions& opts) c if (null_count_ != other.null_count()) { return false; } - if (device_type_ != other.device_type()) { - return false; - } // We cannot toggle check_metadata here yet, so we don't check it if (!type_->Equals(*other.type_, /*check_metadata=*/false)) { return false; @@ -170,9 +161,6 @@ bool ChunkedArray::ApproxEquals(const ChunkedArray& other, if (null_count_ != other.null_count()) { return false; } - if (device_type_ != other.device_type()) { - return false; - } // We cannot toggle check_metadata here yet, so we don't check it if (!type_->Equals(*other.type_, /*check_metadata=*/false)) { return false; diff --git a/cpp/src/arrow/chunked_array.h b/cpp/src/arrow/chunked_array.h index 8c9647271c5..a561b9b3ce2 100644 --- a/cpp/src/arrow/chunked_array.h +++ b/cpp/src/arrow/chunked_array.h @@ -191,21 +191,11 @@ class ARROW_EXPORT ChunkedArray { /// \return Status Status ValidateFull() const; - /// \brief Return the device_type that this chunked array's data is allocated - /// on. - /// - /// This just delegates to calling device_type on the underlying ArrayData - /// object which backs this Array. - /// - /// \return DeviceAllocationType - DeviceAllocationType device_type() const { return device_type_; } - protected: ArrayVector chunks_; std::shared_ptr type_; int64_t length_; int64_t null_count_; - DeviceAllocationType device_type_; private: template diff --git a/python/pyarrow/includes/libarrow.pxd b/python/pyarrow/includes/libarrow.pxd index 8d6455c0c44..c2346750a19 100644 --- a/python/pyarrow/includes/libarrow.pxd +++ b/python/pyarrow/includes/libarrow.pxd @@ -983,8 +983,6 @@ cdef extern from "arrow/api.h" namespace "arrow" nogil: CResult[vector[shared_ptr[CChunkedArray]]] Flatten(CMemoryPool* pool) - CDeviceAllocationType device_type() - CStatus Validate() const CStatus ValidateFull() const diff --git a/python/pyarrow/lib.pxd b/python/pyarrow/lib.pxd index ba989f12111..ad05ea31c91 100644 --- a/python/pyarrow/lib.pxd +++ b/python/pyarrow/lib.pxd @@ -513,7 +513,6 @@ cdef class ChunkedArray(_PandasConvertible): cdef void init(self, const shared_ptr[CChunkedArray]& chunked_array) cdef getitem(self, int64_t i) - cdef void _assert_cpu(self) except * cdef class _Tabular(_PandasConvertible): diff --git a/python/pyarrow/table.pxi b/python/pyarrow/table.pxi index 2012edaa5c1..9bb86236659 100644 --- a/python/pyarrow/table.pxi +++ b/python/pyarrow/table.pxi @@ -1410,28 +1410,6 @@ cdef class ChunkedArray(_PandasConvertible): self.init(c_chunked_array) return self - @property - def device_type(self): - """ - The device type where the chunks in the ChunkedArray reside. - - Returns - ------- - DeviceAllocationType - """ - return _wrap_device_allocation_type(self.sp_chunked_array.get().device_type()) - - @property - def is_cpu(self): - """ - Whether the ChunkedArrays's chunks are CPU-accessible. - """ - return self.device_type == DeviceAllocationType.CPU - - cdef void _assert_cpu(self) except *: - if self.sp_chunked_array.get().device_type() != CDeviceAllocationType_kCPU: - raise NotImplementedError("Implemented only for data on CPU device") - def chunked_array(arrays, type=None): """ From 7ef98d5d61f9448a40ec1d1775c43efa1f00c948 Mon Sep 17 00:00:00 2001 From: Dane Pitkin Date: Fri, 23 Aug 2024 15:09:28 +0000 Subject: [PATCH 05/18] Implement ChunkedArray::IsCpu() instead --- cpp/src/arrow/chunked_array.cc | 10 ++++++++++ cpp/src/arrow/chunked_array.h | 3 +++ python/pyarrow/includes/libarrow.pxd | 2 ++ python/pyarrow/table.pxi | 9 +++++++++ python/pyarrow/tests/test_table.py | 15 +++++++++++++++ 5 files changed, 39 insertions(+) diff --git a/cpp/src/arrow/chunked_array.cc b/cpp/src/arrow/chunked_array.cc index dd6aa51534f..a159d588e20 100644 --- a/cpp/src/arrow/chunked_array.cc +++ b/cpp/src/arrow/chunked_array.cc @@ -28,6 +28,7 @@ #include "arrow/array/util.h" #include "arrow/array/validate.h" #include "arrow/device_allocation_type_set.h" +#include "arrow/device.h" #include "arrow/pretty_print.h" #include "arrow/status.h" #include "arrow/type.h" @@ -299,6 +300,15 @@ Status ChunkedArray::ValidateFull() const { return ValidateChunks(chunks_, /*full_validation=*/true); } +bool ChunkedArray::IsCpu() const { + for (const auto& chunk : chunks_) { + if (chunk->device_type() != DeviceAllocationType::kCPU) { + return false; + } + } + return true; +} + namespace internal { bool MultipleChunkIterator::Next(std::shared_ptr* next_left, diff --git a/cpp/src/arrow/chunked_array.h b/cpp/src/arrow/chunked_array.h index a561b9b3ce2..f3c002c7720 100644 --- a/cpp/src/arrow/chunked_array.h +++ b/cpp/src/arrow/chunked_array.h @@ -191,6 +191,9 @@ class ARROW_EXPORT ChunkedArray { /// \return Status Status ValidateFull() const; + /// \brief Determine if all chunks are located on the CPU + bool IsCpu() const; + protected: ArrayVector chunks_; std::shared_ptr type_; diff --git a/python/pyarrow/includes/libarrow.pxd b/python/pyarrow/includes/libarrow.pxd index c2346750a19..70d67ee759e 100644 --- a/python/pyarrow/includes/libarrow.pxd +++ b/python/pyarrow/includes/libarrow.pxd @@ -983,6 +983,8 @@ cdef extern from "arrow/api.h" namespace "arrow" nogil: CResult[vector[shared_ptr[CChunkedArray]]] Flatten(CMemoryPool* pool) + c_bool IsCpu() const + CStatus Validate() const CStatus ValidateFull() const diff --git a/python/pyarrow/table.pxi b/python/pyarrow/table.pxi index 9bb86236659..04bdfbbf822 100644 --- a/python/pyarrow/table.pxi +++ b/python/pyarrow/table.pxi @@ -1410,6 +1410,15 @@ cdef class ChunkedArray(_PandasConvertible): self.init(c_chunked_array) return self + def is_cpu(self): + """ + Whether all chunks in the ChunkedArray are CPU-accessible. + """ + cdef c_bool result + with nogil: + result = self.chunked_array.IsCpu() + return result + def chunked_array(arrays, type=None): """ diff --git a/python/pyarrow/tests/test_table.py b/python/pyarrow/tests/test_table.py index 57765985505..29cb35882ec 100644 --- a/python/pyarrow/tests/test_table.py +++ b/python/pyarrow/tests/test_table.py @@ -3399,6 +3399,16 @@ def cuda_arrays(cuda_context, cpu_arrays): return [arr.copy_to(cuda_context.memory_manager) for arr in cpu_arrays] +@pytest.fixture +def cuda_chunked_arrays(cuda_arrays): + return pa.chunked_array(cuda_arrays) + + +@pytest.fixture +def cpu_chunked_arrays(cpu_arrays): + return pa.chunked_array(cpu_arrays) + + @pytest.fixture def cpu_recordbatch(cpu_arrays, schema): return pa.record_batch(cpu_arrays, schema=schema) @@ -3409,6 +3419,11 @@ def cuda_recordbatch(cuda_context, cpu_recordbatch): return cpu_recordbatch.copy_to(cuda_context.memory_manager) +def test_chunked_array_non_cpu(cuda_context, cpu_chunked_arrays, cuda_chunked_arrays): + assert cpu_chunked_arrays.is_cpu() is True + assert cuda_chunked_arrays.is_cpu() is False + + def verify_cuda_recordbatch(batch, expected_schema): batch.validate() assert batch.device_type == pa.DeviceAllocationType.CUDA From 6e9c6af9f262674b6b68fe050cbb3445f4f9e84f Mon Sep 17 00:00:00 2001 From: Dane Pitkin Date: Fri, 23 Aug 2024 21:05:23 +0000 Subject: [PATCH 06/18] Add tests --- python/pyarrow/table.pxi | 33 +++++- python/pyarrow/tests/test_table.py | 156 +++++++++++++++++++++++++++-- 2 files changed, 179 insertions(+), 10 deletions(-) diff --git a/python/pyarrow/table.pxi b/python/pyarrow/table.pxi index 04bdfbbf822..52bedfaed54 100644 --- a/python/pyarrow/table.pxi +++ b/python/pyarrow/table.pxi @@ -69,6 +69,7 @@ cdef class ChunkedArray(_PandasConvertible): self.chunked_array = chunked_array.get() def __reduce__(self): + self._assert_cpu() return chunked_array, (self.chunks, self.type) @property @@ -198,6 +199,7 @@ cdef class ChunkedArray(_PandasConvertible): ArrowInvalid """ if full: + self._assert_cpu() with nogil: check_status(self.sp_chunked_array.get().ValidateFull()) else: @@ -299,13 +301,14 @@ cdef class ChunkedArray(_PandasConvertible): ------- value : Scalar (index) or ChunkedArray (slice) """ - + self._assert_cpu() if isinstance(key, slice): return _normalize_slice(self, key) return self.getitem(_normalize_index(key, self.chunked_array.length())) cdef getitem(self, int64_t i): + self._assert_cpu() return Scalar.wrap(GetResultValue(self.chunked_array.GetScalar(i))) def is_null(self, *, nan_is_null=False): @@ -338,6 +341,7 @@ cdef class ChunkedArray(_PandasConvertible): ] ] """ + self._assert_cpu() options = _pc().NullOptions(nan_is_null=nan_is_null) return _pc().call_function('is_null', [self], options) @@ -363,6 +367,7 @@ cdef class ChunkedArray(_PandasConvertible): ] ] """ + self._assert_cpu() return _pc().is_nan(self) def is_valid(self): @@ -388,6 +393,7 @@ cdef class ChunkedArray(_PandasConvertible): ] ] """ + self._assert_cpu() return _pc().is_valid(self) def __eq__(self, other): @@ -430,6 +436,7 @@ cdef class ChunkedArray(_PandasConvertible): ] ] """ + self._assert_cpu() return _pc().fill_null(self, fill_value) def equals(self, ChunkedArray other): @@ -458,6 +465,7 @@ cdef class ChunkedArray(_PandasConvertible): >>> n_legs.equals(animals) False """ + self._assert_cpu() if other is None: return False @@ -472,6 +480,7 @@ cdef class ChunkedArray(_PandasConvertible): return result def _to_pandas(self, options, types_mapper=None, **kwargs): + self._assert_cpu() return _array_like_to_pandas(self, options, types_mapper=types_mapper) def to_numpy(self, zero_copy_only=False): @@ -495,6 +504,7 @@ cdef class ChunkedArray(_PandasConvertible): >>> n_legs.to_numpy() array([ 2, 2, 4, 4, 5, 100]) """ + self._assert_cpu() if np is None: raise ImportError( "Cannot return a numpy.ndarray if NumPy is not present") @@ -529,6 +539,7 @@ cdef class ChunkedArray(_PandasConvertible): return values def __array__(self, dtype=None, copy=None): + self._assert_cpu() if copy is False: raise ValueError( "Unable to avoid a copy while creating a numpy array as requested " @@ -574,6 +585,7 @@ cdef class ChunkedArray(_PandasConvertible): >>> n_legs_seconds.type DurationType(duration[s]) """ + self._assert_cpu() return _pc().cast(self, target_type, safe=safe, options=options) def dictionary_encode(self, null_encoding='mask'): @@ -636,6 +648,7 @@ cdef class ChunkedArray(_PandasConvertible): ] ] """ + self._assert_cpu() options = _pc().DictionaryEncodeOptions(null_encoding) return _pc().call_function('dictionary_encode', [self], options) @@ -700,6 +713,7 @@ cdef class ChunkedArray(_PandasConvertible): >>> n_legs.type DataType(int64) """ + self._assert_cpu() cdef: vector[shared_ptr[CChunkedArray]] flattened CMemoryPool* pool = maybe_unbox_memory_pool(memory_pool) @@ -751,6 +765,7 @@ cdef class ChunkedArray(_PandasConvertible): 100 ] """ + self._assert_cpu() if self.num_chunks == 0: return array([], type=self.type) else: @@ -791,6 +806,7 @@ cdef class ChunkedArray(_PandasConvertible): 100 ] """ + self._assert_cpu() return _pc().call_function('unique', [self]) def value_counts(self): @@ -837,6 +853,7 @@ cdef class ChunkedArray(_PandasConvertible): 1 ] """ + self._assert_cpu() return _pc().call_function('value_counts', [self]) def slice(self, offset=0, length=None): @@ -884,6 +901,7 @@ cdef class ChunkedArray(_PandasConvertible): ] ] """ + self._assert_cpu() cdef shared_ptr[CChunkedArray] result if offset < 0: @@ -959,6 +977,7 @@ cdef class ChunkedArray(_PandasConvertible): ] ] """ + self._assert_cpu() return _pc().filter(self, mask, null_selection_behavior) def index(self, value, start=None, end=None, *, memory_pool=None): @@ -1006,6 +1025,7 @@ cdef class ChunkedArray(_PandasConvertible): >>> n_legs.index(4, start=3) """ + self._assert_cpu() return _pc().index(self, value, start, end, memory_pool=memory_pool) def take(self, object indices): @@ -1052,6 +1072,7 @@ cdef class ChunkedArray(_PandasConvertible): ] ] """ + self._assert_cpu() return _pc().take(self, indices) def drop_null(self): @@ -1091,6 +1112,7 @@ cdef class ChunkedArray(_PandasConvertible): ] ] """ + self._assert_cpu() return _pc().drop_null(self) def sort(self, order="ascending", **kwargs): @@ -1110,6 +1132,7 @@ cdef class ChunkedArray(_PandasConvertible): ------- result : ChunkedArray """ + self._assert_cpu() indices = _pc().sort_indices( self, options=_pc().SortOptions(sort_keys=[("", order)], **kwargs) @@ -1209,6 +1232,7 @@ cdef class ChunkedArray(_PandasConvertible): ] ] """ + self._assert_cpu() cdef: CMemoryPool* pool = maybe_unbox_memory_pool(memory_pool) shared_ptr[CChunkedArray] c_result @@ -1333,6 +1357,7 @@ cdef class ChunkedArray(_PandasConvertible): >>> n_legs.to_pylist() [2, 2, 4, 4, None, 100] """ + self._assert_cpu() result = [] for i in range(self.num_chunks): result += self.chunk(i).to_pylist() @@ -1354,6 +1379,7 @@ cdef class ChunkedArray(_PandasConvertible): PyCapsule A capsule containing a C ArrowArrayStream struct. """ + self._assert_cpu() cdef: ChunkedArray chunked ArrowArrayStream* c_stream = NULL @@ -1419,6 +1445,11 @@ cdef class ChunkedArray(_PandasConvertible): result = self.chunked_array.IsCpu() return result + def _assert_cpu(self): + if not self.is_cpu(): + raise NotImplementedError("Implemented only for data on CPU device") + + def chunked_array(arrays, type=None): """ diff --git a/python/pyarrow/tests/test_table.py b/python/pyarrow/tests/test_table.py index 29cb35882ec..a7d9a5e081e 100644 --- a/python/pyarrow/tests/test_table.py +++ b/python/pyarrow/tests/test_table.py @@ -3390,8 +3390,8 @@ def schema(): @pytest.fixture def cpu_arrays(): - return [pa.array([1, 2, 3, 4, 5], pa.int16()), - pa.array([-10, -5, 0, 1, 10], pa.int32())] + return [pa.array([1, 2, 3, 4, 5], pa.int32()), + pa.array([-10, -5, 0, None, 10], pa.int32())] @pytest.fixture @@ -3400,13 +3400,24 @@ def cuda_arrays(cuda_context, cpu_arrays): @pytest.fixture -def cuda_chunked_arrays(cuda_arrays): - return pa.chunked_array(cuda_arrays) +def cpu_chunked_array(cpu_arrays): + chunked_array = pa.chunked_array(cpu_arrays) + assert chunked_array.is_cpu() is True + return chunked_array @pytest.fixture -def cpu_chunked_arrays(cpu_arrays): - return pa.chunked_array(cpu_arrays) +def cuda_chunked_array(cuda_arrays): + chunked_array = pa.chunked_array(cuda_arrays) + assert chunked_array.is_cpu() is False + return chunked_array + + +@pytest.fixture +def cpu_and_cuda_chunked_array(cpu_arrays, cuda_arrays): + chunked_array = pa.chunked_array(cpu_arrays + cuda_arrays) + assert chunked_array.is_cpu() is False + return chunked_array @pytest.fixture @@ -3419,9 +3430,136 @@ def cuda_recordbatch(cuda_context, cpu_recordbatch): return cpu_recordbatch.copy_to(cuda_context.memory_manager) -def test_chunked_array_non_cpu(cuda_context, cpu_chunked_arrays, cuda_chunked_arrays): - assert cpu_chunked_arrays.is_cpu() is True - assert cuda_chunked_arrays.is_cpu() is False +def test_chunked_array_non_cpu(cuda_context, cpu_chunked_array, cuda_chunked_array, + cpu_and_cuda_chunked_array): + # type test + assert cuda_chunked_array.type == pa.int32() + + # length() test + assert cuda_chunked_array.length() == 10 + + # str(), repr() test + assert str(cuda_chunked_array) in repr(cuda_chunked_array) + + # validate() test + cuda_chunked_array.validate() + with pytest.raises(NotImplementedError): + cuda_chunked_array.validate(full=True) + + # null_count test + assert cuda_chunked_array.null_count == cpu_chunked_array.null_count + + # nbytes() test + assert cuda_chunked_array.nbytes == cpu_chunked_array.nbytes + assert (cpu_and_cuda_chunked_array.nbytes == (cpu_chunked_array.nbytes + + cuda_chunked_array.nbytes)) + + # get_total_buffer_size() test + assert (cpu_and_cuda_chunked_array.get_total_buffer_size() == + (cuda_chunked_array.get_total_buffer_size() + + cpu_chunked_array.get_total_buffer_size())) + + # getitem() test + with pytest.raises(NotImplementedError): + cuda_chunked_array[0] + + # is_null() test + with pytest.raises(NotImplementedError): + cuda_chunked_array.is_null() + + # is_nan() test + with pytest.raises(NotImplementedError): + cuda_chunked_array.is_nan() + + # is_valid() test + with pytest.raises(NotImplementedError): + cuda_chunked_array.is_valid() + + # fill_null() test + with pytest.raises(NotImplementedError): + cuda_chunked_array.fill_null(0) + + # equals() test + with pytest.raises(NotImplementedError): + cuda_chunked_array == cuda_chunked_array + + # to_pandas() test + with pytest.raises(NotImplementedError): + cuda_chunked_array.to_pandas() + + # to_numpy() test + with pytest.raises(NotImplementedError): + cuda_chunked_array.to_numpy() + + # __array__() test + with pytest.raises(NotImplementedError): + cuda_chunked_array.__array__() + + # cast() test + with pytest.raises(NotImplementedError): + cuda_chunked_array.cast() + + # dictionary_encode() test + with pytest.raises(NotImplementedError): + cuda_chunked_array.dictionary_encode() + + # flatten() test + with pytest.raises(NotImplementedError): + cuda_chunked_array.flatten() + + # combine_chunks() test + with pytest.raises(NotImplementedError): + cuda_chunked_array.combine_chunks() + + # unique() test + with pytest.raises(NotImplementedError): + cuda_chunked_array.unique() + + # value_counts() test + with pytest.raises(NotImplementedError): + cuda_chunked_array.value_counts() + + # filter() test + with pytest.raises(NotImplementedError): + cuda_chunked_array.filter([True, False, True, False, True]) + + # index() test + with pytest.raises(NotImplementedError): + cuda_chunked_array.index(5) + + # slice() test + with pytest.raises(NotImplementedError): + cuda_chunked_array.slice(2, 2) + + # take() test + with pytest.raises(NotImplementedError): + cuda_chunked_array.take([1]) + + # drop_null() test + with pytest.raises(NotImplementedError): + cuda_chunked_array.drop_null() + + # sort() test + with pytest.raises(NotImplementedError): + cuda_chunked_array.sort() + + # unify_dictionaries() test + with pytest.raises(NotImplementedError): + cuda_chunked_array.unify_dictionaries() + + # num_chunks test + assert cuda_chunked_array.num_chunks == cpu_chunked_array.num_chunks + + # chunks test + assert len(cuda_chunked_array.chunks) == len(cpu_chunked_array.chunks) + + # to_pylist() test + with pytest.raises(NotImplementedError): + cuda_chunked_array.to_pylist() + + # __arrow_c_stream__() test + with pytest.raises(NotImplementedError): + cuda_chunked_array.__arrow_c_stream__() def verify_cuda_recordbatch(batch, expected_schema): From ca9f52ccb01c52410fad7562a1276b807d611ad8 Mon Sep 17 00:00:00 2001 From: Dane Pitkin Date: Fri, 23 Aug 2024 21:10:25 +0000 Subject: [PATCH 07/18] Cleanup tests --- python/pyarrow/tests/test_table.py | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/python/pyarrow/tests/test_table.py b/python/pyarrow/tests/test_table.py index a7d9a5e081e..f4b94facf8b 100644 --- a/python/pyarrow/tests/test_table.py +++ b/python/pyarrow/tests/test_table.py @@ -3433,12 +3433,15 @@ def cuda_recordbatch(cuda_context, cpu_recordbatch): def test_chunked_array_non_cpu(cuda_context, cpu_chunked_array, cuda_chunked_array, cpu_and_cuda_chunked_array): # type test - assert cuda_chunked_array.type == pa.int32() + assert cuda_chunked_array.type == cpu_chunked_array.type # length() test - assert cuda_chunked_array.length() == 10 + assert cuda_chunked_array.length() == cpu_chunked_array.length() - # str(), repr() test + # str() test + assert str(cuda_chunked_array) == str(cpu_chunked_array) + + # repr() test assert str(cuda_chunked_array) in repr(cuda_chunked_array) # validate() test From 68954edb0b647c98c1ff830b1467c07108806962 Mon Sep 17 00:00:00 2001 From: Dane Pitkin Date: Fri, 23 Aug 2024 21:14:06 +0000 Subject: [PATCH 08/18] Lint --- python/pyarrow/table.pxi | 1 - python/pyarrow/tests/test_table.py | 2 +- 2 files changed, 1 insertion(+), 2 deletions(-) diff --git a/python/pyarrow/table.pxi b/python/pyarrow/table.pxi index 52bedfaed54..58d1fa825a2 100644 --- a/python/pyarrow/table.pxi +++ b/python/pyarrow/table.pxi @@ -1450,7 +1450,6 @@ cdef class ChunkedArray(_PandasConvertible): raise NotImplementedError("Implemented only for data on CPU device") - def chunked_array(arrays, type=None): """ Construct chunked array from list of array-like objects diff --git a/python/pyarrow/tests/test_table.py b/python/pyarrow/tests/test_table.py index f4b94facf8b..3fc1ae26f60 100644 --- a/python/pyarrow/tests/test_table.py +++ b/python/pyarrow/tests/test_table.py @@ -3459,7 +3459,7 @@ def test_chunked_array_non_cpu(cuda_context, cpu_chunked_array, cuda_chunked_arr # get_total_buffer_size() test assert (cpu_and_cuda_chunked_array.get_total_buffer_size() == - (cuda_chunked_array.get_total_buffer_size() + + (cuda_chunked_array.get_total_buffer_size() + cpu_chunked_array.get_total_buffer_size())) # getitem() test From 72acfca30da5ee94084069657dc69f77e57cb147 Mon Sep 17 00:00:00 2001 From: Dane Pitkin Date: Mon, 26 Aug 2024 15:43:58 +0000 Subject: [PATCH 09/18] Fix nbytes test --- python/pyarrow/table.pxi | 1 + python/pyarrow/tests/test_table.py | 5 ++--- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/python/pyarrow/table.pxi b/python/pyarrow/table.pxi index 58d1fa825a2..ab65c5ce2fd 100644 --- a/python/pyarrow/table.pxi +++ b/python/pyarrow/table.pxi @@ -247,6 +247,7 @@ cdef class ChunkedArray(_PandasConvertible): >>> n_legs.nbytes 49 """ + self._assert_cpu() cdef: CResult[int64_t] c_res_buffer diff --git a/python/pyarrow/tests/test_table.py b/python/pyarrow/tests/test_table.py index 3fc1ae26f60..db2a66dfe65 100644 --- a/python/pyarrow/tests/test_table.py +++ b/python/pyarrow/tests/test_table.py @@ -3453,9 +3453,8 @@ def test_chunked_array_non_cpu(cuda_context, cpu_chunked_array, cuda_chunked_arr assert cuda_chunked_array.null_count == cpu_chunked_array.null_count # nbytes() test - assert cuda_chunked_array.nbytes == cpu_chunked_array.nbytes - assert (cpu_and_cuda_chunked_array.nbytes == (cpu_chunked_array.nbytes + - cuda_chunked_array.nbytes)) + with pytest.raises(NotImplementedError): + cuda_chunked_array.nbytes # get_total_buffer_size() test assert (cpu_and_cuda_chunked_array.get_total_buffer_size() == From 19b023907f511771039de346e73386bd60703c94 Mon Sep 17 00:00:00 2001 From: Dane Pitkin Date: Mon, 26 Aug 2024 16:05:59 +0000 Subject: [PATCH 10/18] Fix get_total_buffer_size() test --- python/pyarrow/table.pxi | 1 + python/pyarrow/tests/test_table.py | 5 ++--- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/python/pyarrow/table.pxi b/python/pyarrow/table.pxi index ab65c5ce2fd..38c4f4fcd29 100644 --- a/python/pyarrow/table.pxi +++ b/python/pyarrow/table.pxi @@ -274,6 +274,7 @@ cdef class ChunkedArray(_PandasConvertible): >>> n_legs.get_total_buffer_size() 49 """ + self._assert_cpu() cdef: int64_t total_buffer_size diff --git a/python/pyarrow/tests/test_table.py b/python/pyarrow/tests/test_table.py index db2a66dfe65..61ebb41acce 100644 --- a/python/pyarrow/tests/test_table.py +++ b/python/pyarrow/tests/test_table.py @@ -3457,9 +3457,8 @@ def test_chunked_array_non_cpu(cuda_context, cpu_chunked_array, cuda_chunked_arr cuda_chunked_array.nbytes # get_total_buffer_size() test - assert (cpu_and_cuda_chunked_array.get_total_buffer_size() == - (cuda_chunked_array.get_total_buffer_size() + - cpu_chunked_array.get_total_buffer_size())) + with pytest.raises(NotImplementedError): + cuda_chunked_array.get_total_buffer_size() # getitem() test with pytest.raises(NotImplementedError): From 0caa9848c187aea42cb191cca50f4b149ccb7987 Mon Sep 17 00:00:00 2001 From: Dane Pitkin Date: Thu, 29 Aug 2024 21:04:29 +0000 Subject: [PATCH 11/18] Rebase --- cpp/src/arrow/chunked_array.cc | 9 --------- cpp/src/arrow/chunked_array.h | 4 ---- python/pyarrow/includes/libarrow.pxd | 2 +- python/pyarrow/table.pxi | 5 +++-- python/pyarrow/tests/test_table.py | 6 +++--- 5 files changed, 7 insertions(+), 19 deletions(-) diff --git a/cpp/src/arrow/chunked_array.cc b/cpp/src/arrow/chunked_array.cc index a159d588e20..78695e778af 100644 --- a/cpp/src/arrow/chunked_array.cc +++ b/cpp/src/arrow/chunked_array.cc @@ -300,15 +300,6 @@ Status ChunkedArray::ValidateFull() const { return ValidateChunks(chunks_, /*full_validation=*/true); } -bool ChunkedArray::IsCpu() const { - for (const auto& chunk : chunks_) { - if (chunk->device_type() != DeviceAllocationType::kCPU) { - return false; - } - } - return true; -} - namespace internal { bool MultipleChunkIterator::Next(std::shared_ptr* next_left, diff --git a/cpp/src/arrow/chunked_array.h b/cpp/src/arrow/chunked_array.h index f3c002c7720..c65b6cb6e22 100644 --- a/cpp/src/arrow/chunked_array.h +++ b/cpp/src/arrow/chunked_array.h @@ -26,7 +26,6 @@ #include "arrow/chunk_resolver.h" #include "arrow/compare.h" #include "arrow/device_allocation_type_set.h" -#include "arrow/device.h" #include "arrow/result.h" #include "arrow/status.h" #include "arrow/type_fwd.h" @@ -191,9 +190,6 @@ class ARROW_EXPORT ChunkedArray { /// \return Status Status ValidateFull() const; - /// \brief Determine if all chunks are located on the CPU - bool IsCpu() const; - protected: ArrayVector chunks_; std::shared_ptr type_; diff --git a/python/pyarrow/includes/libarrow.pxd b/python/pyarrow/includes/libarrow.pxd index 70d67ee759e..8e6922a912a 100644 --- a/python/pyarrow/includes/libarrow.pxd +++ b/python/pyarrow/includes/libarrow.pxd @@ -983,7 +983,7 @@ cdef extern from "arrow/api.h" namespace "arrow" nogil: CResult[vector[shared_ptr[CChunkedArray]]] Flatten(CMemoryPool* pool) - c_bool IsCpu() const + c_bool is_cpu() const CStatus Validate() const CStatus ValidateFull() const diff --git a/python/pyarrow/table.pxi b/python/pyarrow/table.pxi index 38c4f4fcd29..076beb8afed 100644 --- a/python/pyarrow/table.pxi +++ b/python/pyarrow/table.pxi @@ -1438,17 +1438,18 @@ cdef class ChunkedArray(_PandasConvertible): self.init(c_chunked_array) return self + @property def is_cpu(self): """ Whether all chunks in the ChunkedArray are CPU-accessible. """ cdef c_bool result with nogil: - result = self.chunked_array.IsCpu() + result = self.chunked_array.is_cpu() return result def _assert_cpu(self): - if not self.is_cpu(): + if not self.is_cpu: raise NotImplementedError("Implemented only for data on CPU device") diff --git a/python/pyarrow/tests/test_table.py b/python/pyarrow/tests/test_table.py index 61ebb41acce..acd23209125 100644 --- a/python/pyarrow/tests/test_table.py +++ b/python/pyarrow/tests/test_table.py @@ -3402,21 +3402,21 @@ def cuda_arrays(cuda_context, cpu_arrays): @pytest.fixture def cpu_chunked_array(cpu_arrays): chunked_array = pa.chunked_array(cpu_arrays) - assert chunked_array.is_cpu() is True + assert chunked_array.is_cpu is True return chunked_array @pytest.fixture def cuda_chunked_array(cuda_arrays): chunked_array = pa.chunked_array(cuda_arrays) - assert chunked_array.is_cpu() is False + assert chunked_array.is_cpu is False return chunked_array @pytest.fixture def cpu_and_cuda_chunked_array(cpu_arrays, cuda_arrays): chunked_array = pa.chunked_array(cpu_arrays + cuda_arrays) - assert chunked_array.is_cpu() is False + assert chunked_array.is_cpu is False return chunked_array From cca6c7730a5d0acc1979d35ba792b4fa96c4d4eb Mon Sep 17 00:00:00 2001 From: Dane Pitkin Date: Thu, 29 Aug 2024 21:25:05 +0000 Subject: [PATCH 12/18] Delete unused header --- cpp/src/arrow/chunked_array.cc | 1 - 1 file changed, 1 deletion(-) diff --git a/cpp/src/arrow/chunked_array.cc b/cpp/src/arrow/chunked_array.cc index 78695e778af..dd6aa51534f 100644 --- a/cpp/src/arrow/chunked_array.cc +++ b/cpp/src/arrow/chunked_array.cc @@ -28,7 +28,6 @@ #include "arrow/array/util.h" #include "arrow/array/validate.h" #include "arrow/device_allocation_type_set.h" -#include "arrow/device.h" #include "arrow/pretty_print.h" #include "arrow/status.h" #include "arrow/type.h" From dc585d24585c1b2d5483d032fe642f55fa6d3d10 Mon Sep 17 00:00:00 2001 From: Dane Pitkin Date: Thu, 29 Aug 2024 21:38:07 +0000 Subject: [PATCH 13/18] Update tests --- python/pyarrow/table.pxi | 2 +- python/pyarrow/tests/test_table.py | 14 +++++++++++--- 2 files changed, 12 insertions(+), 4 deletions(-) diff --git a/python/pyarrow/table.pxi b/python/pyarrow/table.pxi index 076beb8afed..9fe846cc8e8 100644 --- a/python/pyarrow/table.pxi +++ b/python/pyarrow/table.pxi @@ -222,6 +222,7 @@ cdef class ChunkedArray(_PandasConvertible): >>> n_legs.null_count 1 """ + self._assert_cpu() return self.chunked_array.null_count() @property @@ -903,7 +904,6 @@ cdef class ChunkedArray(_PandasConvertible): ] ] """ - self._assert_cpu() cdef shared_ptr[CChunkedArray] result if offset < 0: diff --git a/python/pyarrow/tests/test_table.py b/python/pyarrow/tests/test_table.py index acd23209125..3c975f7e0fb 100644 --- a/python/pyarrow/tests/test_table.py +++ b/python/pyarrow/tests/test_table.py @@ -3450,7 +3450,8 @@ def test_chunked_array_non_cpu(cuda_context, cpu_chunked_array, cuda_chunked_arr cuda_chunked_array.validate(full=True) # null_count test - assert cuda_chunked_array.null_count == cpu_chunked_array.null_count + with pytest.raises(NotImplementedError): + cuda_chunked_array.null_count # nbytes() test with pytest.raises(NotImplementedError): @@ -3529,8 +3530,7 @@ def test_chunked_array_non_cpu(cuda_context, cpu_chunked_array, cuda_chunked_arr cuda_chunked_array.index(5) # slice() test - with pytest.raises(NotImplementedError): - cuda_chunked_array.slice(2, 2) + cuda_chunked_array.slice(2, 2) # take() test with pytest.raises(NotImplementedError): @@ -3554,6 +3554,10 @@ def test_chunked_array_non_cpu(cuda_context, cpu_chunked_array, cuda_chunked_arr # chunks test assert len(cuda_chunked_array.chunks) == len(cpu_chunked_array.chunks) + # chunk() test + chunk = cuda_chunked_array.chunk(0) + assert chunk.device_type == pa.DeviceAllocationType.CUDA + # to_pylist() test with pytest.raises(NotImplementedError): cuda_chunked_array.to_pylist() @@ -3562,6 +3566,10 @@ def test_chunked_array_non_cpu(cuda_context, cpu_chunked_array, cuda_chunked_arr with pytest.raises(NotImplementedError): cuda_chunked_array.__arrow_c_stream__() + # __reduce__() test + with pytest.raises(NotImplementedError): + cuda_chunked_array.__reduce__() + def verify_cuda_recordbatch(batch, expected_schema): batch.validate() From 7c413a0693589dae6a2fa86f6297e7c62e10b58f Mon Sep 17 00:00:00 2001 From: Dane Pitkin Date: Tue, 3 Sep 2024 19:30:55 +0000 Subject: [PATCH 14/18] Cache is_cpu property --- python/pyarrow/lib.pxd | 2 ++ python/pyarrow/table.pxi | 9 +++++---- python/pyarrow/tests/test_table.py | 7 +++++++ 3 files changed, 14 insertions(+), 4 deletions(-) diff --git a/python/pyarrow/lib.pxd b/python/pyarrow/lib.pxd index ad05ea31c91..1caf58e20e6 100644 --- a/python/pyarrow/lib.pxd +++ b/python/pyarrow/lib.pxd @@ -506,6 +506,8 @@ cdef class ChunkedArray(_PandasConvertible): cdef: shared_ptr[CChunkedArray] sp_chunked_array CChunkedArray* chunked_array + c_bool _is_cpu + c_bool _init_is_cpu cdef readonly: # To allow Table to propagate metadata to pandas.Series diff --git a/python/pyarrow/table.pxi b/python/pyarrow/table.pxi index 9fe846cc8e8..d10f714eaa2 100644 --- a/python/pyarrow/table.pxi +++ b/python/pyarrow/table.pxi @@ -59,6 +59,7 @@ cdef class ChunkedArray(_PandasConvertible): def __cinit__(self): self.chunked_array = NULL + self._init_is_cpu = False def __init__(self): raise TypeError("Do not call ChunkedArray's constructor directly, use " @@ -1443,10 +1444,10 @@ cdef class ChunkedArray(_PandasConvertible): """ Whether all chunks in the ChunkedArray are CPU-accessible. """ - cdef c_bool result - with nogil: - result = self.chunked_array.is_cpu() - return result + if self._init_is_cpu == False: + self._is_cpu = self.chunked_array.is_cpu() + self._init_is_cpu = True + return self._is_cpu def _assert_cpu(self): if not self.is_cpu: diff --git a/python/pyarrow/tests/test_table.py b/python/pyarrow/tests/test_table.py index 3c975f7e0fb..6c1360cb0a1 100644 --- a/python/pyarrow/tests/test_table.py +++ b/python/pyarrow/tests/test_table.py @@ -3377,6 +3377,13 @@ def test_invalid_non_join_column(): assert exp_error_msg in str(excinfo.value) +def test_is_cpu(): + chunked_array = pa.chunked_array([1, 2, 3]) + assert chunked_array._init_is_cpu is False + assert chunked_array.is_cpu() is True + assert chunked_array._init_is_cpu is True + + @pytest.fixture def cuda_context(): cuda = pytest.importorskip("pyarrow.cuda") From b422e2b514bdd72016063c443a7aa2404361cd52 Mon Sep 17 00:00:00 2001 From: Dane Pitkin Date: Tue, 3 Sep 2024 19:53:49 +0000 Subject: [PATCH 15/18] Fix broken test --- python/pyarrow/tests/test_table.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python/pyarrow/tests/test_table.py b/python/pyarrow/tests/test_table.py index 6c1360cb0a1..8afc4778af0 100644 --- a/python/pyarrow/tests/test_table.py +++ b/python/pyarrow/tests/test_table.py @@ -3378,7 +3378,7 @@ def test_invalid_non_join_column(): def test_is_cpu(): - chunked_array = pa.chunked_array([1, 2, 3]) + chunked_array = pa.chunked_array([]) assert chunked_array._init_is_cpu is False assert chunked_array.is_cpu() is True assert chunked_array._init_is_cpu is True From 6d12a4fb485f4985c875855dc8e7c1fd8c6b27d0 Mon Sep 17 00:00:00 2001 From: Dane Pitkin Date: Tue, 3 Sep 2024 20:53:23 +0000 Subject: [PATCH 16/18] Delete bad test --- python/pyarrow/tests/test_table.py | 7 ------- 1 file changed, 7 deletions(-) diff --git a/python/pyarrow/tests/test_table.py b/python/pyarrow/tests/test_table.py index 8afc4778af0..3c975f7e0fb 100644 --- a/python/pyarrow/tests/test_table.py +++ b/python/pyarrow/tests/test_table.py @@ -3377,13 +3377,6 @@ def test_invalid_non_join_column(): assert exp_error_msg in str(excinfo.value) -def test_is_cpu(): - chunked_array = pa.chunked_array([]) - assert chunked_array._init_is_cpu is False - assert chunked_array.is_cpu() is True - assert chunked_array._init_is_cpu is True - - @pytest.fixture def cuda_context(): cuda = pytest.importorskip("pyarrow.cuda") From 5c67cd8c9bbf0d293782f08d201d70b1723e4a50 Mon Sep 17 00:00:00 2001 From: Dane Pitkin Date: Wed, 4 Sep 2024 13:46:13 +0000 Subject: [PATCH 17/18] Cleanup is_cup() --- python/pyarrow/table.pxi | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python/pyarrow/table.pxi b/python/pyarrow/table.pxi index d10f714eaa2..3b0df981e01 100644 --- a/python/pyarrow/table.pxi +++ b/python/pyarrow/table.pxi @@ -1444,7 +1444,7 @@ cdef class ChunkedArray(_PandasConvertible): """ Whether all chunks in the ChunkedArray are CPU-accessible. """ - if self._init_is_cpu == False: + if not self._init_is_cpu: self._is_cpu = self.chunked_array.is_cpu() self._init_is_cpu = True return self._is_cpu From a1d857ad6a8dbdb7eee1964972e2ad2945a9e89d Mon Sep 17 00:00:00 2001 From: Dane Pitkin Date: Wed, 4 Sep 2024 14:25:06 +0000 Subject: [PATCH 18/18] Fix tests after rebase --- python/pyarrow/tests/test_table.py | 15 ++++++++------- 1 file changed, 8 insertions(+), 7 deletions(-) diff --git a/python/pyarrow/tests/test_table.py b/python/pyarrow/tests/test_table.py index 3c975f7e0fb..c3f805b4b32 100644 --- a/python/pyarrow/tests/test_table.py +++ b/python/pyarrow/tests/test_table.py @@ -3385,13 +3385,13 @@ def cuda_context(): @pytest.fixture def schema(): - return pa.schema([pa.field('c0', pa.int16()), pa.field('c1', pa.int32())]) + return pa.schema([pa.field('c0', pa.int32()), pa.field('c1', pa.int32())]) @pytest.fixture -def cpu_arrays(): - return [pa.array([1, 2, 3, 4, 5], pa.int32()), - pa.array([-10, -5, 0, None, 10], pa.int32())] +def cpu_arrays(schema): + return [pa.array([1, 2, 3, 4, 5], schema.field(0).type), + pa.array([-10, -5, 0, None, 10], schema.field(1).type)] @pytest.fixture @@ -3642,8 +3642,8 @@ def test_recordbatch_non_cpu(cuda_context, cpu_recordbatch, cuda_recordbatch, cuda_recordbatch.sort_by('c0') # field() test - assert cuda_recordbatch.field(0) == pa.field('c0', pa.int16()) - assert cuda_recordbatch.field(1) == pa.field('c1', pa.int32()) + assert cuda_recordbatch.field(0) == schema.field(0) + assert cuda_recordbatch.field(1) == schema.field(1) # equals() test new_batch = cpu_recordbatch.copy_to(cuda_context.memory_manager) @@ -3713,7 +3713,8 @@ def test_recordbatch_non_cpu(cuda_context, cpu_recordbatch, cuda_recordbatch, # rename_columns() test new_batch = cuda_recordbatch.rename_columns(['col0', 'col1']) expected_schema = pa.schema( - [pa.field('col0', pa.int16()), pa.field('col1', pa.int32())]) + [pa.field('col0', schema.field(0).type), + pa.field('col1', schema.field(1).type)]) verify_cuda_recordbatch(new_batch, expected_schema=expected_schema) # validate() test