From 59d1b6be82e857c6c50bf513e99b7b37ed71a4aa Mon Sep 17 00:00:00 2001 From: Michael Droettboom Date: Thu, 14 Aug 2025 15:44:11 -0400 Subject: [PATCH 1/3] Improve #449: Improve StridedMemoryView creation time Two changes: 1. Refactor the versioned/non-versioned paths to reduce the number of branches. 2. Create shape and strides tuples using Python/C API --- .../cuda/core/experimental/_memoryview.pyx | 36 ++++++++++--------- 1 file changed, 20 insertions(+), 16 deletions(-) diff --git a/cuda_core/cuda/core/experimental/_memoryview.pyx b/cuda_core/cuda/core/experimental/_memoryview.pyx index abe27b8abe..b80d3b5451 100644 --- a/cuda_core/cuda/core/experimental/_memoryview.pyx +++ b/cuda_core/cuda/core/experimental/_memoryview.pyx @@ -184,48 +184,52 @@ cdef StridedMemoryView view_as_dlpack(obj, stream_ptr, view=None): stream=int(stream_ptr) if stream_ptr else None) cdef void* data = NULL + cdef DLTensor* dl_tensor + cdef DLManagedTensorVersioned* dlm_tensor_ver + cdef DLManagedTensor* dlm_tensor + cdef const char *used_name if cpython.PyCapsule_IsValid( capsule, DLPACK_VERSIONED_TENSOR_UNUSED_NAME): data = cpython.PyCapsule_GetPointer( capsule, DLPACK_VERSIONED_TENSOR_UNUSED_NAME) versioned = True + dlm_tensor_ver = data + dl_tensor = &dlm_tensor_ver.dl_tensor + is_readonly = bool((dlm_tensor_ver.flags & DLPACK_FLAG_BITMASK_READ_ONLY) != 0) + used_name = DLPACK_VERSIONED_TENSOR_USED_NAME elif cpython.PyCapsule_IsValid( capsule, DLPACK_TENSOR_UNUSED_NAME): data = cpython.PyCapsule_GetPointer( capsule, DLPACK_TENSOR_UNUSED_NAME) versioned = False - else: - assert False - - cdef DLManagedTensor* dlm_tensor - cdef DLManagedTensorVersioned* dlm_tensor_ver - cdef DLTensor* dl_tensor - if versioned: - dlm_tensor_ver = data - dl_tensor = &dlm_tensor_ver.dl_tensor - is_readonly = bool((dlm_tensor_ver.flags & DLPACK_FLAG_BITMASK_READ_ONLY) != 0) - else: dlm_tensor = data dl_tensor = &dlm_tensor.dl_tensor is_readonly = False + used_name = DLPACK_TENSOR_USED_NAME + else: + assert False cdef StridedMemoryView buf = StridedMemoryView() if view is None else view buf.ptr = (dl_tensor.data) - buf.shape = tuple(int(dl_tensor.shape[i]) for i in range(dl_tensor.ndim)) + + # Construct shape and strides tuples using the Python/C API for speed + buf.shape = cpython.PyTuple_New(dl_tensor.ndim) + for i in range(dl_tensor.ndim): + cpython.PyTuple_SET_ITEM(buf.shape, i, cpython.PyLong_FromLong(dl_tensor.shape[i])) if dl_tensor.strides: - buf.strides = tuple( - int(dl_tensor.strides[i]) for i in range(dl_tensor.ndim)) + buf.strides = cpython.PyTuple_New(dl_tensor.ndim) + for i in range(dl_tensor.ndim): + cpython.PyTuple_SET_ITEM(buf.strides, i, cpython.PyLong_FromLong(dl_tensor.strides[i])) else: # C-order buf.strides = None + buf.dtype = dtype_dlpack_to_numpy(&dl_tensor.dtype) buf.device_id = device_id buf.is_device_accessible = is_device_accessible buf.readonly = is_readonly buf.exporting_obj = obj - cdef const char* used_name = ( - DLPACK_VERSIONED_TENSOR_USED_NAME if versioned else DLPACK_TENSOR_USED_NAME) cpython.PyCapsule_SetName(capsule, used_name) return buf From 8a05be3e9a0fa18afd161295b043250121cd6b2d Mon Sep 17 00:00:00 2001 From: Michael Droettboom Date: Fri, 15 Aug 2025 11:31:38 -0400 Subject: [PATCH 2/3] Add carray_int64_t_to_tuple function --- cuda_core/cuda/core/experimental/_memoryview.pyx | 9 +++------ .../cuda/core/experimental/_utils/cuda_utils.pxd | 12 ++++++++++++ 2 files changed, 15 insertions(+), 6 deletions(-) diff --git a/cuda_core/cuda/core/experimental/_memoryview.pyx b/cuda_core/cuda/core/experimental/_memoryview.pyx index b80d3b5451..0b63faeda4 100644 --- a/cuda_core/cuda/core/experimental/_memoryview.pyx +++ b/cuda_core/cuda/core/experimental/_memoryview.pyx @@ -12,6 +12,7 @@ from typing import Any, Optional import numpy from cuda.core.experimental._utils.cuda_utils import handle_return, driver +from cuda.core.experimental._utils cimport cuda_utils # TODO(leofang): support NumPy structured dtypes @@ -213,13 +214,9 @@ cdef StridedMemoryView view_as_dlpack(obj, stream_ptr, view=None): buf.ptr = (dl_tensor.data) # Construct shape and strides tuples using the Python/C API for speed - buf.shape = cpython.PyTuple_New(dl_tensor.ndim) - for i in range(dl_tensor.ndim): - cpython.PyTuple_SET_ITEM(buf.shape, i, cpython.PyLong_FromLong(dl_tensor.shape[i])) + buf.shape = cuda_utils.carray_int64_t_to_tuple(dl_tensor.shape, dl_tensor.ndim) if dl_tensor.strides: - buf.strides = cpython.PyTuple_New(dl_tensor.ndim) - for i in range(dl_tensor.ndim): - cpython.PyTuple_SET_ITEM(buf.strides, i, cpython.PyLong_FromLong(dl_tensor.strides[i])) + buf.strides = cuda_utils.carray_int64_t_to_tuple(dl_tensor.strides, dl_tensor.ndim) else: # C-order buf.strides = None diff --git a/cuda_core/cuda/core/experimental/_utils/cuda_utils.pxd b/cuda_core/cuda/core/experimental/_utils/cuda_utils.pxd index b082fb8bbf..987a13df62 100644 --- a/cuda_core/cuda/core/experimental/_utils/cuda_utils.pxd +++ b/cuda_core/cuda/core/experimental/_utils/cuda_utils.pxd @@ -2,7 +2,19 @@ # # SPDX-License-Identifier: Apache-2.0 + +cimport cpython +cimport libc.stdint + + cpdef int _check_driver_error(error) except?-1 cpdef int _check_runtime_error(error) except?-1 cpdef int _check_nvrtc_error(error) except?-1 cpdef check_or_create_options(type cls, options, str options_description=*, bint keep_none=*) + + +cdef inline tuple carray_int64_t_to_tuple(libc.stdint.int64_t *ptr, int length): + result = cpython.PyTuple_New(length) + for i in range(length): + cpython.PyTuple_SET_ITEM(result, i, cpython.PyLong_FromLongLong(ptr[i])) + return result From df71f240d06fe75992dc05bd883587d93d7ae740 Mon Sep 17 00:00:00 2001 From: Michael Droettboom Date: Fri, 15 Aug 2025 11:34:39 -0400 Subject: [PATCH 3/3] Move comment --- cuda_core/cuda/core/experimental/_memoryview.pyx | 1 - cuda_core/cuda/core/experimental/_utils/cuda_utils.pxd | 1 + 2 files changed, 1 insertion(+), 1 deletion(-) diff --git a/cuda_core/cuda/core/experimental/_memoryview.pyx b/cuda_core/cuda/core/experimental/_memoryview.pyx index 0b63faeda4..31482229c4 100644 --- a/cuda_core/cuda/core/experimental/_memoryview.pyx +++ b/cuda_core/cuda/core/experimental/_memoryview.pyx @@ -213,7 +213,6 @@ cdef StridedMemoryView view_as_dlpack(obj, stream_ptr, view=None): cdef StridedMemoryView buf = StridedMemoryView() if view is None else view buf.ptr = (dl_tensor.data) - # Construct shape and strides tuples using the Python/C API for speed buf.shape = cuda_utils.carray_int64_t_to_tuple(dl_tensor.shape, dl_tensor.ndim) if dl_tensor.strides: buf.strides = cuda_utils.carray_int64_t_to_tuple(dl_tensor.strides, dl_tensor.ndim) diff --git a/cuda_core/cuda/core/experimental/_utils/cuda_utils.pxd b/cuda_core/cuda/core/experimental/_utils/cuda_utils.pxd index 987a13df62..601736c475 100644 --- a/cuda_core/cuda/core/experimental/_utils/cuda_utils.pxd +++ b/cuda_core/cuda/core/experimental/_utils/cuda_utils.pxd @@ -14,6 +14,7 @@ cpdef check_or_create_options(type cls, options, str options_description=*, bint cdef inline tuple carray_int64_t_to_tuple(libc.stdint.int64_t *ptr, int length): + # Construct shape and strides tuples using the Python/C API for speed result = cpython.PyTuple_New(length) for i in range(length): cpython.PyTuple_SET_ITEM(result, i, cpython.PyLong_FromLongLong(ptr[i]))