diff --git a/cuda_bindings/cuda/bindings/_bindings/cydriver.pyx.in b/cuda_bindings/cuda/bindings/_bindings/cydriver.pyx.in index 7fc86b565e..6925ff6359 100644 --- a/cuda_bindings/cuda/bindings/_bindings/cydriver.pyx.in +++ b/cuda_bindings/cuda/bindings/_bindings/cydriver.pyx.in @@ -490,10 +490,8 @@ cdef bint __cuPythonInit = False ctypedef CUresult (*__cuGetProcAddress_v2_T)(const char*, void**, int, cuuint64_t, CUdriverProcAddressQueryResult*) except?CUDA_ERROR_NOT_FOUND nogil cdef __cuGetProcAddress_v2_T _F_cuGetProcAddress_v2 = NULL -cdef int cuPythonInit() except -1 nogil: +cdef int _cuPythonInit() except -1 nogil: global __cuPythonInit - if __cuPythonInit: - return 0 cdef bint usePTDS cdef char libPath[260] @@ -8883,6 +8881,14 @@ cdef int cuPythonInit() except -1 nogil: __cuPythonInit = True return 0 +# Create a very small function to check whether we are init'ed, so the C +# compiler can inline it. +cdef inline int cuPythonInit() except -1 nogil: + if __cuPythonInit: + return 0 + + return _cuPythonInit() + {{if 'cuGetErrorString' in found_functions}} cdef CUresult _cuGetErrorString(CUresult error, const char** pStr) except ?CUDA_ERROR_NOT_FOUND nogil: diff --git a/cuda_bindings/cuda/bindings/_bindings/cynvrtc.pyx.in b/cuda_bindings/cuda/bindings/_bindings/cynvrtc.pyx.in index 16068f6410..44ec26ffba 100644 --- a/cuda_bindings/cuda/bindings/_bindings/cynvrtc.pyx.in +++ b/cuda_bindings/cuda/bindings/_bindings/cynvrtc.pyx.in @@ -40,10 +40,8 @@ cdef bint __cuPythonInit = False {{if 'nvrtcGetPCHHeapSizeRequired' in found_functions}}cdef void *__nvrtcGetPCHHeapSizeRequired = NULL{{endif}} {{if 'nvrtcSetFlowCallback' in found_functions}}cdef void *__nvrtcSetFlowCallback = NULL{{endif}} -cdef int cuPythonInit() except -1 nogil: +cdef int _cuPythonInit() except -1 nogil: global __cuPythonInit - if __cuPythonInit: - return 0 with gil, __symbol_lock: {{if 'Windows' == platform.system()}} @@ -324,6 +322,14 @@ cdef int cuPythonInit() except -1 nogil: __cuPythonInit = True return 0 +# Create a very small function to check whether we are init'ed, so the C +# compiler can inline it. +cdef inline int cuPythonInit() except -1 nogil: + if __cuPythonInit: + return 0 + + return _cuPythonInit() + {{if 'nvrtcGetErrorString' in found_functions}} cdef const char* _nvrtcGetErrorString(nvrtcResult result) except ?NULL nogil: diff --git a/cuda_bindings/cuda/bindings/_bindings/cyruntime.pyx.in b/cuda_bindings/cuda/bindings/_bindings/cyruntime.pyx.in index a89dae196b..c82189fa42 100644 --- a/cuda_bindings/cuda/bindings/_bindings/cyruntime.pyx.in +++ b/cuda_bindings/cuda/bindings/_bindings/cyruntime.pyx.in @@ -10,16 +10,22 @@ cimport cython cdef bint __cudaPythonInit = False cdef bint __usePTDS = False -cdef int cudaPythonInit() except -1 nogil: +cdef int _cudaPythonInit() except -1 nogil: global __cudaPythonInit global __usePTDS - if __cudaPythonInit: - return __usePTDS with gil: __usePTDS = os.getenv('CUDA_PYTHON_CUDA_PER_THREAD_DEFAULT_STREAM', default=False) __cudaPythonInit = True return __usePTDS +# Create a very small function to check whether we are init'ed, so the C +# compiler can inline it. +cdef inline int cudaPythonInit() except -1 nogil: + if __cudaPythonInit: + return __usePTDS + + return _cudaPythonInit() + {{if 'cudaDeviceReset' in found_functions}} cdef cudaError_t _cudaDeviceReset() except ?cudaErrorCallRequiresNewerDriver nogil: diff --git a/cuda_bindings/docs/source/release/13.X.Y-notes.rst b/cuda_bindings/docs/source/release/13.X.Y-notes.rst index 1ac5e42ccb..2b38504f75 100644 --- a/cuda_bindings/docs/source/release/13.X.Y-notes.rst +++ b/cuda_bindings/docs/source/release/13.X.Y-notes.rst @@ -14,6 +14,8 @@ Highlights * Automatic CUDA library path detection based on ``CUDA_HOME``, eliminating the need to manually set ``LIBRARY_PATH`` environment variables for installation. +* The Python overhead of calling functions in CUDA bindings in `driver`, `runtime` and `nvrtc` has been reduced by approximately 30%. + Known issues ------------