From 67005ba4c1f0c1441a24aef01d1624e44475f2a5 Mon Sep 17 00:00:00 2001
From: "Ralf W. Grosse-Kunstleve" <rwgkio@gmail.com>
Date: Fri, 7 Feb 2025 19:11:56 -0800
Subject: [PATCH 1/9] Add nvvm bindings (#421)

* Add nvvm to setup.py

* Add test_nvvm.py

* test_nvvm.py version(), ir_version()

* Snapshot of generated files.

* Add in `nvvm.create_program()`

* Add in `nvvm.destroy_program()`

* Add in `nvvm.compile_program()`

* Add in add_module_to_program()

* Add in verify_program()

* Add in lazy_add_module_to_program()

* Add in get_compiled_result_size(), get_program_log_size()

* Add in get_compiled_result(), get_program_log()

* Change Copyright dates to 2025

* Use cybind results "automatically generated across versions from 12.0.1 to 12.8.0."

* update to use NVKS runners

* Add tests/run_simple.py

* update fetch_ctk to find nvvm shared lib

* fix wheel rel path

* add nvcc wheel to [all]

* Fix cybind bindings for add_module_to_program(), lazy_add_module_to_program()

* Add test_with_minimal_nnvm_ir()

* Remove tests/run_simple.py

* Update cuda_bindings/cuda/bindings/_internal/nvvm_windows.pyx

Co-authored-by: Leo Fang <leo80042@gmail.com>

* Update cuda_bindings/cuda/bindings/_internal/nvvm_windows.pyx

Co-authored-by: Leo Fang <leo80042@gmail.com>

* Remove stray `f` (it is now a plain string, not an f-string anymore)

* Add bootstrap_local_dev.sh script.

* Fix nvvm.compile_program() failure for CUDA version 12.0

The original datalayout lacked explicit alignment and size definitions for i1, i8, i16, f32, f64, v64, and v128.

The missing types are crucial for LLVM-based compilation in CUDA 12.0.

Later CUDA versions are more forgiving, but 12.0 enforces a stricter layout. The stricter layout should resolve the issue for CUDA 12.0
without breaking compatibility with later versions.

* Add test_verify_program_with_minimal_nnvm_ir() and rename some tests for clarity.

* Complete test coverage.

* Introduce noregex() to reduce backslash clutter.

* Use a contextmanager to replace repeated try-finally.

* Rename noregex to match_exact

* Introduce get_program_log() helper.

* Improve nvvm_program() Context Manager

* Remove redundant "utf-8"

* Also test with NVVM Bitcode (using a new pytest fixture).

* Introduce compile_or_verify fixture.

* Remove bootstrap_local_dev.sh, to be moved to a separate PR.

* Update from codegen after config fix.

* Update from codegen after config fix.

* Update from codegen after adding CTK 11.x nvvm.h headers. Functional NO-OP.

* Fix get_nvvm_dso_version_suffix() to match actual version numbers:

./11.0.3_450.51.06/cuda_nvcc/nvvm/lib64/libnvvm.so.3.3.0
./11.1.1_455.32.00/cuda_nvcc/nvvm/lib64/libnvvm.so.3.3.0
./11.2.2_460.32.03/cuda_nvcc/nvvm/lib64/libnvvm.so.4.0.0
./11.3.1_465.19.01/cuda_nvcc/nvvm/lib64/libnvvm.so.4.0.0
./11.4.4_470.82.01/cuda_nvcc/nvvm/lib64/libnvvm.so.4.0.0
./11.5.1_495.29.05/cuda_nvcc/nvvm/lib64/libnvvm.so.4.0.0
./11.6.2_510.47.03/cuda_nvcc/nvvm/lib64/libnvvm.so.4.0.0
./11.7.1_515.65.01/cuda_nvcc/nvvm/lib64/libnvvm.so.4.0.0
./11.8.0_520.61.05/cuda_nvcc/nvvm/lib64/libnvvm.so.4.0.0
./12.0.1_525.85.12/cuda_nvcc/nvvm/lib64/libnvvm.so.4.0.0
./12.1.1_530.30.02/cuda_nvcc/nvvm/lib64/libnvvm.so.4.0.0
./12.2.2_535.104.05/cuda_nvcc/nvvm/lib64/libnvvm.so.4.0.0
./12.3.2_545.23.08/cuda_nvcc/nvvm/lib64/libnvvm.so.4.0.0
./12.4.1_550.54.15/cuda_nvcc/nvvm/lib64/libnvvm.so.4.0.0
./12.5.1_555.42.06/cuda_nvcc/nvvm/lib64/libnvvm.so.4.0.0
./12.6.2_560.35.03/cuda_nvcc/nvvm/lib64/libnvvm.so.4.0.0
./12.8.0_570.86.10/cuda_nvcc/nvvm/lib64/libnvvm.so.4.0.0

For completeness, since the nvjitlink code is touched in this commit, these are the libnvJitLink version numbers:

./12.0.1_525.85.12/libnvjitlink/targets/x86_64-linux/lib/libnvJitLink.so.12.0.140
./12.1.1_530.30.02/libnvjitlink/targets/x86_64-linux/lib/libnvJitLink.so.12.1.105
./12.2.2_535.104.05/libnvjitlink/targets/x86_64-linux/lib/libnvJitLink.so.12.2.140
./12.3.2_545.23.08/libnvjitlink/targets/x86_64-linux/lib/libnvJitLink.so.12.3.101
./12.4.1_550.54.15/libnvjitlink/targets/x86_64-linux/lib/libnvJitLink.so.12.4.127
./12.5.1_555.42.06/libnvjitlink/targets/x86_64-linux/lib/libnvJitLink.so.12.5.82
./12.6.2_560.35.03/libnvjitlink/targets/x86_64-linux/lib/libnvJitLink.so.12.6.77
./12.8.0_570.86.10/libnvjitlink/targets/x86_64-linux/lib/libnvJitLink.so.12.8.61

* find_libnvvm_so_via_proc_self_maps() Proof Of Concept

* Revert "find_libnvvm_so_via_proc_self_maps() Proof Of Concept"

This reverts commit b45bac231df6b2b482acfd28bf05c1c64111c8e3.

* Add another rpath for finding libnvvm.so

---------

(cherry picked from commit 2981bfd875a0576283fb54130d7b52f29071531c)
---
 .github/actions/fetch_ctk/action.yml          |   2 +-
 .../cuda/bindings/_internal/nvvm.pxd          |  25 ++
 .../cuda/bindings/_internal/nvvm_linux.pyx    | 360 +++++++++++++++++
 .../cuda/bindings/_internal/nvvm_windows.pyx  | 373 ++++++++++++++++++
 .../cuda/bindings/_internal/utils.pxd         | 169 ++++++++
 .../cuda/bindings/_internal/utils.pyx         | 137 +++++++
 cuda_bindings/cuda/bindings/cynvvm.pxd        |  46 +++
 cuda_bindings/cuda/bindings/cynvvm.pyx        |  59 +++
 cuda_bindings/cuda/bindings/nvvm.pxd          |  40 ++
 cuda_bindings/cuda/bindings/nvvm.pyx          | 284 +++++++++++++
 cuda_bindings/pyproject.toml                  |   3 +-
 cuda_bindings/setup.py                        |  13 +-
 cuda_bindings/tests/test_nvvm.py              | 193 +++++++++
 13 files changed, 1701 insertions(+), 3 deletions(-)
 create mode 100644 cuda_bindings/cuda/bindings/_internal/nvvm.pxd
 create mode 100644 cuda_bindings/cuda/bindings/_internal/nvvm_linux.pyx
 create mode 100644 cuda_bindings/cuda/bindings/_internal/nvvm_windows.pyx
 create mode 100644 cuda_bindings/cuda/bindings/_internal/utils.pxd
 create mode 100644 cuda_bindings/cuda/bindings/_internal/utils.pyx
 create mode 100644 cuda_bindings/cuda/bindings/cynvvm.pxd
 create mode 100644 cuda_bindings/cuda/bindings/cynvvm.pyx
 create mode 100644 cuda_bindings/cuda/bindings/nvvm.pxd
 create mode 100644 cuda_bindings/cuda/bindings/nvvm.pyx
 create mode 100644 cuda_bindings/tests/test_nvvm.py

diff --git a/.github/actions/fetch_ctk/action.yml b/.github/actions/fetch_ctk/action.yml
index 18750c2d8b..798e443d47 100644
--- a/.github/actions/fetch_ctk/action.yml
+++ b/.github/actions/fetch_ctk/action.yml
@@ -148,4 +148,4 @@ runs:
         echo "CUDA_PATH=${CUDA_PATH}" >> $GITHUB_ENV
         echo "CUDA_HOME=${CUDA_PATH}" >> $GITHUB_ENV
         echo "${CUDA_PATH}/bin" >> $GITHUB_PATH
-        echo "LD_LIBRARY_PATH=${LD_LIBRARY_PATH:-}:${CUDA_PATH}/lib" >> $GITHUB_ENV
+        echo "LD_LIBRARY_PATH=${LD_LIBRARY_PATH:-}:${CUDA_PATH}/lib:${CUDA_PATH}/nvvm/lib64" >> $GITHUB_ENV
diff --git a/cuda_bindings/cuda/bindings/_internal/nvvm.pxd b/cuda_bindings/cuda/bindings/_internal/nvvm.pxd
new file mode 100644
index 0000000000..0feebf2514
--- /dev/null
+++ b/cuda_bindings/cuda/bindings/_internal/nvvm.pxd
@@ -0,0 +1,25 @@
+# Copyright (c) 2025, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED.
+#
+# SPDX-License-Identifier: LicenseRef-NVIDIA-SOFTWARE-LICENSE
+#
+# This code was automatically generated across versions from 11.0.3 to 12.8.0. Do not modify it directly.
+
+from ..cynvvm cimport *
+
+
+###############################################################################
+# Wrapper functions
+###############################################################################
+
+cdef nvvmResult _nvvmVersion(int* major, int* minor) except* nogil
+cdef nvvmResult _nvvmIRVersion(int* majorIR, int* minorIR, int* majorDbg, int* minorDbg) except* nogil
+cdef nvvmResult _nvvmCreateProgram(nvvmProgram* prog) except* nogil
+cdef nvvmResult _nvvmDestroyProgram(nvvmProgram* prog) except* nogil
+cdef nvvmResult _nvvmAddModuleToProgram(nvvmProgram prog, const char* buffer, size_t size, const char* name) except* nogil
+cdef nvvmResult _nvvmLazyAddModuleToProgram(nvvmProgram prog, const char* buffer, size_t size, const char* name) except* nogil
+cdef nvvmResult _nvvmCompileProgram(nvvmProgram prog, int numOptions, const char** options) except* nogil
+cdef nvvmResult _nvvmVerifyProgram(nvvmProgram prog, int numOptions, const char** options) except* nogil
+cdef nvvmResult _nvvmGetCompiledResultSize(nvvmProgram prog, size_t* bufferSizeRet) except* nogil
+cdef nvvmResult _nvvmGetCompiledResult(nvvmProgram prog, char* buffer) except* nogil
+cdef nvvmResult _nvvmGetProgramLogSize(nvvmProgram prog, size_t* bufferSizeRet) except* nogil
+cdef nvvmResult _nvvmGetProgramLog(nvvmProgram prog, char* buffer) except* nogil
diff --git a/cuda_bindings/cuda/bindings/_internal/nvvm_linux.pyx b/cuda_bindings/cuda/bindings/_internal/nvvm_linux.pyx
new file mode 100644
index 0000000000..e21218772d
--- /dev/null
+++ b/cuda_bindings/cuda/bindings/_internal/nvvm_linux.pyx
@@ -0,0 +1,360 @@
+# Copyright (c) 2025, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED.
+#
+# SPDX-License-Identifier: LicenseRef-NVIDIA-SOFTWARE-LICENSE
+#
+# This code was automatically generated across versions from 11.0.3 to 12.8.0. Do not modify it directly.
+
+from libc.stdint cimport intptr_t
+
+from .utils cimport get_nvvm_dso_version_suffix
+
+from .utils import FunctionNotFoundError, NotSupportedError
+
+###############################################################################
+# Extern
+###############################################################################
+
+cdef extern from "<dlfcn.h>" nogil:
+    void* dlopen(const char*, int)
+    char* dlerror()
+    void* dlsym(void*, const char*)
+    int dlclose(void*)
+
+    enum:
+        RTLD_LAZY
+        RTLD_NOW
+        RTLD_GLOBAL
+        RTLD_LOCAL
+
+    const void* RTLD_DEFAULT 'RTLD_DEFAULT'
+
+
+###############################################################################
+# Wrapper init
+###############################################################################
+
+cdef bint __py_nvvm_init = False
+cdef void* __cuDriverGetVersion = NULL
+
+cdef void* __nvvmVersion = NULL
+cdef void* __nvvmIRVersion = NULL
+cdef void* __nvvmCreateProgram = NULL
+cdef void* __nvvmDestroyProgram = NULL
+cdef void* __nvvmAddModuleToProgram = NULL
+cdef void* __nvvmLazyAddModuleToProgram = NULL
+cdef void* __nvvmCompileProgram = NULL
+cdef void* __nvvmVerifyProgram = NULL
+cdef void* __nvvmGetCompiledResultSize = NULL
+cdef void* __nvvmGetCompiledResult = NULL
+cdef void* __nvvmGetProgramLogSize = NULL
+cdef void* __nvvmGetProgramLog = NULL
+
+
+cdef void* load_library(const int driver_ver) except* with gil:
+    cdef void* handle
+    for suffix in get_nvvm_dso_version_suffix(driver_ver):
+        so_name = "libnvvm.so" + (f".{suffix}" if suffix else suffix)
+        handle = dlopen(so_name.encode(), RTLD_NOW | RTLD_GLOBAL)
+        if handle != NULL:
+            break
+    else:
+        err_msg = dlerror()
+        raise RuntimeError(f'Failed to dlopen libnvvm ({err_msg.decode()})')
+    return handle
+
+
+cdef int _check_or_init_nvvm() except -1 nogil:
+    global __py_nvvm_init
+    if __py_nvvm_init:
+        return 0
+
+    # Load driver to check version
+    cdef void* handle = NULL
+    handle = dlopen('libcuda.so.1', RTLD_NOW | RTLD_GLOBAL)
+    if handle == NULL:
+        with gil:
+            err_msg = dlerror()
+            raise NotSupportedError(f'CUDA driver is not found ({err_msg.decode()})')
+    global __cuDriverGetVersion
+    if __cuDriverGetVersion == NULL:
+        __cuDriverGetVersion = dlsym(handle, "cuDriverGetVersion")
+    if __cuDriverGetVersion == NULL:
+        with gil:
+            raise RuntimeError('something went wrong')
+    cdef int err, driver_ver
+    err = (<int (*)(int*) nogil>__cuDriverGetVersion)(&driver_ver)
+    if err != 0:
+        with gil:
+            raise RuntimeError('something went wrong')
+    #dlclose(handle)
+    handle = NULL
+
+    # Load function
+    global __nvvmVersion
+    __nvvmVersion = dlsym(RTLD_DEFAULT, 'nvvmVersion')
+    if __nvvmVersion == NULL:
+        if handle == NULL:
+            handle = load_library(driver_ver)
+        __nvvmVersion = dlsym(handle, 'nvvmVersion')
+
+    global __nvvmIRVersion
+    __nvvmIRVersion = dlsym(RTLD_DEFAULT, 'nvvmIRVersion')
+    if __nvvmIRVersion == NULL:
+        if handle == NULL:
+            handle = load_library(driver_ver)
+        __nvvmIRVersion = dlsym(handle, 'nvvmIRVersion')
+
+    global __nvvmCreateProgram
+    __nvvmCreateProgram = dlsym(RTLD_DEFAULT, 'nvvmCreateProgram')
+    if __nvvmCreateProgram == NULL:
+        if handle == NULL:
+            handle = load_library(driver_ver)
+        __nvvmCreateProgram = dlsym(handle, 'nvvmCreateProgram')
+
+    global __nvvmDestroyProgram
+    __nvvmDestroyProgram = dlsym(RTLD_DEFAULT, 'nvvmDestroyProgram')
+    if __nvvmDestroyProgram == NULL:
+        if handle == NULL:
+            handle = load_library(driver_ver)
+        __nvvmDestroyProgram = dlsym(handle, 'nvvmDestroyProgram')
+
+    global __nvvmAddModuleToProgram
+    __nvvmAddModuleToProgram = dlsym(RTLD_DEFAULT, 'nvvmAddModuleToProgram')
+    if __nvvmAddModuleToProgram == NULL:
+        if handle == NULL:
+            handle = load_library(driver_ver)
+        __nvvmAddModuleToProgram = dlsym(handle, 'nvvmAddModuleToProgram')
+
+    global __nvvmLazyAddModuleToProgram
+    __nvvmLazyAddModuleToProgram = dlsym(RTLD_DEFAULT, 'nvvmLazyAddModuleToProgram')
+    if __nvvmLazyAddModuleToProgram == NULL:
+        if handle == NULL:
+            handle = load_library(driver_ver)
+        __nvvmLazyAddModuleToProgram = dlsym(handle, 'nvvmLazyAddModuleToProgram')
+
+    global __nvvmCompileProgram
+    __nvvmCompileProgram = dlsym(RTLD_DEFAULT, 'nvvmCompileProgram')
+    if __nvvmCompileProgram == NULL:
+        if handle == NULL:
+            handle = load_library(driver_ver)
+        __nvvmCompileProgram = dlsym(handle, 'nvvmCompileProgram')
+
+    global __nvvmVerifyProgram
+    __nvvmVerifyProgram = dlsym(RTLD_DEFAULT, 'nvvmVerifyProgram')
+    if __nvvmVerifyProgram == NULL:
+        if handle == NULL:
+            handle = load_library(driver_ver)
+        __nvvmVerifyProgram = dlsym(handle, 'nvvmVerifyProgram')
+
+    global __nvvmGetCompiledResultSize
+    __nvvmGetCompiledResultSize = dlsym(RTLD_DEFAULT, 'nvvmGetCompiledResultSize')
+    if __nvvmGetCompiledResultSize == NULL:
+        if handle == NULL:
+            handle = load_library(driver_ver)
+        __nvvmGetCompiledResultSize = dlsym(handle, 'nvvmGetCompiledResultSize')
+
+    global __nvvmGetCompiledResult
+    __nvvmGetCompiledResult = dlsym(RTLD_DEFAULT, 'nvvmGetCompiledResult')
+    if __nvvmGetCompiledResult == NULL:
+        if handle == NULL:
+            handle = load_library(driver_ver)
+        __nvvmGetCompiledResult = dlsym(handle, 'nvvmGetCompiledResult')
+
+    global __nvvmGetProgramLogSize
+    __nvvmGetProgramLogSize = dlsym(RTLD_DEFAULT, 'nvvmGetProgramLogSize')
+    if __nvvmGetProgramLogSize == NULL:
+        if handle == NULL:
+            handle = load_library(driver_ver)
+        __nvvmGetProgramLogSize = dlsym(handle, 'nvvmGetProgramLogSize')
+
+    global __nvvmGetProgramLog
+    __nvvmGetProgramLog = dlsym(RTLD_DEFAULT, 'nvvmGetProgramLog')
+    if __nvvmGetProgramLog == NULL:
+        if handle == NULL:
+            handle = load_library(driver_ver)
+        __nvvmGetProgramLog = dlsym(handle, 'nvvmGetProgramLog')
+
+    __py_nvvm_init = True
+    return 0
+
+
+cdef dict func_ptrs = None
+
+
+cpdef dict _inspect_function_pointers():
+    global func_ptrs
+    if func_ptrs is not None:
+        return func_ptrs
+
+    _check_or_init_nvvm()
+    cdef dict data = {}
+
+    global __nvvmVersion
+    data["__nvvmVersion"] = <intptr_t>__nvvmVersion
+
+    global __nvvmIRVersion
+    data["__nvvmIRVersion"] = <intptr_t>__nvvmIRVersion
+
+    global __nvvmCreateProgram
+    data["__nvvmCreateProgram"] = <intptr_t>__nvvmCreateProgram
+
+    global __nvvmDestroyProgram
+    data["__nvvmDestroyProgram"] = <intptr_t>__nvvmDestroyProgram
+
+    global __nvvmAddModuleToProgram
+    data["__nvvmAddModuleToProgram"] = <intptr_t>__nvvmAddModuleToProgram
+
+    global __nvvmLazyAddModuleToProgram
+    data["__nvvmLazyAddModuleToProgram"] = <intptr_t>__nvvmLazyAddModuleToProgram
+
+    global __nvvmCompileProgram
+    data["__nvvmCompileProgram"] = <intptr_t>__nvvmCompileProgram
+
+    global __nvvmVerifyProgram
+    data["__nvvmVerifyProgram"] = <intptr_t>__nvvmVerifyProgram
+
+    global __nvvmGetCompiledResultSize
+    data["__nvvmGetCompiledResultSize"] = <intptr_t>__nvvmGetCompiledResultSize
+
+    global __nvvmGetCompiledResult
+    data["__nvvmGetCompiledResult"] = <intptr_t>__nvvmGetCompiledResult
+
+    global __nvvmGetProgramLogSize
+    data["__nvvmGetProgramLogSize"] = <intptr_t>__nvvmGetProgramLogSize
+
+    global __nvvmGetProgramLog
+    data["__nvvmGetProgramLog"] = <intptr_t>__nvvmGetProgramLog
+
+    func_ptrs = data
+    return data
+
+
+cpdef _inspect_function_pointer(str name):
+    global func_ptrs
+    if func_ptrs is None:
+        func_ptrs = _inspect_function_pointers()
+    return func_ptrs[name]
+
+
+###############################################################################
+# Wrapper functions
+###############################################################################
+
+cdef nvvmResult _nvvmVersion(int* major, int* minor) except* nogil:
+    global __nvvmVersion
+    _check_or_init_nvvm()
+    if __nvvmVersion == NULL:
+        with gil:
+            raise FunctionNotFoundError("function nvvmVersion is not found")
+    return (<nvvmResult (*)(int*, int*) nogil>__nvvmVersion)(
+        major, minor)
+
+
+cdef nvvmResult _nvvmIRVersion(int* majorIR, int* minorIR, int* majorDbg, int* minorDbg) except* nogil:
+    global __nvvmIRVersion
+    _check_or_init_nvvm()
+    if __nvvmIRVersion == NULL:
+        with gil:
+            raise FunctionNotFoundError("function nvvmIRVersion is not found")
+    return (<nvvmResult (*)(int*, int*, int*, int*) nogil>__nvvmIRVersion)(
+        majorIR, minorIR, majorDbg, minorDbg)
+
+
+cdef nvvmResult _nvvmCreateProgram(nvvmProgram* prog) except* nogil:
+    global __nvvmCreateProgram
+    _check_or_init_nvvm()
+    if __nvvmCreateProgram == NULL:
+        with gil:
+            raise FunctionNotFoundError("function nvvmCreateProgram is not found")
+    return (<nvvmResult (*)(nvvmProgram*) nogil>__nvvmCreateProgram)(
+        prog)
+
+
+cdef nvvmResult _nvvmDestroyProgram(nvvmProgram* prog) except* nogil:
+    global __nvvmDestroyProgram
+    _check_or_init_nvvm()
+    if __nvvmDestroyProgram == NULL:
+        with gil:
+            raise FunctionNotFoundError("function nvvmDestroyProgram is not found")
+    return (<nvvmResult (*)(nvvmProgram*) nogil>__nvvmDestroyProgram)(
+        prog)
+
+
+cdef nvvmResult _nvvmAddModuleToProgram(nvvmProgram prog, const char* buffer, size_t size, const char* name) except* nogil:
+    global __nvvmAddModuleToProgram
+    _check_or_init_nvvm()
+    if __nvvmAddModuleToProgram == NULL:
+        with gil:
+            raise FunctionNotFoundError("function nvvmAddModuleToProgram is not found")
+    return (<nvvmResult (*)(nvvmProgram, const char*, size_t, const char*) nogil>__nvvmAddModuleToProgram)(
+        prog, buffer, size, name)
+
+
+cdef nvvmResult _nvvmLazyAddModuleToProgram(nvvmProgram prog, const char* buffer, size_t size, const char* name) except* nogil:
+    global __nvvmLazyAddModuleToProgram
+    _check_or_init_nvvm()
+    if __nvvmLazyAddModuleToProgram == NULL:
+        with gil:
+            raise FunctionNotFoundError("function nvvmLazyAddModuleToProgram is not found")
+    return (<nvvmResult (*)(nvvmProgram, const char*, size_t, const char*) nogil>__nvvmLazyAddModuleToProgram)(
+        prog, buffer, size, name)
+
+
+cdef nvvmResult _nvvmCompileProgram(nvvmProgram prog, int numOptions, const char** options) except* nogil:
+    global __nvvmCompileProgram
+    _check_or_init_nvvm()
+    if __nvvmCompileProgram == NULL:
+        with gil:
+            raise FunctionNotFoundError("function nvvmCompileProgram is not found")
+    return (<nvvmResult (*)(nvvmProgram, int, const char**) nogil>__nvvmCompileProgram)(
+        prog, numOptions, options)
+
+
+cdef nvvmResult _nvvmVerifyProgram(nvvmProgram prog, int numOptions, const char** options) except* nogil:
+    global __nvvmVerifyProgram
+    _check_or_init_nvvm()
+    if __nvvmVerifyProgram == NULL:
+        with gil:
+            raise FunctionNotFoundError("function nvvmVerifyProgram is not found")
+    return (<nvvmResult (*)(nvvmProgram, int, const char**) nogil>__nvvmVerifyProgram)(
+        prog, numOptions, options)
+
+
+cdef nvvmResult _nvvmGetCompiledResultSize(nvvmProgram prog, size_t* bufferSizeRet) except* nogil:
+    global __nvvmGetCompiledResultSize
+    _check_or_init_nvvm()
+    if __nvvmGetCompiledResultSize == NULL:
+        with gil:
+            raise FunctionNotFoundError("function nvvmGetCompiledResultSize is not found")
+    return (<nvvmResult (*)(nvvmProgram, size_t*) nogil>__nvvmGetCompiledResultSize)(
+        prog, bufferSizeRet)
+
+
+cdef nvvmResult _nvvmGetCompiledResult(nvvmProgram prog, char* buffer) except* nogil:
+    global __nvvmGetCompiledResult
+    _check_or_init_nvvm()
+    if __nvvmGetCompiledResult == NULL:
+        with gil:
+            raise FunctionNotFoundError("function nvvmGetCompiledResult is not found")
+    return (<nvvmResult (*)(nvvmProgram, char*) nogil>__nvvmGetCompiledResult)(
+        prog, buffer)
+
+
+cdef nvvmResult _nvvmGetProgramLogSize(nvvmProgram prog, size_t* bufferSizeRet) except* nogil:
+    global __nvvmGetProgramLogSize
+    _check_or_init_nvvm()
+    if __nvvmGetProgramLogSize == NULL:
+        with gil:
+            raise FunctionNotFoundError("function nvvmGetProgramLogSize is not found")
+    return (<nvvmResult (*)(nvvmProgram, size_t*) nogil>__nvvmGetProgramLogSize)(
+        prog, bufferSizeRet)
+
+
+cdef nvvmResult _nvvmGetProgramLog(nvvmProgram prog, char* buffer) except* nogil:
+    global __nvvmGetProgramLog
+    _check_or_init_nvvm()
+    if __nvvmGetProgramLog == NULL:
+        with gil:
+            raise FunctionNotFoundError("function nvvmGetProgramLog is not found")
+    return (<nvvmResult (*)(nvvmProgram, char*) nogil>__nvvmGetProgramLog)(
+        prog, buffer)
diff --git a/cuda_bindings/cuda/bindings/_internal/nvvm_windows.pyx b/cuda_bindings/cuda/bindings/_internal/nvvm_windows.pyx
new file mode 100644
index 0000000000..b8e6795478
--- /dev/null
+++ b/cuda_bindings/cuda/bindings/_internal/nvvm_windows.pyx
@@ -0,0 +1,373 @@
+# Copyright (c) 2025, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED.
+#
+# SPDX-License-Identifier: LicenseRef-NVIDIA-SOFTWARE-LICENSE
+#
+# This code was automatically generated across versions from 11.0.3 to 12.8.0. Do not modify it directly.
+
+from libc.stdint cimport intptr_t
+
+from .utils cimport get_nvvm_dso_version_suffix
+
+from .utils import FunctionNotFoundError, NotSupportedError
+
+import os
+import site
+
+import win32api
+
+
+###############################################################################
+# Wrapper init
+###############################################################################
+
+LOAD_LIBRARY_SEARCH_SYSTEM32     = 0x00000800
+LOAD_LIBRARY_SEARCH_DEFAULT_DIRS = 0x00001000
+LOAD_LIBRARY_SEARCH_DLL_LOAD_DIR = 0x00000100
+cdef bint __py_nvvm_init = False
+cdef void* __cuDriverGetVersion = NULL
+
+cdef void* __nvvmVersion = NULL
+cdef void* __nvvmIRVersion = NULL
+cdef void* __nvvmCreateProgram = NULL
+cdef void* __nvvmDestroyProgram = NULL
+cdef void* __nvvmAddModuleToProgram = NULL
+cdef void* __nvvmLazyAddModuleToProgram = NULL
+cdef void* __nvvmCompileProgram = NULL
+cdef void* __nvvmVerifyProgram = NULL
+cdef void* __nvvmGetCompiledResultSize = NULL
+cdef void* __nvvmGetCompiledResult = NULL
+cdef void* __nvvmGetProgramLogSize = NULL
+cdef void* __nvvmGetProgramLog = NULL
+
+
+cdef inline list get_site_packages():
+    return [site.getusersitepackages()] + site.getsitepackages()
+
+
+cdef load_library(const int driver_ver):
+    handle = 0
+
+    for suffix in get_nvvm_dso_version_suffix(driver_ver):
+        if len(suffix) == 0:
+            continue
+        dll_name = "nvvm64_40_0"
+
+        # First check if the DLL has been loaded by 3rd parties
+        try:
+            handle = win32api.GetModuleHandle(dll_name)
+        except:
+            pass
+        else:
+            break
+
+        # Next, check if DLLs are installed via pip
+        for sp in get_site_packages():
+            mod_path = os.path.join(sp, "nvidia", "cuda_nvcc", "nvvm", "bin")
+            if not os.path.isdir(mod_path):
+                continue
+            os.add_dll_directory(mod_path)
+        try:
+            handle = win32api.LoadLibraryEx(
+                # Note: LOAD_LIBRARY_SEARCH_DLL_LOAD_DIR needs an abs path...
+                os.path.join(mod_path, dll_name),
+                0, LOAD_LIBRARY_SEARCH_DEFAULT_DIRS | LOAD_LIBRARY_SEARCH_DLL_LOAD_DIR)
+        except:
+            pass
+        else:
+            break
+
+        # Finally, try default search
+        try:
+            handle = win32api.LoadLibrary(dll_name)
+        except:
+            pass
+        else:
+            break
+    else:
+        raise RuntimeError('Failed to load nvvm')
+
+    assert handle != 0
+    return handle
+
+
+cdef int _check_or_init_nvvm() except -1 nogil:
+    global __py_nvvm_init
+    if __py_nvvm_init:
+        return 0
+
+    cdef int err, driver_ver
+    with gil:
+        # Load driver to check version
+        try:
+            handle = win32api.LoadLibraryEx("nvcuda.dll", 0, LOAD_LIBRARY_SEARCH_SYSTEM32)
+        except Exception as e:
+            raise NotSupportedError(f'CUDA driver is not found ({e})')
+        global __cuDriverGetVersion
+        if __cuDriverGetVersion == NULL:
+            __cuDriverGetVersion = <void*><intptr_t>win32api.GetProcAddress(handle, 'cuDriverGetVersion')
+            if __cuDriverGetVersion == NULL:
+                raise RuntimeError('something went wrong')
+        err = (<int (*)(int*) nogil>__cuDriverGetVersion)(&driver_ver)
+        if err != 0:
+            raise RuntimeError('something went wrong')
+
+        # Load library
+        handle = load_library(driver_ver)
+
+        # Load function
+        global __nvvmVersion
+        try:
+            __nvvmVersion = <void*><intptr_t>win32api.GetProcAddress(handle, 'nvvmVersion')
+        except:
+            pass
+
+        global __nvvmIRVersion
+        try:
+            __nvvmIRVersion = <void*><intptr_t>win32api.GetProcAddress(handle, 'nvvmIRVersion')
+        except:
+            pass
+
+        global __nvvmCreateProgram
+        try:
+            __nvvmCreateProgram = <void*><intptr_t>win32api.GetProcAddress(handle, 'nvvmCreateProgram')
+        except:
+            pass
+
+        global __nvvmDestroyProgram
+        try:
+            __nvvmDestroyProgram = <void*><intptr_t>win32api.GetProcAddress(handle, 'nvvmDestroyProgram')
+        except:
+            pass
+
+        global __nvvmAddModuleToProgram
+        try:
+            __nvvmAddModuleToProgram = <void*><intptr_t>win32api.GetProcAddress(handle, 'nvvmAddModuleToProgram')
+        except:
+            pass
+
+        global __nvvmLazyAddModuleToProgram
+        try:
+            __nvvmLazyAddModuleToProgram = <void*><intptr_t>win32api.GetProcAddress(handle, 'nvvmLazyAddModuleToProgram')
+        except:
+            pass
+
+        global __nvvmCompileProgram
+        try:
+            __nvvmCompileProgram = <void*><intptr_t>win32api.GetProcAddress(handle, 'nvvmCompileProgram')
+        except:
+            pass
+
+        global __nvvmVerifyProgram
+        try:
+            __nvvmVerifyProgram = <void*><intptr_t>win32api.GetProcAddress(handle, 'nvvmVerifyProgram')
+        except:
+            pass
+
+        global __nvvmGetCompiledResultSize
+        try:
+            __nvvmGetCompiledResultSize = <void*><intptr_t>win32api.GetProcAddress(handle, 'nvvmGetCompiledResultSize')
+        except:
+            pass
+
+        global __nvvmGetCompiledResult
+        try:
+            __nvvmGetCompiledResult = <void*><intptr_t>win32api.GetProcAddress(handle, 'nvvmGetCompiledResult')
+        except:
+            pass
+
+        global __nvvmGetProgramLogSize
+        try:
+            __nvvmGetProgramLogSize = <void*><intptr_t>win32api.GetProcAddress(handle, 'nvvmGetProgramLogSize')
+        except:
+            pass
+
+        global __nvvmGetProgramLog
+        try:
+            __nvvmGetProgramLog = <void*><intptr_t>win32api.GetProcAddress(handle, 'nvvmGetProgramLog')
+        except:
+            pass
+
+    __py_nvvm_init = True
+    return 0
+
+
+cdef dict func_ptrs = None
+
+
+cpdef dict _inspect_function_pointers():
+    global func_ptrs
+    if func_ptrs is not None:
+        return func_ptrs
+
+    _check_or_init_nvvm()
+    cdef dict data = {}
+
+    global __nvvmVersion
+    data["__nvvmVersion"] = <intptr_t>__nvvmVersion
+
+    global __nvvmIRVersion
+    data["__nvvmIRVersion"] = <intptr_t>__nvvmIRVersion
+
+    global __nvvmCreateProgram
+    data["__nvvmCreateProgram"] = <intptr_t>__nvvmCreateProgram
+
+    global __nvvmDestroyProgram
+    data["__nvvmDestroyProgram"] = <intptr_t>__nvvmDestroyProgram
+
+    global __nvvmAddModuleToProgram
+    data["__nvvmAddModuleToProgram"] = <intptr_t>__nvvmAddModuleToProgram
+
+    global __nvvmLazyAddModuleToProgram
+    data["__nvvmLazyAddModuleToProgram"] = <intptr_t>__nvvmLazyAddModuleToProgram
+
+    global __nvvmCompileProgram
+    data["__nvvmCompileProgram"] = <intptr_t>__nvvmCompileProgram
+
+    global __nvvmVerifyProgram
+    data["__nvvmVerifyProgram"] = <intptr_t>__nvvmVerifyProgram
+
+    global __nvvmGetCompiledResultSize
+    data["__nvvmGetCompiledResultSize"] = <intptr_t>__nvvmGetCompiledResultSize
+
+    global __nvvmGetCompiledResult
+    data["__nvvmGetCompiledResult"] = <intptr_t>__nvvmGetCompiledResult
+
+    global __nvvmGetProgramLogSize
+    data["__nvvmGetProgramLogSize"] = <intptr_t>__nvvmGetProgramLogSize
+
+    global __nvvmGetProgramLog
+    data["__nvvmGetProgramLog"] = <intptr_t>__nvvmGetProgramLog
+
+    func_ptrs = data
+    return data
+
+
+cpdef _inspect_function_pointer(str name):
+    global func_ptrs
+    if func_ptrs is None:
+        func_ptrs = _inspect_function_pointers()
+    return func_ptrs[name]
+
+
+###############################################################################
+# Wrapper functions
+###############################################################################
+
+cdef nvvmResult _nvvmVersion(int* major, int* minor) except* nogil:
+    global __nvvmVersion
+    _check_or_init_nvvm()
+    if __nvvmVersion == NULL:
+        with gil:
+            raise FunctionNotFoundError("function nvvmVersion is not found")
+    return (<nvvmResult (*)(int*, int*) nogil>__nvvmVersion)(
+        major, minor)
+
+
+cdef nvvmResult _nvvmIRVersion(int* majorIR, int* minorIR, int* majorDbg, int* minorDbg) except* nogil:
+    global __nvvmIRVersion
+    _check_or_init_nvvm()
+    if __nvvmIRVersion == NULL:
+        with gil:
+            raise FunctionNotFoundError("function nvvmIRVersion is not found")
+    return (<nvvmResult (*)(int*, int*, int*, int*) nogil>__nvvmIRVersion)(
+        majorIR, minorIR, majorDbg, minorDbg)
+
+
+cdef nvvmResult _nvvmCreateProgram(nvvmProgram* prog) except* nogil:
+    global __nvvmCreateProgram
+    _check_or_init_nvvm()
+    if __nvvmCreateProgram == NULL:
+        with gil:
+            raise FunctionNotFoundError("function nvvmCreateProgram is not found")
+    return (<nvvmResult (*)(nvvmProgram*) nogil>__nvvmCreateProgram)(
+        prog)
+
+
+cdef nvvmResult _nvvmDestroyProgram(nvvmProgram* prog) except* nogil:
+    global __nvvmDestroyProgram
+    _check_or_init_nvvm()
+    if __nvvmDestroyProgram == NULL:
+        with gil:
+            raise FunctionNotFoundError("function nvvmDestroyProgram is not found")
+    return (<nvvmResult (*)(nvvmProgram*) nogil>__nvvmDestroyProgram)(
+        prog)
+
+
+cdef nvvmResult _nvvmAddModuleToProgram(nvvmProgram prog, const char* buffer, size_t size, const char* name) except* nogil:
+    global __nvvmAddModuleToProgram
+    _check_or_init_nvvm()
+    if __nvvmAddModuleToProgram == NULL:
+        with gil:
+            raise FunctionNotFoundError("function nvvmAddModuleToProgram is not found")
+    return (<nvvmResult (*)(nvvmProgram, const char*, size_t, const char*) nogil>__nvvmAddModuleToProgram)(
+        prog, buffer, size, name)
+
+
+cdef nvvmResult _nvvmLazyAddModuleToProgram(nvvmProgram prog, const char* buffer, size_t size, const char* name) except* nogil:
+    global __nvvmLazyAddModuleToProgram
+    _check_or_init_nvvm()
+    if __nvvmLazyAddModuleToProgram == NULL:
+        with gil:
+            raise FunctionNotFoundError("function nvvmLazyAddModuleToProgram is not found")
+    return (<nvvmResult (*)(nvvmProgram, const char*, size_t, const char*) nogil>__nvvmLazyAddModuleToProgram)(
+        prog, buffer, size, name)
+
+
+cdef nvvmResult _nvvmCompileProgram(nvvmProgram prog, int numOptions, const char** options) except* nogil:
+    global __nvvmCompileProgram
+    _check_or_init_nvvm()
+    if __nvvmCompileProgram == NULL:
+        with gil:
+            raise FunctionNotFoundError("function nvvmCompileProgram is not found")
+    return (<nvvmResult (*)(nvvmProgram, int, const char**) nogil>__nvvmCompileProgram)(
+        prog, numOptions, options)
+
+
+cdef nvvmResult _nvvmVerifyProgram(nvvmProgram prog, int numOptions, const char** options) except* nogil:
+    global __nvvmVerifyProgram
+    _check_or_init_nvvm()
+    if __nvvmVerifyProgram == NULL:
+        with gil:
+            raise FunctionNotFoundError("function nvvmVerifyProgram is not found")
+    return (<nvvmResult (*)(nvvmProgram, int, const char**) nogil>__nvvmVerifyProgram)(
+        prog, numOptions, options)
+
+
+cdef nvvmResult _nvvmGetCompiledResultSize(nvvmProgram prog, size_t* bufferSizeRet) except* nogil:
+    global __nvvmGetCompiledResultSize
+    _check_or_init_nvvm()
+    if __nvvmGetCompiledResultSize == NULL:
+        with gil:
+            raise FunctionNotFoundError("function nvvmGetCompiledResultSize is not found")
+    return (<nvvmResult (*)(nvvmProgram, size_t*) nogil>__nvvmGetCompiledResultSize)(
+        prog, bufferSizeRet)
+
+
+cdef nvvmResult _nvvmGetCompiledResult(nvvmProgram prog, char* buffer) except* nogil:
+    global __nvvmGetCompiledResult
+    _check_or_init_nvvm()
+    if __nvvmGetCompiledResult == NULL:
+        with gil:
+            raise FunctionNotFoundError("function nvvmGetCompiledResult is not found")
+    return (<nvvmResult (*)(nvvmProgram, char*) nogil>__nvvmGetCompiledResult)(
+        prog, buffer)
+
+
+cdef nvvmResult _nvvmGetProgramLogSize(nvvmProgram prog, size_t* bufferSizeRet) except* nogil:
+    global __nvvmGetProgramLogSize
+    _check_or_init_nvvm()
+    if __nvvmGetProgramLogSize == NULL:
+        with gil:
+            raise FunctionNotFoundError("function nvvmGetProgramLogSize is not found")
+    return (<nvvmResult (*)(nvvmProgram, size_t*) nogil>__nvvmGetProgramLogSize)(
+        prog, bufferSizeRet)
+
+
+cdef nvvmResult _nvvmGetProgramLog(nvvmProgram prog, char* buffer) except* nogil:
+    global __nvvmGetProgramLog
+    _check_or_init_nvvm()
+    if __nvvmGetProgramLog == NULL:
+        with gil:
+            raise FunctionNotFoundError("function nvvmGetProgramLog is not found")
+    return (<nvvmResult (*)(nvvmProgram, char*) nogil>__nvvmGetProgramLog)(
+        prog, buffer)
diff --git a/cuda_bindings/cuda/bindings/_internal/utils.pxd b/cuda_bindings/cuda/bindings/_internal/utils.pxd
new file mode 100644
index 0000000000..67f88e9f20
--- /dev/null
+++ b/cuda_bindings/cuda/bindings/_internal/utils.pxd
@@ -0,0 +1,169 @@
+# Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED.
+#
+# SPDX-License-Identifier: LicenseRef-NVIDIA-SOFTWARE-LICENSE
+
+from libc.stdint cimport int32_t, int64_t, intptr_t
+from libcpp.vector cimport vector
+from libcpp cimport bool as cppbool
+from libcpp cimport nullptr_t, nullptr
+from libcpp.memory cimport unique_ptr
+
+
+cdef extern from * nogil:
+    """
+    template<typename T>
+    class nullable_unique_ptr {
+      public:
+        nullable_unique_ptr() noexcept = default;
+
+        nullable_unique_ptr(std::nullptr_t) noexcept = delete;
+
+        explicit nullable_unique_ptr(T* data, bool own_data):
+            own_data_(own_data)
+        {
+            if (own_data)
+                manager_.reset(data);
+            else
+                raw_data_ = data;       
+        }
+
+        nullable_unique_ptr(const nullable_unique_ptr&) = delete;
+
+        nullable_unique_ptr& operator=(const nullable_unique_ptr&) = delete;
+
+        nullable_unique_ptr(nullable_unique_ptr&& other) noexcept
+        {
+            own_data_ = other.own_data_;
+            other.own_data_ = false;  // ownership is transferred
+            if (own_data_)
+            {
+                manager_ = std::move(other.manager_);
+                raw_data_ = nullptr;  // just in case
+            }   
+            else
+            {
+                manager_.reset(nullptr);  // just in case
+                raw_data_ = other.raw_data_;
+            }
+        }
+
+        nullable_unique_ptr& operator=(nullable_unique_ptr&& other) noexcept
+        {
+            own_data_ = other.own_data_;
+            other.own_data_ = false;  // ownership is transferred
+            if (own_data_)
+            {
+                manager_ = std::move(other.manager_);
+                raw_data_ = nullptr;  // just in case
+            }   
+            else
+            {
+                manager_.reset(nullptr);  // just in case
+                raw_data_ = other.raw_data_;
+            }
+            return *this;
+        }
+
+        ~nullable_unique_ptr() = default;
+
+        void reset(T* data, bool own_data)
+        {
+            own_data_ = own_data;
+            if (own_data_)
+            {
+                manager_.reset(data);
+                raw_data_ = nullptr;
+            }
+            else
+            {
+                manager_.reset(nullptr);
+                raw_data_ = data;
+            }
+        }
+
+        void swap(nullable_unique_ptr& other) noexcept
+        {
+            std::swap(manager_, other.manager_);
+            std::swap(raw_data_, other.raw_data_);
+            std::swap(own_data_, other.own_data_);
+        }
+
+        /*
+         * Get the pointer to the underlying object (this is different from data()!).
+         */
+        T* get() const noexcept
+        {
+            if (own_data_)
+                return manager_.get();
+            else
+                return raw_data_;
+        }
+
+        /*
+         * Get the pointer to the underlying buffer (this is different from get()!).
+         */
+        void* data() noexcept
+        {
+            if (own_data_)
+                return manager_.get()->data();
+            else
+                return raw_data_;
+        }
+
+        T& operator*()
+        {
+            if (own_data_)
+                return *manager_;
+            else
+                return *raw_data_;
+        }
+
+      private:
+        std::unique_ptr<T> manager_{};
+        T* raw_data_{nullptr};
+        bool own_data_{false};
+    };
+    """
+    # xref: cython/Cython/Includes/libcpp/memory.pxd
+    cdef cppclass nullable_unique_ptr[T]:
+        nullable_unique_ptr()
+        nullable_unique_ptr(T*, cppbool)
+        nullable_unique_ptr(nullable_unique_ptr[T]&)
+
+        # Modifiers
+        void reset(T*, cppbool)
+        void swap(nullable_unique_ptr&)
+
+        # Observers
+        T* get()
+        T& operator*()
+        void* data()
+
+
+ctypedef fused ResT:
+    int
+    int32_t
+    int64_t
+    char
+    float
+    double
+
+
+ctypedef fused PtrT:
+    void
+
+
+cdef cppclass nested_resource[T]:
+    nullable_unique_ptr[ vector[intptr_t] ] ptrs
+    nullable_unique_ptr[ vector[vector[T]] ] nested_resource_ptr
+
+
+# accepts the output pointer as input to use the return value for exception propagation
+cdef int get_resource_ptr(nullable_unique_ptr[vector[ResT]] &in_out_ptr, object obj, ResT* __unused) except 1
+cdef int get_resource_ptrs(nullable_unique_ptr[ vector[PtrT*] ] &in_out_ptr, object obj, PtrT* __unused) except 1
+cdef int get_nested_resource_ptr(nested_resource[ResT] &in_out_ptr, object obj, ResT* __unused) except 1
+
+cdef bint is_nested_sequence(data)
+cdef void* get_buffer_pointer(buf, Py_ssize_t size, readonly=*) except*
+
+cdef tuple get_nvvm_dso_version_suffix(int driver_ver)
diff --git a/cuda_bindings/cuda/bindings/_internal/utils.pyx b/cuda_bindings/cuda/bindings/_internal/utils.pyx
new file mode 100644
index 0000000000..d4fd1d813c
--- /dev/null
+++ b/cuda_bindings/cuda/bindings/_internal/utils.pyx
@@ -0,0 +1,137 @@
+# Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED.
+#
+# SPDX-License-Identifier: LicenseRef-NVIDIA-SOFTWARE-LICENSE
+
+cimport cpython
+from libc.stdint cimport intptr_t
+from libcpp.utility cimport move
+from cython.operator cimport dereference as deref
+
+
+cdef bint is_nested_sequence(data):
+    if not cpython.PySequence_Check(data):
+        return False
+    else:
+        for i in data:
+            if not cpython.PySequence_Check(i):
+                return False
+        else:
+            return True
+
+
+cdef void* get_buffer_pointer(buf, Py_ssize_t size, readonly=True) except*:
+    """The caller must ensure ``buf`` is alive when the returned pointer is in use."""
+    cdef void* bufPtr
+    cdef int flags = cpython.PyBUF_ANY_CONTIGUOUS
+    if not readonly:
+        flags |= cpython.PyBUF_WRITABLE
+    cdef int status = -1
+    cdef cpython.Py_buffer view
+
+    if isinstance(buf, int):
+        bufPtr = <void*><intptr_t>buf
+    else:  # try buffer protocol
+        try:
+            status = cpython.PyObject_GetBuffer(buf, &view, flags)
+            # when the caller does not provide a size, it is set to -1 at generate-time by cybind
+            if size != -1:
+                assert view.len == size
+            assert view.ndim == 1
+        except Exception as e:
+            adj = "writable " if not readonly else ""
+            raise ValueError(
+                 "buf must be either a Python int representing the pointer "
+                f"address to a valid buffer, or a 1D contiguous {adj}"
+                 "buffer, of size bytes") from e
+        else:
+            bufPtr = view.buf
+        finally:
+            if status == 0:
+                cpython.PyBuffer_Release(&view)
+
+    return bufPtr
+
+
+# Cython can't infer the ResT overload when it is wrapped in nullable_unique_ptr,
+# so we need a dummy (__unused) input argument to help it
+cdef int get_resource_ptr(nullable_unique_ptr[vector[ResT]] &in_out_ptr, object obj, ResT* __unused) except 1:
+    if cpython.PySequence_Check(obj):
+        vec = new vector[ResT](len(obj))
+        # set the ownership immediately to avoid leaking the `vec` memory in
+        # case of exception in the following loop
+        in_out_ptr.reset(vec, True)
+        for i in range(len(obj)):
+            deref(vec)[i] = obj[i]
+    else:
+        in_out_ptr.reset(<vector[ResT]*><intptr_t>obj, False)
+    return 0
+
+
+cdef int get_resource_ptrs(nullable_unique_ptr[ vector[PtrT*] ] &in_out_ptr, object obj, PtrT* __unused) except 1:
+    if cpython.PySequence_Check(obj):
+        vec = new vector[PtrT*](len(obj))
+        # set the ownership immediately to avoid leaking the `vec` memory in
+        # case of exception in the following loop
+        in_out_ptr.reset(vec, True)
+        for i in range(len(obj)):
+            deref(vec)[i] = <PtrT*><intptr_t>(obj[i])
+    else:
+        in_out_ptr.reset(<vector[PtrT*]*><intptr_t>obj, False)
+    return 0
+
+
+cdef int get_nested_resource_ptr(nested_resource[ResT] &in_out_ptr, object obj, ResT* __unused) except 1:
+    cdef nullable_unique_ptr[ vector[intptr_t] ] nested_ptr
+    cdef nullable_unique_ptr[ vector[vector[ResT]] ] nested_res_ptr
+    cdef vector[intptr_t]* nested_vec = NULL
+    cdef vector[vector[ResT]]* nested_res_vec = NULL
+    cdef size_t i = 0, length = 0
+    cdef intptr_t addr
+
+    if is_nested_sequence(obj):
+        length = len(obj)
+        nested_res_vec = new vector[vector[ResT]](length)
+        nested_vec = new vector[intptr_t](length)
+        # set the ownership immediately to avoid leaking memory in case of
+        # exception in the following loop
+        nested_res_ptr.reset(nested_res_vec, True)
+        nested_ptr.reset(nested_vec, True)
+        for i, obj_i in enumerate(obj):
+            if ResT is char:
+                obj_i_bytes = (<str?>(obj_i)).encode()
+                str_len = <size_t>(len(obj_i_bytes)) + 1  # including null termination
+                deref(nested_res_vec)[i].resize(str_len)
+                obj_i_ptr = <char*>(obj_i_bytes)
+                # cast to size_t explicitly to work around a potentially Cython bug
+                deref(nested_res_vec)[i].assign(obj_i_ptr, obj_i_ptr + <size_t>str_len)
+            else:
+                deref(nested_res_vec)[i] = obj_i
+            deref(nested_vec)[i] = <intptr_t>(deref(nested_res_vec)[i].data())
+    elif cpython.PySequence_Check(obj):
+        length = len(obj)
+        nested_vec = new vector[intptr_t](length)
+        nested_ptr.reset(nested_vec, True)
+        for i, addr in enumerate(obj):
+            deref(nested_vec)[i] = addr
+        nested_res_ptr.reset(NULL, False)
+    else:
+        # obj is an int (ResT**)
+        nested_res_ptr.reset(NULL, False)
+        nested_ptr.reset(<vector[intptr_t]*><intptr_t>obj, False)
+
+    in_out_ptr.ptrs = move(nested_ptr)
+    in_out_ptr.nested_resource_ptr = move(nested_res_ptr)
+    return 0
+
+
+class FunctionNotFoundError(RuntimeError): pass
+
+class NotSupportedError(RuntimeError): pass
+
+
+cdef tuple get_nvvm_dso_version_suffix(int driver_ver):
+    if 11000 <= driver_ver < 11020:
+        return ('3', '')
+    if 11020 <= driver_ver < 13000:
+        return ('4', '')
+    raise NotSupportedError(f'CUDA driver version {driver_ver} is not supported')
diff --git a/cuda_bindings/cuda/bindings/cynvvm.pxd b/cuda_bindings/cuda/bindings/cynvvm.pxd
new file mode 100644
index 0000000000..fa27d99bbf
--- /dev/null
+++ b/cuda_bindings/cuda/bindings/cynvvm.pxd
@@ -0,0 +1,46 @@
+# Copyright (c) 2025, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED.
+#
+# SPDX-License-Identifier: LicenseRef-NVIDIA-SOFTWARE-LICENSE
+#
+# This code was automatically generated across versions from 11.0.3 to 12.8.0. Do not modify it directly.
+
+
+###############################################################################
+# Types (structs, enums, ...)
+###############################################################################
+
+# enums
+ctypedef enum nvvmResult "nvvmResult":
+    NVVM_SUCCESS "NVVM_SUCCESS" = 0
+    NVVM_ERROR_OUT_OF_MEMORY "NVVM_ERROR_OUT_OF_MEMORY" = 1
+    NVVM_ERROR_PROGRAM_CREATION_FAILURE "NVVM_ERROR_PROGRAM_CREATION_FAILURE" = 2
+    NVVM_ERROR_IR_VERSION_MISMATCH "NVVM_ERROR_IR_VERSION_MISMATCH" = 3
+    NVVM_ERROR_INVALID_INPUT "NVVM_ERROR_INVALID_INPUT" = 4
+    NVVM_ERROR_INVALID_PROGRAM "NVVM_ERROR_INVALID_PROGRAM" = 5
+    NVVM_ERROR_INVALID_IR "NVVM_ERROR_INVALID_IR" = 6
+    NVVM_ERROR_INVALID_OPTION "NVVM_ERROR_INVALID_OPTION" = 7
+    NVVM_ERROR_NO_MODULE_IN_PROGRAM "NVVM_ERROR_NO_MODULE_IN_PROGRAM" = 8
+    NVVM_ERROR_COMPILATION "NVVM_ERROR_COMPILATION" = 9
+    NVVM_ERROR_CANCELLED "NVVM_ERROR_CANCELLED" = 10
+
+
+# types
+ctypedef void* nvvmProgram 'nvvmProgram'
+
+
+###############################################################################
+# Functions
+###############################################################################
+
+cdef nvvmResult nvvmVersion(int* major, int* minor) except* nogil
+cdef nvvmResult nvvmIRVersion(int* majorIR, int* minorIR, int* majorDbg, int* minorDbg) except* nogil
+cdef nvvmResult nvvmCreateProgram(nvvmProgram* prog) except* nogil
+cdef nvvmResult nvvmDestroyProgram(nvvmProgram* prog) except* nogil
+cdef nvvmResult nvvmAddModuleToProgram(nvvmProgram prog, const char* buffer, size_t size, const char* name) except* nogil
+cdef nvvmResult nvvmLazyAddModuleToProgram(nvvmProgram prog, const char* buffer, size_t size, const char* name) except* nogil
+cdef nvvmResult nvvmCompileProgram(nvvmProgram prog, int numOptions, const char** options) except* nogil
+cdef nvvmResult nvvmVerifyProgram(nvvmProgram prog, int numOptions, const char** options) except* nogil
+cdef nvvmResult nvvmGetCompiledResultSize(nvvmProgram prog, size_t* bufferSizeRet) except* nogil
+cdef nvvmResult nvvmGetCompiledResult(nvvmProgram prog, char* buffer) except* nogil
+cdef nvvmResult nvvmGetProgramLogSize(nvvmProgram prog, size_t* bufferSizeRet) except* nogil
+cdef nvvmResult nvvmGetProgramLog(nvvmProgram prog, char* buffer) except* nogil
diff --git a/cuda_bindings/cuda/bindings/cynvvm.pyx b/cuda_bindings/cuda/bindings/cynvvm.pyx
new file mode 100644
index 0000000000..1812998e18
--- /dev/null
+++ b/cuda_bindings/cuda/bindings/cynvvm.pyx
@@ -0,0 +1,59 @@
+# Copyright (c) 2025, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED.
+#
+# SPDX-License-Identifier: LicenseRef-NVIDIA-SOFTWARE-LICENSE
+#
+# This code was automatically generated across versions from 11.0.3 to 12.8.0. Do not modify it directly.
+
+from ._internal cimport nvvm as _nvvm
+
+
+###############################################################################
+# Wrapper functions
+###############################################################################
+
+cdef nvvmResult nvvmVersion(int* major, int* minor) except* nogil:
+    return _nvvm._nvvmVersion(major, minor)
+
+
+cdef nvvmResult nvvmIRVersion(int* majorIR, int* minorIR, int* majorDbg, int* minorDbg) except* nogil:
+    return _nvvm._nvvmIRVersion(majorIR, minorIR, majorDbg, minorDbg)
+
+
+cdef nvvmResult nvvmCreateProgram(nvvmProgram* prog) except* nogil:
+    return _nvvm._nvvmCreateProgram(prog)
+
+
+cdef nvvmResult nvvmDestroyProgram(nvvmProgram* prog) except* nogil:
+    return _nvvm._nvvmDestroyProgram(prog)
+
+
+cdef nvvmResult nvvmAddModuleToProgram(nvvmProgram prog, const char* buffer, size_t size, const char* name) except* nogil:
+    return _nvvm._nvvmAddModuleToProgram(prog, buffer, size, name)
+
+
+cdef nvvmResult nvvmLazyAddModuleToProgram(nvvmProgram prog, const char* buffer, size_t size, const char* name) except* nogil:
+    return _nvvm._nvvmLazyAddModuleToProgram(prog, buffer, size, name)
+
+
+cdef nvvmResult nvvmCompileProgram(nvvmProgram prog, int numOptions, const char** options) except* nogil:
+    return _nvvm._nvvmCompileProgram(prog, numOptions, options)
+
+
+cdef nvvmResult nvvmVerifyProgram(nvvmProgram prog, int numOptions, const char** options) except* nogil:
+    return _nvvm._nvvmVerifyProgram(prog, numOptions, options)
+
+
+cdef nvvmResult nvvmGetCompiledResultSize(nvvmProgram prog, size_t* bufferSizeRet) except* nogil:
+    return _nvvm._nvvmGetCompiledResultSize(prog, bufferSizeRet)
+
+
+cdef nvvmResult nvvmGetCompiledResult(nvvmProgram prog, char* buffer) except* nogil:
+    return _nvvm._nvvmGetCompiledResult(prog, buffer)
+
+
+cdef nvvmResult nvvmGetProgramLogSize(nvvmProgram prog, size_t* bufferSizeRet) except* nogil:
+    return _nvvm._nvvmGetProgramLogSize(prog, bufferSizeRet)
+
+
+cdef nvvmResult nvvmGetProgramLog(nvvmProgram prog, char* buffer) except* nogil:
+    return _nvvm._nvvmGetProgramLog(prog, buffer)
diff --git a/cuda_bindings/cuda/bindings/nvvm.pxd b/cuda_bindings/cuda/bindings/nvvm.pxd
new file mode 100644
index 0000000000..dc8b2eea1b
--- /dev/null
+++ b/cuda_bindings/cuda/bindings/nvvm.pxd
@@ -0,0 +1,40 @@
+# Copyright (c) 2025, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED.
+#
+# SPDX-License-Identifier: LicenseRef-NVIDIA-SOFTWARE-LICENSE
+#
+# This code was automatically generated across versions from 11.0.3 to 12.8.0. Do not modify it directly.
+
+from libc.stdint cimport intptr_t
+
+from .cynvvm cimport *
+
+
+###############################################################################
+# Types
+###############################################################################
+
+ctypedef nvvmProgram Program
+
+
+###############################################################################
+# Enum
+###############################################################################
+
+ctypedef nvvmResult _Result
+
+
+###############################################################################
+# Functions
+###############################################################################
+
+cpdef tuple version()
+cpdef tuple ir_version()
+cpdef intptr_t create_program() except? 0
+cpdef add_module_to_program(intptr_t prog, buffer, size_t size, name)
+cpdef lazy_add_module_to_program(intptr_t prog, buffer, size_t size, name)
+cpdef compile_program(intptr_t prog, int num_options, options)
+cpdef verify_program(intptr_t prog, int num_options, options)
+cpdef size_t get_compiled_result_size(intptr_t prog) except? 0
+cpdef get_compiled_result(intptr_t prog, buffer)
+cpdef size_t get_program_log_size(intptr_t prog) except? 0
+cpdef get_program_log(intptr_t prog, buffer)
diff --git a/cuda_bindings/cuda/bindings/nvvm.pyx b/cuda_bindings/cuda/bindings/nvvm.pyx
new file mode 100644
index 0000000000..2a334994c2
--- /dev/null
+++ b/cuda_bindings/cuda/bindings/nvvm.pyx
@@ -0,0 +1,284 @@
+# Copyright (c) 2025, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED.
+#
+# SPDX-License-Identifier: LicenseRef-NVIDIA-SOFTWARE-LICENSE
+#
+# This code was automatically generated across versions from 11.0.3 to 12.8.0. Do not modify it directly.
+
+cimport cython  # NOQA
+
+from ._internal.utils cimport (get_buffer_pointer, get_nested_resource_ptr,
+                               nested_resource)
+
+from enum import IntEnum as _IntEnum
+
+
+###############################################################################
+# Enum
+###############################################################################
+
+class Result(_IntEnum):
+    """See `nvvmResult`."""
+    SUCCESS = NVVM_SUCCESS
+    ERROR_OUT_OF_MEMORY = NVVM_ERROR_OUT_OF_MEMORY
+    ERROR_PROGRAM_CREATION_FAILURE = NVVM_ERROR_PROGRAM_CREATION_FAILURE
+    ERROR_IR_VERSION_MISMATCH = NVVM_ERROR_IR_VERSION_MISMATCH
+    ERROR_INVALID_INPUT = NVVM_ERROR_INVALID_INPUT
+    ERROR_INVALID_PROGRAM = NVVM_ERROR_INVALID_PROGRAM
+    ERROR_INVALID_IR = NVVM_ERROR_INVALID_IR
+    ERROR_INVALID_OPTION = NVVM_ERROR_INVALID_OPTION
+    ERROR_NO_MODULE_IN_PROGRAM = NVVM_ERROR_NO_MODULE_IN_PROGRAM
+    ERROR_COMPILATION = NVVM_ERROR_COMPILATION
+    ERROR_CANCELLED = NVVM_ERROR_CANCELLED
+
+
+###############################################################################
+# Error handling
+###############################################################################
+
+class nvvmError(Exception):
+
+    def __init__(self, status):
+        self.status = status
+        s = Result(status)
+        cdef str err = f"{s.name} ({s.value})"
+        super(nvvmError, self).__init__(err)
+
+    def __reduce__(self):
+        return (type(self), (self.status,))
+
+
+@cython.profile(False)
+cdef int check_status(int status) except 1 nogil:
+    if status != 0:
+        with gil:
+            raise nvvmError(status)
+    return status
+
+
+###############################################################################
+# Wrapper functions
+###############################################################################
+
+cpdef destroy_program(intptr_t prog):
+    """Destroy a program.
+
+    Args:
+        prog (intptr_t): nvvm prog.
+
+    .. seealso:: `nvvmDestroyProgram`
+    """
+    cdef Program p = <Program>prog
+    with nogil:
+        status = nvvmDestroyProgram(&p)
+    check_status(status)
+
+
+cpdef tuple version():
+    """Get the NVVM version.
+
+    Returns:
+        A 2-tuple containing:
+
+        - int: NVVM major version number.
+        - int: NVVM minor version number.
+
+    .. seealso:: `nvvmVersion`
+    """
+    cdef int major
+    cdef int minor
+    with nogil:
+        status = nvvmVersion(&major, &minor)
+    check_status(status)
+    return (major, minor)
+
+
+cpdef tuple ir_version():
+    """Get the NVVM IR version.
+
+    Returns:
+        A 4-tuple containing:
+
+        - int: NVVM IR major version number.
+        - int: NVVM IR minor version number.
+        - int: NVVM IR debug metadata major version number.
+        - int: NVVM IR debug metadata minor version number.
+
+    .. seealso:: `nvvmIRVersion`
+    """
+    cdef int major_ir
+    cdef int minor_ir
+    cdef int major_dbg
+    cdef int minor_dbg
+    with nogil:
+        status = nvvmIRVersion(&major_ir, &minor_ir, &major_dbg, &minor_dbg)
+    check_status(status)
+    return (major_ir, minor_ir, major_dbg, minor_dbg)
+
+
+cpdef intptr_t create_program() except? 0:
+    """Create a program, and set the value of its handle to ``*prog``.
+
+    Returns:
+        intptr_t: NVVM program.
+
+    .. seealso:: `nvvmCreateProgram`
+    """
+    cdef Program prog
+    with nogil:
+        status = nvvmCreateProgram(&prog)
+    check_status(status)
+    return <intptr_t>prog
+
+
+cpdef add_module_to_program(intptr_t prog, buffer, size_t size, name):
+    """Add a module level NVVM IR to a program.
+
+    Args:
+        prog (intptr_t): NVVM program.
+        buffer (bytes): NVVM IR module in the bitcode or text representation.
+        size (size_t): Size of the NVVM IR module.
+        name (str): Name of the NVVM IR module. If NULL, "<unnamed>" is used as the name.
+
+    .. seealso:: `nvvmAddModuleToProgram`
+    """
+    cdef void* _buffer_ = get_buffer_pointer(buffer, size, readonly=True)
+    if not isinstance(name, str):
+        raise TypeError("name must be a Python str")
+    cdef bytes _temp_name_ = (<str>name).encode()
+    cdef char* _name_ = _temp_name_
+    with nogil:
+        status = nvvmAddModuleToProgram(<Program>prog, <const char*>_buffer_, size, <const char*>_name_)
+    check_status(status)
+
+
+cpdef lazy_add_module_to_program(intptr_t prog, buffer, size_t size, name):
+    """Add a module level NVVM IR to a program.
+
+    Args:
+        prog (intptr_t): NVVM program.
+        buffer (bytes): NVVM IR module in the bitcode representation.
+        size (size_t): Size of the NVVM IR module.
+        name (str): Name of the NVVM IR module. If NULL, "<unnamed>" is used as the name.
+
+    .. seealso:: `nvvmLazyAddModuleToProgram`
+    """
+    cdef void* _buffer_ = get_buffer_pointer(buffer, size, readonly=True)
+    if not isinstance(name, str):
+        raise TypeError("name must be a Python str")
+    cdef bytes _temp_name_ = (<str>name).encode()
+    cdef char* _name_ = _temp_name_
+    with nogil:
+        status = nvvmLazyAddModuleToProgram(<Program>prog, <const char*>_buffer_, size, <const char*>_name_)
+    check_status(status)
+
+
+cpdef compile_program(intptr_t prog, int num_options, options):
+    """Compile the NVVM program.
+
+    Args:
+        prog (intptr_t): NVVM program.
+        num_options (int): Number of compiler ``options`` passed.
+        options (object): Compiler options in the form of C string array. It can be:
+
+            - an :class:`int` as the pointer address to the nested sequence, or
+            - a Python sequence of :class:`int`\s, each of which is a pointer address
+              to a valid sequence of 'char', or
+            - a nested Python sequence of ``str``.
+
+
+    .. seealso:: `nvvmCompileProgram`
+    """
+    cdef nested_resource[ char ] _options_
+    get_nested_resource_ptr[char](_options_, options, <char*>NULL)
+    with nogil:
+        status = nvvmCompileProgram(<Program>prog, num_options, <const char**>(_options_.ptrs.data()))
+    check_status(status)
+
+
+cpdef verify_program(intptr_t prog, int num_options, options):
+    """Verify the NVVM program.
+
+    Args:
+        prog (intptr_t): NVVM program.
+        num_options (int): Number of compiler ``options`` passed.
+        options (object): Compiler options in the form of C string array. It can be:
+
+            - an :class:`int` as the pointer address to the nested sequence, or
+            - a Python sequence of :class:`int`\s, each of which is a pointer address
+              to a valid sequence of 'char', or
+            - a nested Python sequence of ``str``.
+
+
+    .. seealso:: `nvvmVerifyProgram`
+    """
+    cdef nested_resource[ char ] _options_
+    get_nested_resource_ptr[char](_options_, options, <char*>NULL)
+    with nogil:
+        status = nvvmVerifyProgram(<Program>prog, num_options, <const char**>(_options_.ptrs.data()))
+    check_status(status)
+
+
+cpdef size_t get_compiled_result_size(intptr_t prog) except? 0:
+    """Get the size of the compiled result.
+
+    Args:
+        prog (intptr_t): NVVM program.
+
+    Returns:
+        size_t: Size of the compiled result (including the trailing NULL).
+
+    .. seealso:: `nvvmGetCompiledResultSize`
+    """
+    cdef size_t buffer_size_ret
+    with nogil:
+        status = nvvmGetCompiledResultSize(<Program>prog, &buffer_size_ret)
+    check_status(status)
+    return buffer_size_ret
+
+
+cpdef get_compiled_result(intptr_t prog, buffer):
+    """Get the compiled result.
+
+    Args:
+        prog (intptr_t): NVVM program.
+        buffer (bytes): Compiled result.
+
+    .. seealso:: `nvvmGetCompiledResult`
+    """
+    cdef void* _buffer_ = get_buffer_pointer(buffer, -1, readonly=False)
+    with nogil:
+        status = nvvmGetCompiledResult(<Program>prog, <char*>_buffer_)
+    check_status(status)
+
+
+cpdef size_t get_program_log_size(intptr_t prog) except? 0:
+    """Get the Size of Compiler/Verifier Message.
+
+    Args:
+        prog (intptr_t): NVVM program.
+
+    Returns:
+        size_t: Size of the compilation/verification log (including the trailing NULL).
+
+    .. seealso:: `nvvmGetProgramLogSize`
+    """
+    cdef size_t buffer_size_ret
+    with nogil:
+        status = nvvmGetProgramLogSize(<Program>prog, &buffer_size_ret)
+    check_status(status)
+    return buffer_size_ret
+
+
+cpdef get_program_log(intptr_t prog, buffer):
+    """Get the Compiler/Verifier Message.
+
+    Args:
+        prog (intptr_t): NVVM program.
+        buffer (bytes): Compilation/Verification log.
+
+    .. seealso:: `nvvmGetProgramLog`
+    """
+    cdef void* _buffer_ = get_buffer_pointer(buffer, -1, readonly=False)
+    with nogil:
+        status = nvvmGetProgramLog(<Program>prog, <char*>_buffer_)
+    check_status(status)
diff --git a/cuda_bindings/pyproject.toml b/cuda_bindings/pyproject.toml
index 0abf0672f6..028c20f993 100644
--- a/cuda_bindings/pyproject.toml
+++ b/cuda_bindings/pyproject.toml
@@ -34,7 +34,8 @@ dependencies = [
 
 [project.optional-dependencies]
 all = [
-    "nvidia-cuda-nvrtc-cu11"
+    "nvidia-cuda-nvcc-cu11",
+    "nvidia-cuda-nvrtc-cu11",
 ]
 
 [project.urls]
diff --git a/cuda_bindings/setup.py b/cuda_bindings/setup.py
index 7b4884f6ff..0d2b938d40 100644
--- a/cuda_bindings/setup.py
+++ b/cuda_bindings/setup.py
@@ -228,6 +228,9 @@ def do_cythonize(extensions):
     ["cuda/bindings/*.pyx"],
     # public (deprecated, to be removed)
     ["cuda/*.pyx"],
+    # internal files used by generated bindings
+    ["cuda/bindings/_internal/nvvm.pyx"],
+    ["cuda/bindings/_internal/utils.pyx"],
 ]
 
 for sources in sources_list:
@@ -260,7 +263,15 @@ def build_extension(self, ext):
             # Allow extensions to discover libraries at runtime
             # relative their wheels installation.
             if ext.name == "cuda.bindings._bindings.cynvrtc":
-                ldflag = f"-Wl,--disable-new-dtags,-rpath,$ORIGIN/../../../nvidia/cuda_nvrtc/lib"
+                ldflag = "-Wl,--disable-new-dtags,-rpath,$ORIGIN/../../../nvidia/cuda_nvrtc/lib"
+            elif ext.name == "cuda.bindings._internal.nvvm":
+                # from <loc>/site-packages/cuda/bindings/_internal/
+                #   to <loc>/site-packages/nvidia/cuda_nvcc/nvvm/lib64/
+                rel1 = "$ORIGIN/../../../nvidia/cuda_nvcc/nvvm/lib64"
+                # from <loc>/lib/python3.*/site-packages/cuda/bindings/_internal/
+                #   to <loc>/lib/nvvm/lib64/
+                rel2 = "$ORIGIN/../../../../../../nvvm/lib64"
+                ldflag = f"-Wl,--disable-new-dtags,-rpath,{rel1},-rpath,{rel2}"
             else:
                 ldflag = None
 
diff --git a/cuda_bindings/tests/test_nvvm.py b/cuda_bindings/tests/test_nvvm.py
new file mode 100644
index 0000000000..4bf0a3cebb
--- /dev/null
+++ b/cuda_bindings/tests/test_nvvm.py
@@ -0,0 +1,193 @@
+# Copyright (c) 2025, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED.
+#
+# SPDX-License-Identifier: LicenseRef-NVIDIA-SOFTWARE-LICENSE
+
+import base64
+import re
+from contextlib import contextmanager
+
+import pytest
+
+from cuda.bindings import nvvm
+
+MINIMAL_NVVMIR_TXT = b"""\
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-i128:128:128-f32:32:32-f64:64:64-v16:16:16-v32:32:32-v64:64:64-v128:128:128-n16:32:64"
+
+target triple = "nvptx64-nvidia-cuda"
+
+define void @kernel() {
+entry:
+  ret void
+}
+
+!nvvm.annotations = !{!0}
+!0 = !{void ()* @kernel, !"kernel", i32 1}
+
+!nvvmir.version = !{!1}
+!1 = !{i32 2, i32 0, i32 3, i32 1}
+"""  # noqa: E501
+
+# Equivalent to MINIMAL_NVVMIR_TXT
+MINIMAL_NVVMIR_BITCODE = base64.b64decode("""
+QkPA3jUUAAAFAAAAYgwwJElZvmbu034tRAEyBQAAAAAhDAAAJAEAAAsCIQACAAAAEwAAAAeBI5FB
+yARJBhAyOZIBhAwlBQgZHgSLYoAMRQJCkgtCZBAyFDgIGEsKMjKISJAUIENGiKUAGTJC5EgOkJEh
+xFBBUYGM4YPligQZRgaJIAAACwAAADIiyAggZIUEkyGkhASTIeOEoZAUEkyGjAuEZEwQFCMAJQBl
+IGCOAAwAAAAAEyZ3sAd4oAd8sAM6aAN3sId0IId0CIc2GId6IIdw2OAS5dAG8KAHdkAHemAHdKAH
+dkAHbZAOcaAHeKAHeNAG6YAHeoAHeoAHbZAOcWAHehAHdqAHcWAHbZAOcyAHejAHcqAHcyAHbZAO
+dkAHemAHdKAHdkAHbZAOcSAHeKAHcSAHeKAHcSAHeNAG5jAHcqAHcyAHejAHctAG5mAHdKAHdkAH
+emAHdNAG9hAHdqAHcWAHehAHdtAG9jAHcqAHcyAHejAHctAG9mAHdKAHdkAHemAHdNAG9hAHcoAH
+ehAHcoAHehAHcoAHbeAOcWAHejAHcqAHdkAHGiEMGTFIgzDA8jdVxSCRvyxDIsAIAAAAAAAAAAAA
+AEBig0BRlAAAgCwQBgAAADIemAwZEUyQjAkmR8YEQ2IJFMEIQBkAALEYAABtAAAAMwiAHMThHGYU
+AT2IQziEw4xCgAd5eAdzmHEM5gAP7RAO9IAOMwxCHsLBHc6hHGYwBT2IQziEgxvMAz3IQz2MAz3M
+eIx0cAd7CAd5SIdwcAd6cAN2eIdwIIcZzBEO7JAO4TAPbjAP4/AO8FAOMxDEHd4hHNghHcJhHmYw
+iTu8gzvQQzm0Azy8gzyEAzvM8BR2YAd7aAc3aIdyaAc3gIdwkIdwYAd2KAd2+AV2eId3gIdfCIdx
+GIdymId5mIEs7vAO7uAO9cAO7DADYsihHOShHMyhHOShHNxhHMohHMSBHcphBtaQQznIQzmYQznI
+Qzm4wziUQziIAzuUwy+8gzz8gjvUAzuwwwzHaYdwWIdycIN0aAd4YId0GId0oIcZzlMP7gAP8lAO
+5JAO40AP4SAO7FAOMyAoHdzBHsJBHtIhHNyBHtzgHOThHeoBHmYYUTiwQzqcgzvMUCR2YAd7aAc3
+YId3eAd4mFFM9JAP8FAOMx5qHsphHOghHd7BHX4BHuShHMwhHfBhBlSFgzjMwzuwQz3QQzn8wjzk
+QzuIwzuww4zFCod5mId3GId0CAd6KAdyAAAAAHkgAAAeAAAAYh5IIEOIDBk5GSSQkUDGyMhoIlAI
+FDKeGBkhR8iQUQwIBQAABgAAAGtlcm5lbAAAIwgCMIJABCMIhDCCQAwjCAQxwyAEwwwEURiDjAQm
+KCE3O7s2lzA3tze6MLq0N7e5UQIjHTc7u7Y0ORe7Mrm5tDe3UYIDAAAAqRgAAAsAAAALCnIoh3eA
+B3pYcJhDPbjDOLBDOdDDguYcxqEN6EEewsEd5iEd6CEd3sEdANEQAAAGAAAAB8w8pIM7nAM7lAM9
+oIM8lEM4kMMBAAAAYSAAAAYAAAATBAGGAwEAAAIAAAAHUBDNFGEAAAAAAABxIAAAAwAAADIOECKE
+AKACAAAAAAAAAABlDAAAHQAAABIDlOgAAAAAAAAAAAYAAAAFAAAARAAAAAEAAABQAAAAAAAAAFAA
+AAABAAAAaAAAAAAAAAALAAAAEwAAAB4AAAARAAAALwAAAAAAAAAAAAAAAQAAAAAAAAAAAAAABgAA
+AAAAAAAGAAAA/////wAkAAAAAAAAXQwAAA8AAAASA5RvAAAAAGtlcm5lbDUuMC4xbnZwdHg2NC1u
+dmlkaWEtY3VkYW1pbmltYWxfbnZ2bWlyLmxsAAAAAAA=
+""")
+# To regenerate, pull and start a docker container:
+#     docker pull centos/llvm-toolset-7-centos7
+#     docker run -it centos/llvm-toolset-7-centos7 /bin/bash
+# In the docker container, copy MINIMAL_NVVMIR_TXT to a file with name minimal_nvvmir.ll
+# Then run:
+#     llvm-as minimal_nvvmir.ll -o minimal_nvvmir.bc
+# Save this to encode.py:
+#     import base64, sys, textwrap
+#     bitcode = open(sys.argv[1], "rb").read()
+#     encoded_bitcode = base64.b64encode(bitcode).decode("ascii")
+#     wrapped_base64 = "\n".join(textwrap.wrap(encoded_bitcode, width=76))
+#     print(wrapped_base64)
+# Then run:
+#     python encode.py minimal_nvvmir.bc
+
+
+@pytest.fixture(params=["txt", "bitcode"])
+def minimal_nvvmir(request):
+    return MINIMAL_NVVMIR_TXT if request.param == "txt" else MINIMAL_NVVMIR_BITCODE
+
+
+@pytest.fixture(params=[nvvm.compile_program, nvvm.verify_program])
+def compile_or_verify(request):
+    return request.param
+
+
+def match_exact(s):
+    return "^" + re.escape(s) + "$"
+
+
+@contextmanager
+def nvvm_program() -> int:
+    prog: int = nvvm.create_program()
+    try:
+        yield prog
+    finally:
+        nvvm.destroy_program(prog)
+
+
+def get_program_log(prog):
+    buffer = bytearray(nvvm.get_program_log_size(prog))
+    nvvm.get_program_log(prog, buffer)
+    return buffer.decode(errors="backslashreplace")
+
+
+def test_nvvm_version():
+    ver = nvvm.version()
+    assert len(ver) == 2
+    assert ver >= (2, 0)
+
+
+def test_nvvm_ir_version():
+    ver = nvvm.ir_version()
+    assert len(ver) == 4
+    assert ver >= (2, 0, 3, 1)
+
+
+def test_create_and_destroy():
+    with nvvm_program() as prog:
+        assert isinstance(prog, int)
+        assert prog != 0
+
+
+@pytest.mark.parametrize("add_fn", [nvvm.add_module_to_program, nvvm.lazy_add_module_to_program])
+def test_add_module_to_program_fail(add_fn):
+    with nvvm_program() as prog, pytest.raises(ValueError):
+        # Passing a C NULL pointer generates "ERROR_INVALID_INPUT (4)",
+        # but that is not possible through our Python bindings.
+        # The ValueError originates from the cython bindings code.
+        add_fn(prog, None, 0, "FileNameHere.ll")
+
+
+def test_c_or_v_program_fail_no_module(compile_or_verify):
+    with nvvm_program() as prog, pytest.raises(nvvm.nvvmError, match=match_exact("ERROR_NO_MODULE_IN_PROGRAM (8)")):
+        compile_or_verify(prog, 0, [])
+
+
+def test_c_or_v_program_fail_invalid_ir(compile_or_verify):
+    expected_error = "ERROR_COMPILATION (9)" if compile_or_verify is nvvm.compile_program else "ERROR_INVALID_IR (6)"
+    nvvm_ll = b"This is not NVVM IR"
+    with nvvm_program() as prog:
+        nvvm.add_module_to_program(prog, nvvm_ll, len(nvvm_ll), "FileNameHere.ll")
+        with pytest.raises(nvvm.nvvmError, match=match_exact(expected_error)):
+            compile_or_verify(prog, 0, [])
+        assert get_program_log(prog) == "FileNameHere.ll (1, 0): parse expected top-level entity\x00"
+
+
+def test_c_or_v_program_fail_bad_option(minimal_nvvmir, compile_or_verify):
+    with nvvm_program() as prog:
+        nvvm.add_module_to_program(prog, minimal_nvvmir, len(minimal_nvvmir), "FileNameHere.ll")
+        with pytest.raises(nvvm.nvvmError, match=match_exact("ERROR_INVALID_OPTION (7)")):
+            compile_or_verify(prog, 1, ["BadOption"])
+        assert get_program_log(prog) == "libnvvm : error: BadOption is an unsupported option\x00"
+
+
+@pytest.mark.parametrize(
+    ("get_size", "get_buffer"),
+    [
+        (nvvm.get_compiled_result_size, nvvm.get_compiled_result),
+        (nvvm.get_program_log_size, nvvm.get_program_log),
+    ],
+)
+def test_get_buffer_empty(get_size, get_buffer):
+    with nvvm_program() as prog:
+        buffer_size = get_size(prog)
+        assert buffer_size == 1
+        buffer = bytearray(buffer_size)
+        get_buffer(prog, buffer)
+        assert buffer == b"\x00"
+
+
+@pytest.mark.parametrize("options", [[], ["-opt=0"], ["-opt=3", "-g"]])
+def test_compile_program_with_minimal_nnvm_ir(minimal_nvvmir, options):
+    with nvvm_program() as prog:
+        nvvm.add_module_to_program(prog, minimal_nvvmir, len(minimal_nvvmir), "FileNameHere.ll")
+        try:
+            nvvm.compile_program(prog, len(options), options)
+        except nvvm.nvvmError as e:
+            raise RuntimeError(get_program_log(prog)) from e
+        else:
+            log_size = nvvm.get_program_log_size(prog)
+            assert log_size == 1
+            buffer = bytearray(log_size)
+            nvvm.get_program_log(prog, buffer)
+            assert buffer == b"\x00"
+        result_size = nvvm.get_compiled_result_size(prog)
+        buffer = bytearray(result_size)
+        nvvm.get_compiled_result(prog, buffer)
+        assert ".visible .entry kernel()" in buffer.decode()
+
+
+@pytest.mark.parametrize("options", [[], ["-opt=0"], ["-opt=3", "-g"]])
+def test_verify_program_with_minimal_nnvm_ir(minimal_nvvmir, options):
+    with nvvm_program() as prog:
+        nvvm.add_module_to_program(prog, minimal_nvvmir, len(minimal_nvvmir), "FileNameHere.ll")
+        nvvm.verify_program(prog, len(options), options)

From 639319e0595bc2cb9bd9cf26ce82aa0fd53fa25a Mon Sep 17 00:00:00 2001
From: Leo Fang <leof@nvidia.com>
Date: Sat, 8 Feb 2025 04:18:12 +0000
Subject: [PATCH 2/9] backport build system additions

---
 .../cuda/bindings/_internal/__init__.py       |  0
 cuda_bindings/setup.py                        | 40 ++++++++++++++++++-
 2 files changed, 39 insertions(+), 1 deletion(-)
 create mode 100644 cuda_bindings/cuda/bindings/_internal/__init__.py

diff --git a/cuda_bindings/cuda/bindings/_internal/__init__.py b/cuda_bindings/cuda/bindings/_internal/__init__.py
new file mode 100644
index 0000000000..e69de29bb2
diff --git a/cuda_bindings/setup.py b/cuda_bindings/setup.py
index 0d2b938d40..4968d54eb2 100644
--- a/cuda_bindings/setup.py
+++ b/cuda_bindings/setup.py
@@ -6,11 +6,15 @@
 # this software and related documentation outside the terms of the EULA
 # is strictly prohibited.
 
+import atexit
+import contextlib
 import glob
 import os
 import platform
+import shutil
 import sys
 import sysconfig
+import tempfile
 
 from Cython import Tempita
 from Cython.Build import cythonize
@@ -145,7 +149,9 @@ def generate_output(infile, local):
              os.path.join('cuda', 'bindings'),
              os.path.join('cuda', 'bindings', '_bindings'),
              os.path.join('cuda', 'bindings', '_lib'),
-             os.path.join('cuda', 'bindings', '_lib', 'cyruntime')]
+             os.path.join('cuda', 'bindings', '_lib', 'cyruntime'),
+             os.path.join('cuda', 'bindings', '_internal'),
+            ]
 input_files = []
 for path in path_list:
     input_files += fetch_input_files(path)
@@ -206,6 +212,38 @@ def prep_extensions(sources):
     return exts
 
 
+# new path for the bindings from cybind
+def rename_architecture_specific_files():
+    if sys.platform == "linux":
+        src_files = glob.glob(os.path.join(path, "*_linux.pyx"))
+    elif sys.platform == "win32":
+        src_files = glob.glob(os.path.join(path, "*_windows.pyx"))
+    else:
+        raise RuntimeError(f"platform is unrecognized: {sys.platform}")
+    dst_files = []
+    for src in src_files:
+        # Set up a temporary file; it must be under the cache directory so
+        # that atomic moves within the same filesystem can be guaranteed
+        with tempfile.NamedTemporaryFile(delete=False, dir=".") as f:
+            shutil.copy2(src, f.name)
+            f_name = f.name
+        dst = src.replace("_linux", "").replace("_windows", "")
+        # atomic move with the destination guaranteed to be overwritten
+        os.replace(f_name, f"./{dst}")
+        dst_files.append(dst)
+    return dst_files
+
+
+dst_files = rename_architecture_specific_files()
+
+
+@atexit.register
+def cleanup_dst_files():
+    for dst in dst_files:
+        with contextlib.suppress(FileNotFoundError):
+            os.remove(dst)
+
+
 def do_cythonize(extensions):
     return cythonize(
         extensions,

From 66aa0380f937c96a69eda567ea8f2a439e3c9f2d Mon Sep 17 00:00:00 2001
From: "Ralf W. Grosse-Kunstleve" <rgrossekunst@nvidia.com>
Date: Fri, 7 Feb 2025 22:19:03 -0800
Subject: [PATCH 3/9] CTK 11.8 IR version compatibility

---
 cuda_bindings/tests/test_nvvm.py | 17 +++++++++++------
 1 file changed, 11 insertions(+), 6 deletions(-)

diff --git a/cuda_bindings/tests/test_nvvm.py b/cuda_bindings/tests/test_nvvm.py
index 4bf0a3cebb..86a74d4fdd 100644
--- a/cuda_bindings/tests/test_nvvm.py
+++ b/cuda_bindings/tests/test_nvvm.py
@@ -24,11 +24,11 @@
 !0 = !{void ()* @kernel, !"kernel", i32 1}
 
 !nvvmir.version = !{!1}
-!1 = !{i32 2, i32 0, i32 3, i32 1}
+!1 = !{i32 %d, i32 %d, i32 %d, i32 %d}
 """  # noqa: E501
 
-# Equivalent to MINIMAL_NVVMIR_TXT
-MINIMAL_NVVMIR_BITCODE = base64.b64decode("""
+# Equivalent to MINIMAL_NVVMIR_TXT % (2, 0, 3, 1)
+MINIMAL_NVVMIR_BITCODE_2_0_3_1 = base64.b64decode("""
 QkPA3jUUAAAFAAAAYgwwJElZvmbu034tRAEyBQAAAAAhDAAAJAEAAAsCIQACAAAAEwAAAAeBI5FB
 yARJBhAyOZIBhAwlBQgZHgSLYoAMRQJCkgtCZBAyFDgIGEsKMjKISJAUIENGiKUAGTJC5EgOkJEh
 xFBBUYGM4YPligQZRgaJIAAACwAAADIiyAggZIUEkyGkhASTIeOEoZAUEkyGjAuEZEwQFCMAJQBl
@@ -73,7 +73,12 @@
 
 @pytest.fixture(params=["txt", "bitcode"])
 def minimal_nvvmir(request):
-    return MINIMAL_NVVMIR_TXT if request.param == "txt" else MINIMAL_NVVMIR_BITCODE
+    ir_vers = nvvm.ir_version()
+    if request.param == "txt":
+        return MINIMAL_NVVMIR_TXT % ir_vers
+    if ir_vers[:2] != (3, 0):
+        pytest.skip(f"MINIMAL_NVVMIR_BITCODE_2_0_3_1 vs {ir_vers} IR version incompatibility")
+    return MINIMAL_NVVMIR_BITCODE_2_0_3_1
 
 
 @pytest.fixture(params=[nvvm.compile_program, nvvm.verify_program])
@@ -103,13 +108,13 @@ def get_program_log(prog):
 def test_nvvm_version():
     ver = nvvm.version()
     assert len(ver) == 2
-    assert ver >= (2, 0)
+    assert ver >= (1, 0)
 
 
 def test_nvvm_ir_version():
     ver = nvvm.ir_version()
     assert len(ver) == 4
-    assert ver >= (2, 0, 3, 1)
+    assert ver >= (1, 0, 0, 0)
 
 
 def test_create_and_destroy():

From 4064fc3dcaf1c11bdc502e7ed9430ef4e85cc845 Mon Sep 17 00:00:00 2001
From: "Ralf W. Grosse-Kunstleve" <rgrossekunst@nvidia.com>
Date: Fri, 7 Feb 2025 22:56:33 -0800
Subject: [PATCH 4/9] Use llvmlite to convert IR txt to bitcode.

---
 cuda_bindings/requirements.txt   |  1 +
 cuda_bindings/tests/test_nvvm.py | 64 +++++++-------------------------
 2 files changed, 14 insertions(+), 51 deletions(-)

diff --git a/cuda_bindings/requirements.txt b/cuda_bindings/requirements.txt
index 2fdaa17e71..2d78753841 100644
--- a/cuda_bindings/requirements.txt
+++ b/cuda_bindings/requirements.txt
@@ -7,3 +7,4 @@ setuptools
 tomli; python_version < "3.11"
 wheel
 pywin32; sys_platform == 'win32'
+llvmlite
diff --git a/cuda_bindings/tests/test_nvvm.py b/cuda_bindings/tests/test_nvvm.py
index 86a74d4fdd..8f0e8dfdfd 100644
--- a/cuda_bindings/tests/test_nvvm.py
+++ b/cuda_bindings/tests/test_nvvm.py
@@ -2,10 +2,10 @@
 #
 # SPDX-License-Identifier: LicenseRef-NVIDIA-SOFTWARE-LICENSE
 
-import base64
 import re
 from contextlib import contextmanager
 
+import llvmlite.binding
 import pytest
 
 from cuda.bindings import nvvm
@@ -27,58 +27,20 @@
 !1 = !{i32 %d, i32 %d, i32 %d, i32 %d}
 """  # noqa: E501
 
-# Equivalent to MINIMAL_NVVMIR_TXT % (2, 0, 3, 1)
-MINIMAL_NVVMIR_BITCODE_2_0_3_1 = base64.b64decode("""
-QkPA3jUUAAAFAAAAYgwwJElZvmbu034tRAEyBQAAAAAhDAAAJAEAAAsCIQACAAAAEwAAAAeBI5FB
-yARJBhAyOZIBhAwlBQgZHgSLYoAMRQJCkgtCZBAyFDgIGEsKMjKISJAUIENGiKUAGTJC5EgOkJEh
-xFBBUYGM4YPligQZRgaJIAAACwAAADIiyAggZIUEkyGkhASTIeOEoZAUEkyGjAuEZEwQFCMAJQBl
-IGCOAAwAAAAAEyZ3sAd4oAd8sAM6aAN3sId0IId0CIc2GId6IIdw2OAS5dAG8KAHdkAHemAHdKAH
-dkAHbZAOcaAHeKAHeNAG6YAHeoAHeoAHbZAOcWAHehAHdqAHcWAHbZAOcyAHejAHcqAHcyAHbZAO
-dkAHemAHdKAHdkAHbZAOcSAHeKAHcSAHeKAHcSAHeNAG5jAHcqAHcyAHejAHctAG5mAHdKAHdkAH
-emAHdNAG9hAHdqAHcWAHehAHdtAG9jAHcqAHcyAHejAHctAG9mAHdKAHdkAHemAHdNAG9hAHcoAH
-ehAHcoAHehAHcoAHbeAOcWAHejAHcqAHdkAHGiEMGTFIgzDA8jdVxSCRvyxDIsAIAAAAAAAAAAAA
-AEBig0BRlAAAgCwQBgAAADIemAwZEUyQjAkmR8YEQ2IJFMEIQBkAALEYAABtAAAAMwiAHMThHGYU
-AT2IQziEw4xCgAd5eAdzmHEM5gAP7RAO9IAOMwxCHsLBHc6hHGYwBT2IQziEgxvMAz3IQz2MAz3M
-eIx0cAd7CAd5SIdwcAd6cAN2eIdwIIcZzBEO7JAO4TAPbjAP4/AO8FAOMxDEHd4hHNghHcJhHmYw
-iTu8gzvQQzm0Azy8gzyEAzvM8BR2YAd7aAc3aIdyaAc3gIdwkIdwYAd2KAd2+AV2eId3gIdfCIdx
-GIdymId5mIEs7vAO7uAO9cAO7DADYsihHOShHMyhHOShHNxhHMohHMSBHcphBtaQQznIQzmYQznI
-Qzm4wziUQziIAzuUwy+8gzz8gjvUAzuwwwzHaYdwWIdycIN0aAd4YId0GId0oIcZzlMP7gAP8lAO
-5JAO40AP4SAO7FAOMyAoHdzBHsJBHtIhHNyBHtzgHOThHeoBHmYYUTiwQzqcgzvMUCR2YAd7aAc3
-YId3eAd4mFFM9JAP8FAOMx5qHsphHOghHd7BHX4BHuShHMwhHfBhBlSFgzjMwzuwQz3QQzn8wjzk
-QzuIwzuww4zFCod5mId3GId0CAd6KAdyAAAAAHkgAAAeAAAAYh5IIEOIDBk5GSSQkUDGyMhoIlAI
-FDKeGBkhR8iQUQwIBQAABgAAAGtlcm5lbAAAIwgCMIJABCMIhDCCQAwjCAQxwyAEwwwEURiDjAQm
-KCE3O7s2lzA3tze6MLq0N7e5UQIjHTc7u7Y0ORe7Mrm5tDe3UYIDAAAAqRgAAAsAAAALCnIoh3eA
-B3pYcJhDPbjDOLBDOdDDguYcxqEN6EEewsEd5iEd6CEd3sEdANEQAAAGAAAAB8w8pIM7nAM7lAM9
-oIM8lEM4kMMBAAAAYSAAAAYAAAATBAGGAwEAAAIAAAAHUBDNFGEAAAAAAABxIAAAAwAAADIOECKE
-AKACAAAAAAAAAABlDAAAHQAAABIDlOgAAAAAAAAAAAYAAAAFAAAARAAAAAEAAABQAAAAAAAAAFAA
-AAABAAAAaAAAAAAAAAALAAAAEwAAAB4AAAARAAAALwAAAAAAAAAAAAAAAQAAAAAAAAAAAAAABgAA
-AAAAAAAGAAAA/////wAkAAAAAAAAXQwAAA8AAAASA5RvAAAAAGtlcm5lbDUuMC4xbnZwdHg2NC1u
-dmlkaWEtY3VkYW1pbmltYWxfbnZ2bWlyLmxsAAAAAAA=
-""")
-# To regenerate, pull and start a docker container:
-#     docker pull centos/llvm-toolset-7-centos7
-#     docker run -it centos/llvm-toolset-7-centos7 /bin/bash
-# In the docker container, copy MINIMAL_NVVMIR_TXT to a file with name minimal_nvvmir.ll
-# Then run:
-#     llvm-as minimal_nvvmir.ll -o minimal_nvvmir.bc
-# Save this to encode.py:
-#     import base64, sys, textwrap
-#     bitcode = open(sys.argv[1], "rb").read()
-#     encoded_bitcode = base64.b64encode(bitcode).decode("ascii")
-#     wrapped_base64 = "\n".join(textwrap.wrap(encoded_bitcode, width=76))
-#     print(wrapped_base64)
-# Then run:
-#     python encode.py minimal_nvvmir.bc
+MINIMAL_NVVMIR_CACHE = {}
 
 
 @pytest.fixture(params=["txt", "bitcode"])
 def minimal_nvvmir(request):
-    ir_vers = nvvm.ir_version()
-    if request.param == "txt":
-        return MINIMAL_NVVMIR_TXT % ir_vers
-    if ir_vers[:2] != (3, 0):
-        pytest.skip(f"MINIMAL_NVVMIR_BITCODE_2_0_3_1 vs {ir_vers} IR version incompatibility")
-    return MINIMAL_NVVMIR_BITCODE_2_0_3_1
+    for _ in range(2):
+        nvvmir = MINIMAL_NVVMIR_CACHE.get(request.param)
+        if nvvmir is not None:
+            return nvvmir
+        txt = MINIMAL_NVVMIR_TXT % nvvm.ir_version()
+        bitcode = llvmlite.binding.parse_assembly(txt.decode()).as_bitcode()
+        MINIMAL_NVVMIR_CACHE["txt"] = txt
+        MINIMAL_NVVMIR_CACHE["bitcode"] = bitcode
+    raise AssertionError("This code path is meant to be unreachable.")
 
 
 @pytest.fixture(params=[nvvm.compile_program, nvvm.verify_program])
@@ -172,7 +134,7 @@ def test_get_buffer_empty(get_size, get_buffer):
 
 
 @pytest.mark.parametrize("options", [[], ["-opt=0"], ["-opt=3", "-g"]])
-def test_compile_program_with_minimal_nnvm_ir(minimal_nvvmir, options):
+def test_compile_program_with_minimal_nvvm_ir(minimal_nvvmir, options):
     with nvvm_program() as prog:
         nvvm.add_module_to_program(prog, minimal_nvvmir, len(minimal_nvvmir), "FileNameHere.ll")
         try:
@@ -192,7 +154,7 @@ def test_compile_program_with_minimal_nnvm_ir(minimal_nvvmir, options):
 
 
 @pytest.mark.parametrize("options", [[], ["-opt=0"], ["-opt=3", "-g"]])
-def test_verify_program_with_minimal_nnvm_ir(minimal_nvvmir, options):
+def test_verify_program_with_minimal_nvvm_ir(minimal_nvvmir, options):
     with nvvm_program() as prog:
         nvvm.add_module_to_program(prog, minimal_nvvmir, len(minimal_nvvmir), "FileNameHere.ll")
         nvvm.verify_program(prog, len(options), options)

From 37a5e0fd3ab7d6562a277e02b6d50a7114e97177 Mon Sep 17 00:00:00 2001
From: "Ralf W. Grosse-Kunstleve" <rgrossekunst@nvidia.com>
Date: Mon, 10 Feb 2025 22:57:16 -0800
Subject: [PATCH 5/9] Make llvmlite an optional test dependency by introducing
 a MINIMAL_NVVMIR_BITCODE_STATIC dict.

---
 cuda_bindings/tests/test_nvvm.py | 95 +++++++++++++++++++++++++++++---
 1 file changed, 87 insertions(+), 8 deletions(-)

diff --git a/cuda_bindings/tests/test_nvvm.py b/cuda_bindings/tests/test_nvvm.py
index 8f0e8dfdfd..9fdfdaa231 100644
--- a/cuda_bindings/tests/test_nvvm.py
+++ b/cuda_bindings/tests/test_nvvm.py
@@ -2,14 +2,23 @@
 #
 # SPDX-License-Identifier: LicenseRef-NVIDIA-SOFTWARE-LICENSE
 
+import binascii
 import re
+import textwrap
 from contextlib import contextmanager
 
-import llvmlite.binding
 import pytest
 
 from cuda.bindings import nvvm
 
+MINIMAL_NVVMIR_FIXTURE_PARAMS = ["txt", "bitcode_static"]
+try:
+    import llvmlite.binding as llvmlite_binding  # Optional test dependency.
+except ImportError:
+    llvmlite_binding = None
+else:
+    MINIMAL_NVVMIR_FIXTURE_PARAMS.append("bitcode_dynamic")
+
 MINIMAL_NVVMIR_TXT = b"""\
 target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-i128:128:128-f32:32:32-f64:64:64-v16:16:16-v32:32:32-v64:64:64-v128:128:128-n16:32:64"
 
@@ -24,22 +33,92 @@
 !0 = !{void ()* @kernel, !"kernel", i32 1}
 
 !nvvmir.version = !{!1}
-!1 = !{i32 %d, i32 %d, i32 %d, i32 %d}
+!1 = !{i32 %d, i32 0, i32 %d, i32 0}
 """  # noqa: E501
 
+MINIMAL_NVVMIR_BITCODE_STATIC = {
+    (2, 3):  # (major, debug_major)
+    "4243c0de3514000005000000620c30244a59be669dfbb4bf0b51804c01000000210c000080010000"
+    "0b02210002000000160000000781239141c80449061032399201840c250508191e048b62800c4502"
+    "42920b42641032143808184b0a3232884870c421234412878c1041920264c808b1142043468820c9"
+    "01323284182a282a90317cb05c9120c3c8000000892000000b0000003222c80820624600212b2498"
+    "0c212524980c19270c85a4906032645c20246382a01801300128030173046000132677b00778a007"
+    "7cb0033a680377b0877420877408873618877a208770d8e012e5d006f0a0077640077a600774a007"
+    "7640076d900e71a00778a00778d006e980077a80077a80076d900e7160077a100776a0077160076d"
+    "900e7320077a300772a0077320076d900e7640077a600774a0077640076d900e71200778a0077120"
+    "0778a00771200778d006e6300772a0077320077a300772d006e6600774a0077640077a600774d006"
+    "f6100776a0077160077a100776d006f6300772a0077320077a300772d006f6600774a0077640077a"
+    "600774d006f610077280077a10077280077a10077280076de00e7160077a300772a0077640071a21"
+    "4c0e11de9c2e4fbbcfbe211560040000000000000000000000000620b141a0286100004016080000"
+    "06000000321e980c19114c908c092647c60443620914c10840190000b1180000ac0000003308801c"
+    "c4e11c6614013d88433884c38c4280077978077398710ce6000fed100ef4800e330c421ec2c11dce"
+    "a11c6630053d88433884831bcc033dc8433d8c033dcc788c7470077b08077948877070077a700376"
+    "788770208719cc110eec900ee1300f6e300fe3f00ef0500e3310c41dde211cd8211dc2611e663089"
+    "3bbc833bd04339b4033cbc833c84033bccf0147660077b6807376887726807378087709087706007"
+    "76280776f8057678877780875f08877118877298877998812ceef00eeee00ef5c00eec300362c8a1"
+    "1ce4a11ccca11ce4a11cdc611cca211cc4811dca6106d6904339c84339984339c84339b8c3389443"
+    "3888033b94c32fbc833cfc823bd4033bb0c30cc7698770588772708374680778608774188774a087"
+    "19ce530fee000ff2500ee4900ee3400fe1200eec500e3320281ddcc11ec2411ed2211cdc811edce0"
+    "1ce4e11dea011e66185138b0433a9c833bcc50247660077b68073760877778077898514cf4900ff0"
+    "500e331e6a1eca611ce8211ddec11d7e011ee4a11ccc211df0610654858338ccc33bb0433dd04339"
+    "fcc23ce4433b88c33bb0c38cc50a877998877718877408077a28077298815ce3100eecc00ee5500e"
+    "f33023c1d2411ee4e117d8e11dde011e6648193bb0833db4831b84c3388c4339ccc33cb8c139c8c3"
+    "3bd4033ccc48b471080776600771088771588719dbc60eec600fede006f0200fe5300fe5200ff650"
+    "0e6e100ee3300ee5300ff3e006e9e00ee4500ef83023e2ec611cc2811dd8e117ec211de6211dc421"
+    "1dd8211de8211f66209d3bbc433db80339948339cc58bc7070077778077a08077a488777708719cb"
+    "e70eef300fe1e00ee9400fe9a00fe530c3010373a8077718875f988770708774a08774d087729881"
+    "844139e0c338b0433d904339cc40c4a01dcaa11de0411edec11c662463300ee1c00eec300fe9400f"
+    "e5000000792000001e000000721e482043880c19097232482023818c9191d144a01028643c313242"
+    "8e9021a318100a00060000006b65726e656c0000230802308240042308843082400c23080431c320"
+    "04c30c045118858c04262821373bbb36973037b737ba30bab437b7b95102231d373bbbb6343917bb"
+    "32b9b9b437b7518203000000a9180000250000000b0a7228877780077a587098433db8c338b04339"
+    "d0c382e61cc6a10de8411ec2c11de6211de8211ddec11d1634e3600ee7500fe1200fe4400fe1200f"
+    "e7500ef4b08081077928877060077678877108077a28077258709cc338b4013ba4833d94c3026b1c"
+    "d8211cdce11cdc201ce4611cdc201ce8811ec2611cd0a11cc8611cc2811dd861c1010ff4200fe150"
+    "0ff4800e00000000d11000000600000007cc3ca4833b9c033b94033da0833c94433890c301000000"
+    "6120000006000000130481860301000002000000075010cd14610000000000007120000003000000"
+    "320e10228400fc020000000000000000650c00001f000000120394f0000000000300000006000000"
+    "060000004c000000010000005800000000000000580000000100000070000000000000000c000000"
+    "130000001f0000000800000006000000000000007000000000000000000000000100000000000000"
+    "00000000060000000000000006000000ffffffff00240000000000005d0c00000d00000012039467"
+    "000000006b65726e656c31352e302e376e7670747836342d6e76696469612d637564613c73747269"
+    "6e673e0000000000",
+}
+
 MINIMAL_NVVMIR_CACHE = {}
 
 
-@pytest.fixture(params=["txt", "bitcode"])
+@pytest.fixture(params=MINIMAL_NVVMIR_FIXTURE_PARAMS)
 def minimal_nvvmir(request):
     for _ in range(2):
-        nvvmir = MINIMAL_NVVMIR_CACHE.get(request.param)
-        if nvvmir is not None:
+        nvvmir = MINIMAL_NVVMIR_CACHE.get(request.param, -1)
+        if nvvmir != -1:
+            if nvvmir is None:
+                pytest.skip(f"UNAVAILABLE: {request.param}")
             return nvvmir
-        txt = MINIMAL_NVVMIR_TXT % nvvm.ir_version()
-        bitcode = llvmlite.binding.parse_assembly(txt.decode()).as_bitcode()
+        major, minor, debug_major, debug_minor = nvvm.ir_version()
+        txt = MINIMAL_NVVMIR_TXT % (major, debug_major)
+        if llvmlite_binding is None:
+            bitcode_dynamic = None
+        else:
+            bitcode_dynamic = llvmlite_binding.parse_assembly(txt.decode()).as_bitcode()
+        bitcode_static = MINIMAL_NVVMIR_BITCODE_STATIC.get((major, debug_major))
+        if bitcode_static is not None:
+            bitcode_static = binascii.unhexlify(bitcode_static)
         MINIMAL_NVVMIR_CACHE["txt"] = txt
-        MINIMAL_NVVMIR_CACHE["bitcode"] = bitcode
+        MINIMAL_NVVMIR_CACHE["bitcode_dynamic"] = bitcode_dynamic
+        MINIMAL_NVVMIR_CACHE["bitcode_static"] = bitcode_static
+        if bitcode_static is None:
+            if bitcode_dynamic is None:
+                raise RuntimeError("Please `pip install llvmlite` to generate `bitcode_static`")
+            bitcode_hex = binascii.hexlify(bitcode_dynamic).decode("ascii")
+            print("\n\nMINIMAL_NVVMIR_BITCODE_STATIC = { # PLEASE ADD TO test_nvvm.py")
+            print(f"    ({major}, {debug_major}):  # (major, debug_major)")
+            lines = textwrap.wrap(bitcode_hex, width=80)
+            for line in lines[:-1]:
+                print(f'    "{line}"')
+            print(f'    "{lines[-1]}",')
+            print("}\n", flush=True)
     raise AssertionError("This code path is meant to be unreachable.")
 
 

From 03cb2ee2a591f06a6e79344c938d885a22099d1a Mon Sep 17 00:00:00 2001
From: "Ralf W. Grosse-Kunstleve" <rgrossekunst@nvidia.com>
Date: Mon, 10 Feb 2025 23:26:36 -0800
Subject: [PATCH 6/9] Add MINIMAL_NVVMIR_BITCODE_STATIC entry for CTK 11.8

---
 cuda_bindings/tests/test_nvvm.py | 46 ++++++++++++++++++++++++++++++++
 1 file changed, 46 insertions(+)

diff --git a/cuda_bindings/tests/test_nvvm.py b/cuda_bindings/tests/test_nvvm.py
index 9fdfdaa231..e356a077d1 100644
--- a/cuda_bindings/tests/test_nvvm.py
+++ b/cuda_bindings/tests/test_nvvm.py
@@ -37,6 +37,52 @@
 """  # noqa: E501
 
 MINIMAL_NVVMIR_BITCODE_STATIC = {
+    (1, 3):  # (major, debug_major)
+    "4243c0de3514000005000000620c30244a59be669dfbb4bf0b51804c01000000210c00007f010000"
+    "0b02210002000000160000000781239141c80449061032399201840c250508191e048b62800c4502"
+    "42920b42641032143808184b0a3232884870c421234412878c1041920264c808b1142043468820c9"
+    "01323284182a282a90317cb05c9120c3c8000000892000000b0000003222c80820624600212b2498"
+    "0c212524980c19270c85a4906032645c20246382a01801300128030173046000132677b00778a007"
+    "7cb0033a680377b0877420877408873618877a208770d8e012e5d006f0a0077640077a600774a007"
+    "7640076d900e71a00778a00778d006e980077a80077a80076d900e7160077a100776a0077160076d"
+    "900e7320077a300772a0077320076d900e7640077a600774a0077640076d900e71200778a0077120"
+    "0778a00771200778d006e6300772a0077320077a300772d006e6600774a0077640077a600774d006"
+    "f6100776a0077160077a100776d006f6300772a0077320077a300772d006f6600774a0077640077a"
+    "600774d006f610077280077a10077280077a10077280076de00e7160077a300772a0077640071a21"
+    "4c0e11de9c2e4fbbcfbe211560040000000000000000000000000620b141a0e86000004016080000"
+    "06000000321e980c19114c908c092647c6044362098c009401000000b1180000ac0000003308801c"
+    "c4e11c6614013d88433884c38c4280077978077398710ce6000fed100ef4800e330c421ec2c11dce"
+    "a11c6630053d88433884831bcc033dc8433d8c033dcc788c7470077b08077948877070077a700376"
+    "788770208719cc110eec900ee1300f6e300fe3f00ef0500e3310c41dde211cd8211dc2611e663089"
+    "3bbc833bd04339b4033cbc833c84033bccf0147660077b6807376887726807378087709087706007"
+    "76280776f8057678877780875f08877118877298877998812ceef00eeee00ef5c00eec300362c8a1"
+    "1ce4a11ccca11ce4a11cdc611cca211cc4811dca6106d6904339c84339984339c84339b8c3389443"
+    "3888033b94c32fbc833cfc823bd4033bb0c30cc7698770588772708374680778608774188774a087"
+    "19ce530fee000ff2500ee4900ee3400fe1200eec500e3320281ddcc11ec2411ed2211cdc811edce0"
+    "1ce4e11dea011e66185138b0433a9c833bcc50247660077b68073760877778077898514cf4900ff0"
+    "500e331e6a1eca611ce8211ddec11d7e011ee4a11ccc211df0610654858338ccc33bb0433dd04339"
+    "fcc23ce4433b88c33bb0c38cc50a877998877718877408077a28077298815ce3100eecc00ee5500e"
+    "f33023c1d2411ee4e117d8e11dde011e6648193bb0833db4831b84c3388c4339ccc33cb8c139c8c3"
+    "3bd4033ccc48b471080776600771088771588719dbc60eec600fede006f0200fe5300fe5200ff650"
+    "0e6e100ee3300ee5300ff3e006e9e00ee4500ef83023e2ec611cc2811dd8e117ec211de6211dc421"
+    "1dd8211de8211f66209d3bbc433db80339948339cc58bc7070077778077a08077a488777708719cb"
+    "e70eef300fe1e00ee9400fe9a00fe530c3010373a8077718875f988770708774a08774d087729881"
+    "844139e0c338b0433d904339cc40c4a01dcaa11de0411edec11c662463300ee1c00eec300fe9400f"
+    "e5000000792000001d000000721e482043880c19097232482023818c9191d144a01028643c313242"
+    "8e9021a318100a00060000006b65726e656c0000230802308240042308843082400c330c4230cc40"
+    "0c4441c84860821272b3b36b730973737ba30ba34b7b739b1b2528d271b3b36b4b9373b12b939b4b"
+    "7b731b2530000000a9180000250000000b0a7228877780077a587098433db8c338b04339d0c382e6"
+    "1cc6a10de8411ec2c11de6211de8211ddec11d1634e3600ee7500fe1200fe4400fe1200fe7500ef4"
+    "b08081077928877060077678877108077a28077258709cc338b4013ba4833d94c3026b1cd8211cdc"
+    "e11cdc201ce4611cdc201ce8811ec2611cd0a11cc8611cc2811dd861c1010ff4200fe1500ff4800e"
+    "00000000d11000000600000007cc3ca4833b9c033b94033da0833c94433890c30100000061200000"
+    "06000000130481860301000002000000075010cd14610000000000007120000003000000320e1022"
+    "8400fb020000000000000000650c00001f000000120394f000000000030000000600000006000000"
+    "4c000000010000005800000000000000580000000100000070000000000000000c00000013000000"
+    "1f000000080000000600000000000000700000000000000000000000010000000000000000000000"
+    "060000000000000006000000ffffffff00240000000000005d0c00000d0000001203946700000000"
+    "6b65726e656c31352e302e376e7670747836342d6e76696469612d637564613c737472696e673e00"
+    "00000000",
     (2, 3):  # (major, debug_major)
     "4243c0de3514000005000000620c30244a59be669dfbb4bf0b51804c01000000210c000080010000"
     "0b02210002000000160000000781239141c80449061032399201840c250508191e048b62800c4502"

From d388b624ff243941b1dc1f6f6b8650d3c30f1f98 Mon Sep 17 00:00:00 2001
From: "Ralf W. Grosse-Kunstleve" <rgrossekunst@nvidia.com>
Date: Mon, 10 Feb 2025 23:31:52 -0800
Subject: [PATCH 7/9] Comment out llvmlite in requirements.txt

---
 cuda_bindings/requirements.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/cuda_bindings/requirements.txt b/cuda_bindings/requirements.txt
index 2d78753841..eeffb93cb9 100644
--- a/cuda_bindings/requirements.txt
+++ b/cuda_bindings/requirements.txt
@@ -7,4 +7,4 @@ setuptools
 tomli; python_version < "3.11"
 wheel
 pywin32; sys_platform == 'win32'
-llvmlite
+# llvmlite # Uncomment to generate MINIMAL_NVVMIR_BITCODE_STATIC for test_nvvm.py

From e1950d5efeaa4c0bb3f739ae566cc26c10f50696 Mon Sep 17 00:00:00 2001
From: "Ralf W. Grosse-Kunstleve" <rgrossekunst@nvidia.com>
Date: Tue, 11 Feb 2025 10:19:47 -0800
Subject: [PATCH 8/9] Add "(see PR #443)"

---
 cuda_bindings/requirements.txt   | 2 +-
 cuda_bindings/tests/test_nvvm.py | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/cuda_bindings/requirements.txt b/cuda_bindings/requirements.txt
index eeffb93cb9..1ad127ac28 100644
--- a/cuda_bindings/requirements.txt
+++ b/cuda_bindings/requirements.txt
@@ -7,4 +7,4 @@ setuptools
 tomli; python_version < "3.11"
 wheel
 pywin32; sys_platform == 'win32'
-# llvmlite # Uncomment to generate MINIMAL_NVVMIR_BITCODE_STATIC for test_nvvm.py
+# llvmlite # Uncomment to generate MINIMAL_NVVMIR_BITCODE_STATIC for test_nvvm.py (see PR #443).
diff --git a/cuda_bindings/tests/test_nvvm.py b/cuda_bindings/tests/test_nvvm.py
index e356a077d1..753fdd9702 100644
--- a/cuda_bindings/tests/test_nvvm.py
+++ b/cuda_bindings/tests/test_nvvm.py
@@ -156,7 +156,7 @@ def minimal_nvvmir(request):
         MINIMAL_NVVMIR_CACHE["bitcode_static"] = bitcode_static
         if bitcode_static is None:
             if bitcode_dynamic is None:
-                raise RuntimeError("Please `pip install llvmlite` to generate `bitcode_static`")
+                raise RuntimeError("Please `pip install llvmlite` to generate `bitcode_static` (see PR #443)")
             bitcode_hex = binascii.hexlify(bitcode_dynamic).decode("ascii")
             print("\n\nMINIMAL_NVVMIR_BITCODE_STATIC = { # PLEASE ADD TO test_nvvm.py")
             print(f"    ({major}, {debug_major}):  # (major, debug_major)")

From 236fe897440a80933739e1c11d6329128f5bb493 Mon Sep 17 00:00:00 2001
From: "Ralf W. Grosse-Kunstleve" <rgrossekunst@nvidia.com>
Date: Tue, 11 Feb 2025 12:21:40 -0800
Subject: [PATCH 9/9] Make minimal_nvvmir fixture implementation slightly more
 readable.

---
 cuda_bindings/tests/test_nvvm.py | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/cuda_bindings/tests/test_nvvm.py b/cuda_bindings/tests/test_nvvm.py
index 753fdd9702..db8481a925 100644
--- a/cuda_bindings/tests/test_nvvm.py
+++ b/cuda_bindings/tests/test_nvvm.py
@@ -136,12 +136,15 @@
 
 @pytest.fixture(params=MINIMAL_NVVMIR_FIXTURE_PARAMS)
 def minimal_nvvmir(request):
-    for _ in range(2):
+    for pass_counter in range(2):
         nvvmir = MINIMAL_NVVMIR_CACHE.get(request.param, -1)
         if nvvmir != -1:
             if nvvmir is None:
                 pytest.skip(f"UNAVAILABLE: {request.param}")
             return nvvmir
+        if pass_counter:
+            raise AssertionError("This code path is meant to be unreachable.")
+        # Build cache entries, then try again (above).
         major, minor, debug_major, debug_minor = nvvm.ir_version()
         txt = MINIMAL_NVVMIR_TXT % (major, debug_major)
         if llvmlite_binding is None:
@@ -165,7 +168,6 @@ def minimal_nvvmir(request):
                 print(f'    "{line}"')
             print(f'    "{lines[-1]}",')
             print("}\n", flush=True)
-    raise AssertionError("This code path is meant to be unreachable.")
 
 
 @pytest.fixture(params=[nvvm.compile_program, nvvm.verify_program])