From dc3222ddb41f442d599eafdab53d923987f176d7 Mon Sep 17 00:00:00 2001
From: Abhilash Majumder <abmajumder@nvidia.com>
Date: Tue, 26 Aug 2025 19:09:04 +0530
Subject: [PATCH 01/42] nvvm ir integration

---
 cuda_core/cuda/core/experimental/_module.py   | 18 +++++
 cuda_core/cuda/core/experimental/_program.py  | 65 +++++++++++++++++--
 .../core/experimental/_utils/cuda_utils.pyx   | 27 +++++++-
 3 files changed, 103 insertions(+), 7 deletions(-)

diff --git a/cuda_core/cuda/core/experimental/_module.py b/cuda_core/cuda/core/experimental/_module.py
index 63bb6ff260..8b3146fee3 100644
--- a/cuda_core/cuda/core/experimental/_module.py
+++ b/cuda_core/cuda/core/experimental/_module.py
@@ -544,6 +544,24 @@ def from_ltoir(module: Union[bytes, str], *, name: str = "", symbol_mapping: Opt
             them (default to no mappings).
         """
         return ObjectCode._init(module, "ltoir", name=name, symbol_mapping=symbol_mapping)
+    
+    @staticmethod
+    def from_nvvm(module: Union[bytes, str], *, name: str = "", symbol_mapping: Optional[dict] = None) -> "ObjectCode":
+        """Create an :class:`ObjectCode` instance from an existing NVVM IR.
+
+        Parameters
+        ----------
+        module : Union[bytes, str]
+            Either a bytes object containing the in-memory NVVM IR code to load, or
+            a file path string pointing to the on-disk NVVM IR file to load.
+        name : Optional[str]
+            A human-readable identifier representing this code object.
+        symbol_mapping : Optional[dict]
+            A dictionary specifying how the unmangled symbol names (as keys)
+            should be mapped to the mangled names before trying to retrieve
+            them (default to no mappings).
+        """
+        return ObjectCode._init(module, "nvvm", name=name, symbol_mapping=symbol_mapping)
 
     @staticmethod
     def from_fatbin(
diff --git a/cuda_core/cuda/core/experimental/_program.py b/cuda_core/cuda/core/experimental/_program.py
index 3df8894d50..7782cfe5ed 100644
--- a/cuda_core/cuda/core/experimental/_program.py
+++ b/cuda_core/cuda/core/experimental/_program.py
@@ -24,6 +24,7 @@
     is_nested_sequence,
     is_sequence,
     nvrtc,
+    nvvm,
 )
 
 
@@ -370,22 +371,26 @@ class Program:
     code : Any
         String of the CUDA Runtime Compilation program.
     code_type : Any
-        String of the code type. Currently ``"ptx"`` and ``"c++"`` are supported.
+        String of the code type. Currently ``"ptx"``, ``"c++"``, and ``"nvvm"`` are supported.
     options : ProgramOptions, optional
         A ProgramOptions object to customize the compilation process.
         See :obj:`ProgramOptions` for more information.
     """
 
     class _MembersNeededForFinalize:
-        __slots__ = "handle"
+        __slots__ = "handle", "backend"
 
-        def __init__(self, program_obj, handle):
+        def __init__(self, program_obj, handle, backend="NVRTC"):
             self.handle = handle
+            self.backend = backend
             weakref.finalize(program_obj, self.close)
 
         def close(self):
             if self.handle is not None:
-                handle_return(nvrtc.nvrtcDestroyProgram(self.handle))
+                if self.backend == "NVRTC":
+                    handle_return(nvrtc.nvrtcDestroyProgram(self.handle))
+                elif self.backend == "NVVM":
+                    handle_return(nvvm.destroy_program(self.handle))
                 self.handle = None
 
     __slots__ = ("__weakref__", "_mnff", "_backend", "_linker", "_options")
@@ -402,6 +407,7 @@ def __init__(self, code, code_type, options: ProgramOptions = None):
             # TODO: allow tuples once NVIDIA/cuda-python#72 is resolved
 
             self._mnff.handle = handle_return(nvrtc.nvrtcCreateProgram(code.encode(), options._name, 0, [], []))
+            self._mnff.backend = "NVRTC"
             self._backend = "NVRTC"
             self._linker = None
 
@@ -411,8 +417,21 @@ def __init__(self, code, code_type, options: ProgramOptions = None):
                 ObjectCode._init(code.encode(), code_type), options=self._translate_program_options(options)
             )
             self._backend = self._linker.backend
+
+        elif code_type == "nvvm":
+            if isinstance(code, str):
+                code = code.encode('utf-8')
+            elif not isinstance(code, (bytes, bytearray)):
+                raise TypeError("NVVM IR code must be provided as str, bytes, or bytearray")
+            
+            self._mnff.handle = nvvm.create_program()
+            self._mnff.backend = "NVVM"
+            nvvm.add_module_to_program(self._mnff.handle, code, len(code), options._name.decode())
+            self._backend = "NVVM"
+            self._linker = None
+
         else:
-            supported_code_types = ("c++", "ptx")
+            supported_code_types = ("c++", "ptx", "nvvm")
             assert code_type not in supported_code_types, f"{code_type=}"
             raise RuntimeError(f"Unsupported {code_type=} ({supported_code_types=})")
 
@@ -513,6 +532,42 @@ def compile(self, target_type, name_expressions=(), logs=None):
 
             return ObjectCode._init(data, target_type, symbol_mapping=symbol_mapping, name=self._options.name)
 
+        elif self._backend == "NVVM":
+            if target_type != "ptx":
+                raise ValueError(f'NVVM backend only supports target_type="ptx", got "{target_type}"')
+            
+            nvvm_options = []
+            if self._options.arch is not None:
+                arch = self._options.arch
+                if arch.startswith("sm_"):
+                    arch = f"compute_{arch[3:]}"
+                nvvm_options.append(f"-arch={arch}")
+            else:
+                major, minor = Device().compute_capability
+                nvvm_options.append(f"-arch=compute_{major}{minor}")
+
+            if self._options.debug:
+                nvvm_options.append("-g")
+            if self._options.device_code_optimize is False:
+                nvvm_options.append("-opt=0")
+            elif self._options.device_code_optimize is True:
+                nvvm_options.append("-opt=3")
+
+            nvvm.compile_program(self._mnff.handle, len(nvvm_options), nvvm_options)
+            
+            size = nvvm.get_compiled_result_size(self._mnff.handle)
+            data = bytearray(size)
+            nvvm.get_compiled_result(self._mnff.handle, data)
+            
+            if logs is not None:
+                logsize = nvvm.get_program_log_size(self._mnff.handle)
+                if logsize > 1:
+                    log = bytearray(logsize)
+                    nvvm.get_program_log(self._mnff.handle, log)
+                    logs.write(log.decode("utf-8", errors="backslashreplace"))
+                    
+            return ObjectCode._init(data, target_type, name=self._options.name)
+
         supported_backends = ("nvJitLink", "driver")
         if self._backend not in supported_backends:
             raise ValueError(f'Unsupported backend="{self._backend}" ({supported_backends=})')
diff --git a/cuda_core/cuda/core/experimental/_utils/cuda_utils.pyx b/cuda_core/cuda/core/experimental/_utils/cuda_utils.pyx
index 86588f733c..a51b0c766c 100644
--- a/cuda_core/cuda/core/experimental/_utils/cuda_utils.pyx
+++ b/cuda_core/cuda/core/experimental/_utils/cuda_utils.pyx
@@ -9,11 +9,11 @@ from collections.abc import Sequence
 from typing import Callable
 
 try:
-    from cuda.bindings import driver, nvrtc, runtime
+    from cuda.bindings import driver, nvrtc, nvvm, runtime
 except ImportError:
     from cuda import cuda as driver
     from cuda import cudart as runtime
-    from cuda import nvrtc
+    from cuda import nvrtc, nvvm
 
 from cuda.core.experimental._utils.driver_cu_result_explanations import DRIVER_CU_RESULT_EXPLANATIONS
 from cuda.core.experimental._utils.runtime_cuda_error_explanations import RUNTIME_CUDA_ERROR_EXPLANATIONS
@@ -27,6 +27,10 @@ class NVRTCError(CUDAError):
     pass
 
 
+class NVVMError(CUDAError):
+    pass
+
+
 ComputeCapability = namedtuple("ComputeCapability", ("major", "minor"))
 
 
@@ -55,6 +59,7 @@ def _reduce_3_tuple(t: tuple):
 cdef object _DRIVER_SUCCESS = driver.CUresult.CUDA_SUCCESS
 cdef object _RUNTIME_SUCCESS = runtime.cudaError_t.cudaSuccess
 cdef object _NVRTC_SUCCESS = nvrtc.nvrtcResult.NVRTC_SUCCESS
+cdef object _NVVM_SUCCESS = nvvm.Result.SUCCESS
 
 
 cpdef inline int _check_driver_error(error) except?-1:
@@ -103,6 +108,22 @@ cpdef inline int _check_nvrtc_error(error, handle=None) except?-1:
     raise NVRTCError(err)
 
 
+cpdef inline int _check_nvvm_error(error, handle=None) except?-1:
+    if error == _NVVM_SUCCESS:
+        return 0
+    err = f"{error}: {nvvm.get_error_string(error)}"
+    if handle is not None:
+        try:
+            logsize = nvvm.get_program_log_size(handle)
+            if logsize > 1:
+                log = bytearray(logsize)
+                nvvm.get_program_log(handle, log)
+                err += f", compilation log:\n\n{log.decode('utf-8', errors='backslashreplace')}"
+        except:
+            pass  # Log extraction failed, but we still have the error
+    raise NVVMError(err)
+
+
 cdef inline int _check_error(error, handle=None) except?-1:
     if isinstance(error, driver.CUresult):
         return _check_driver_error(error)
@@ -110,6 +131,8 @@ cdef inline int _check_error(error, handle=None) except?-1:
         return _check_runtime_error(error)
     elif isinstance(error, nvrtc.nvrtcResult):
         return _check_nvrtc_error(error, handle=handle)
+    elif isinstance(error, nvvm.Result):
+        return _check_nvvm_error(error, handle=handle)
     else:
         raise RuntimeError(f"Unknown error type: {error}")
 

From 028a2948e96d5b5b2c88066a1e4f0bfd8ef3c328 Mon Sep 17 00:00:00 2001
From: Abhilash Majumder <abmajumder@nvidia.com>
Date: Wed, 27 Aug 2025 20:10:18 +0530
Subject: [PATCH 02/42] add test

---
 .../core/experimental/_utils/cuda_utils.pyx   |  4 +-
 cuda_core/tests/test_program.py               | 72 ++++++++++++++++++-
 2 files changed, 73 insertions(+), 3 deletions(-)

diff --git a/cuda_core/cuda/core/experimental/_utils/cuda_utils.pyx b/cuda_core/cuda/core/experimental/_utils/cuda_utils.pyx
index a51b0c766c..01d39e4f1f 100644
--- a/cuda_core/cuda/core/experimental/_utils/cuda_utils.pyx
+++ b/cuda_core/cuda/core/experimental/_utils/cuda_utils.pyx
@@ -119,8 +119,8 @@ cpdef inline int _check_nvvm_error(error, handle=None) except?-1:
                 log = bytearray(logsize)
                 nvvm.get_program_log(handle, log)
                 err += f", compilation log:\n\n{log.decode('utf-8', errors='backslashreplace')}"
-        except:
-            pass  # Log extraction failed, but we still have the error
+        except Exception as e:
+            raise NVVMError(err) from e
     raise NVVMError(err)
 
 
diff --git a/cuda_core/tests/test_program.py b/cuda_core/tests/test_program.py
index e5c873f1fb..8c8b29ecfe 100644
--- a/cuda_core/tests/test_program.py
+++ b/cuda_core/tests/test_program.py
@@ -12,6 +12,37 @@
 
 is_culink_backend = _linker._decide_nvjitlink_or_driver()
 
+nvvm_ir = """
+target triple = "nvptx64-unknown-cuda"
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-i128:128:128-f32:32:32-f64:64:64-v16:16:16-v32:32:32-v64:64:64-v128:128:128-n16:32:64"
+
+define i32 @ave(i32 %a, i32 %b) {
+entry:
+  %add = add nsw i32 %a, %b
+  %div = sdiv i32 %add, 2
+  ret i32 %div
+}
+
+define void @simple(i32* %data) {
+entry:
+  %0 = call i32 @llvm.nvvm.read.ptx.sreg.ctaid.x()
+  %1 = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x()
+  %mul = mul i32 %0, %1
+  %2 = call i32 @llvm.nvvm.read.ptx.sreg.tid.x()
+  %add = add i32 %mul, %2
+  %call = call i32 @ave(i32 %add, i32 %add)
+  %idxprom = sext i32 %add to i64
+  store i32 %call, i32* %data, align 4
+  ret void
+}
+
+declare i32 @llvm.nvvm.read.ptx.sreg.ctaid.x() nounwind readnone
+
+declare i32 @llvm.nvvm.read.ptx.sreg.ntid.x() nounwind readnone
+
+declare i32 @llvm.nvvm.read.ptx.sreg.tid.x() nounwind readnone
+
+"""
 
 @pytest.fixture(scope="module")
 def ptx_code_object():
@@ -92,7 +123,7 @@ def test_program_init_valid_code_type():
 def test_program_init_invalid_code_type():
     code = "goto 100"
     with pytest.raises(
-        RuntimeError, match=r"^Unsupported code_type='fortran' \(supported_code_types=\('c\+\+', 'ptx'\)\)$"
+        RuntimeError, match=r"^Unsupported code_type='fortran' \(supported_code_types=\('c\+\+', 'ptx', 'nvvm'\)\)$"
     ):
         Program(code, "FORTRAN")
 
@@ -150,3 +181,42 @@ def test_program_close():
     program = Program(code, "c++")
     program.close()
     assert program.handle is None
+
+nvvm_options = [
+    ProgramOptions(name="nvvm_test"),
+    ProgramOptions(device_code_optimize=True),
+    ProgramOptions(arch="sm_90"),
+    ProgramOptions(debug=True),
+]
+
+@pytest.mark.parametrize("options", nvvm_options)
+def test_nvvm_program_with_various_options(init_cuda, options):
+    program = Program(nvvm_ir, "nvvm", options)
+    assert program.backend == "NVVM"
+    program.compile("ptx")
+    program.close()
+    assert program.handle is None
+
+
+def test_nvvm_program_creation():
+    program = Program(nvvm_ir, "nvvm")
+    assert program.backend == "NVVM"
+    assert program.handle is not None
+
+
+def test_nvvm_compile_invalid_target():
+    program = Program(nvvm_ir, "nvvm")
+    with pytest.raises(ValueError):
+        program.compile("cubin")
+
+
+def test_nvvm_compile_valid_target_type(init_cuda):
+    program = Program(nvvm_ir, "nvvm", options={"name": "nvvm_test"})
+    ptx_object_code = program.compile("ptx")
+    assert isinstance(ptx_object_code, ObjectCode)
+    assert ptx_object_code.name == "nvvm_test"
+    
+    ptx_kernel = ptx_object_code.get_kernel("nvvm_kernel")
+    assert isinstance(ptx_kernel, Kernel)
+    
+    program.close()

From a418f4bab98b4278b354c18841afabaca6674abc Mon Sep 17 00:00:00 2001
From: Abhilash Majumder <abmajumder@nvidia.com>
Date: Mon, 1 Sep 2025 11:49:34 +0530
Subject: [PATCH 03/42] remove nvvm error handling from utils

---
 .../core/experimental/_utils/cuda_utils.pyx   | 26 -------------------
 1 file changed, 26 deletions(-)

diff --git a/cuda_core/cuda/core/experimental/_utils/cuda_utils.pyx b/cuda_core/cuda/core/experimental/_utils/cuda_utils.pyx
index 01d39e4f1f..af7d212642 100644
--- a/cuda_core/cuda/core/experimental/_utils/cuda_utils.pyx
+++ b/cuda_core/cuda/core/experimental/_utils/cuda_utils.pyx
@@ -26,11 +26,6 @@ class CUDAError(Exception):
 class NVRTCError(CUDAError):
     pass
 
-
-class NVVMError(CUDAError):
-    pass
-
-
 ComputeCapability = namedtuple("ComputeCapability", ("major", "minor"))
 
 
@@ -59,8 +54,6 @@ def _reduce_3_tuple(t: tuple):
 cdef object _DRIVER_SUCCESS = driver.CUresult.CUDA_SUCCESS
 cdef object _RUNTIME_SUCCESS = runtime.cudaError_t.cudaSuccess
 cdef object _NVRTC_SUCCESS = nvrtc.nvrtcResult.NVRTC_SUCCESS
-cdef object _NVVM_SUCCESS = nvvm.Result.SUCCESS
-
 
 cpdef inline int _check_driver_error(error) except?-1:
     if error == _DRIVER_SUCCESS:
@@ -107,23 +100,6 @@ cpdef inline int _check_nvrtc_error(error, handle=None) except?-1:
         err += f", compilation log:\n\n{log.decode('utf-8', errors='backslashreplace')}"
     raise NVRTCError(err)
 
-
-cpdef inline int _check_nvvm_error(error, handle=None) except?-1:
-    if error == _NVVM_SUCCESS:
-        return 0
-    err = f"{error}: {nvvm.get_error_string(error)}"
-    if handle is not None:
-        try:
-            logsize = nvvm.get_program_log_size(handle)
-            if logsize > 1:
-                log = bytearray(logsize)
-                nvvm.get_program_log(handle, log)
-                err += f", compilation log:\n\n{log.decode('utf-8', errors='backslashreplace')}"
-        except Exception as e:
-            raise NVVMError(err) from e
-    raise NVVMError(err)
-
-
 cdef inline int _check_error(error, handle=None) except?-1:
     if isinstance(error, driver.CUresult):
         return _check_driver_error(error)
@@ -131,8 +107,6 @@ cdef inline int _check_error(error, handle=None) except?-1:
         return _check_runtime_error(error)
     elif isinstance(error, nvrtc.nvrtcResult):
         return _check_nvrtc_error(error, handle=handle)
-    elif isinstance(error, nvvm.Result):
-        return _check_nvvm_error(error, handle=handle)
     else:
         raise RuntimeError(f"Unknown error type: {error}")
 

From 92af3dd5c53146e2734262528ae9ecde95c06322 Mon Sep 17 00:00:00 2001
From: Abhilash Majumder <abmajumder@nvidia.com>
Date: Mon, 1 Sep 2025 15:20:41 +0530
Subject: [PATCH 04/42] use version dependent nvvm inclusion

---
 cuda_core/cuda/core/experimental/_program.py  | 65 +++++++++++++++++--
 .../core/experimental/_utils/cuda_utils.pyx   |  4 +-
 2 files changed, 63 insertions(+), 6 deletions(-)

diff --git a/cuda_core/cuda/core/experimental/_program.py b/cuda_core/cuda/core/experimental/_program.py
index 7782cfe5ed..63c903736d 100644
--- a/cuda_core/cuda/core/experimental/_program.py
+++ b/cuda_core/cuda/core/experimental/_program.py
@@ -21,12 +21,63 @@
     check_or_create_options,
     driver,
     handle_return,
+    get_binding_version,
     is_nested_sequence,
     is_sequence,
-    nvrtc,
-    nvvm,
+    nvrtc
 )
 
+_nvvm_module = None
+_nvvm_import_attempted = False
+
+
+def _get_nvvm_module():
+    """
+    Handles the import of NVVM module with version and availability checks.
+    NVVM bindings were added in CUDA 12.9.0, so we need to handle cases where:
+    1. cuda.bindings is not new enough (< 12.9.0)
+    2. libnvvm is not found in the Python environment
+    
+    Returns:
+        The nvvm module if available and working
+        
+    Raises:
+        ImportError: If NVVM is not available due to version or library issues
+    """
+    global _nvvm_module, _nvvm_import_attempted
+    
+    if _nvvm_import_attempted:
+        if _nvvm_module is None:
+            raise ImportError("NVVM module is not available (previous import attempt failed)")
+        return _nvvm_module
+    
+    _nvvm_import_attempted = True
+    
+    try:
+        version = get_binding_version()
+        if version < (12, 9):
+            raise ImportError(
+                f"NVVM bindings require cuda-bindings >= 12.9.0, but found {version[0]}.{version[1]}.x. "
+                "Please update cuda-bindings to use NVVM features."
+            )
+        
+        from cuda.bindings import nvvm
+        try:
+            from cuda.bindings._internal.nvvm import _inspect_function_pointers
+            _inspect_function_pointers()
+        except Exception as e:
+            raise ImportError(
+                "NVVM library (libnvvm) is not available in this Python environment. "
+                f"Original error: {e}"
+            )
+        
+        _nvvm_module = nvvm
+        return _nvvm_module
+        
+    except ImportError as e:
+        _nvvm_module = None
+        raise e
+
 
 def _process_define_macro_inner(formatted_options, macro):
     if isinstance(macro, str):
@@ -390,7 +441,11 @@ def close(self):
                 if self.backend == "NVRTC":
                     handle_return(nvrtc.nvrtcDestroyProgram(self.handle))
                 elif self.backend == "NVVM":
-                    handle_return(nvvm.destroy_program(self.handle))
+                    try:
+                        nvvm = _get_nvvm_module()
+                        handle_return(nvvm.destroy_program(self.handle))
+                    except ImportError as e:
+                        pass
                 self.handle = None
 
     __slots__ = ("__weakref__", "_mnff", "_backend", "_linker", "_options")
@@ -424,6 +479,7 @@ def __init__(self, code, code_type, options: ProgramOptions = None):
             elif not isinstance(code, (bytes, bytearray)):
                 raise TypeError("NVVM IR code must be provided as str, bytes, or bytearray")
             
+            nvvm = _get_nvvm_module()
             self._mnff.handle = nvvm.create_program()
             self._mnff.backend = "NVVM"
             nvvm.add_module_to_program(self._mnff.handle, code, len(code), options._name.decode())
@@ -552,7 +608,8 @@ def compile(self, target_type, name_expressions=(), logs=None):
                 nvvm_options.append("-opt=0")
             elif self._options.device_code_optimize is True:
                 nvvm_options.append("-opt=3")
-
+            
+            nvvm = _get_nvvm_module()
             nvvm.compile_program(self._mnff.handle, len(nvvm_options), nvvm_options)
             
             size = nvvm.get_compiled_result_size(self._mnff.handle)
diff --git a/cuda_core/cuda/core/experimental/_utils/cuda_utils.pyx b/cuda_core/cuda/core/experimental/_utils/cuda_utils.pyx
index af7d212642..4a9d61aee9 100644
--- a/cuda_core/cuda/core/experimental/_utils/cuda_utils.pyx
+++ b/cuda_core/cuda/core/experimental/_utils/cuda_utils.pyx
@@ -9,11 +9,11 @@ from collections.abc import Sequence
 from typing import Callable
 
 try:
-    from cuda.bindings import driver, nvrtc, nvvm, runtime
+    from cuda.bindings import driver, nvrtc, runtime
 except ImportError:
     from cuda import cuda as driver
     from cuda import cudart as runtime
-    from cuda import nvrtc, nvvm
+    from cuda import nvrtc
 
 from cuda.core.experimental._utils.driver_cu_result_explanations import DRIVER_CU_RESULT_EXPLANATIONS
 from cuda.core.experimental._utils.runtime_cuda_error_explanations import RUNTIME_CUDA_ERROR_EXPLANATIONS

From bdd167111e88b6a45ab4d36df66713a2724094e9 Mon Sep 17 00:00:00 2001
From: Abhilash Majumder <abmajumder@nvidia.com>
Date: Mon, 1 Sep 2025 22:54:58 +0530
Subject: [PATCH 05/42] fix nvvm compilation flow and test

---
 cuda_core/cuda/core/experimental/_program.py |   5 +-
 cuda_core/tests/test_program.py              | 179 +++++++++++++++----
 2 files changed, 147 insertions(+), 37 deletions(-)

diff --git a/cuda_core/cuda/core/experimental/_program.py b/cuda_core/cuda/core/experimental/_program.py
index 63c903736d..7034e68a1c 100644
--- a/cuda_core/cuda/core/experimental/_program.py
+++ b/cuda_core/cuda/core/experimental/_program.py
@@ -443,7 +443,7 @@ def close(self):
                 elif self.backend == "NVVM":
                     try:
                         nvvm = _get_nvvm_module()
-                        handle_return(nvvm.destroy_program(self.handle))
+                        nvvm.destroy_program(self.handle)
                     except ImportError as e:
                         pass
                 self.handle = None
@@ -623,7 +623,8 @@ def compile(self, target_type, name_expressions=(), logs=None):
                     nvvm.get_program_log(self._mnff.handle, log)
                     logs.write(log.decode("utf-8", errors="backslashreplace"))
                     
-            return ObjectCode._init(data, target_type, name=self._options.name)
+            data_bytes = bytes(data)
+            return ObjectCode._init(data_bytes, target_type, name=self._options.name)
 
         supported_backends = ("nvJitLink", "driver")
         if self._backend not in supported_backends:
diff --git a/cuda_core/tests/test_program.py b/cuda_core/tests/test_program.py
index 8c8b29ecfe..bb1ba02d9b 100644
--- a/cuda_core/tests/test_program.py
+++ b/cuda_core/tests/test_program.py
@@ -12,8 +12,52 @@
 
 is_culink_backend = _linker._decide_nvjitlink_or_driver()
 
-nvvm_ir = """
-target triple = "nvptx64-unknown-cuda"
+def get_nvvm_ir():
+    """Generate working NVVM IR with proper version metadata"""
+    try:
+        from cuda.core.experimental._program import _get_nvvm_module
+        nvvm = _get_nvvm_module()
+        major, minor, debug_major, debug_minor = nvvm.ir_version()
+        
+        
+        nvvm_ir_template = '''target triple = "nvptx64-unknown-cuda"
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-i128:128:128-f32:32:32-f64:64:64-v16:16:16-v32:32:32-v64:64:64-v128:128:128-n16:32:64"
+
+define i32 @ave(i32 %a, i32 %b) {{
+entry:
+  %add = add nsw i32 %a, %b
+  %div = sdiv i32 %add, 2
+  ret i32 %div
+}}
+
+define void @simple(i32* %data) {{
+entry:
+  %0 = call i32 @llvm.nvvm.read.ptx.sreg.ctaid.x()
+  %1 = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x()
+  %mul = mul i32 %0, %1
+  %2 = call i32 @llvm.nvvm.read.ptx.sreg.tid.x()
+  %add = add i32 %mul, %2
+  %call = call i32 @ave(i32 %add, i32 %add)
+  %idxprom = sext i32 %add to i64
+  store i32 %call, i32* %data, align 4
+  ret void
+}}
+
+declare i32 @llvm.nvvm.read.ptx.sreg.ctaid.x() nounwind readnone
+declare i32 @llvm.nvvm.read.ptx.sreg.ntid.x() nounwind readnone
+declare i32 @llvm.nvvm.read.ptx.sreg.tid.x() nounwind readnone
+
+!nvvm.annotations = !{{!0}}
+!0 = !{{void (i32*)* @simple, !"kernel", i32 1}}
+
+!nvvmir.version = !{{!1}}
+!1 = !{{i32 {major}, i32 0, i32 {debug_major}, i32 0}}
+'''
+        
+        return nvvm_ir_template.format(major=major, debug_major=debug_major)
+    except Exception:
+        
+        return """target triple = "nvptx64-unknown-cuda"
 target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-i128:128:128-f32:32:32-f64:64:64-v16:16:16-v32:32:32-v64:64:64-v128:128:128-n16:32:64"
 
 define i32 @ave(i32 %a, i32 %b) {
@@ -37,13 +81,15 @@
 }
 
 declare i32 @llvm.nvvm.read.ptx.sreg.ctaid.x() nounwind readnone
-
 declare i32 @llvm.nvvm.read.ptx.sreg.ntid.x() nounwind readnone
-
 declare i32 @llvm.nvvm.read.ptx.sreg.tid.x() nounwind readnone
 
+!nvvm.annotations = !{!0}
+!0 = !{void (i32*)* @simple, !"kernel", i32 1}
 """
 
+nvvm_ir = get_nvvm_ir()
+
 @pytest.fixture(scope="module")
 def ptx_code_object():
     code = 'extern "C" __global__ void my_kernel() {}'
@@ -182,41 +228,104 @@ def test_program_close():
     program.close()
     assert program.handle is None
 
-nvvm_options = [
-    ProgramOptions(name="nvvm_test"),
-    ProgramOptions(device_code_optimize=True),
-    ProgramOptions(arch="sm_90"),
-    ProgramOptions(debug=True),
-]
 
-@pytest.mark.parametrize("options", nvvm_options)
-def test_nvvm_program_with_various_options(init_cuda, options):
-    program = Program(nvvm_ir, "nvvm", options)
-    assert program.backend == "NVVM"
-    program.compile("ptx")
-    program.close()
-    assert program.handle is None
 
 
-def test_nvvm_program_creation():
-    program = Program(nvvm_ir, "nvvm")
-    assert program.backend == "NVVM"
-    assert program.handle is not None
+def test_nvvm_deferred_import():
+    """Test that our deferred NVVM import works correctly"""
+    try:
+        from cuda.core.experimental._program import _get_nvvm_module
+        nvvm = _get_nvvm_module()
+        assert nvvm is not None
+    except ImportError as e:
+        pytest.skip(f"NVVM not available: {e}")
 
 
-def test_nvvm_compile_invalid_target():
-    program = Program(nvvm_ir, "nvvm")
-    with pytest.raises(ValueError):
-        program.compile("cubin")
+def test_nvvm_program_creation():
+    """Test basic NVVM program creation"""
+    try:
+        program = Program(nvvm_ir, "nvvm")
+        assert program.backend == "NVVM"
+        assert program.handle is not None
+        program.close()
+    except ImportError as e:
+        pytest.skip(f"NVVM not available: {e}")
 
 
-def test_nvvm_compile_valid_target_type(init_cuda):
-    program = Program(nvvm_ir, "nvvm", options={"name": "nvvm_test"})
-    ptx_object_code = program.compile("ptx")
-    assert isinstance(ptx_object_code, ObjectCode)
-    assert ptx_object_code.name == "nvvm_test"
-    
-    ptx_kernel = ptx_object_code.get_kernel("nvvm_kernel")
-    assert isinstance(ptx_kernel, Kernel)
-    
-    program.close()
+def test_nvvm_compile_invalid_target():
+    """Test that NVVM programs reject invalid compilation targets"""
+    try:
+        program = Program(nvvm_ir, "nvvm")
+        with pytest.raises(ValueError, match="NVVM backend only supports target_type=\"ptx\""):
+            program.compile("cubin")
+        program.close()
+    except ImportError as e:
+        pytest.skip(f"NVVM not available: {e}")
+
+
+def test_nvvm_compile_to_ptx(init_cuda):
+    """Test NVVM IR compilation to PTX"""
+    try:
+        options = ProgramOptions(name="nvvm_test", arch="sm_90", device_code_optimize=False)
+        program = Program(nvvm_ir, "nvvm", options=options)
+        try:
+            ptx_object_code = program.compile("ptx")
+            assert isinstance(ptx_object_code, ObjectCode)
+            assert ptx_object_code.name == "nvvm_test"
+            assert ptx_object_code._code_type == "ptx"
+            
+            ptx_code = ptx_object_code.code
+            if isinstance(ptx_code, bytes):
+                ptx_text = ptx_code.decode()
+            else:
+                ptx_text = str(ptx_code)
+            assert ".visible .entry simple(" in ptx_text
+            
+            ptx_kernel = ptx_object_code.get_kernel("simple")
+            assert isinstance(ptx_kernel, Kernel)
+            
+        except Exception as e:
+            if any(error in str(e) for error in ["ERROR_IR_VERSION_MISMATCH", "ERROR_INVALID_OPTION", "ERROR_COMPILATION"]):
+                pytest.skip(f"NVVM IR not compatible with this CUDA version: {e}")
+            else:
+                raise
+        finally:
+            program.close()
+            
+    except ImportError as e:
+        pytest.skip(f"NVVM not available: {e}")
+
+
+@pytest.mark.parametrize("options", [
+    ProgramOptions(name="test1", arch="sm_90", device_code_optimize=False),
+    ProgramOptions(name="test2", arch="sm_90", device_code_optimize=False),
+    ProgramOptions(name="test3", arch="sm_90", device_code_optimize=True),
+])
+def test_nvvm_program_options(init_cuda, options):
+    """Test NVVM programs with different options"""
+    try:
+        program = Program(nvvm_ir, "nvvm", options)
+        assert program.backend == "NVVM"
+        
+        try:
+            ptx_code = program.compile("ptx")
+            assert isinstance(ptx_code, ObjectCode)
+            assert ptx_code.name == options.name
+            
+            code_content = ptx_code.code
+            if isinstance(code_content, bytes):
+                ptx_text = code_content.decode()
+            else:
+                ptx_text = str(code_content)
+            assert ".visible .entry simple(" in ptx_text
+            
+        except Exception as e:
+            if any(error in str(e) for error in ["ERROR_IR_VERSION_MISMATCH", "ERROR_INVALID_OPTION", "ERROR_COMPILATION"]):
+                pytest.skip(f"NVVM compilation not supported: {e}")
+            else:
+                raise
+        finally:
+            program.close()
+            
+    except ImportError as e:
+        pytest.skip(f"NVVM not available: {e}")
\ No newline at end of file

From dc9a4e3dd3ba8e169cf46371c7687ae754136a45 Mon Sep 17 00:00:00 2001
From: Abhilash Majumder <abmajumder@nvidia.com>
Date: Wed, 3 Sep 2025 17:38:31 +0530
Subject: [PATCH 06/42] refactor

---
 cuda_core/cuda/core/experimental/_module.py  | 18 ------
 cuda_core/cuda/core/experimental/_program.py | 63 +++++++++++---------
 2 files changed, 35 insertions(+), 46 deletions(-)

diff --git a/cuda_core/cuda/core/experimental/_module.py b/cuda_core/cuda/core/experimental/_module.py
index 8b3146fee3..63bb6ff260 100644
--- a/cuda_core/cuda/core/experimental/_module.py
+++ b/cuda_core/cuda/core/experimental/_module.py
@@ -544,24 +544,6 @@ def from_ltoir(module: Union[bytes, str], *, name: str = "", symbol_mapping: Opt
             them (default to no mappings).
         """
         return ObjectCode._init(module, "ltoir", name=name, symbol_mapping=symbol_mapping)
-    
-    @staticmethod
-    def from_nvvm(module: Union[bytes, str], *, name: str = "", symbol_mapping: Optional[dict] = None) -> "ObjectCode":
-        """Create an :class:`ObjectCode` instance from an existing NVVM IR.
-
-        Parameters
-        ----------
-        module : Union[bytes, str]
-            Either a bytes object containing the in-memory NVVM IR code to load, or
-            a file path string pointing to the on-disk NVVM IR file to load.
-        name : Optional[str]
-            A human-readable identifier representing this code object.
-        symbol_mapping : Optional[dict]
-            A dictionary specifying how the unmangled symbol names (as keys)
-            should be mapped to the mangled names before trying to retrieve
-            them (default to no mappings).
-        """
-        return ObjectCode._init(module, "nvvm", name=name, symbol_mapping=symbol_mapping)
 
     @staticmethod
     def from_fatbin(
diff --git a/cuda_core/cuda/core/experimental/_program.py b/cuda_core/cuda/core/experimental/_program.py
index e2c8bf6463..497b68f669 100644
--- a/cuda_core/cuda/core/experimental/_program.py
+++ b/cuda_core/cuda/core/experimental/_program.py
@@ -62,13 +62,12 @@ def _get_nvvm_module():
             )
         
         from cuda.bindings import nvvm
-        try:
-            from cuda.bindings._internal.nvvm import _inspect_function_pointers
-            _inspect_function_pointers()
-        except Exception as e:
+        from cuda.bindings._internal.nvvm import _inspect_function_pointer
+        if _inspect_function_pointer("__nvvmCreateProgram") == 0:
             raise ImportError(
                 "NVVM library (libnvvm) is not available in this Python environment. "
                 f"Original error: {e}"
+            
             )
         
         _nvvm_module = nvvm
@@ -431,7 +430,7 @@ class Program:
     class _MembersNeededForFinalize:
         __slots__ = "handle", "backend"
 
-        def __init__(self, program_obj, handle, backend="NVRTC"):
+        def __init__(self, program_obj, handle, backend):
             self.handle = handle
             self.backend = backend
             weakref.finalize(program_obj, self.close)
@@ -441,17 +440,14 @@ def close(self):
                 if self.backend == "NVRTC":
                     handle_return(nvrtc.nvrtcDestroyProgram(self.handle))
                 elif self.backend == "NVVM":
-                    try:
-                        nvvm = _get_nvvm_module()
-                        nvvm.destroy_program(self.handle)
-                    except ImportError as e:
-                        pass
+                    nvvm = _get_nvvm_module()
+                    nvvm.destroy_program(self.handle)
                 self.handle = None
 
     __slots__ = ("__weakref__", "_mnff", "_backend", "_linker", "_options")
 
     def __init__(self, code, code_type, options: ProgramOptions = None):
-        self._mnff = Program._MembersNeededForFinalize(self, None)
+        self._mnff = Program._MembersNeededForFinalize(self, None, None)
 
         self._options = options = check_or_create_options(ProgramOptions, options, "Program options")
         code_type = code_type.lower()
@@ -507,6 +503,27 @@ def _translate_program_options(self, options: ProgramOptions) -> LinkerOptions:
             split_compile=options.split_compile,
             ptxas_options=options.ptxas_options,
         )
+        
+    def _translate_program_options_to_nvvm(self, options: ProgramOptions) -> List[str]:
+        """Translate ProgramOptions to NVVM-specific compilation options."""
+        nvvm_options = []
+        
+        if options.arch is not None:
+            arch = options.arch
+            if arch.startswith("sm_"):
+                arch = f"compute_{arch[3:]}"
+            nvvm_options.append(f"-arch={arch}")
+        else:
+            major, minor = Device().compute_capability
+            nvvm_options.append(f"-arch=compute_{major}{minor}")
+        if options.debug:
+            nvvm_options.append("-g")
+        if options.device_code_optimize is False:
+            nvvm_options.append("-opt=0")
+        elif options.device_code_optimize is True:
+            nvvm_options.append("-opt=3")
+            
+        return nvvm_options
 
     def close(self):
         """Destroy this program."""
@@ -592,23 +609,7 @@ def compile(self, target_type, name_expressions=(), logs=None):
             if target_type != "ptx":
                 raise ValueError(f'NVVM backend only supports target_type="ptx", got "{target_type}"')
             
-            nvvm_options = []
-            if self._options.arch is not None:
-                arch = self._options.arch
-                if arch.startswith("sm_"):
-                    arch = f"compute_{arch[3:]}"
-                nvvm_options.append(f"-arch={arch}")
-            else:
-                major, minor = Device().compute_capability
-                nvvm_options.append(f"-arch=compute_{major}{minor}")
-
-            if self._options.debug:
-                nvvm_options.append("-g")
-            if self._options.device_code_optimize is False:
-                nvvm_options.append("-opt=0")
-            elif self._options.device_code_optimize is True:
-                nvvm_options.append("-opt=3")
-            
+            nvvm_options = self._translate_program_options_to_nvvm(self._options)
             nvvm = _get_nvvm_module()
             nvvm.compile_program(self._mnff.handle, len(nvvm_options), nvvm_options)
             
@@ -650,3 +651,9 @@ def handle(self) -> ProgramHandleT:
             handle, call ``int(Program.handle)``.
         """
         return self._mnff.handle
+aution::
+
+            This handle is a Python object. To get the memory address of the underlying C
+            handle, call ``int(Program.handle)``.
+        """
+        return self._mnff.handle

From 22d18b9bba81c10e31ee261d9c86a7bd107671aa Mon Sep 17 00:00:00 2001
From: Abhilash Majumder <abmajumder@nvidia.com>
Date: Wed, 3 Sep 2025 17:41:25 +0530
Subject: [PATCH 07/42] fix unwanted rebase

---
 cuda_core/cuda/core/experimental/_program.py | 7 +------
 1 file changed, 1 insertion(+), 6 deletions(-)

diff --git a/cuda_core/cuda/core/experimental/_program.py b/cuda_core/cuda/core/experimental/_program.py
index 497b68f669..3b566df5b6 100644
--- a/cuda_core/cuda/core/experimental/_program.py
+++ b/cuda_core/cuda/core/experimental/_program.py
@@ -651,9 +651,4 @@ def handle(self) -> ProgramHandleT:
             handle, call ``int(Program.handle)``.
         """
         return self._mnff.handle
-aution::
-
-            This handle is a Python object. To get the memory address of the underlying C
-            handle, call ``int(Program.handle)``.
-        """
-        return self._mnff.handle
+        
\ No newline at end of file

From 0f7fda4979a10868b5a74ecdc23668edbe7e8233 Mon Sep 17 00:00:00 2001
From: Abhilash Majumder <abmajumder@nvidia.com>
Date: Wed, 3 Sep 2025 18:40:25 +0530
Subject: [PATCH 08/42] fix core linter errors

---
 cuda_core/cuda/core/experimental/_program.py | 39 +++++++++-----------
 cuda_core/tests/test_program.py              | 26 ++++++-------
 2 files changed, 31 insertions(+), 34 deletions(-)

diff --git a/cuda_core/cuda/core/experimental/_program.py b/cuda_core/cuda/core/experimental/_program.py
index 3b566df5b6..8927534d2a 100644
--- a/cuda_core/cuda/core/experimental/_program.py
+++ b/cuda_core/cuda/core/experimental/_program.py
@@ -20,11 +20,11 @@
     _handle_boolean_option,
     check_or_create_options,
     driver,
-    handle_return,
     get_binding_version,
+    handle_return,
     is_nested_sequence,
     is_sequence,
-    nvrtc
+    nvrtc,
 )
 
 _nvvm_module = None
@@ -37,22 +37,22 @@ def _get_nvvm_module():
     NVVM bindings were added in CUDA 12.9.0, so we need to handle cases where:
     1. cuda.bindings is not new enough (< 12.9.0)
     2. libnvvm is not found in the Python environment
-    
+
     Returns:
         The nvvm module if available and working
-        
+
     Raises:
         ImportError: If NVVM is not available due to version or library issues
     """
     global _nvvm_module, _nvvm_import_attempted
-    
+
     if _nvvm_import_attempted:
         if _nvvm_module is None:
             raise ImportError("NVVM module is not available (previous import attempt failed)")
         return _nvvm_module
-    
+
     _nvvm_import_attempted = True
-    
+
     try:
         version = get_binding_version()
         if version < (12, 9):
@@ -60,19 +60,17 @@ def _get_nvvm_module():
                 f"NVVM bindings require cuda-bindings >= 12.9.0, but found {version[0]}.{version[1]}.x. "
                 "Please update cuda-bindings to use NVVM features."
             )
-        
+
         from cuda.bindings import nvvm
         from cuda.bindings._internal.nvvm import _inspect_function_pointer
         if _inspect_function_pointer("__nvvmCreateProgram") == 0:
             raise ImportError(
                 "NVVM library (libnvvm) is not available in this Python environment. "
-                f"Original error: {e}"
-            
             )
-        
+
         _nvvm_module = nvvm
         return _nvvm_module
-        
+
     except ImportError as e:
         _nvvm_module = None
         raise e
@@ -474,7 +472,7 @@ def __init__(self, code, code_type, options: ProgramOptions = None):
                 code = code.encode('utf-8')
             elif not isinstance(code, (bytes, bytearray)):
                 raise TypeError("NVVM IR code must be provided as str, bytes, or bytearray")
-            
+
             nvvm = _get_nvvm_module()
             self._mnff.handle = nvvm.create_program()
             self._mnff.backend = "NVVM"
@@ -503,11 +501,11 @@ def _translate_program_options(self, options: ProgramOptions) -> LinkerOptions:
             split_compile=options.split_compile,
             ptxas_options=options.ptxas_options,
         )
-        
+
     def _translate_program_options_to_nvvm(self, options: ProgramOptions) -> List[str]:
         """Translate ProgramOptions to NVVM-specific compilation options."""
         nvvm_options = []
-        
+
         if options.arch is not None:
             arch = options.arch
             if arch.startswith("sm_"):
@@ -522,7 +520,7 @@ def _translate_program_options_to_nvvm(self, options: ProgramOptions) -> List[st
             nvvm_options.append("-opt=0")
         elif options.device_code_optimize is True:
             nvvm_options.append("-opt=3")
-            
+
         return nvvm_options
 
     def close(self):
@@ -608,22 +606,22 @@ def compile(self, target_type, name_expressions=(), logs=None):
         elif self._backend == "NVVM":
             if target_type != "ptx":
                 raise ValueError(f'NVVM backend only supports target_type="ptx", got "{target_type}"')
-            
+
             nvvm_options = self._translate_program_options_to_nvvm(self._options)
             nvvm = _get_nvvm_module()
             nvvm.compile_program(self._mnff.handle, len(nvvm_options), nvvm_options)
-            
+
             size = nvvm.get_compiled_result_size(self._mnff.handle)
             data = bytearray(size)
             nvvm.get_compiled_result(self._mnff.handle, data)
-            
+
             if logs is not None:
                 logsize = nvvm.get_program_log_size(self._mnff.handle)
                 if logsize > 1:
                     log = bytearray(logsize)
                     nvvm.get_program_log(self._mnff.handle, log)
                     logs.write(log.decode("utf-8", errors="backslashreplace"))
-                    
+
             data_bytes = bytes(data)
             return ObjectCode._init(data_bytes, target_type, name=self._options.name)
 
@@ -651,4 +649,3 @@ def handle(self) -> ProgramHandleT:
             handle, call ``int(Program.handle)``.
         """
         return self._mnff.handle
-        
\ No newline at end of file
diff --git a/cuda_core/tests/test_program.py b/cuda_core/tests/test_program.py
index bb1ba02d9b..c258e5b4d6 100644
--- a/cuda_core/tests/test_program.py
+++ b/cuda_core/tests/test_program.py
@@ -18,8 +18,8 @@ def get_nvvm_ir():
         from cuda.core.experimental._program import _get_nvvm_module
         nvvm = _get_nvvm_module()
         major, minor, debug_major, debug_minor = nvvm.ir_version()
-        
-        
+
+
         nvvm_ir_template = '''target triple = "nvptx64-unknown-cuda"
 target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-i128:128:128-f32:32:32-f64:64:64-v16:16:16-v32:32:32-v64:64:64-v128:128:128-n16:32:64"
 
@@ -53,10 +53,10 @@ def get_nvvm_ir():
 !nvvmir.version = !{{!1}}
 !1 = !{{i32 {major}, i32 0, i32 {debug_major}, i32 0}}
 '''
-        
+
         return nvvm_ir_template.format(major=major, debug_major=debug_major)
     except Exception:
-        
+
         return """target triple = "nvptx64-unknown-cuda"
 target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-i128:128:128-f32:32:32-f64:64:64-v16:16:16-v32:32:32-v64:64:64-v128:128:128-n16:32:64"
 
@@ -273,17 +273,17 @@ def test_nvvm_compile_to_ptx(init_cuda):
             assert isinstance(ptx_object_code, ObjectCode)
             assert ptx_object_code.name == "nvvm_test"
             assert ptx_object_code._code_type == "ptx"
-            
+
             ptx_code = ptx_object_code.code
             if isinstance(ptx_code, bytes):
                 ptx_text = ptx_code.decode()
             else:
                 ptx_text = str(ptx_code)
             assert ".visible .entry simple(" in ptx_text
-            
+
             ptx_kernel = ptx_object_code.get_kernel("simple")
             assert isinstance(ptx_kernel, Kernel)
-            
+
         except Exception as e:
             if any(error in str(e) for error in ["ERROR_IR_VERSION_MISMATCH", "ERROR_INVALID_OPTION", "ERROR_COMPILATION"]):
                 pytest.skip(f"NVVM IR not compatible with this CUDA version: {e}")
@@ -291,7 +291,7 @@ def test_nvvm_compile_to_ptx(init_cuda):
                 raise
         finally:
             program.close()
-            
+
     except ImportError as e:
         pytest.skip(f"NVVM not available: {e}")
 
@@ -306,19 +306,19 @@ def test_nvvm_program_options(init_cuda, options):
     try:
         program = Program(nvvm_ir, "nvvm", options)
         assert program.backend == "NVVM"
-        
+
         try:
             ptx_code = program.compile("ptx")
             assert isinstance(ptx_code, ObjectCode)
             assert ptx_code.name == options.name
-            
+
             code_content = ptx_code.code
             if isinstance(code_content, bytes):
                 ptx_text = code_content.decode()
             else:
                 ptx_text = str(code_content)
             assert ".visible .entry simple(" in ptx_text
-            
+
         except Exception as e:
             if any(error in str(e) for error in ["ERROR_IR_VERSION_MISMATCH", "ERROR_INVALID_OPTION", "ERROR_COMPILATION"]):
                 pytest.skip(f"NVVM compilation not supported: {e}")
@@ -326,6 +326,6 @@ def test_nvvm_program_options(init_cuda, options):
                 raise
         finally:
             program.close()
-            
+
     except ImportError as e:
-        pytest.skip(f"NVVM not available: {e}")
\ No newline at end of file
+        pytest.skip(f"NVVM not available: {e}")

From 9ed80511963941c873754cdc6ed2cc05e748e56c Mon Sep 17 00:00:00 2001
From: Abhilash Majumder <abmajumder@nvidia.com>
Date: Wed, 3 Sep 2025 20:58:45 +0530
Subject: [PATCH 09/42] refactor tests

---
 cuda_core/tests/test_program.py | 144 ++++++++++++--------------------
 1 file changed, 52 insertions(+), 92 deletions(-)

diff --git a/cuda_core/tests/test_program.py b/cuda_core/tests/test_program.py
index c258e5b4d6..c511b22567 100644
--- a/cuda_core/tests/test_program.py
+++ b/cuda_core/tests/test_program.py
@@ -12,14 +12,29 @@
 
 is_culink_backend = _linker._decide_nvjitlink_or_driver()
 
-def get_nvvm_ir():
+def _is_nvvm_available():
+    """Check if NVVM is available."""
+    try:
+        from cuda.core.experimental._program import _get_nvvm_module
+        _get_nvvm_module()
+        return True
+    except ImportError:
+        return False
+
+nvvm_available = pytest.mark.skipif(
+    not _is_nvvm_available(),
+    reason="NVVM not available (libNVVM not found or cuda-bindings < 12.9.0)"
+)
+
+@pytest.fixture(scope="session")
+def nvvm_ir():
     """Generate working NVVM IR with proper version metadata"""
     try:
         from cuda.core.experimental._program import _get_nvvm_module
         nvvm = _get_nvvm_module()
         major, minor, debug_major, debug_minor = nvvm.ir_version()
-
-
+        
+        
         nvvm_ir_template = '''target triple = "nvptx64-unknown-cuda"
 target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-i128:128:128-f32:32:32-f64:64:64-v16:16:16-v32:32:32-v64:64:64-v128:128:128-n16:32:64"
 
@@ -53,10 +68,10 @@ def get_nvvm_ir():
 !nvvmir.version = !{{!1}}
 !1 = !{{i32 {major}, i32 0, i32 {debug_major}, i32 0}}
 '''
-
+        
         return nvvm_ir_template.format(major=major, debug_major=debug_major)
     except Exception:
-
+        
         return """target triple = "nvptx64-unknown-cuda"
 target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-i128:128:128-f32:32:32-f64:64:64-v16:16:16-v32:32:32-v64:64:64-v128:128:128-n16:32:64"
 
@@ -88,8 +103,6 @@ def get_nvvm_ir():
 !0 = !{void (i32*)* @simple, !"kernel", i32 1}
 """
 
-nvvm_ir = get_nvvm_ir()
-
 @pytest.fixture(scope="module")
 def ptx_code_object():
     code = 'extern "C" __global__ void my_kernel() {}'
@@ -231,101 +244,48 @@ def test_program_close():
 
 
 
+@nvvm_available
 def test_nvvm_deferred_import():
     """Test that our deferred NVVM import works correctly"""
-    try:
-        from cuda.core.experimental._program import _get_nvvm_module
-        nvvm = _get_nvvm_module()
-        assert nvvm is not None
-    except ImportError as e:
-        pytest.skip(f"NVVM not available: {e}")
+    from cuda.core.experimental._program import _get_nvvm_module
+    nvvm = _get_nvvm_module()
+    assert nvvm is not None
 
 
-def test_nvvm_program_creation():
+@nvvm_available
+def test_nvvm_program_creation(nvvm_ir):
     """Test basic NVVM program creation"""
-    try:
-        program = Program(nvvm_ir, "nvvm")
-        assert program.backend == "NVVM"
-        assert program.handle is not None
-        program.close()
-    except ImportError as e:
-        pytest.skip(f"NVVM not available: {e}")
+    program = Program(nvvm_ir, "nvvm")
+    assert program.backend == "NVVM"
+    assert program.handle is not None
+    program.close()
 
 
-def test_nvvm_compile_invalid_target():
+@nvvm_available
+def test_nvvm_compile_invalid_target(nvvm_ir):
     """Test that NVVM programs reject invalid compilation targets"""
-    try:
-        program = Program(nvvm_ir, "nvvm")
-        with pytest.raises(ValueError, match="NVVM backend only supports target_type=\"ptx\""):
-            program.compile("cubin")
-        program.close()
-    except ImportError as e:
-        pytest.skip(f"NVVM not available: {e}")
-
-
-def test_nvvm_compile_to_ptx(init_cuda):
-    """Test NVVM IR compilation to PTX"""
-    try:
-        options = ProgramOptions(name="nvvm_test", arch="sm_90", device_code_optimize=False)
-        program = Program(nvvm_ir, "nvvm", options=options)
-        try:
-            ptx_object_code = program.compile("ptx")
-            assert isinstance(ptx_object_code, ObjectCode)
-            assert ptx_object_code.name == "nvvm_test"
-            assert ptx_object_code._code_type == "ptx"
-
-            ptx_code = ptx_object_code.code
-            if isinstance(ptx_code, bytes):
-                ptx_text = ptx_code.decode()
-            else:
-                ptx_text = str(ptx_code)
-            assert ".visible .entry simple(" in ptx_text
-
-            ptx_kernel = ptx_object_code.get_kernel("simple")
-            assert isinstance(ptx_kernel, Kernel)
-
-        except Exception as e:
-            if any(error in str(e) for error in ["ERROR_IR_VERSION_MISMATCH", "ERROR_INVALID_OPTION", "ERROR_COMPILATION"]):
-                pytest.skip(f"NVVM IR not compatible with this CUDA version: {e}")
-            else:
-                raise
-        finally:
-            program.close()
-
-    except ImportError as e:
-        pytest.skip(f"NVVM not available: {e}")
-
+    program = Program(nvvm_ir, "nvvm")
+    with pytest.raises(ValueError, match="NVVM backend only supports target_type=\"ptx\""):
+        program.compile("cubin")
+    program.close()
 
+@nvvm_available
 @pytest.mark.parametrize("options", [
     ProgramOptions(name="test1", arch="sm_90", device_code_optimize=False),
-    ProgramOptions(name="test2", arch="sm_90", device_code_optimize=False),
-    ProgramOptions(name="test3", arch="sm_90", device_code_optimize=True),
+    ProgramOptions(name="test2", arch="sm_100", device_code_optimize=False),
+    ProgramOptions(name="test3", arch="sm_110", device_code_optimize=True),
 ])
-def test_nvvm_program_options(init_cuda, options):
+def test_nvvm_program_options(init_cuda, nvvm_ir, options):
     """Test NVVM programs with different options"""
-    try:
-        program = Program(nvvm_ir, "nvvm", options)
-        assert program.backend == "NVVM"
-
-        try:
-            ptx_code = program.compile("ptx")
-            assert isinstance(ptx_code, ObjectCode)
-            assert ptx_code.name == options.name
-
-            code_content = ptx_code.code
-            if isinstance(code_content, bytes):
-                ptx_text = code_content.decode()
-            else:
-                ptx_text = str(code_content)
-            assert ".visible .entry simple(" in ptx_text
-
-        except Exception as e:
-            if any(error in str(e) for error in ["ERROR_IR_VERSION_MISMATCH", "ERROR_INVALID_OPTION", "ERROR_COMPILATION"]):
-                pytest.skip(f"NVVM compilation not supported: {e}")
-            else:
-                raise
-        finally:
-            program.close()
-
-    except ImportError as e:
-        pytest.skip(f"NVVM not available: {e}")
+    program = Program(nvvm_ir, "nvvm", options)
+    assert program.backend == "NVVM"
+    
+    ptx_code = program.compile("ptx")
+    assert isinstance(ptx_code, ObjectCode)
+    assert ptx_code.name == options.name
+    
+    code_content = ptx_code.code
+    ptx_text = code_content.decode() if isinstance(code_content, bytes) else str(code_content)
+    assert ".visible .entry simple(" in ptx_text
+    
+    program.close()
\ No newline at end of file

From bccda4709650b9c9042f16cf5de951844b961db4 Mon Sep 17 00:00:00 2001
From: Abhilash Majumder <abmajumder@nvidia.com>
Date: Wed, 3 Sep 2025 20:59:49 +0530
Subject: [PATCH 10/42] refactor

---
 cuda_core/tests/test_program.py | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

diff --git a/cuda_core/tests/test_program.py b/cuda_core/tests/test_program.py
index c511b22567..64ecfb9c75 100644
--- a/cuda_core/tests/test_program.py
+++ b/cuda_core/tests/test_program.py
@@ -28,7 +28,12 @@ def _is_nvvm_available():
 
 @pytest.fixture(scope="session")
 def nvvm_ir():
-    """Generate working NVVM IR with proper version metadata"""
+    """Generate working NVVM IR with proper version metadata
+       The try clause here is used for older nvvm modules which
+       might not have an ir_version() method. In which case the 
+       fallback assumes no version metadata will be present in 
+       the input nvvm ir
+    """
     try:
         from cuda.core.experimental._program import _get_nvvm_module
         nvvm = _get_nvvm_module()

From 64436c174f63a0e5b0210da9903fbc83023188c3 Mon Sep 17 00:00:00 2001
From: Abhilash Majumder <abmajumder@nvidia.com>
Date: Wed, 3 Sep 2025 21:01:35 +0530
Subject: [PATCH 11/42] refactor

---
 cuda_core/tests/test_program.py | 22 +++++++++++-----------
 1 file changed, 11 insertions(+), 11 deletions(-)

diff --git a/cuda_core/tests/test_program.py b/cuda_core/tests/test_program.py
index 64ecfb9c75..126deb2d03 100644
--- a/cuda_core/tests/test_program.py
+++ b/cuda_core/tests/test_program.py
@@ -28,18 +28,18 @@ def _is_nvvm_available():
 
 @pytest.fixture(scope="session")
 def nvvm_ir():
-    """Generate working NVVM IR with proper version metadata
+    """Generate working NVVM IR with proper version metadata.
        The try clause here is used for older nvvm modules which
-       might not have an ir_version() method. In which case the 
-       fallback assumes no version metadata will be present in 
+       might not have an ir_version() method. In which case the
+       fallback assumes no version metadata will be present in
        the input nvvm ir
     """
     try:
         from cuda.core.experimental._program import _get_nvvm_module
         nvvm = _get_nvvm_module()
         major, minor, debug_major, debug_minor = nvvm.ir_version()
-        
-        
+
+
         nvvm_ir_template = '''target triple = "nvptx64-unknown-cuda"
 target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-i128:128:128-f32:32:32-f64:64:64-v16:16:16-v32:32:32-v64:64:64-v128:128:128-n16:32:64"
 
@@ -73,10 +73,10 @@ def nvvm_ir():
 !nvvmir.version = !{{!1}}
 !1 = !{{i32 {major}, i32 0, i32 {debug_major}, i32 0}}
 '''
-        
+
         return nvvm_ir_template.format(major=major, debug_major=debug_major)
     except Exception:
-        
+
         return """target triple = "nvptx64-unknown-cuda"
 target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-i128:128:128-f32:32:32-f64:64:64-v16:16:16-v32:32:32-v64:64:64-v128:128:128-n16:32:64"
 
@@ -284,13 +284,13 @@ def test_nvvm_program_options(init_cuda, nvvm_ir, options):
     """Test NVVM programs with different options"""
     program = Program(nvvm_ir, "nvvm", options)
     assert program.backend == "NVVM"
-    
+
     ptx_code = program.compile("ptx")
     assert isinstance(ptx_code, ObjectCode)
     assert ptx_code.name == options.name
-    
+
     code_content = ptx_code.code
     ptx_text = code_content.decode() if isinstance(code_content, bytes) else str(code_content)
     assert ".visible .entry simple(" in ptx_text
-    
-    program.close()
\ No newline at end of file
+
+    program.close()

From 58317d0601a56a7418d1e17dc3594c3273da660b Mon Sep 17 00:00:00 2001
From: Abhilash Majumder <abmajumder@nvidia.com>
Date: Thu, 4 Sep 2025 08:59:16 +0530
Subject: [PATCH 12/42] ruff format

---
 cuda_core/cuda/core/experimental/_program.py |  7 ++-
 cuda_core/tests/test_program.py              | 48 ++++++++++++--------
 2 files changed, 31 insertions(+), 24 deletions(-)

diff --git a/cuda_core/cuda/core/experimental/_program.py b/cuda_core/cuda/core/experimental/_program.py
index 8927534d2a..20159b03e0 100644
--- a/cuda_core/cuda/core/experimental/_program.py
+++ b/cuda_core/cuda/core/experimental/_program.py
@@ -63,10 +63,9 @@ def _get_nvvm_module():
 
         from cuda.bindings import nvvm
         from cuda.bindings._internal.nvvm import _inspect_function_pointer
+
         if _inspect_function_pointer("__nvvmCreateProgram") == 0:
-            raise ImportError(
-                "NVVM library (libnvvm) is not available in this Python environment. "
-            )
+            raise ImportError("NVVM library (libnvvm) is not available in this Python environment. ")
 
         _nvvm_module = nvvm
         return _nvvm_module
@@ -469,7 +468,7 @@ def __init__(self, code, code_type, options: ProgramOptions = None):
 
         elif code_type == "nvvm":
             if isinstance(code, str):
-                code = code.encode('utf-8')
+                code = code.encode("utf-8")
             elif not isinstance(code, (bytes, bytearray)):
                 raise TypeError("NVVM IR code must be provided as str, bytes, or bytearray")
 
diff --git a/cuda_core/tests/test_program.py b/cuda_core/tests/test_program.py
index 126deb2d03..49665564b5 100644
--- a/cuda_core/tests/test_program.py
+++ b/cuda_core/tests/test_program.py
@@ -12,36 +12,40 @@
 
 is_culink_backend = _linker._decide_nvjitlink_or_driver()
 
+
 def _is_nvvm_available():
     """Check if NVVM is available."""
     try:
         from cuda.core.experimental._program import _get_nvvm_module
+
         _get_nvvm_module()
         return True
     except ImportError:
         return False
 
+
 nvvm_available = pytest.mark.skipif(
-    not _is_nvvm_available(),
-    reason="NVVM not available (libNVVM not found or cuda-bindings < 12.9.0)"
+    not _is_nvvm_available(), reason="NVVM not available (libNVVM not found or cuda-bindings < 12.9.0)"
 )
 
+
 @pytest.fixture(scope="session")
 def nvvm_ir():
     """Generate working NVVM IR with proper version metadata.
-       The try clause here is used for older nvvm modules which
-       might not have an ir_version() method. In which case the
-       fallback assumes no version metadata will be present in
-       the input nvvm ir
+    The try clause here is used for older nvvm modules which
+    might not have an ir_version() method. In which case the
+    fallback assumes no version metadata will be present in
+    the input nvvm ir
     """
     try:
         from cuda.core.experimental._program import _get_nvvm_module
+
         nvvm = _get_nvvm_module()
         major, minor, debug_major, debug_minor = nvvm.ir_version()
 
-
-        nvvm_ir_template = '''target triple = "nvptx64-unknown-cuda"
-target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-i128:128:128-f32:32:32-f64:64:64-v16:16:16-v32:32:32-v64:64:64-v128:128:128-n16:32:64"
+        nvvm_ir_template = """target triple = "nvptx64-unknown-cuda"
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-i128:128:128-" \
+"f32:32:32-f64:64:64-v16:16:16-v32:32:32-v64:64:64-v128:128:128-n16:32:64"
 
 define i32 @ave(i32 %a, i32 %b) {{
 entry:
@@ -72,13 +76,13 @@ def nvvm_ir():
 
 !nvvmir.version = !{{!1}}
 !1 = !{{i32 {major}, i32 0, i32 {debug_major}, i32 0}}
-'''
+"""
 
         return nvvm_ir_template.format(major=major, debug_major=debug_major)
     except Exception:
-
         return """target triple = "nvptx64-unknown-cuda"
-target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-i128:128:128-f32:32:32-f64:64:64-v16:16:16-v32:32:32-v64:64:64-v128:128:128-n16:32:64"
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-i128:128:128-" \
+"f32:32:32-f64:64:64-v16:16:16-v32:32:32-v64:64:64-v128:128:128-n16:32:64"
 
 define i32 @ave(i32 %a, i32 %b) {
 entry:
@@ -108,6 +112,7 @@ def nvvm_ir():
 !0 = !{void (i32*)* @simple, !"kernel", i32 1}
 """
 
+
 @pytest.fixture(scope="module")
 def ptx_code_object():
     code = 'extern "C" __global__ void my_kernel() {}'
@@ -247,12 +252,11 @@ def test_program_close():
     assert program.handle is None
 
 
-
-
 @nvvm_available
 def test_nvvm_deferred_import():
     """Test that our deferred NVVM import works correctly"""
     from cuda.core.experimental._program import _get_nvvm_module
+
     nvvm = _get_nvvm_module()
     assert nvvm is not None
 
@@ -270,16 +274,20 @@ def test_nvvm_program_creation(nvvm_ir):
 def test_nvvm_compile_invalid_target(nvvm_ir):
     """Test that NVVM programs reject invalid compilation targets"""
     program = Program(nvvm_ir, "nvvm")
-    with pytest.raises(ValueError, match="NVVM backend only supports target_type=\"ptx\""):
+    with pytest.raises(ValueError, match='NVVM backend only supports target_type="ptx"'):
         program.compile("cubin")
     program.close()
 
+
 @nvvm_available
-@pytest.mark.parametrize("options", [
-    ProgramOptions(name="test1", arch="sm_90", device_code_optimize=False),
-    ProgramOptions(name="test2", arch="sm_100", device_code_optimize=False),
-    ProgramOptions(name="test3", arch="sm_110", device_code_optimize=True),
-])
+@pytest.mark.parametrize(
+    "options",
+    [
+        ProgramOptions(name="test1", arch="sm_90", device_code_optimize=False),
+        ProgramOptions(name="test2", arch="sm_100", device_code_optimize=False),
+        ProgramOptions(name="test3", arch="sm_110", device_code_optimize=True),
+    ],
+)
 def test_nvvm_program_options(init_cuda, nvvm_ir, options):
     """Test NVVM programs with different options"""
     program = Program(nvvm_ir, "nvvm", options)

From caf4f22f1fe84ff4a016a94fecd8ff82f09d785e Mon Sep 17 00:00:00 2001
From: Abhilash Majumder <abmajumder@nvidia.com>
Date: Thu, 4 Sep 2025 09:03:15 +0530
Subject: [PATCH 13/42] ruff format

---
 cuda_core/cuda/core/experimental/_program.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/cuda_core/cuda/core/experimental/_program.py b/cuda_core/cuda/core/experimental/_program.py
index 20159b03e0..c3bca7ac7e 100644
--- a/cuda_core/cuda/core/experimental/_program.py
+++ b/cuda_core/cuda/core/experimental/_program.py
@@ -9,6 +9,8 @@
 from typing import TYPE_CHECKING, List, Tuple, Union
 from warnings import warn
 
+import list
+
 if TYPE_CHECKING:
     import cuda.bindings
 
@@ -501,7 +503,7 @@ def _translate_program_options(self, options: ProgramOptions) -> LinkerOptions:
             ptxas_options=options.ptxas_options,
         )
 
-    def _translate_program_options_to_nvvm(self, options: ProgramOptions) -> List[str]:
+    def _translate_program_options_to_nvvm(self, options: ProgramOptions) -> list[str]:
         """Translate ProgramOptions to NVVM-specific compilation options."""
         nvvm_options = []
 

From 88237bcdb18d9025f7e566d73f1e63c07704e5da Mon Sep 17 00:00:00 2001
From: Abhilash Majumder <abmajumder@nvidia.com>
Date: Thu, 4 Sep 2025 09:10:18 +0530
Subject: [PATCH 14/42] revert changes to cuda_utils

---
 cuda_core/cuda/core/experimental/_utils/cuda_utils.pyx | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/cuda_core/cuda/core/experimental/_utils/cuda_utils.pyx b/cuda_core/cuda/core/experimental/_utils/cuda_utils.pyx
index 4a9d61aee9..60bcaeba6c 100644
--- a/cuda_core/cuda/core/experimental/_utils/cuda_utils.pyx
+++ b/cuda_core/cuda/core/experimental/_utils/cuda_utils.pyx
@@ -26,6 +26,7 @@ class CUDAError(Exception):
 class NVRTCError(CUDAError):
     pass
 
+
 ComputeCapability = namedtuple("ComputeCapability", ("major", "minor"))
 
 
@@ -55,6 +56,7 @@ cdef object _DRIVER_SUCCESS = driver.CUresult.CUDA_SUCCESS
 cdef object _RUNTIME_SUCCESS = runtime.cudaError_t.cudaSuccess
 cdef object _NVRTC_SUCCESS = nvrtc.nvrtcResult.NVRTC_SUCCESS
 
+
 cpdef inline int _check_driver_error(error) except?-1:
     if error == _DRIVER_SUCCESS:
         return 0
@@ -100,6 +102,7 @@ cpdef inline int _check_nvrtc_error(error, handle=None) except?-1:
         err += f", compilation log:\n\n{log.decode('utf-8', errors='backslashreplace')}"
     raise NVRTCError(err)
 
+
 cdef inline int _check_error(error, handle=None) except?-1:
     if isinstance(error, driver.CUresult):
         return _check_driver_error(error)
@@ -218,4 +221,4 @@ def get_binding_version():
         major_minor = importlib.metadata.version("cuda-bindings").split(".")[:2]
     except importlib.metadata.PackageNotFoundError:
         major_minor = importlib.metadata.version("cuda-python").split(".")[:2]
-    return tuple(int(v) for v in major_minor)
+    return tuple(int(v) for v in major_minor)
\ No newline at end of file

From 5e2e137659df87a15a0b950e042a9d2c30883618 Mon Sep 17 00:00:00 2001
From: abhilash1910 <debabhi1396@gmail.com>
Date: Thu, 4 Sep 2025 09:13:10 +0530
Subject: [PATCH 15/42] new line

---
 cuda_core/cuda/core/experimental/_utils/cuda_utils.pyx | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/cuda_core/cuda/core/experimental/_utils/cuda_utils.pyx b/cuda_core/cuda/core/experimental/_utils/cuda_utils.pyx
index 60bcaeba6c..86588f733c 100644
--- a/cuda_core/cuda/core/experimental/_utils/cuda_utils.pyx
+++ b/cuda_core/cuda/core/experimental/_utils/cuda_utils.pyx
@@ -221,4 +221,4 @@ def get_binding_version():
         major_minor = importlib.metadata.version("cuda-bindings").split(".")[:2]
     except importlib.metadata.PackageNotFoundError:
         major_minor = importlib.metadata.version("cuda-python").split(".")[:2]
-    return tuple(int(v) for v in major_minor)
\ No newline at end of file
+    return tuple(int(v) for v in major_minor)

From 0301a4d670e8533ef66367e23c28f1960dda2b9f Mon Sep 17 00:00:00 2001
From: abhilash1910 <debabhi1396@gmail.com>
Date: Fri, 5 Sep 2025 08:44:19 +0530
Subject: [PATCH 16/42] fix CI rm list import

---
 cuda_core/cuda/core/experimental/_program.py | 2 --
 1 file changed, 2 deletions(-)

diff --git a/cuda_core/cuda/core/experimental/_program.py b/cuda_core/cuda/core/experimental/_program.py
index c3bca7ac7e..0c59298e60 100644
--- a/cuda_core/cuda/core/experimental/_program.py
+++ b/cuda_core/cuda/core/experimental/_program.py
@@ -9,8 +9,6 @@
 from typing import TYPE_CHECKING, List, Tuple, Union
 from warnings import warn
 
-import list
-
 if TYPE_CHECKING:
     import cuda.bindings
 

From 28e2d4b9e8872de93f66cc59ff81d4f13c8e1ab6 Mon Sep 17 00:00:00 2001
From: abhilash1910 <debabhi1396@gmail.com>
Date: Sun, 7 Sep 2025 14:12:36 +0530
Subject: [PATCH 17/42] use noqa

---
 cuda_core/tests/test_program.py | 10 ++++------
 1 file changed, 4 insertions(+), 6 deletions(-)

diff --git a/cuda_core/tests/test_program.py b/cuda_core/tests/test_program.py
index 49665564b5..d8dc3af31d 100644
--- a/cuda_core/tests/test_program.py
+++ b/cuda_core/tests/test_program.py
@@ -44,8 +44,7 @@ def nvvm_ir():
         major, minor, debug_major, debug_minor = nvvm.ir_version()
 
         nvvm_ir_template = """target triple = "nvptx64-unknown-cuda"
-target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-i128:128:128-" \
-"f32:32:32-f64:64:64-v16:16:16-v32:32:32-v64:64:64-v128:128:128-n16:32:64"
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-i128:128:128-f32:32:32-f64:64:64-v16:16:16-v32:32:32-v64:64:64-v128:128:128-n16:32:64"
 
 define i32 @ave(i32 %a, i32 %b) {{
 entry:
@@ -76,13 +75,12 @@ def nvvm_ir():
 
 !nvvmir.version = !{{!1}}
 !1 = !{{i32 {major}, i32 0, i32 {debug_major}, i32 0}}
-"""
+""" # noqa: E501
 
         return nvvm_ir_template.format(major=major, debug_major=debug_major)
     except Exception:
         return """target triple = "nvptx64-unknown-cuda"
-target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-i128:128:128-" \
-"f32:32:32-f64:64:64-v16:16:16-v32:32:32-v64:64:64-v128:128:128-n16:32:64"
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-i128:128:128-f32:32:32-f64:64:64-v16:16:16-v32:32:32-v64:64:64-v128:128:128-n16:32:64"
 
 define i32 @ave(i32 %a, i32 %b) {
 entry:
@@ -110,7 +108,7 @@ def nvvm_ir():
 
 !nvvm.annotations = !{!0}
 !0 = !{void (i32*)* @simple, !"kernel", i32 1}
-"""
+""" # noqa: E501
 
 
 @pytest.fixture(scope="module")

From af1008ff076accd6b40270c7e9be21e38ab897c9 Mon Sep 17 00:00:00 2001
From: abhilash1910 <debabhi1396@gmail.com>
Date: Sun, 7 Sep 2025 14:23:08 +0530
Subject: [PATCH 18/42] format

---
 cuda_core/tests/test_program.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/cuda_core/tests/test_program.py b/cuda_core/tests/test_program.py
index d8dc3af31d..656cdc428a 100644
--- a/cuda_core/tests/test_program.py
+++ b/cuda_core/tests/test_program.py
@@ -75,7 +75,7 @@ def nvvm_ir():
 
 !nvvmir.version = !{{!1}}
 !1 = !{{i32 {major}, i32 0, i32 {debug_major}, i32 0}}
-""" # noqa: E501
+"""  # noqa: E501
 
         return nvvm_ir_template.format(major=major, debug_major=debug_major)
     except Exception:
@@ -108,7 +108,7 @@ def nvvm_ir():
 
 !nvvm.annotations = !{!0}
 !0 = !{void (i32*)* @simple, !"kernel", i32 1}
-""" # noqa: E501
+"""  # noqa: E501
 
 
 @pytest.fixture(scope="module")

From a85a44f7293dfe4578b4d30a50fed6b6a5f38eaf Mon Sep 17 00:00:00 2001
From: abhilash1910 <debabhi1396@gmail.com>
Date: Mon, 8 Sep 2025 20:32:58 +0530
Subject: [PATCH 19/42] verify and skip 110

---
 cuda_core/cuda/core/experimental/_program.py |  1 +
 cuda_core/tests/test_program.py              | 10 +++++++++-
 2 files changed, 10 insertions(+), 1 deletion(-)

diff --git a/cuda_core/cuda/core/experimental/_program.py b/cuda_core/cuda/core/experimental/_program.py
index 0c59298e60..da1ef315a2 100644
--- a/cuda_core/cuda/core/experimental/_program.py
+++ b/cuda_core/cuda/core/experimental/_program.py
@@ -608,6 +608,7 @@ def compile(self, target_type, name_expressions=(), logs=None):
 
             nvvm_options = self._translate_program_options_to_nvvm(self._options)
             nvvm = _get_nvvm_module()
+            nvvm.verify_program(self._mnff.handle, len(nvvm_options), nvvm_options)
             nvvm.compile_program(self._mnff.handle, len(nvvm_options), nvvm_options)
 
             size = nvvm.get_compiled_result_size(self._mnff.handle)
diff --git a/cuda_core/tests/test_program.py b/cuda_core/tests/test_program.py
index 656cdc428a..a471df754f 100644
--- a/cuda_core/tests/test_program.py
+++ b/cuda_core/tests/test_program.py
@@ -9,7 +9,9 @@
 from cuda.core.experimental import _linker
 from cuda.core.experimental._module import Kernel, ObjectCode
 from cuda.core.experimental._program import Program, ProgramOptions
+from cuda.core.experimental._utils.cuda_utils import driver, handle_return
 
+cuda_driver_version = handle_return(driver.cuDriverGetVersion())
 is_culink_backend = _linker._decide_nvjitlink_or_driver()
 
 
@@ -283,7 +285,13 @@ def test_nvvm_compile_invalid_target(nvvm_ir):
     [
         ProgramOptions(name="test1", arch="sm_90", device_code_optimize=False),
         ProgramOptions(name="test2", arch="sm_100", device_code_optimize=False),
-        ProgramOptions(name="test3", arch="sm_110", device_code_optimize=True),
+        pytest.param(
+            ProgramOptions(name="test3", arch="sm_110", device_code_optimize=True),
+            marks=pytest.mark.skipif(
+                12000 <= cuda_driver_version < 13000,
+                reason="Compute capability 110 not supported with CUDA 12.x",
+            ),
+        ),
     ],
 )
 def test_nvvm_program_options(init_cuda, nvvm_ir, options):

From 0be06aa2bae81eb230fec3cf0e39dc7e755eabc4 Mon Sep 17 00:00:00 2001
From: abhilash1910 <debabhi1396@gmail.com>
Date: Mon, 8 Sep 2025 20:51:28 +0530
Subject: [PATCH 20/42] add flags and lto

---
 cuda_core/cuda/core/experimental/_program.py | 15 +++++++++++++--
 1 file changed, 13 insertions(+), 2 deletions(-)

diff --git a/cuda_core/cuda/core/experimental/_program.py b/cuda_core/cuda/core/experimental/_program.py
index da1ef315a2..1535a726cf 100644
--- a/cuda_core/cuda/core/experimental/_program.py
+++ b/cuda_core/cuda/core/experimental/_program.py
@@ -520,6 +520,15 @@ def _translate_program_options_to_nvvm(self, options: ProgramOptions) -> list[st
         elif options.device_code_optimize is True:
             nvvm_options.append("-opt=3")
 
+        if options.ftz is not None:
+            nvvm_options.append(f"-ftz={'true' if options.ftz else 'false'}")
+        if options.prec_sqrt is not None:
+            nvvm_options.append(f"-prec-sqrt={'true' if options.prec_sqrt else 'false'}")
+        if options.prec_div is not None:
+            nvvm_options.append(f"-prec-div={'true' if options.prec_div else 'false'}")
+        if options.fma is not None:
+            nvvm_options.append(f"-fma={'true' if options.fma else 'false'}")
+
         return nvvm_options
 
     def close(self):
@@ -603,10 +612,12 @@ def compile(self, target_type, name_expressions=(), logs=None):
             return ObjectCode._init(data, target_type, symbol_mapping=symbol_mapping, name=self._options.name)
 
         elif self._backend == "NVVM":
-            if target_type != "ptx":
-                raise ValueError(f'NVVM backend only supports target_type="ptx", got "{target_type}"')
+            if target_type not in ("ptx", "ltoir"):
+                raise ValueError(f'NVVM backend only supports target_type="ptx", "ltoir", got "{target_type}"')
 
             nvvm_options = self._translate_program_options_to_nvvm(self._options)
+            if target_type == "ltoir" and "-gen-lto" not in nvvm_options:
+                nvvm_options.append("-gen-lto")
             nvvm = _get_nvvm_module()
             nvvm.verify_program(self._mnff.handle, len(nvvm_options), nvvm_options)
             nvvm.compile_program(self._mnff.handle, len(nvvm_options), nvvm_options)

From 1c63a11da6cc29314d425cf3021a97cf0c98011d Mon Sep 17 00:00:00 2001
From: Abhilash Majumder <abmajumder@nvidia.com>
Date: Mon, 15 Sep 2025 12:51:58 +0530
Subject: [PATCH 21/42] rename gpu-arch to arch

---
 cuda_core/cuda/core/experimental/_program.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/cuda_core/cuda/core/experimental/_program.py b/cuda_core/cuda/core/experimental/_program.py
index 1535a726cf..afccb5c191 100644
--- a/cuda_core/cuda/core/experimental/_program.py
+++ b/cuda_core/cuda/core/experimental/_program.py
@@ -277,10 +277,10 @@ def __post_init__(self):
 
         self._formatted_options = []
         if self.arch is not None:
-            self._formatted_options.append(f"--gpu-architecture={self.arch}")
+            self._formatted_options.append(f"-arch={self.arch}")
         else:
             self._formatted_options.append(
-                "--gpu-architecture=sm_" + "".join(f"{i}" for i in Device().compute_capability)
+                "-arch=sm_" + "".join(f"{i}" for i in Device().compute_capability)
             )
         if self.relocatable_device_code is not None:
             self._formatted_options.append(

From cab6db0c69fac0fd3497a008c1b22b7d9d838f83 Mon Sep 17 00:00:00 2001
From: abhilash1910 <debabhi1396@gmail.com>
Date: Mon, 15 Sep 2025 14:02:57 +0530
Subject: [PATCH 22/42] change libnvvm version check

---
 cuda_core/cuda/core/experimental/_program.py |   4 +-
 cuda_core/tests/test_program.py              | 105 ++++++++++++++++++-
 2 files changed, 103 insertions(+), 6 deletions(-)

diff --git a/cuda_core/cuda/core/experimental/_program.py b/cuda_core/cuda/core/experimental/_program.py
index afccb5c191..6a285a9819 100644
--- a/cuda_core/cuda/core/experimental/_program.py
+++ b/cuda_core/cuda/core/experimental/_program.py
@@ -279,9 +279,7 @@ def __post_init__(self):
         if self.arch is not None:
             self._formatted_options.append(f"-arch={self.arch}")
         else:
-            self._formatted_options.append(
-                "-arch=sm_" + "".join(f"{i}" for i in Device().compute_capability)
-            )
+            self._formatted_options.append("-arch=sm_" + "".join(f"{i}" for i in Device().compute_capability))
         if self.relocatable_device_code is not None:
             self._formatted_options.append(
                 f"--relocatable-device-code={_handle_boolean_option(self.relocatable_device_code)}"
diff --git a/cuda_core/tests/test_program.py b/cuda_core/tests/test_program.py
index a471df754f..209dfa1b0f 100644
--- a/cuda_core/tests/test_program.py
+++ b/cuda_core/tests/test_program.py
@@ -30,6 +30,83 @@ def _is_nvvm_available():
     not _is_nvvm_available(), reason="NVVM not available (libNVVM not found or cuda-bindings < 12.9.0)"
 )
 
+try:
+    from cuda.core.experimental._utils.cuda_utils import driver, handle_return
+
+    _cuda_driver_version = handle_return(driver.cuDriverGetVersion())
+except Exception:
+    _cuda_driver_version = 0
+
+_libnvvm_version = None
+_libnvvm_version_attempted = False
+
+precheck_nvvm_ir = """
+target triple = "nvptx64-nvidia-cuda"
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-i128:128:128-f32:32:32-f64:64:64-v16:16:16-v32:32:32-v64:64:64-v128:128:128-n16:32:64"
+
+define void @dummy_kernel() {
+entry:
+  ret void
+}
+
+!nvvm.annotations = !{!0}
+!0 = !{void ()* @dummy_kernel, !"kernel", i32 1}
+"""  # noqa: E501
+
+
+def _get_libnvvm_version_for_tests():
+    """
+    Detect libNVVM version by compiling dummy IR and analyzing the PTX output.
+
+    Workaround for the lack of direct libNVVM version API (nvbugs 5312315).
+    The approach:
+    - Compile a small dummy NVVM IR to PTX
+    - Use PTX version analysis APIs if available to infer libNVVM version
+    - Cache the result for future use
+    """
+    global _libnvvm_version, _libnvvm_version_attempted
+
+    if _libnvvm_version_attempted:
+        return _libnvvm_version
+
+    _libnvvm_version_attempted = True
+
+    try:
+        from cuda.core.experimental._program import _get_nvvm_module
+
+        nvvm = _get_nvvm_module()
+
+        try:
+            from cuda.bindings.utils import get_ptx_ver, get_minimal_required_cuda_ver_from_ptx_ver
+        except ImportError:
+            _libnvvm_version = None
+            return _libnvvm_version
+
+        program = nvvm.create_program()
+        try:
+            precheck_ir_bytes = precheck_nvvm_ir.encode("utf-8")
+            nvvm.add_module_to_program(program, precheck_ir_bytes, len(precheck_ir_bytes), "precheck.ll")
+
+            options = ["-arch=compute_70"]
+            nvvm.verify_program(program, len(options), options)
+            nvvm.compile_program(program, len(options), options)
+
+            ptx_size = nvvm.get_compiled_result_size(program)
+            ptx_data = bytearray(ptx_size)
+            nvvm.get_compiled_result(program, ptx_data)
+            ptx_str = ptx_data.decode("utf-8")
+            ptx_version = get_ptx_ver(ptx_str)
+            cuda_version = get_minimal_required_cuda_ver_from_ptx_ver(ptx_version)
+            _libnvvm_version = cuda_version
+            return _libnvvm_version
+
+        finally:
+            nvvm.destroy_program(program)
+
+    except Exception:
+        _libnvvm_version = None
+        return _libnvvm_version
+
 
 @pytest.fixture(scope="session")
 def nvvm_ir():
@@ -286,10 +363,32 @@ def test_nvvm_compile_invalid_target(nvvm_ir):
         ProgramOptions(name="test1", arch="sm_90", device_code_optimize=False),
         ProgramOptions(name="test2", arch="sm_100", device_code_optimize=False),
         pytest.param(
-            ProgramOptions(name="test3", arch="sm_110", device_code_optimize=True),
+            ProgramOptions(name="test_sm110_1", arch="sm_110", device_code_optimize=False),
+            marks=pytest.mark.skipif(
+                (_get_libnvvm_version_for_tests() or 0) < 13000,
+                reason="Compute capability 110 requires libNVVM >= 13.0",
+            ),
+        ),
+        pytest.param(
+            ProgramOptions(
+                name="test_sm110_2",
+                arch="sm_110",
+                ftz=True,
+                prec_sqrt=False,
+                prec_div=False,
+                fma=True,
+                device_code_optimize=True,
+            ),
+            marks=pytest.mark.skipif(
+                (_get_libnvvm_version_for_tests() or 0) < 13000,
+                reason="Compute capability 110 requires libNVVM >= 13.0",
+            ),
+        ),
+        pytest.param(
+            ProgramOptions(name="test_sm110_3", arch="sm_110", link_time_optimization=True),
             marks=pytest.mark.skipif(
-                12000 <= cuda_driver_version < 13000,
-                reason="Compute capability 110 not supported with CUDA 12.x",
+                (_get_libnvvm_version_for_tests() or 0) < 13000,
+                reason="Compute capability 110 requires libNVVM >= 13.0",
             ),
         ),
     ],

From 3abcd389713af1e24498c74e585307e3ab0918ec Mon Sep 17 00:00:00 2001
From: abhilash1910 <debabhi1396@gmail.com>
Date: Mon, 15 Sep 2025 14:05:13 +0530
Subject: [PATCH 23/42] format

---
 cuda_core/tests/test_program.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/cuda_core/tests/test_program.py b/cuda_core/tests/test_program.py
index 209dfa1b0f..e4653d1164 100644
--- a/cuda_core/tests/test_program.py
+++ b/cuda_core/tests/test_program.py
@@ -77,7 +77,7 @@ def _get_libnvvm_version_for_tests():
         nvvm = _get_nvvm_module()
 
         try:
-            from cuda.bindings.utils import get_ptx_ver, get_minimal_required_cuda_ver_from_ptx_ver
+            from cuda.bindings.utils import get_minimal_required_cuda_ver_from_ptx_ver, get_ptx_ver
         except ImportError:
             _libnvvm_version = None
             return _libnvvm_version

From caf634cb95399b98f8e93ca4c472e943db452b1b Mon Sep 17 00:00:00 2001
From: abhilash1910 <debabhi1396@gmail.com>
Date: Mon, 15 Sep 2025 14:10:20 +0530
Subject: [PATCH 24/42] compute 90

---
 cuda_core/tests/test_program.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/cuda_core/tests/test_program.py b/cuda_core/tests/test_program.py
index e4653d1164..effa1f0cfd 100644
--- a/cuda_core/tests/test_program.py
+++ b/cuda_core/tests/test_program.py
@@ -87,7 +87,7 @@ def _get_libnvvm_version_for_tests():
             precheck_ir_bytes = precheck_nvvm_ir.encode("utf-8")
             nvvm.add_module_to_program(program, precheck_ir_bytes, len(precheck_ir_bytes), "precheck.ll")
 
-            options = ["-arch=compute_70"]
+            options = ["-arch=compute_90"]
             nvvm.verify_program(program, len(options), options)
             nvvm.compile_program(program, len(options), options)
 

From 64d19abb88e752e7324dbc578b45f239472faf93 Mon Sep 17 00:00:00 2001
From: Leo Fang <leo80042@gmail.com>
Date: Mon, 15 Sep 2025 19:24:48 -0400
Subject: [PATCH 25/42] Apply suggestions from code review

---
 cuda_core/cuda/core/experimental/_program.py | 28 +++++++++-----------
 cuda_core/tests/test_program.py              |  2 +-
 2 files changed, 14 insertions(+), 16 deletions(-)

diff --git a/cuda_core/cuda/core/experimental/_program.py b/cuda_core/cuda/core/experimental/_program.py
index 6a285a9819..211ae50fbe 100644
--- a/cuda_core/cuda/core/experimental/_program.py
+++ b/cuda_core/cuda/core/experimental/_program.py
@@ -34,7 +34,7 @@
 def _get_nvvm_module():
     """
     Handles the import of NVVM module with version and availability checks.
-    NVVM bindings were added in CUDA 12.9.0, so we need to handle cases where:
+    NVVM bindings were added in cuda-bindings 12.9.0, so we need to handle cases where:
     1. cuda.bindings is not new enough (< 12.9.0)
     2. libnvvm is not found in the Python environment
 
@@ -42,13 +42,13 @@ def _get_nvvm_module():
         The nvvm module if available and working
 
     Raises:
-        ImportError: If NVVM is not available due to version or library issues
+        RuntimeError: If NVVM is not available due to version or library issues
     """
     global _nvvm_module, _nvvm_import_attempted
 
     if _nvvm_import_attempted:
         if _nvvm_module is None:
-            raise ImportError("NVVM module is not available (previous import attempt failed)")
+            raise RuntimeError("NVVM module is not available (previous import attempt failed)")
         return _nvvm_module
 
     _nvvm_import_attempted = True
@@ -56,7 +56,7 @@ def _get_nvvm_module():
     try:
         version = get_binding_version()
         if version < (12, 9):
-            raise ImportError(
+            raise RuntimeError(
                 f"NVVM bindings require cuda-bindings >= 12.9.0, but found {version[0]}.{version[1]}.x. "
                 "Please update cuda-bindings to use NVVM features."
             )
@@ -65,12 +65,12 @@ def _get_nvvm_module():
         from cuda.bindings._internal.nvvm import _inspect_function_pointer
 
         if _inspect_function_pointer("__nvvmCreateProgram") == 0:
-            raise ImportError("NVVM library (libnvvm) is not available in this Python environment. ")
+            raise RuntimeError("NVVM library (libnvvm) is not available in this Python environment. ")
 
         _nvvm_module = nvvm
         return _nvvm_module
 
-    except ImportError as e:
+    except RuntimeError as e:
         _nvvm_module = None
         raise e
 
@@ -279,7 +279,8 @@ def __post_init__(self):
         if self.arch is not None:
             self._formatted_options.append(f"-arch={self.arch}")
         else:
-            self._formatted_options.append("-arch=sm_" + "".join(f"{i}" for i in Device().compute_capability))
+            self.arch = f"sm_{Device().arch}"
+            self._formatted_options.append(f"-arch={self.arch}")
         if self.relocatable_device_code is not None:
             self._formatted_options.append(
                 f"--relocatable-device-code={_handle_boolean_option(self.relocatable_device_code)}"
@@ -503,14 +504,11 @@ def _translate_program_options_to_nvvm(self, options: ProgramOptions) -> list[st
         """Translate ProgramOptions to NVVM-specific compilation options."""
         nvvm_options = []
 
-        if options.arch is not None:
-            arch = options.arch
-            if arch.startswith("sm_"):
-                arch = f"compute_{arch[3:]}"
-            nvvm_options.append(f"-arch={arch}")
-        else:
-            major, minor = Device().compute_capability
-            nvvm_options.append(f"-arch=compute_{major}{minor}")
+        assert options.arch is not None
+        arch = options.arch
+        if arch.startswith("sm_"):
+            arch = f"compute_{arch[3:]}"
+        nvvm_options.append(f"-arch={arch}")
         if options.debug:
             nvvm_options.append("-g")
         if options.device_code_optimize is False:
diff --git a/cuda_core/tests/test_program.py b/cuda_core/tests/test_program.py
index effa1f0cfd..a2035ca391 100644
--- a/cuda_core/tests/test_program.py
+++ b/cuda_core/tests/test_program.py
@@ -22,7 +22,7 @@ def _is_nvvm_available():
 
         _get_nvvm_module()
         return True
-    except ImportError:
+    except RuntimeError:
         return False
 
 

From c5993dcc12d82267f0938251d86810a957b41b9d Mon Sep 17 00:00:00 2001
From: abhilash1910 <debabhi1396@gmail.com>
Date: Tue, 16 Sep 2025 06:30:49 +0000
Subject: [PATCH 26/42] update test

---
 cuda_core/tests/test_program.py | 33 +++++++++++++++++++++++++++------
 1 file changed, 27 insertions(+), 6 deletions(-)

diff --git a/cuda_core/tests/test_program.py b/cuda_core/tests/test_program.py
index a2035ca391..a31650572a 100644
--- a/cuda_core/tests/test_program.py
+++ b/cuda_core/tests/test_program.py
@@ -40,17 +40,38 @@ def _is_nvvm_available():
 _libnvvm_version = None
 _libnvvm_version_attempted = False
 
-precheck_nvvm_ir = """
-target triple = "nvptx64-nvidia-cuda"
+precheck_nvvm_ir = """target triple = "nvptx64-unknown-cuda"
 target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-i128:128:128-f32:32:32-f64:64:64-v16:16:16-v32:32:32-v64:64:64-v128:128:128-n16:32:64"
 
-define void @dummy_kernel() {
+define i32 @ave(i32 %a, i32 %b) {{
+entry:
+  %add = add nsw i32 %a, %b
+  %div = sdiv i32 %add, 2
+  ret i32 %div
+}}
+
+define void @simple(i32* %data) {{
 entry:
+  %0 = call i32 @llvm.nvvm.read.ptx.sreg.ctaid.x()
+  %1 = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x()
+  %mul = mul i32 %0, %1
+  %2 = call i32 @llvm.nvvm.read.ptx.sreg.tid.x()
+  %add = add i32 %mul, %2
+  %call = call i32 @ave(i32 %add, i32 %add)
+  %idxprom = sext i32 %add to i64
+  store i32 %call, i32* %data, align 4
   ret void
-}
+}}
 
-!nvvm.annotations = !{!0}
-!0 = !{void ()* @dummy_kernel, !"kernel", i32 1}
+declare i32 @llvm.nvvm.read.ptx.sreg.ctaid.x() nounwind readnone
+declare i32 @llvm.nvvm.read.ptx.sreg.ntid.x() nounwind readnone
+declare i32 @llvm.nvvm.read.ptx.sreg.tid.x() nounwind readnone
+
+!nvvm.annotations = !{{!0}}
+!0 = !{{void (i32*)* @simple, !"kernel", i32 1}}
+
+!nvvmir.version = !{{!1}}
+!1 = !{{i32 {major}, i32 0, i32 {debug_major}, i32 0}}
 """  # noqa: E501
 
 

From 5d5b1d3a78483907696cd046ef8be2a7c0555ffd Mon Sep 17 00:00:00 2001
From: abhilash1910 <debabhi1396@gmail.com>
Date: Tue, 16 Sep 2025 07:02:46 +0000
Subject: [PATCH 27/42] use exception manager

---
 cuda_core/tests/test_program.py | 34 +++++++++++++++++++++++++++++++--
 1 file changed, 32 insertions(+), 2 deletions(-)

diff --git a/cuda_core/tests/test_program.py b/cuda_core/tests/test_program.py
index a31650572a..8bd7d778b2 100644
--- a/cuda_core/tests/test_program.py
+++ b/cuda_core/tests/test_program.py
@@ -10,11 +10,40 @@
 from cuda.core.experimental._module import Kernel, ObjectCode
 from cuda.core.experimental._program import Program, ProgramOptions
 from cuda.core.experimental._utils.cuda_utils import driver, handle_return
+from contextlib import contextmanager
 
 cuda_driver_version = handle_return(driver.cuDriverGetVersion())
 is_culink_backend = _linker._decide_nvjitlink_or_driver()
 
 
+@contextmanager
+def _nvvm_exception_manager(nvvm, program_handle):
+    """
+    Taken from _linker.py
+    """
+    try:
+        yield
+    except Exception as e:
+        error_log = ""
+        try:
+            # Try to get the NVVM program log
+            logsize = nvvm.get_program_log_size(program_handle)
+            if logsize > 1:
+                log = bytearray(logsize)
+                nvvm.get_program_log(program_handle, log)
+                error_log = log.decode("utf-8", errors="backslashreplace")
+        except Exception:
+            # If we can't get the log, continue without it
+            pass
+
+        # Starting Python 3.11 we could also use Exception.add_note() for the same purpose, but
+        # unfortunately we are still supporting Python 3.9/3.10...
+        # Append the NVVM program log to the original exception message
+        if error_log:
+            e.args = (e.args[0] + f"\nNVVM program log: {error_log}", *e.args[1:])
+        raise e
+
+
 def _is_nvvm_available():
     """Check if NVVM is available."""
     try:
@@ -109,8 +138,9 @@ def _get_libnvvm_version_for_tests():
             nvvm.add_module_to_program(program, precheck_ir_bytes, len(precheck_ir_bytes), "precheck.ll")
 
             options = ["-arch=compute_90"]
-            nvvm.verify_program(program, len(options), options)
-            nvvm.compile_program(program, len(options), options)
+            with _nvvm_exception_manager(nvvm, program):
+                nvvm.verify_program(program, len(options), options)
+                nvvm.compile_program(program, len(options), options)
 
             ptx_size = nvvm.get_compiled_result_size(program)
             ptx_data = bytearray(ptx_size)

From f6b55281f61e511d1b7976219b8c008970ff45aa Mon Sep 17 00:00:00 2001
From: abhilash1910 <debabhi1396@gmail.com>
Date: Tue, 16 Sep 2025 07:09:25 +0000
Subject: [PATCH 28/42] format

---
 cuda_core/tests/test_program.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/cuda_core/tests/test_program.py b/cuda_core/tests/test_program.py
index 8bd7d778b2..fb6c0b7d56 100644
--- a/cuda_core/tests/test_program.py
+++ b/cuda_core/tests/test_program.py
@@ -3,14 +3,14 @@
 # SPDX-License-Identifier: LicenseRef-NVIDIA-SOFTWARE-LICENSE
 
 import warnings
+from contextlib import contextmanager
 
 import pytest
+from cuda.core.experimental._utils.cuda_utils import driver, handle_return
 
 from cuda.core.experimental import _linker
 from cuda.core.experimental._module import Kernel, ObjectCode
 from cuda.core.experimental._program import Program, ProgramOptions
-from cuda.core.experimental._utils.cuda_utils import driver, handle_return
-from contextlib import contextmanager
 
 cuda_driver_version = handle_return(driver.cuDriverGetVersion())
 is_culink_backend = _linker._decide_nvjitlink_or_driver()

From c7fad0a8fe6a8029e2b23caac47c8288b668dc19 Mon Sep 17 00:00:00 2001
From: abhilash1910 <debabhi1396@gmail.com>
Date: Tue, 16 Sep 2025 09:18:30 +0000
Subject: [PATCH 29/42] format ruff

---
 cuda_core/tests/test_program.py | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/cuda_core/tests/test_program.py b/cuda_core/tests/test_program.py
index fb6c0b7d56..08b2ad3486 100644
--- a/cuda_core/tests/test_program.py
+++ b/cuda_core/tests/test_program.py
@@ -33,8 +33,7 @@ def _nvvm_exception_manager(nvvm, program_handle):
                 nvvm.get_program_log(program_handle, log)
                 error_log = log.decode("utf-8", errors="backslashreplace")
         except Exception:
-            # If we can't get the log, continue without it
-            pass
+            error_log = ""
 
         # Starting Python 3.11 we could also use Exception.add_note() for the same purpose, but
         # unfortunately we are still supporting Python 3.9/3.10...

From 2e6e02be8e7e0827f2253ca7bc48cc6db809ee73 Mon Sep 17 00:00:00 2001
From: "pre-commit-ci[bot]"
 <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Date: Tue, 16 Sep 2025 09:30:24 +0000
Subject: [PATCH 30/42] [pre-commit.ci] auto code formatting

---
 cuda_core/tests/test_program.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/cuda_core/tests/test_program.py b/cuda_core/tests/test_program.py
index 08b2ad3486..7c5586f48f 100644
--- a/cuda_core/tests/test_program.py
+++ b/cuda_core/tests/test_program.py
@@ -6,11 +6,11 @@
 from contextlib import contextmanager
 
 import pytest
-from cuda.core.experimental._utils.cuda_utils import driver, handle_return
 
 from cuda.core.experimental import _linker
 from cuda.core.experimental._module import Kernel, ObjectCode
 from cuda.core.experimental._program import Program, ProgramOptions
+from cuda.core.experimental._utils.cuda_utils import driver, handle_return
 
 cuda_driver_version = handle_return(driver.cuDriverGetVersion())
 is_culink_backend = _linker._decide_nvjitlink_or_driver()

From 680d790af7fb2e332c261b36d67a00919768906f Mon Sep 17 00:00:00 2001
From: abhilash1910 <debabhi1396@gmail.com>
Date: Tue, 16 Sep 2025 09:52:19 +0000
Subject: [PATCH 31/42] add release  notes

---
 cuda_core/docs/source/release/0.X.Y-notes.rst | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/cuda_core/docs/source/release/0.X.Y-notes.rst b/cuda_core/docs/source/release/0.X.Y-notes.rst
index 8024a14f62..c549769807 100644
--- a/cuda_core/docs/source/release/0.X.Y-notes.rst
+++ b/cuda_core/docs/source/release/0.X.Y-notes.rst
@@ -28,6 +28,8 @@ New features
 
 - Added :attr:`Device.arch` property that returns the compute capability as a string (e.g., '75' for CC 7.5), providing a convenient alternative to manually concatenating the compute capability tuple.
 - CUDA 13.x testing support through new ``test-cu13`` dependency group.
+- Added `NVVM IR` support for cuda_core frontend, which would allow cuda_python to take `NVVM IR` as an input format. New tests are added inside 
+ ```test_program.py```.
 
 
 New examples

From c55fa59ead57ebd184197e97f50bb9ba5b306aea Mon Sep 17 00:00:00 2001
From: "pre-commit-ci[bot]"
 <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Date: Tue, 16 Sep 2025 09:53:24 +0000
Subject: [PATCH 32/42] [pre-commit.ci] auto code formatting

---
 cuda_core/docs/source/release/0.X.Y-notes.rst | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/cuda_core/docs/source/release/0.X.Y-notes.rst b/cuda_core/docs/source/release/0.X.Y-notes.rst
index c549769807..5cbe0bb7a0 100644
--- a/cuda_core/docs/source/release/0.X.Y-notes.rst
+++ b/cuda_core/docs/source/release/0.X.Y-notes.rst
@@ -28,7 +28,7 @@ New features
 
 - Added :attr:`Device.arch` property that returns the compute capability as a string (e.g., '75' for CC 7.5), providing a convenient alternative to manually concatenating the compute capability tuple.
 - CUDA 13.x testing support through new ``test-cu13`` dependency group.
-- Added `NVVM IR` support for cuda_core frontend, which would allow cuda_python to take `NVVM IR` as an input format. New tests are added inside 
+- Added `NVVM IR` support for cuda_core frontend, which would allow cuda_python to take `NVVM IR` as an input format. New tests are added inside
  ```test_program.py```.
 
 

From 2cbee7f5242e204406cd243f4a813a0812d6bab4 Mon Sep 17 00:00:00 2001
From: abhilash1910 <debabhi1396@gmail.com>
Date: Tue, 16 Sep 2025 09:58:13 +0000
Subject: [PATCH 33/42] rectify quotes

---
 cuda_core/docs/source/release/0.X.Y-notes.rst | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/cuda_core/docs/source/release/0.X.Y-notes.rst b/cuda_core/docs/source/release/0.X.Y-notes.rst
index 5cbe0bb7a0..b8b02bae70 100644
--- a/cuda_core/docs/source/release/0.X.Y-notes.rst
+++ b/cuda_core/docs/source/release/0.X.Y-notes.rst
@@ -29,7 +29,7 @@ New features
 - Added :attr:`Device.arch` property that returns the compute capability as a string (e.g., '75' for CC 7.5), providing a convenient alternative to manually concatenating the compute capability tuple.
 - CUDA 13.x testing support through new ``test-cu13`` dependency group.
 - Added `NVVM IR` support for cuda_core frontend, which would allow cuda_python to take `NVVM IR` as an input format. New tests are added inside
- ```test_program.py```.
+ ``test_program.py``.
 
 
 New examples

From 63e8d57016ed5fee1c2f651c2bd62e4cef1d7a4b Mon Sep 17 00:00:00 2001
From: abhilash1910 <debabhi1396@gmail.com>
Date: Tue, 16 Sep 2025 11:19:50 +0000
Subject: [PATCH 34/42] refix format

---
 cuda_core/docs/source/release/0.X.Y-notes.rst | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/cuda_core/docs/source/release/0.X.Y-notes.rst b/cuda_core/docs/source/release/0.X.Y-notes.rst
index b8b02bae70..c981ca01fe 100644
--- a/cuda_core/docs/source/release/0.X.Y-notes.rst
+++ b/cuda_core/docs/source/release/0.X.Y-notes.rst
@@ -28,7 +28,7 @@ New features
 
 - Added :attr:`Device.arch` property that returns the compute capability as a string (e.g., '75' for CC 7.5), providing a convenient alternative to manually concatenating the compute capability tuple.
 - CUDA 13.x testing support through new ``test-cu13`` dependency group.
-- Added `NVVM IR` support for cuda_core frontend, which would allow cuda_python to take `NVVM IR` as an input format. New tests are added inside
+- Added NVVM IR support for cuda_core frontend, which would allow cuda_python to take NVVM IR as an input format. New tests are added inside
  ``test_program.py``.
 
 

From 94c2e5646c9787f6fc3e048e2e6d63222c9efc85 Mon Sep 17 00:00:00 2001
From: abhilash1910 <debabhi1396@gmail.com>
Date: Wed, 17 Sep 2025 08:35:30 +0000
Subject: [PATCH 35/42] refresh

---
 cuda_core/cuda/core/experimental/_program.py  | 30 ++++++++-
 cuda_core/docs/source/release/0.X.Y-notes.rst |  4 +-
 cuda_core/tests/test_program.py               | 61 ++-----------------
 3 files changed, 35 insertions(+), 60 deletions(-)

diff --git a/cuda_core/cuda/core/experimental/_program.py b/cuda_core/cuda/core/experimental/_program.py
index 7c28db21cc..cabcfffe40 100644
--- a/cuda_core/cuda/core/experimental/_program.py
+++ b/cuda_core/cuda/core/experimental/_program.py
@@ -6,6 +6,7 @@
 
 import weakref
 from dataclasses import dataclass
+from contextlib import contextmanager
 from typing import TYPE_CHECKING, Union
 from warnings import warn
 
@@ -27,6 +28,30 @@
     nvrtc,
 )
 
+@contextmanager
+def _nvvm_exception_manager(self):
+    """
+    Taken from _linker.py
+    """
+    try:
+        yield
+    except Exception as e:
+        error_log = ""
+        if hasattr(self, "_mnff"):
+            try:
+                nvvm = _get_nvvm_module()
+                logsize = nvvm.get_program_log_size(self._mnff.handle)
+                if logsize > 1:
+                    log = bytearray(logsize)
+                    nvvm.get_program_log(self._mnff.handle, log)
+                    error_log = log.decode("utf-8", errors="backslashreplace")
+            except Exception:
+                error_log = ""
+        # Starting Python 3.11 we could also use Exception.add_note() for the same purpose, but
+        # unfortunately we are still supporting Python 3.9/3.10...
+        e.args = (e.args[0] + (f"\nNVVM program log: {error_log}" if error_log else ""), *e.args[1:])
+        raise e
+
 _nvvm_module = None
 _nvvm_import_attempted = False
 
@@ -615,8 +640,9 @@ def compile(self, target_type, name_expressions=(), logs=None):
             if target_type == "ltoir" and "-gen-lto" not in nvvm_options:
                 nvvm_options.append("-gen-lto")
             nvvm = _get_nvvm_module()
-            nvvm.verify_program(self._mnff.handle, len(nvvm_options), nvvm_options)
-            nvvm.compile_program(self._mnff.handle, len(nvvm_options), nvvm_options)
+            with _nvvm_exception_manager(self):
+                nvvm.verify_program(self._mnff.handle, len(nvvm_options), nvvm_options)
+                nvvm.compile_program(self._mnff.handle, len(nvvm_options), nvvm_options)
 
             size = nvvm.get_compiled_result_size(self._mnff.handle)
             data = bytearray(size)
diff --git a/cuda_core/docs/source/release/0.X.Y-notes.rst b/cuda_core/docs/source/release/0.X.Y-notes.rst
index c981ca01fe..a1295e8eaa 100644
--- a/cuda_core/docs/source/release/0.X.Y-notes.rst
+++ b/cuda_core/docs/source/release/0.X.Y-notes.rst
@@ -28,9 +28,7 @@ New features
 
 - Added :attr:`Device.arch` property that returns the compute capability as a string (e.g., '75' for CC 7.5), providing a convenient alternative to manually concatenating the compute capability tuple.
 - CUDA 13.x testing support through new ``test-cu13`` dependency group.
-- Added NVVM IR support for cuda_core frontend, which would allow cuda_python to take NVVM IR as an input format. New tests are added inside
- ``test_program.py``.
-
+- Added NVVM IR support to :class:`Program`. NVVM IR is now understood with ``code_type="nvvm"``.
 
 New examples
 ------------
diff --git a/cuda_core/tests/test_program.py b/cuda_core/tests/test_program.py
index 7c5586f48f..c9caa7e463 100644
--- a/cuda_core/tests/test_program.py
+++ b/cuda_core/tests/test_program.py
@@ -3,7 +3,6 @@
 # SPDX-License-Identifier: LicenseRef-NVIDIA-SOFTWARE-LICENSE
 
 import warnings
-from contextlib import contextmanager
 
 import pytest
 
@@ -15,34 +14,6 @@
 cuda_driver_version = handle_return(driver.cuDriverGetVersion())
 is_culink_backend = _linker._decide_nvjitlink_or_driver()
 
-
-@contextmanager
-def _nvvm_exception_manager(nvvm, program_handle):
-    """
-    Taken from _linker.py
-    """
-    try:
-        yield
-    except Exception as e:
-        error_log = ""
-        try:
-            # Try to get the NVVM program log
-            logsize = nvvm.get_program_log_size(program_handle)
-            if logsize > 1:
-                log = bytearray(logsize)
-                nvvm.get_program_log(program_handle, log)
-                error_log = log.decode("utf-8", errors="backslashreplace")
-        except Exception:
-            error_log = ""
-
-        # Starting Python 3.11 we could also use Exception.add_note() for the same purpose, but
-        # unfortunately we are still supporting Python 3.9/3.10...
-        # Append the NVVM program log to the original exception message
-        if error_log:
-            e.args = (e.args[0] + f"\nNVVM program log: {error_log}", *e.args[1:])
-        raise e
-
-
 def _is_nvvm_available():
     """Check if NVVM is available."""
     try:
@@ -71,29 +42,10 @@ def _is_nvvm_available():
 precheck_nvvm_ir = """target triple = "nvptx64-unknown-cuda"
 target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-i128:128:128-f32:32:32-f64:64:64-v16:16:16-v32:32:32-v64:64:64-v128:128:128-n16:32:64"
 
-define i32 @ave(i32 %a, i32 %b) {{
-entry:
-  %add = add nsw i32 %a, %b
-  %div = sdiv i32 %add, 2
-  ret i32 %div
-}}
-
-define void @simple(i32* %data) {{
-entry:
-  %0 = call i32 @llvm.nvvm.read.ptx.sreg.ctaid.x()
-  %1 = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x()
-  %mul = mul i32 %0, %1
-  %2 = call i32 @llvm.nvvm.read.ptx.sreg.tid.x()
-  %add = add i32 %mul, %2
-  %call = call i32 @ave(i32 %add, i32 %add)
-  %idxprom = sext i32 %add to i64
-  store i32 %call, i32* %data, align 4
-  ret void
-}}
-
-declare i32 @llvm.nvvm.read.ptx.sreg.ctaid.x() nounwind readnone
-declare i32 @llvm.nvvm.read.ptx.sreg.ntid.x() nounwind readnone
-declare i32 @llvm.nvvm.read.ptx.sreg.tid.x() nounwind readnone
+define void @dummy_kernel() {
+  entry:
+    ret void
+}
 
 !nvvm.annotations = !{{!0}}
 !0 = !{{void (i32*)* @simple, !"kernel", i32 1}}
@@ -137,9 +89,8 @@ def _get_libnvvm_version_for_tests():
             nvvm.add_module_to_program(program, precheck_ir_bytes, len(precheck_ir_bytes), "precheck.ll")
 
             options = ["-arch=compute_90"]
-            with _nvvm_exception_manager(nvvm, program):
-                nvvm.verify_program(program, len(options), options)
-                nvvm.compile_program(program, len(options), options)
+            nvvm.verify_program(program, len(options), options)
+            nvvm.compile_program(program, len(options), options)
 
             ptx_size = nvvm.get_compiled_result_size(program)
             ptx_data = bytearray(ptx_size)

From 34bf2cc4f3d52bf0db3d38f4a3ccbe82f07bf902 Mon Sep 17 00:00:00 2001
From: "pre-commit-ci[bot]"
 <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Date: Wed, 17 Sep 2025 08:39:00 +0000
Subject: [PATCH 36/42] [pre-commit.ci] auto code formatting

---
 cuda_core/cuda/core/experimental/_program.py | 4 +++-
 cuda_core/tests/test_program.py              | 1 +
 2 files changed, 4 insertions(+), 1 deletion(-)

diff --git a/cuda_core/cuda/core/experimental/_program.py b/cuda_core/cuda/core/experimental/_program.py
index cabcfffe40..20050faec0 100644
--- a/cuda_core/cuda/core/experimental/_program.py
+++ b/cuda_core/cuda/core/experimental/_program.py
@@ -5,8 +5,8 @@
 from __future__ import annotations
 
 import weakref
-from dataclasses import dataclass
 from contextlib import contextmanager
+from dataclasses import dataclass
 from typing import TYPE_CHECKING, Union
 from warnings import warn
 
@@ -28,6 +28,7 @@
     nvrtc,
 )
 
+
 @contextmanager
 def _nvvm_exception_manager(self):
     """
@@ -52,6 +53,7 @@ def _nvvm_exception_manager(self):
         e.args = (e.args[0] + (f"\nNVVM program log: {error_log}" if error_log else ""), *e.args[1:])
         raise e
 
+
 _nvvm_module = None
 _nvvm_import_attempted = False
 
diff --git a/cuda_core/tests/test_program.py b/cuda_core/tests/test_program.py
index c9caa7e463..9cb54e6dd9 100644
--- a/cuda_core/tests/test_program.py
+++ b/cuda_core/tests/test_program.py
@@ -14,6 +14,7 @@
 cuda_driver_version = handle_return(driver.cuDriverGetVersion())
 is_culink_backend = _linker._decide_nvjitlink_or_driver()
 
+
 def _is_nvvm_available():
     """Check if NVVM is available."""
     try:

From 6b130bb73151b21a9b2fcbb7c6359bb3d0657320 Mon Sep 17 00:00:00 2001
From: abhilash1910 <debabhi1396@gmail.com>
Date: Wed, 17 Sep 2025 09:57:26 +0000
Subject: [PATCH 37/42] user major minor

---
 cuda_core/tests/test_program.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/cuda_core/tests/test_program.py b/cuda_core/tests/test_program.py
index 9cb54e6dd9..5d77b4ae6f 100644
--- a/cuda_core/tests/test_program.py
+++ b/cuda_core/tests/test_program.py
@@ -155,10 +155,10 @@ def nvvm_ir():
 !0 = !{{void (i32*)* @simple, !"kernel", i32 1}}
 
 !nvvmir.version = !{{!1}}
-!1 = !{{i32 {major}, i32 0, i32 {debug_major}, i32 0}}
+!1 = !{{i32 {major}, i32 {minor}, i32 {debug_major}, i32 {debug_minor}}}
 """  # noqa: E501
 
-        return nvvm_ir_template.format(major=major, debug_major=debug_major)
+        return nvvm_ir_template.format(major=major, minor=minor, debug_major=debug_major, debug_minor=debug_minor)
     except Exception:
         return """target triple = "nvptx64-unknown-cuda"
 target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-i128:128:128-f32:32:32-f64:64:64-v16:16:16-v32:32:32-v64:64:64-v128:128:128-n16:32:64"

From d96c848e8f187f2c334d107716b51dc312f9173e Mon Sep 17 00:00:00 2001
From: Leo Fang <leo80042@gmail.com>
Date: Wed, 17 Sep 2025 15:58:58 -0400
Subject: [PATCH 38/42] fix test

---
 cuda_core/tests/test_program.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/cuda_core/tests/test_program.py b/cuda_core/tests/test_program.py
index 5d77b4ae6f..c0257b5b19 100644
--- a/cuda_core/tests/test_program.py
+++ b/cuda_core/tests/test_program.py
@@ -52,7 +52,7 @@ def _is_nvvm_available():
 !0 = !{{void (i32*)* @simple, !"kernel", i32 1}}
 
 !nvvmir.version = !{{!1}}
-!1 = !{{i32 {major}, i32 0, i32 {debug_major}, i32 0}}
+!1 = !{{i32 {major}, i32 {minor}, i32 {debug_major}, i32 {debug_minor}}}
 """  # noqa: E501
 
 
@@ -86,6 +86,8 @@ def _get_libnvvm_version_for_tests():
 
         program = nvvm.create_program()
         try:
+            major, minor, debug_major, debug_minor = nvvm.ir_version()
+            precheck_nvvm_ir = precheck_nvvm_ir.format(major=major, minor=minor, debug_major=debug_major, debug_minor=debug_minor)
             precheck_ir_bytes = precheck_nvvm_ir.encode("utf-8")
             nvvm.add_module_to_program(program, precheck_ir_bytes, len(precheck_ir_bytes), "precheck.ll")
 

From 8331ecf0bf2ebf72aad9573a4338b3d44edfe74b Mon Sep 17 00:00:00 2001
From: Leo Fang <leof@nvidia.com>
Date: Wed, 17 Sep 2025 20:15:07 +0000
Subject: [PATCH 39/42] fix IR - again

---
 cuda_core/tests/test_program.py | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/cuda_core/tests/test_program.py b/cuda_core/tests/test_program.py
index c0257b5b19..a9f560ae95 100644
--- a/cuda_core/tests/test_program.py
+++ b/cuda_core/tests/test_program.py
@@ -43,13 +43,13 @@ def _is_nvvm_available():
 precheck_nvvm_ir = """target triple = "nvptx64-unknown-cuda"
 target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-i128:128:128-f32:32:32-f64:64:64-v16:16:16-v32:32:32-v64:64:64-v128:128:128-n16:32:64"
 
-define void @dummy_kernel() {
+define void @dummy_kernel() {{
   entry:
     ret void
-}
+}}
 
 !nvvm.annotations = !{{!0}}
-!0 = !{{void (i32*)* @simple, !"kernel", i32 1}}
+!0 = !{{void ()* @dummy_kernel, !"kernel", i32 1}}
 
 !nvvmir.version = !{{!1}}
 !1 = !{{i32 {major}, i32 {minor}, i32 {debug_major}, i32 {debug_minor}}}
@@ -87,6 +87,7 @@ def _get_libnvvm_version_for_tests():
         program = nvvm.create_program()
         try:
             major, minor, debug_major, debug_minor = nvvm.ir_version()
+            global precheck_nvvm_ir
             precheck_nvvm_ir = precheck_nvvm_ir.format(major=major, minor=minor, debug_major=debug_major, debug_minor=debug_minor)
             precheck_ir_bytes = precheck_nvvm_ir.encode("utf-8")
             nvvm.add_module_to_program(program, precheck_ir_bytes, len(precheck_ir_bytes), "precheck.ll")
@@ -103,7 +104,6 @@ def _get_libnvvm_version_for_tests():
             cuda_version = get_minimal_required_cuda_ver_from_ptx_ver(ptx_version)
             _libnvvm_version = cuda_version
             return _libnvvm_version
-
         finally:
             nvvm.destroy_program(program)
 

From 2fa944ef821c1f855587e2ac7349e513c3fa4381 Mon Sep 17 00:00:00 2001
From: Leo Fang <leof@nvidia.com>
Date: Wed, 17 Sep 2025 20:18:34 +0000
Subject: [PATCH 40/42] fix nvvm option handling

---
 cuda_core/cuda/core/experimental/_program.py | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/cuda_core/cuda/core/experimental/_program.py b/cuda_core/cuda/core/experimental/_program.py
index 20050faec0..9b28115d4e 100644
--- a/cuda_core/cuda/core/experimental/_program.py
+++ b/cuda_core/cuda/core/experimental/_program.py
@@ -542,15 +542,15 @@ def _translate_program_options_to_nvvm(self, options: ProgramOptions) -> list[st
             nvvm_options.append("-opt=0")
         elif options.device_code_optimize is True:
             nvvm_options.append("-opt=3")
-
+        # NVVM is not consistent with NVRTC, it uses 0/1 instead...
         if options.ftz is not None:
-            nvvm_options.append(f"-ftz={'true' if options.ftz else 'false'}")
+            nvvm_options.append(f"-ftz={'1' if options.ftz else '0'}")
         if options.prec_sqrt is not None:
-            nvvm_options.append(f"-prec-sqrt={'true' if options.prec_sqrt else 'false'}")
+            nvvm_options.append(f"-prec-sqrt={'1' if options.prec_sqrt else '0'}")
         if options.prec_div is not None:
-            nvvm_options.append(f"-prec-div={'true' if options.prec_div else 'false'}")
+            nvvm_options.append(f"-prec-div={'1' if options.prec_div else '0'}")
         if options.fma is not None:
-            nvvm_options.append(f"-fma={'true' if options.fma else 'false'}")
+            nvvm_options.append(f"-fma={'1' if options.fma else '0'}")
 
         return nvvm_options
 

From 4d32276c54e9015f2a8e7748bf498d3244d8db89 Mon Sep 17 00:00:00 2001
From: Leo Fang <leof@nvidia.com>
Date: Wed, 17 Sep 2025 20:22:17 +0000
Subject: [PATCH 41/42] remove redundant IR & fix linter

---
 cuda_core/tests/test_program.py | 47 ++++++---------------------------
 1 file changed, 8 insertions(+), 39 deletions(-)

diff --git a/cuda_core/tests/test_program.py b/cuda_core/tests/test_program.py
index a9f560ae95..a842b293eb 100644
--- a/cuda_core/tests/test_program.py
+++ b/cuda_core/tests/test_program.py
@@ -88,7 +88,9 @@ def _get_libnvvm_version_for_tests():
         try:
             major, minor, debug_major, debug_minor = nvvm.ir_version()
             global precheck_nvvm_ir
-            precheck_nvvm_ir = precheck_nvvm_ir.format(major=major, minor=minor, debug_major=debug_major, debug_minor=debug_minor)
+            precheck_nvvm_ir = precheck_nvvm_ir.format(
+                major=major, minor=minor, debug_major=debug_major, debug_minor=debug_minor
+            )
             precheck_ir_bytes = precheck_nvvm_ir.encode("utf-8")
             nvvm.add_module_to_program(program, precheck_ir_bytes, len(precheck_ir_bytes), "precheck.ll")
 
@@ -120,13 +122,12 @@ def nvvm_ir():
     fallback assumes no version metadata will be present in
     the input nvvm ir
     """
-    try:
-        from cuda.core.experimental._program import _get_nvvm_module
+    from cuda.core.experimental._program import _get_nvvm_module
 
-        nvvm = _get_nvvm_module()
-        major, minor, debug_major, debug_minor = nvvm.ir_version()
+    nvvm = _get_nvvm_module()
+    major, minor, debug_major, debug_minor = nvvm.ir_version()
 
-        nvvm_ir_template = """target triple = "nvptx64-unknown-cuda"
+    nvvm_ir_template = """target triple = "nvptx64-unknown-cuda"
 target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-i128:128:128-f32:32:32-f64:64:64-v16:16:16-v32:32:32-v64:64:64-v128:128:128-n16:32:64"
 
 define i32 @ave(i32 %a, i32 %b) {{
@@ -159,39 +160,7 @@ def nvvm_ir():
 !nvvmir.version = !{{!1}}
 !1 = !{{i32 {major}, i32 {minor}, i32 {debug_major}, i32 {debug_minor}}}
 """  # noqa: E501
-
-        return nvvm_ir_template.format(major=major, minor=minor, debug_major=debug_major, debug_minor=debug_minor)
-    except Exception:
-        return """target triple = "nvptx64-unknown-cuda"
-target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-i128:128:128-f32:32:32-f64:64:64-v16:16:16-v32:32:32-v64:64:64-v128:128:128-n16:32:64"
-
-define i32 @ave(i32 %a, i32 %b) {
-entry:
-  %add = add nsw i32 %a, %b
-  %div = sdiv i32 %add, 2
-  ret i32 %div
-}
-
-define void @simple(i32* %data) {
-entry:
-  %0 = call i32 @llvm.nvvm.read.ptx.sreg.ctaid.x()
-  %1 = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x()
-  %mul = mul i32 %0, %1
-  %2 = call i32 @llvm.nvvm.read.ptx.sreg.tid.x()
-  %add = add i32 %mul, %2
-  %call = call i32 @ave(i32 %add, i32 %add)
-  %idxprom = sext i32 %add to i64
-  store i32 %call, i32* %data, align 4
-  ret void
-}
-
-declare i32 @llvm.nvvm.read.ptx.sreg.ctaid.x() nounwind readnone
-declare i32 @llvm.nvvm.read.ptx.sreg.ntid.x() nounwind readnone
-declare i32 @llvm.nvvm.read.ptx.sreg.tid.x() nounwind readnone
-
-!nvvm.annotations = !{!0}
-!0 = !{void (i32*)* @simple, !"kernel", i32 1}
-"""  # noqa: E501
+    return nvvm_ir_template.format(major=major, minor=minor, debug_major=debug_major, debug_minor=debug_minor)
 
 
 @pytest.fixture(scope="module")

From e5b5ea4c7ef2d35ca13d60deb698f3a110a266dd Mon Sep 17 00:00:00 2001
From: Leo Fang <leof@nvidia.com>
Date: Wed, 17 Sep 2025 20:31:59 +0000
Subject: [PATCH 42/42] avoid extra copy + ensure compiled objcode loadable

---
 cuda_core/cuda/core/experimental/_module.py                 | 6 +++---
 cuda_core/cuda/core/experimental/_program.py                | 3 +--
 .../cuda/core/experimental/_utils/clear_error_support.py    | 6 +++---
 cuda_core/tests/test_program.py                             | 4 +++-
 4 files changed, 10 insertions(+), 9 deletions(-)

diff --git a/cuda_core/cuda/core/experimental/_module.py b/cuda_core/cuda/core/experimental/_module.py
index c659a8d782..71293be4d1 100644
--- a/cuda_core/cuda/core/experimental/_module.py
+++ b/cuda_core/cuda/core/experimental/_module.py
@@ -11,7 +11,7 @@
 from cuda.core.experimental._stream import Stream
 from cuda.core.experimental._utils.clear_error_support import (
     assert_type,
-    assert_type_str_or_bytes,
+    assert_type_str_or_bytes_like,
     raise_code_path_meant_to_be_unreachable,
 )
 from cuda.core.experimental._utils.cuda_utils import driver, get_binding_version, handle_return, precondition
@@ -615,14 +615,14 @@ def _lazy_load_module(self, *args, **kwargs):
         if self._handle is not None:
             return
         module = self._module
-        assert_type_str_or_bytes(module)
+        assert_type_str_or_bytes_like(module)
         if isinstance(module, str):
             if self._backend_version == "new":
                 self._handle = handle_return(self._loader["file"](module.encode(), [], [], 0, [], [], 0))
             else:  # "old" backend
                 self._handle = handle_return(self._loader["file"](module.encode()))
             return
-        if isinstance(module, bytes):
+        if isinstance(module, (bytes, bytearray)):
             if self._backend_version == "new":
                 self._handle = handle_return(self._loader["data"](module, [], [], 0, [], [], 0))
             else:  # "old" backend
diff --git a/cuda_core/cuda/core/experimental/_program.py b/cuda_core/cuda/core/experimental/_program.py
index 9b28115d4e..dee6f001e7 100644
--- a/cuda_core/cuda/core/experimental/_program.py
+++ b/cuda_core/cuda/core/experimental/_program.py
@@ -657,8 +657,7 @@ def compile(self, target_type, name_expressions=(), logs=None):
                     nvvm.get_program_log(self._mnff.handle, log)
                     logs.write(log.decode("utf-8", errors="backslashreplace"))
 
-            data_bytes = bytes(data)
-            return ObjectCode._init(data_bytes, target_type, name=self._options.name)
+            return ObjectCode._init(data, target_type, name=self._options.name)
 
         supported_backends = ("nvJitLink", "driver")
         if self._backend not in supported_backends:
diff --git a/cuda_core/cuda/core/experimental/_utils/clear_error_support.py b/cuda_core/cuda/core/experimental/_utils/clear_error_support.py
index b13a3d6b02..0410e7aa2f 100644
--- a/cuda_core/cuda/core/experimental/_utils/clear_error_support.py
+++ b/cuda_core/cuda/core/experimental/_utils/clear_error_support.py
@@ -9,10 +9,10 @@ def assert_type(obj, expected_type):
         raise TypeError(f"Expected type {expected_type.__name__}, but got {type(obj).__name__}")
 
 
-def assert_type_str_or_bytes(obj):
+def assert_type_str_or_bytes_like(obj):
     """Ensure obj is of type str or bytes, else raise AssertionError with a clear message."""
-    if not isinstance(obj, (str, bytes)):
-        raise TypeError(f"Expected type str or bytes, but got {type(obj).__name__}")
+    if not isinstance(obj, (str, bytes, bytearray)):
+        raise TypeError(f"Expected type str or bytes or bytearray, but got {type(obj).__name__}")
 
 
 def raise_code_path_meant_to_be_unreachable():
diff --git a/cuda_core/tests/test_program.py b/cuda_core/tests/test_program.py
index a842b293eb..d30b845c20 100644
--- a/cuda_core/tests/test_program.py
+++ b/cuda_core/tests/test_program.py
@@ -312,11 +312,13 @@ def test_nvvm_deferred_import():
 
 
 @nvvm_available
-def test_nvvm_program_creation(nvvm_ir):
+def test_nvvm_program_creation_compilation(nvvm_ir):
     """Test basic NVVM program creation"""
     program = Program(nvvm_ir, "nvvm")
     assert program.backend == "NVVM"
     assert program.handle is not None
+    obj = program.compile("ptx")
+    ker = obj.get_kernel("simple")  # noqa: F841
     program.close()