From e06bee67b3d7f64fb8867fce75a40bb0c685eb7c Mon Sep 17 00:00:00 2001
From: ksimpson <ksimpson@nvidia.com>
Date: Tue, 4 Feb 2025 14:21:51 -0800
Subject: [PATCH 1/8] remove jit options from lazy module load

---
 cuda_core/cuda/core/experimental/_module.py | 26 ++-------------------
 1 file changed, 2 insertions(+), 24 deletions(-)
diff --git a/cuda_core/cuda/core/experimental/_module.py b/cuda_core/cuda/core/experimental/_module.py
index 36178f5d71..4ac0c33b7f 100644
--- a/cuda_core/cuda/core/experimental/_module.py
+++ b/cuda_core/cuda/core/experimental/_module.py
@@ -128,37 +128,15 @@ def __init__(self, module, code_type, jit_options=None, *, symbol_mapping=None):
     def _lazy_load_module(self, *args, **kwargs):
         if self._handle is not None:
             return
-        jit_options = self._jit_options
         module = self._module
         if isinstance(module, str):
-            # TODO: this option is only taken by the new library APIs, but we have
-            # a bug that we can't easily support it just yet (NVIDIA/cuda-python#73).
-            if jit_options is not None:
-                raise ValueError
             self._handle = handle_return(self._loader["file"](module))
         else:
             assert isinstance(module, bytes)
-            if jit_options is None:
-                jit_options = {}
             if self._backend_version == "new":
-                args = (
-                    module,
-                    list(jit_options.keys()),
-                    list(jit_options.values()),
-                    len(jit_options),
-                    # TODO: support library options
-                    [],
-                    [],
-                    0,
-                )
+                self._handle = handle_return(self._loader["data"](module, [], [], 0, [], [], 0))
             else:  # "old" backend
-                args = (
-                    module,
-                    len(jit_options),
-                    list(jit_options.keys()),
-                    list(jit_options.values()),
-                )
-            self._handle = handle_return(self._loader["data"](*args))
+                self._handle = handle_return(self._loader["data"](module, 0, [], []))
 
     @precondition(_lazy_load_module)
     def get_kernel(self, name):

From f2d3d85a0bfdda009835f0d0faaa3d7f25674aa0 Mon Sep 17 00:00:00 2001
From: ksimpson <ksimpson@nvidia.com>
Date: Tue, 4 Feb 2025 14:34:20 -0800
Subject: [PATCH 2/8] add note to release notes

---
 cuda_core/docs/source/release/0.2.0-notes.rst | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/cuda_core/docs/source/release/0.2.0-notes.rst b/cuda_core/docs/source/release/0.2.0-notes.rst
index 0a34f825a9..4fdb7dbfa7 100644
--- a/cuda_core/docs/source/release/0.2.0-notes.rst
+++ b/cuda_core/docs/source/release/0.2.0-notes.rst
@@ -3,7 +3,7 @@
 ``cuda.core`` 0.2.0 Release Notes
 =================================
 
-Released on <TODO>, 2024
+Released on <TODO>, 2025
 
 Highlights
 ----------
@@ -20,3 +20,4 @@ Breaking Changes
 
 - Change ``__cuda_stream__`` from attribute to method
 - The :meth:`~Program.compile` method no longer accepts the `options` argument. Instead, you can optionally pass an instance of :class:`~ProgramOptions` to the constructor of :obj:`~Program`.
+- The internal constructor of :class:`~ObjectCode` no longer accepts the jit_options argument. Options are provided to upstream :class:`~ProgramOptions` or :class:`~LinkerOptions` instead.

From 9bb4651c6c1c040eef9b1577a5c52059464f1709 Mon Sep 17 00:00:00 2001
From: Leo Fang <leof@nvidia.com>
Date: Sun, 16 Feb 2025 05:55:08 +0000
Subject: [PATCH 3/8] organize ObjectCode __init__

---
 cuda_core/cuda/core/experimental/_linker.py  |  2 +-
 cuda_core/cuda/core/experimental/_module.py  | 47 ++++++++++++--------
 cuda_core/cuda/core/experimental/_program.py |  4 +-
 3 files changed, 32 insertions(+), 21 deletions(-)

diff --git a/cuda_core/cuda/core/experimental/_linker.py b/cuda_core/cuda/core/experimental/_linker.py
index 7736d7b2d2..2f84f3502a 100644
--- a/cuda_core/cuda/core/experimental/_linker.py
+++ b/cuda_core/cuda/core/experimental/_linker.py
@@ -439,7 +439,7 @@ def link(self, target_type) -> ObjectCode:
                 addr, size = handle_return(_driver.cuLinkComplete(self._mnff.handle))
                 code = (ctypes.c_char * size).from_address(addr)
 
-        return ObjectCode(bytes(code), target_type)
+        return ObjectCode._init(bytes(code), target_type)
 
     def get_error_log(self) -> str:
         """Get the error log generated by the linker.
diff --git a/cuda_core/cuda/core/experimental/_module.py b/cuda_core/cuda/core/experimental/_module.py
index 0687dc69e5..d713eb9fca 100644
--- a/cuda_core/cuda/core/experimental/_module.py
+++ b/cuda_core/cuda/core/experimental/_module.py
@@ -2,7 +2,7 @@
 #
 # SPDX-License-Identifier: LicenseRef-NVIDIA-SOFTWARE-LICENSE
 
-
+from typing import Union
 from warnings import warn
 
 from cuda.core.experimental._utils import driver, get_binding_version, handle_return, precondition
@@ -220,6 +220,12 @@ class ObjectCode:
 
     Loads the module library with specified module code and JIT options.
 
+    Note
+    ----
+    The public constructor assumes that ``module`` is of code type "cubin".
+    For all other possible code types (ex: "ptx"), only :class:`~cuda.core.experimental.Program`
+    accepts them and returns an `ObjectCode` instance with its ``compile`` method.
+
     Note
     ----
     Usage under CUDA 11.x will only load to the current device
@@ -228,32 +234,32 @@ class ObjectCode:
     Parameters
     ----------
     module : Union[bytes, str]
-        Either a bytes object containing the module to load, or
-        a file path string containing that module for loading.
-    code_type : Any
-        String of the compiled type.
-        Supported options are "ptx", "cubin", "ltoir" and "fatbin".
-    jit_options : Optional
-        Mapping of JIT options to use during module loading.
-        (Default to no options)
-    symbol_mapping : Optional
-        Keyword argument dictionary specifying how symbol names
-        should be mapped before trying to retrieve them.
-        (Default to no mappings)
-
+        Either a bytes object containing the cubin to load, or
+        a file path string pointing to the cubin to load.
     """
 
-    __slots__ = ("_handle", "_backend_version", "_jit_options", "_code_type", "_module", "_loader", "_sym_map")
+    __slots__ = ("_handle", "_backend_version", "_code_type", "_module", "_loader", "_sym_map")
     _supported_code_type = ("cubin", "ptx", "ltoir", "fatbin")
 
-    def __init__(self, module, code_type, jit_options=None, *, symbol_mapping=None):
+    def __init__(self, module: Union[bytes, str]):
+        _lazy_init()
+
+        # handle is assigned during _lazy_load
+        self._handle = None
+        self._backend_version = "new" if (_py_major_ver >= 12 and _driver_ver >= 12000) else "old"
+        self._loader = _backend[self._backend_version]
+        self._code_type = "cubin"
+        self._module = module
+        self._sym_map = {}
+
+    def _init(module, code_type, *, symbol_mapping=None):
+        self = ObjectCode.__new__(ObjectCode)
         if code_type not in self._supported_code_type:
             raise ValueError
         _lazy_init()
 
         # handle is assigned during _lazy_load
         self._handle = None
-        self._jit_options = jit_options
 
         self._backend_version = "new" if (_py_major_ver >= 12 and _driver_ver >= 12000) else "old"
         self._loader = _backend[self._backend_version]
@@ -262,6 +268,8 @@ def __init__(self, module, code_type, jit_options=None, *, symbol_mapping=None):
         self._module = module
         self._sym_map = {} if symbol_mapping is None else symbol_mapping
 
+        return self
+
     # TODO: do we want to unload in a finalizer? Probably not..
 
     def _lazy_load_module(self, *args, **kwargs):
@@ -269,7 +277,10 @@ def _lazy_load_module(self, *args, **kwargs):
             return
         module = self._module
         if isinstance(module, str):
-            self._handle = handle_return(self._loader["file"](module))
+            if self._backend_version == "new":
+                self._handle = handle_return(self._loader["file"](module, [], [], 0, [], [], 0))
+            else:  # "old" backend
+                self._handle = handle_return(self._loader["file"](module))
         else:
             assert isinstance(module, bytes)
             if self._backend_version == "new":
diff --git a/cuda_core/cuda/core/experimental/_program.py b/cuda_core/cuda/core/experimental/_program.py
index f938895ed7..b1fb0d90f9 100644
--- a/cuda_core/cuda/core/experimental/_program.py
+++ b/cuda_core/cuda/core/experimental/_program.py
@@ -386,7 +386,7 @@ def __init__(self, code, code_type, options: ProgramOptions = None):
             if not isinstance(code, str):
                 raise TypeError("ptx Program expects code argument to be a string")
             self._linker = Linker(
-                ObjectCode(code.encode(), code_type), options=self._translate_program_options(options)
+                ObjectCode._init(code.encode(), code_type), options=self._translate_program_options(options)
             )
             self._backend = "linker"
         else:
@@ -472,7 +472,7 @@ def compile(self, target_type, name_expressions=(), logs=None):
                     handle_return(nvrtc.nvrtcGetProgramLog(self._mnff.handle, log), handle=self._mnff.handle)
                     logs.write(log.decode())
 
-            return ObjectCode(data, target_type, symbol_mapping=symbol_mapping)
+            return ObjectCode._init(data, target_type, symbol_mapping=symbol_mapping)
 
         if self._backend == "linker":
             return self._linker.link(target_type)

From 5349bd19b43f11efcb1bbd3936aaf009636fd6b9 Mon Sep 17 00:00:00 2001
From: Leo Fang <leof@nvidia.com>
Date: Sun, 16 Feb 2025 06:30:32 +0000
Subject: [PATCH 4/8] expose ObjectCode to public + fix file loading

---
 cuda_core/cuda/core/experimental/__init__.py  |  1 +
 cuda_core/cuda/core/experimental/_module.py   | 16 +++++++-----
 cuda_core/docs/source/release/0.2.0-notes.rst | 18 ++++++++-----
 cuda_core/tests/test_module.py                | 26 ++++++++++++++++---
 4 files changed, 45 insertions(+), 16 deletions(-)

diff --git a/cuda_core/cuda/core/experimental/__init__.py b/cuda_core/cuda/core/experimental/__init__.py
index 3db9e8abb4..6e289d49b3 100644
--- a/cuda_core/cuda/core/experimental/__init__.py
+++ b/cuda_core/cuda/core/experimental/__init__.py
@@ -7,6 +7,7 @@
 from cuda.core.experimental._event import EventOptions
 from cuda.core.experimental._launcher import LaunchConfig, launch
 from cuda.core.experimental._linker import Linker, LinkerOptions
+from cuda.core.experimental._module import ObjectCode
 from cuda.core.experimental._program import Program, ProgramOptions
 from cuda.core.experimental._stream import Stream, StreamOptions
 from cuda.core.experimental._system import System
diff --git a/cuda_core/cuda/core/experimental/_module.py b/cuda_core/cuda/core/experimental/_module.py
index d713eb9fca..3c55bccdb9 100644
--- a/cuda_core/cuda/core/experimental/_module.py
+++ b/cuda_core/cuda/core/experimental/_module.py
@@ -2,7 +2,7 @@
 #
 # SPDX-License-Identifier: LicenseRef-NVIDIA-SOFTWARE-LICENSE
 
-from typing import Union
+from typing import Optional, Union
 from warnings import warn
 
 from cuda.core.experimental._utils import driver, get_binding_version, handle_return, precondition
@@ -236,12 +236,16 @@ class ObjectCode:
     module : Union[bytes, str]
         Either a bytes object containing the cubin to load, or
         a file path string pointing to the cubin to load.
+    symbol_mapping : Optional[dict]
+        A dictionary specifying how the unmangled symbol names (as keys)
+        should be mapped to the mangled names before trying to retrieve
+        them (default to no mappings).
     """
 
     __slots__ = ("_handle", "_backend_version", "_code_type", "_module", "_loader", "_sym_map")
     _supported_code_type = ("cubin", "ptx", "ltoir", "fatbin")
 
-    def __init__(self, module: Union[bytes, str]):
+    def __init__(self, module: Union[bytes, str], *, symbol_mapping: Optional[dict]=None):
         _lazy_init()
 
         # handle is assigned during _lazy_load
@@ -250,9 +254,9 @@ def __init__(self, module: Union[bytes, str]):
         self._loader = _backend[self._backend_version]
         self._code_type = "cubin"
         self._module = module
-        self._sym_map = {}
+        self._sym_map = {} if symbol_mapping is None else symbol_mapping
 
-    def _init(module, code_type, *, symbol_mapping=None):
+    def _init(module, code_type, *, symbol_mapping: Optional[dict]=None):
         self = ObjectCode.__new__(ObjectCode)
         if code_type not in self._supported_code_type:
             raise ValueError
@@ -278,9 +282,9 @@ def _lazy_load_module(self, *args, **kwargs):
         module = self._module
         if isinstance(module, str):
             if self._backend_version == "new":
-                self._handle = handle_return(self._loader["file"](module, [], [], 0, [], [], 0))
+                self._handle = handle_return(self._loader["file"](module.encode(), [], [], 0, [], [], 0))
             else:  # "old" backend
-                self._handle = handle_return(self._loader["file"](module))
+                self._handle = handle_return(self._loader["file"](module.encode()))
         else:
             assert isinstance(module, bytes)
             if self._backend_version == "new":
diff --git a/cuda_core/docs/source/release/0.2.0-notes.rst b/cuda_core/docs/source/release/0.2.0-notes.rst
index bf5705839a..47217453b0 100644
--- a/cuda_core/docs/source/release/0.2.0-notes.rst
+++ b/cuda_core/docs/source/release/0.2.0-notes.rst
@@ -12,15 +12,19 @@ Highlights
 - Add :class:`~DeviceProperties` to provide pythonic access to device properties.
 - Add kernel attributes to :class:`~Kernel`
 
-Limitations
------------
-
-- <TODO>
-
 Breaking Changes
 ----------------
 
 - Change ``__cuda_stream__`` from attribute to method
 - The :meth:`~Program.compile` method no longer accepts the `options` argument. Instead, you can optionally pass an instance of :class:`~ProgramOptions` to the constructor of :obj:`~Program`.
-- The internal constructor of :class:`~ObjectCode` no longer accepts the jit_options argument. Options are provided to upstream :class:`~ProgramOptions` or :class:`~LinkerOptions` instead.
-- :meth: `~Device.properties` now provides an instance of :class:`~DeviceProperties` instead of a dictionary. 
+- :meth: `~Device.properties` now provides an instance of :class:`~DeviceProperties` instead of a dictionary.
+
+New features
+------------
+
+- Expose :class:`ObjectCode` as a public API, which allows loading cubins from memory or disk. For loading other kinds of code types, please continue using :class:`Program`.
+
+Limitations
+-----------
+
+- <TODO>
diff --git a/cuda_core/tests/test_module.py b/cuda_core/tests/test_module.py
index 9f126fa179..33501999df 100644
--- a/cuda_core/tests/test_module.py
+++ b/cuda_core/tests/test_module.py
@@ -10,7 +10,7 @@
 import pytest
 from conftest import can_load_generated_ptx
 
-from cuda.core.experimental import Program, ProgramOptions, system
+from cuda.core.experimental import ObjectCode, Program, ProgramOptions, system
 
 
 @pytest.fixture(scope="function")
@@ -37,7 +37,7 @@ def get_saxpy_kernel(init_cuda):
     )
 
     # run in single precision
-    return mod.get_kernel("saxpy<float>")
+    return mod.get_kernel("saxpy<float>"), mod
 
 
 @pytest.mark.xfail(not can_load_generated_ptx(), reason="PTX version too new")
@@ -72,7 +72,7 @@ def test_get_kernel(init_cuda):
     ],
 )
 def test_read_only_kernel_attributes(get_saxpy_kernel, attr, expected_type):
-    kernel = get_saxpy_kernel
+    kernel, _ = get_saxpy_kernel
     method = getattr(kernel.attributes, attr)
     # get the value without providing a device ordinal
     value = method()
@@ -82,3 +82,23 @@ def test_read_only_kernel_attributes(get_saxpy_kernel, attr, expected_type):
     for device in system.devices:
         value = method(device.device_id)
     assert isinstance(value, expected_type), f"Expected {attr} to be of type {expected_type}, but got {type(value)}"
+
+
+def test_object_code_load_cubin(get_saxpy_kernel):
+    _, mod = get_saxpy_kernel
+    cubin = mod._module
+    sym_map = mod._sym_map
+    assert isinstance(cubin, bytes)
+    mod = ObjectCode(cubin, symbol_mapping=sym_map)
+    ker = mod.get_kernel("saxpy<double>")
+
+
+def test_object_code_load_cubin_from_file(get_saxpy_kernel, tmp_path):
+    _, mod = get_saxpy_kernel
+    cubin = mod._module
+    sym_map = mod._sym_map
+    assert isinstance(cubin, bytes)
+    cubin_file = tmp_path / "test.cubin"
+    cubin_file.write_bytes(cubin)
+    mod = ObjectCode(str(cubin_file), symbol_mapping=sym_map)
+    ker = mod.get_kernel("saxpy<double>")

From 57a9b19026b3d15b4ad4af5bce1ccf3ebc05096d Mon Sep 17 00:00:00 2001
From: Leo Fang <leof@nvidia.com>
Date: Sun, 16 Feb 2025 06:31:01 +0000
Subject: [PATCH 5/8] update multi-gpu example

---
 .../examples/simple_multi_gpu_example.py      | 20 ++++---------------
 1 file changed, 4 insertions(+), 16 deletions(-)

diff --git a/cuda_core/examples/simple_multi_gpu_example.py b/cuda_core/examples/simple_multi_gpu_example.py
index 7b83d844c6..98969e8fab 100644
--- a/cuda_core/examples/simple_multi_gpu_example.py
+++ b/cuda_core/examples/simple_multi_gpu_example.py
@@ -34,14 +34,8 @@
 }
 """
 arch0 = "".join(f"{i}" for i in dev0.compute_capability)
-prog_add = Program(code_add, code_type="c++")
-mod_add = prog_add.compile(
-    "cubin",
-    options=(
-        "-std=c++17",
-        "-arch=sm_" + arch0,
-    ),
-)
+prog_add = Program(code_add, code_type="c++", options={"std": "c++17", "arch": f"sm_{arch0}"})
+mod_add = prog_add.compile("cubin")
 ker_add = mod_add.get_kernel("vector_add")
 
 # Set GPU 1
@@ -63,14 +57,8 @@
 }
 """
 arch1 = "".join(f"{i}" for i in dev1.compute_capability)
-prog_sub = Program(code_sub, code_type="c++")
-mod_sub = prog_sub.compile(
-    "cubin",
-    options=(
-        "-std=c++17",
-        "-arch=sm_" + arch1,
-    ),
-)
+prog_sub = Program(code_sub, code_type="c++", options={"std": "c++17", "arch": f"sm_{arch0}"})
+mod_sub = prog_sub.compile("cubin")
 ker_sub = mod_sub.get_kernel("vector_sub")
 
 

From 1d4576747cf4372ceaa27b5a3362b54817314c6a Mon Sep 17 00:00:00 2001
From: Leo Fang <leof@nvidia.com>
Date: Sun, 16 Feb 2025 06:33:51 +0000
Subject: [PATCH 6/8] make linter happy

---
 cuda_core/cuda/core/experimental/_module.py | 4 ++--
 cuda_core/tests/test_module.py              | 4 ++--
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/cuda_core/cuda/core/experimental/_module.py b/cuda_core/cuda/core/experimental/_module.py
index 3c55bccdb9..7119583af2 100644
--- a/cuda_core/cuda/core/experimental/_module.py
+++ b/cuda_core/cuda/core/experimental/_module.py
@@ -245,7 +245,7 @@ class ObjectCode:
     __slots__ = ("_handle", "_backend_version", "_code_type", "_module", "_loader", "_sym_map")
     _supported_code_type = ("cubin", "ptx", "ltoir", "fatbin")
 
-    def __init__(self, module: Union[bytes, str], *, symbol_mapping: Optional[dict]=None):
+    def __init__(self, module: Union[bytes, str], *, symbol_mapping: Optional[dict] = None):
         _lazy_init()
 
         # handle is assigned during _lazy_load
@@ -256,7 +256,7 @@ def __init__(self, module: Union[bytes, str], *, symbol_mapping: Optional[dict]=
         self._module = module
         self._sym_map = {} if symbol_mapping is None else symbol_mapping
 
-    def _init(module, code_type, *, symbol_mapping: Optional[dict]=None):
+    def _init(module, code_type, *, symbol_mapping: Optional[dict] = None):
         self = ObjectCode.__new__(ObjectCode)
         if code_type not in self._supported_code_type:
             raise ValueError
diff --git a/cuda_core/tests/test_module.py b/cuda_core/tests/test_module.py
index 33501999df..814a9e517e 100644
--- a/cuda_core/tests/test_module.py
+++ b/cuda_core/tests/test_module.py
@@ -90,7 +90,7 @@ def test_object_code_load_cubin(get_saxpy_kernel):
     sym_map = mod._sym_map
     assert isinstance(cubin, bytes)
     mod = ObjectCode(cubin, symbol_mapping=sym_map)
-    ker = mod.get_kernel("saxpy<double>")
+    mod.get_kernel("saxpy<double>")  # force loading
 
 
 def test_object_code_load_cubin_from_file(get_saxpy_kernel, tmp_path):
@@ -101,4 +101,4 @@ def test_object_code_load_cubin_from_file(get_saxpy_kernel, tmp_path):
     cubin_file = tmp_path / "test.cubin"
     cubin_file.write_bytes(cubin)
     mod = ObjectCode(str(cubin_file), symbol_mapping=sym_map)
-    ker = mod.get_kernel("saxpy<double>")
+    mod.get_kernel("saxpy<double>")  # force loading

From 87c157175620d2240403f974e105da96661435e5 Mon Sep 17 00:00:00 2001
From: Leo Fang <leof@nvidia.com>
Date: Tue, 18 Feb 2025 21:15:21 +0000
Subject: [PATCH 7/8] implement from_cubin; add docs; ensure get_kernel cannot
 be called with lto-ir

---
 cuda_core/cuda/core/experimental/_event.py  |  4 +-
 cuda_core/cuda/core/experimental/_module.py | 63 +++++++++++----------
 cuda_core/docs/source/api.rst               |  1 +
 cuda_core/tests/test_module.py              |  4 +-
 4 files changed, 36 insertions(+), 36 deletions(-)

diff --git a/cuda_core/cuda/core/experimental/_event.py b/cuda_core/cuda/core/experimental/_event.py
index 07e87fb664..06005f95ae 100644
--- a/cuda_core/cuda/core/experimental/_event.py
+++ b/cuda_core/cuda/core/experimental/_event.py
@@ -65,9 +65,7 @@ def close(self):
     __slots__ = ("__weakref__", "_mnff", "_timing_disabled", "_busy_waited")
 
     def __init__(self):
-        raise NotImplementedError(
-            "directly creating an Event object can be ambiguous. Please call call Stream.record()."
-        )
+        raise NotImplementedError("directly creating an Event object can be ambiguous. Please call Stream.record().")
 
     @staticmethod
     def _init(options: Optional[EventOptions] = None):
diff --git a/cuda_core/cuda/core/experimental/_module.py b/cuda_core/cuda/core/experimental/_module.py
index 7119583af2..c4535c2799 100644
--- a/cuda_core/cuda/core/experimental/_module.py
+++ b/cuda_core/cuda/core/experimental/_module.py
@@ -213,53 +213,38 @@ def attributes(self):
 
 
 class ObjectCode:
-    """Represent a compiled program that was loaded onto the device.
+    """Represent a compiled program to be loaded onto the device.
 
     This object provides a unified interface for different types of
-    compiled programs that are loaded onto the device.
-
-    Loads the module library with specified module code and JIT options.
+    compiled programs that will be loaded onto the device.
 
     Note
     ----
-    The public constructor assumes that ``module`` is of code type "cubin".
-    For all other possible code types (ex: "ptx"), only :class:`~cuda.core.experimental.Program`
-    accepts them and returns an `ObjectCode` instance with its ``compile`` method.
+    This class has no default constructor. If you already have a cubin that you would
+    like to load, use the :meth:`from_cubin` alternative constructor. For all other
+    possible code types (ex: "ptx"), only :class:`~cuda.core.experimental.Program`
+    accepts them and returns an :class:`ObjectCode` instance with its
+    :meth:`~cuda.core.experimental.Program.compile` method.
 
     Note
     ----
     Usage under CUDA 11.x will only load to the current device
     context.
-
-    Parameters
-    ----------
-    module : Union[bytes, str]
-        Either a bytes object containing the cubin to load, or
-        a file path string pointing to the cubin to load.
-    symbol_mapping : Optional[dict]
-        A dictionary specifying how the unmangled symbol names (as keys)
-        should be mapped to the mangled names before trying to retrieve
-        them (default to no mappings).
     """
 
     __slots__ = ("_handle", "_backend_version", "_code_type", "_module", "_loader", "_sym_map")
     _supported_code_type = ("cubin", "ptx", "ltoir", "fatbin")
 
-    def __init__(self, module: Union[bytes, str], *, symbol_mapping: Optional[dict] = None):
-        _lazy_init()
-
-        # handle is assigned during _lazy_load
-        self._handle = None
-        self._backend_version = "new" if (_py_major_ver >= 12 and _driver_ver >= 12000) else "old"
-        self._loader = _backend[self._backend_version]
-        self._code_type = "cubin"
-        self._module = module
-        self._sym_map = {} if symbol_mapping is None else symbol_mapping
+    def __init__(self):
+        raise NotImplementedError(
+            "directly creating an ObjectCode object can be ambiguous. Please either call Program.compile() "
+            "or one of the ObjectCode.from_*() constructors"
+        )
 
+    @staticmethod
     def _init(module, code_type, *, symbol_mapping: Optional[dict] = None):
         self = ObjectCode.__new__(ObjectCode)
-        if code_type not in self._supported_code_type:
-            raise ValueError
+        assert code_type in self._supported_code_type, f"{code_type=} is not supported"
         _lazy_init()
 
         # handle is assigned during _lazy_load
@@ -274,6 +259,22 @@ def _init(module, code_type, *, symbol_mapping: Optional[dict] = None):
 
         return self
 
+    @staticmethod
+    def from_cubin(module: Union[bytes, str], *, symbol_mapping: Optional[dict] = None) -> "ObjectCode":
+        """Create an :class:`ObjectCode` instance from an existing cubin.
+
+        Parameters
+        ----------
+        module : Union[bytes, str]
+            Either a bytes object containing the in-memory cubin to load, or
+            a file path string pointing to the on-disk cubin to load.
+        symbol_mapping : Optional[dict]
+            A dictionary specifying how the unmangled symbol names (as keys)
+            should be mapped to the mangled names before trying to retrieve
+            them (default to no mappings).
+        """
+        return ObjectCode._init(module, "cubin", symbol_mapping=symbol_mapping)
+
     # TODO: do we want to unload in a finalizer? Probably not..
 
     def _lazy_load_module(self, *args, **kwargs):
@@ -307,6 +308,8 @@ def get_kernel(self, name):
             Newly created kernel object.
 
         """
+        if self._code_type not in ("cubin", "ptx", "fatbin"):
+            raise RuntimeError(f"get_kernel() is not supported for {self._code_type}")
         try:
             name = self._sym_map[name]
         except KeyError:
@@ -314,5 +317,3 @@ def get_kernel(self, name):
 
         data = handle_return(self._loader["kernel"](self._handle, name))
         return Kernel._from_obj(data, self)
-
-    # TODO: implement from_handle()
diff --git a/cuda_core/docs/source/api.rst b/cuda_core/docs/source/api.rst
index f5ee30c1af..b52fda55d0 100644
--- a/cuda_core/docs/source/api.rst
+++ b/cuda_core/docs/source/api.rst
@@ -32,6 +32,7 @@ CUDA compilation toolchain
 
    Program
    Linker
+   ObjectCode
 
    :template: dataclass.rst
 
diff --git a/cuda_core/tests/test_module.py b/cuda_core/tests/test_module.py
index 814a9e517e..f859142c97 100644
--- a/cuda_core/tests/test_module.py
+++ b/cuda_core/tests/test_module.py
@@ -89,7 +89,7 @@ def test_object_code_load_cubin(get_saxpy_kernel):
     cubin = mod._module
     sym_map = mod._sym_map
     assert isinstance(cubin, bytes)
-    mod = ObjectCode(cubin, symbol_mapping=sym_map)
+    mod = ObjectCode.from_cubin(cubin, symbol_mapping=sym_map)
     mod.get_kernel("saxpy<double>")  # force loading
 
 
@@ -100,5 +100,5 @@ def test_object_code_load_cubin_from_file(get_saxpy_kernel, tmp_path):
     assert isinstance(cubin, bytes)
     cubin_file = tmp_path / "test.cubin"
     cubin_file.write_bytes(cubin)
-    mod = ObjectCode(str(cubin_file), symbol_mapping=sym_map)
+    mod = ObjectCode.from_cubin(str(cubin_file), symbol_mapping=sym_map)
     mod.get_kernel("saxpy<double>")  # force loading

From 380896cce8db4d8c8f1bdee3441437714303202b Mon Sep 17 00:00:00 2001
From: Leo Fang <leof@nvidia.com>
Date: Tue, 18 Feb 2025 17:58:13 -0500
Subject: [PATCH 8/8] Apply suggestions from code review

Co-authored-by: Vladislav Zhurba <53052066+vzhurba01@users.noreply.github.com>
---
 cuda_core/docs/source/release/0.2.0-notes.rst  | 2 +-
 cuda_core/examples/simple_multi_gpu_example.py | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/cuda_core/docs/source/release/0.2.0-notes.rst b/cuda_core/docs/source/release/0.2.0-notes.rst
index 47217453b0..81e4e12924 100644
--- a/cuda_core/docs/source/release/0.2.0-notes.rst
+++ b/cuda_core/docs/source/release/0.2.0-notes.rst
@@ -17,7 +17,7 @@ Breaking Changes
 
 - Change ``__cuda_stream__`` from attribute to method
 - The :meth:`~Program.compile` method no longer accepts the `options` argument. Instead, you can optionally pass an instance of :class:`~ProgramOptions` to the constructor of :obj:`~Program`.
-- :meth: `~Device.properties` now provides an instance of :class:`~DeviceProperties` instead of a dictionary.
+- :meth:`~Device.properties` now provides an instance of :class:`~DeviceProperties` instead of a dictionary.
 
 New features
 ------------
diff --git a/cuda_core/examples/simple_multi_gpu_example.py b/cuda_core/examples/simple_multi_gpu_example.py
index 98969e8fab..baa9547773 100644
--- a/cuda_core/examples/simple_multi_gpu_example.py
+++ b/cuda_core/examples/simple_multi_gpu_example.py
@@ -57,7 +57,7 @@
 }
 """
 arch1 = "".join(f"{i}" for i in dev1.compute_capability)
-prog_sub = Program(code_sub, code_type="c++", options={"std": "c++17", "arch": f"sm_{arch0}"})
+prog_sub = Program(code_sub, code_type="c++", options={"std": "c++17", "arch": f"sm_{arch1}"})
 mod_sub = prog_sub.compile("cubin")
 ker_sub = mod_sub.get_kernel("vector_sub")