diff --git a/build2cmake/src/config/mod.rs b/build2cmake/src/config/mod.rs index 83a64732..8d69276a 100644 --- a/build2cmake/src/config/mod.rs +++ b/build2cmake/src/config/mod.rs @@ -44,6 +44,7 @@ pub struct General { pub python_depends: Option>, pub cuda: Option, + pub neuron: Option, pub xpu: Option, } @@ -106,6 +107,10 @@ pub struct XpuGeneral { pub python_depends: Option>, } +pub struct NeuronGeneral { + pub python_depends: Option>, +} + pub struct Hub { pub repo_id: Option, pub branch: Option, @@ -237,16 +242,18 @@ pub enum Backend { Cpu, Cuda, Metal, + Neuron, Rocm, Xpu, } impl Backend { - pub const fn all() -> [Backend; 5] { + pub const fn all() -> [Backend; 6] { [ Backend::Cpu, Backend::Cuda, Backend::Metal, + Backend::Neuron, Backend::Rocm, Backend::Xpu, ] @@ -259,6 +266,7 @@ impl Display for Backend { Backend::Cpu => write!(f, "cpu"), Backend::Cuda => write!(f, "cuda"), Backend::Metal => write!(f, "metal"), + Backend::Neuron => write!(f, "neuron"), Backend::Rocm => write!(f, "rocm"), Backend::Xpu => write!(f, "xpu"), } @@ -273,6 +281,7 @@ impl FromStr for Backend { "cpu" => Ok(Backend::Cpu), "cuda" => Ok(Backend::Cuda), "metal" => Ok(Backend::Metal), + "neuron" => Ok(Backend::Neuron), "rocm" => Ok(Backend::Rocm), "xpu" => Ok(Backend::Xpu), _ => Err(format!("Unknown backend: {s}")), diff --git a/build2cmake/src/config/v1.rs b/build2cmake/src/config/v1.rs index 687b0eea..3c69bbea 100644 --- a/build2cmake/src/config/v1.rs +++ b/build2cmake/src/config/v1.rs @@ -86,6 +86,7 @@ impl TryFrom for super::Build { Backend::Cpu, Backend::Cuda, Backend::Metal, + Backend::Neuron, Backend::Rocm, Backend::Xpu, ] @@ -102,6 +103,7 @@ impl TryFrom for super::Build { license: None, backends, hub: None, + neuron: None, python_depends: None, cuda: None, xpu: None, diff --git a/build2cmake/src/config/v2.rs b/build2cmake/src/config/v2.rs index b404aa06..871cb093 100644 --- a/build2cmake/src/config/v2.rs +++ b/build2cmake/src/config/v2.rs @@ -132,6 +132,7 @@ impl TryFrom for super::Build { Backend::Cpu, Backend::Cuda, Backend::Metal, + Backend::Neuron, Backend::Rocm, Backend::Xpu, ] @@ -168,6 +169,7 @@ impl General { backends, cuda, hub: general.hub.map(Into::into), + neuron: None, python_depends: None, xpu: None, } diff --git a/build2cmake/src/config/v3.rs b/build2cmake/src/config/v3.rs index 948b4563..6592c29d 100644 --- a/build2cmake/src/config/v3.rs +++ b/build2cmake/src/config/v3.rs @@ -31,6 +31,8 @@ pub struct General { pub hub: Option, + pub neuron: Option, + pub python_depends: Option>, pub xpu: Option, @@ -44,6 +46,12 @@ pub struct CudaGeneral { pub python_depends: Option>, } +#[derive(Debug, Deserialize, Serialize)] +#[serde(deny_unknown_fields, rename_all = "kebab-case")] +pub struct NeuronGeneral { + pub python_depends: Option>, +} + #[derive(Debug, Deserialize, Serialize)] #[serde(deny_unknown_fields, rename_all = "kebab-case")] pub struct XpuGeneral { @@ -121,6 +129,7 @@ pub enum Backend { Cpu, Cuda, Metal, + Neuron, Rocm, Xpu, } @@ -150,6 +159,7 @@ impl From for super::General { backends: general.backends.into_iter().map(Into::into).collect(), cuda: general.cuda.map(Into::into), hub: general.hub.map(Into::into), + neuron: general.neuron.map(Into::into), python_depends: general.python_depends, xpu: general.xpu.map(Into::into), } @@ -166,6 +176,14 @@ impl From for super::CudaGeneral { } } +impl From for super::NeuronGeneral { + fn from(neuron: NeuronGeneral) -> Self { + Self { + python_depends: neuron.python_depends, + } + } +} + impl From for super::XpuGeneral { fn from(xpu: XpuGeneral) -> Self { Self { @@ -201,6 +219,7 @@ impl From for super::Backend { Backend::Cpu => super::Backend::Cpu, Backend::Cuda => super::Backend::Cuda, Backend::Metal => super::Backend::Metal, + Backend::Neuron => super::Backend::Neuron, Backend::Rocm => super::Backend::Rocm, Backend::Xpu => super::Backend::Xpu, } @@ -304,6 +323,7 @@ impl From for General { backends: general.backends.into_iter().map(Into::into).collect(), cuda: general.cuda.map(Into::into), hub: general.hub.map(Into::into), + neuron: general.neuron.map(Into::into), python_depends: general.python_depends, xpu: general.xpu.map(Into::into), } @@ -320,6 +340,14 @@ impl From for CudaGeneral { } } +impl From for NeuronGeneral { + fn from(neuron: super::NeuronGeneral) -> Self { + Self { + python_depends: neuron.python_depends, + } + } +} + impl From for XpuGeneral { fn from(xpu: super::XpuGeneral) -> Self { Self { @@ -355,6 +383,7 @@ impl From for Backend { super::Backend::Cpu => Backend::Cpu, super::Backend::Cuda => Backend::Cuda, super::Backend::Metal => Backend::Metal, + super::Backend::Neuron => Backend::Neuron, super::Backend::Rocm => Backend::Rocm, super::Backend::Xpu => Backend::Xpu, } diff --git a/build2cmake/src/python_dependencies.json b/build2cmake/src/python_dependencies.json index 25b5d9e3..a944b3aa 100644 --- a/build2cmake/src/python_dependencies.json +++ b/build2cmake/src/python_dependencies.json @@ -14,6 +14,12 @@ } }, "metal": {}, + "neuron": { + "nki": { + "nix": [], + "python": ["nki"] + } + }, "rocm": {}, "xpu": { "onednn": { diff --git a/build2cmake/src/templates/noarch/setup.py b/build2cmake/src/templates/noarch/setup.py old mode 100755 new mode 100644 index 3e88a51a..9e1a2227 --- a/build2cmake/src/templates/noarch/setup.py +++ b/build2cmake/src/templates/noarch/setup.py @@ -1,8 +1,9 @@ #!/usr/bin/env python -import shutil -from pathlib import Path from typing import Any +from pathlib import Path +import shutil +import sys from setuptools import setup from setuptools.command.build import build @@ -30,7 +31,10 @@ def run(self) -> None: """Execute the build command.""" project_root = Path(__file__).parent - import tomllib + if sys.version_info >= (3, 11): + import tomllib + else: + import tomli as tomllib with open(project_root / "build.toml", "rb") as f: build_toml: dict[str, Any] = tomllib.load(f) diff --git a/builder/examples/relu-nki/build.toml b/builder/examples/relu-nki/build.toml new file mode 100644 index 00000000..70e4ef7e --- /dev/null +++ b/builder/examples/relu-nki/build.toml @@ -0,0 +1,9 @@ +[general] +name = "relu-nki" +version = 1 +backends = [ + "neuron", +] + +[general.neuron] +python-depends = ["nki"] diff --git a/builder/examples/relu-nki/torch-ext/relu_nki/__init__.py b/builder/examples/relu-nki/torch-ext/relu_nki/__init__.py new file mode 100644 index 00000000..0e3e6e96 --- /dev/null +++ b/builder/examples/relu-nki/torch-ext/relu_nki/__init__.py @@ -0,0 +1,27 @@ +import nki +import nki.language as nl +import nki.isa as nisa + +from ._ops import ops + + +@nki.jit(platform_target="trn2") +def relu(x): + # Check the first dimension's size to ensure it does not exceed on-chip + # memory tile size, since this simple kernel does not tile inputs. + assert x.shape[0] <= nl.tile_size.pmax + x_tile = sbuf.view(dtype=x.dtype, shape=x.shape) + nisa.dma_copy(dst=x_tile, src=x) + out_tile = sbuf.view(dtype=x.dtype, shape=x.shape) + nisa.tensor_scalar(dst=out_tile, data=x_tile, operand0=0, op0=nl.maximum) + c_output = hbm.view(dtype=x.dtype, shape=x.shape) + nisa.dma_copy(dst=c_output, src=out_tile) + return c_output + + +from . import layers + +__all__ = [ + "layers", + "relu", +] diff --git a/builder/examples/relu-nki/torch-ext/relu_nki/layers/__init__.py b/builder/examples/relu-nki/torch-ext/relu_nki/layers/__init__.py new file mode 100644 index 00000000..7800a2ef --- /dev/null +++ b/builder/examples/relu-nki/torch-ext/relu_nki/layers/__init__.py @@ -0,0 +1,9 @@ +import torch +import torch.nn as nn + +from .. import relu + + +class ReLU(nn.Module): + def forward(self, x: torch.Tensor) -> torch.Tensor: + return relu(x) diff --git a/kernels/src/kernels/layer/kernelize.py b/kernels/src/kernels/layer/kernelize.py index 9b318427..4ed37de0 100644 --- a/kernels/src/kernels/layer/kernelize.py +++ b/kernels/src/kernels/layer/kernelize.py @@ -274,7 +274,7 @@ def forward(self, x: torch.Tensor) -> torch.Tensor: def _validate_device_type(device_type: str) -> None: """Validate that the device type is supported.""" - supported_devices = {"cpu", "cuda", "mps", "npu", "rocm", "xpu"} + supported_devices = {"cpu", "cuda", "mps", "neuron", "npu", "rocm", "xpu"} if device_type not in supported_devices: raise ValueError( f"Unsupported device type '{device_type}'. Supported device types are: {', '.join(sorted(supported_devices))}" @@ -310,3 +310,9 @@ def _is_rocm_platform(): import torch return torch.version.hip is not None + + +def _has_neuron_ops(): + import torch + + return hasattr(torch, "neuron") diff --git a/kernels/src/kernels/layer/repos.py b/kernels/src/kernels/layer/repos.py index 6eee11b3..7b3148c4 100644 --- a/kernels/src/kernels/layer/repos.py +++ b/kernels/src/kernels/layer/repos.py @@ -36,6 +36,8 @@ def create_repo(device: Device) -> "DeviceRepos": return _XPURepos() elif device.type == "npu": return _NPURepos() + elif device.type == "neuron": + return _NeuronRepos() else: raise ValueError(f"Unknown device type: {device.type}") @@ -93,6 +95,26 @@ def insert(self, device: Device, repos: dict[Mode, RepositoryProtocol]): self._repos = repos +class _NeuronRepos(DeviceRepos): + _repos: dict[Mode, RepositoryProtocol] + + def __init__(self): + super().__init__() + self._repos = {} + + @property + def repos( + self, + ) -> dict[Mode, RepositoryProtocol] | None: + return self._repos + + def insert(self, device: Device, repos: dict[Mode, RepositoryProtocol]): + if device.type != "neuron": + raise ValueError(f"Device type must be 'neuron', got {device.type}") + + self._repos = repos + + class _NPURepos(DeviceRepos): _repos: dict[Mode, RepositoryProtocol] diff --git a/kernels/src/kernels/python_depends.json b/kernels/src/kernels/python_depends.json index 25b5d9e3..a944b3aa 100644 --- a/kernels/src/kernels/python_depends.json +++ b/kernels/src/kernels/python_depends.json @@ -14,6 +14,12 @@ } }, "metal": {}, + "neuron": { + "nki": { + "nix": [], + "python": ["nki"] + } + }, "rocm": {}, "xpu": { "onednn": { diff --git a/kernels/src/kernels/utils.py b/kernels/src/kernels/utils.py index 7ee546f1..2dcdd3e6 100644 --- a/kernels/src/kernels/utils.py +++ b/kernels/src/kernels/utils.py @@ -22,7 +22,7 @@ from kernels.lockfile import KernelLock, VariantLock from kernels.metadata import Metadata -KNOWN_BACKENDS = {"cpu", "cuda", "metal", "rocm", "xpu", "npu"} +KNOWN_BACKENDS = {"cpu", "cuda", "metal", "neuron", "rocm", "xpu", "npu"} def _get_cache_dir() -> str | None: @@ -74,7 +74,11 @@ def _get_privateuse_backend_name() -> str | None: def _backend() -> str: import torch - if torch.version.cuda is not None: + if hasattr(torch, "neuron"): + # Needs to be sorted before specific Torch builds, since Neuron + # extension can be loaded into e.g. CUDA Torch builds. + return "neuron" + elif torch.version.cuda is not None: return "cuda" elif torch.version.hip is not None: return "rocm" @@ -88,7 +92,11 @@ def _backend() -> str: return "cpu" -def _build_variant(backend: str | None) -> str: +def _build_variant(backend: str | None) -> str | None: + """ + Build vaariant for arch kernels, returns `None` when the backend + does not (yet) support arch kernels. + """ backend = _select_backend(backend) import torch @@ -101,6 +109,8 @@ def _build_variant(backend: str | None) -> str: compute_framework = f"rocm{rocm_version.major}{rocm_version.minor}" elif backend == "metal": compute_framework = "metal" + elif backend == "neuron": + return None elif backend == "xpu" and torch.version.xpu is not None: version = torch.version.xpu compute_framework = f"xpu{version[0:4]}{version[5:6]}" @@ -149,6 +159,8 @@ def _build_variant_noarch(backend: str | None) -> str: if backend == "cuda": return "torch-cuda" + elif backend == "neuron": + return "torch-neuron" elif backend == "rocm": return "torch-rocm" elif backend == "metal": @@ -168,11 +180,15 @@ def _build_variant_universal() -> str: def _build_variants(backend: str | None) -> list[str]: """Return compatible build variants in preferred order.""" - return [ - _build_variant(backend), - _build_variant_noarch(backend), - _build_variant_universal(), - ] + arch_variant = _build_variant(backend) + variants = [arch_variant] if arch_variant is not None else [] + variants.extend( + [ + _build_variant_noarch(backend), + _build_variant_universal(), + ] + ) + return variants def _import_from_path(module_name: str, variant_path: Path) -> ModuleType: diff --git a/kernels/tests/conftest.py b/kernels/tests/conftest.py index 4867b9c0..08fe3f28 100644 --- a/kernels/tests/conftest.py +++ b/kernels/tests/conftest.py @@ -10,6 +10,9 @@ and torch.version.cuda is not None and torch.cuda.device_count() > 0 ) + +has_neuron = hasattr(torch, "neuron") and torch.neuron.device_count() > 0 + has_rocm = ( hasattr(torch.version, "hip") and torch.version.hip is not None @@ -46,6 +49,8 @@ def device(): def pytest_runtest_setup(item): if "cuda_only" in item.keywords and not has_cuda: pytest.skip("skipping CUDA-only test on host without CUDA") + if "neuron_only" in item.keywords and not has_neuron: + pytest.skip("skipping Neuron-only test on host without Neuron") if "rocm_only" in item.keywords and not has_rocm: pytest.skip("skipping ROCm-only test on host without ROCm") if "darwin_only" in item.keywords and not sys.platform.startswith("darwin"): diff --git a/kernels/tests/test_basic.py b/kernels/tests/test_basic.py index 241c9b2a..5a12ef50 100644 --- a/kernels/tests/test_basic.py +++ b/kernels/tests/test_basic.py @@ -262,6 +262,13 @@ def test_local_overrides(monkeypatch, local_kernel_path): get_kernel("kernels-test/activation") +@pytest.mark.neuron_only +def test_neuron(): + relu = get_kernel("kernels-test/relu-nki", version=1) + x = torch.randn((16, 16), dtype=torch.float16).to(device="neuron") + torch.testing.assert_close(relu.relu(x), x.relu()) + + def silu_and_mul_torch(x: torch.Tensor): d = x.shape[-1] // 2 return F.silu(x[..., :d]) * x[..., d:] diff --git a/kernels/tests/test_init.py b/kernels/tests/test_init.py index 67ec09f0..84e3cddb 100644 --- a/kernels/tests/test_init.py +++ b/kernels/tests/test_init.py @@ -34,7 +34,8 @@ def e2e_init(backends: list[str]) -> None: expected_backend_dirs.remove(Path(f"{expected_normalized_name}_rocm")) expected_backend_dirs.add(Path(f"{expected_normalized_name}_cuda")) - # TODO: npu is not yet supported in the template + # TODO: neuron/npu are not yet supported in the template + expected_backend_dirs.discard(Path(f"{expected_normalized_name}_neuron")) expected_backend_dirs.discard(Path(f"{expected_normalized_name}_npu")) with tempfile.TemporaryDirectory() as tmpdir: diff --git a/kernels/tests/test_layer.py b/kernels/tests/test_layer.py index 05a586b0..0e203ab6 100644 --- a/kernels/tests/test_layer.py +++ b/kernels/tests/test_layer.py @@ -94,6 +94,23 @@ class RMSNormWithKernel(RMSNorm): pass +class ReLU(nn.Module): + def __init__(self): + super().__init__() + # Used to check that we called hub kernel. + self.n_calls = 0 + + def forward(self, input: torch.Tensor) -> torch.Tensor: + self.n_calls += 1 + d = input.shape[-1] // 2 + return F.relu(input) + + +@use_kernel_forward_from_hub("ReLU") +class ReLUWithKernel(ReLU): + pass + + class SiluAndMul(nn.Module): def __init__(self): super().__init__() @@ -198,6 +215,55 @@ def test_hub_func(cls): assert silu_and_mul_with_kernel.n_calls == 0 +@pytest.mark.neuron_only +def test_hub_forward_neuron(): + torch.manual_seed(0) + + mapping = { + "ReLU": { + "neuron": LayerRepository( + repo_id="kernels-test/relu-nki", version=1, layer_name="ReLU" + ) + } + } + + relu = ReLU() + X = torch.randn((16, 16), device="neuron") + Y = relu(X) + + with use_kernel_mapping(mapping): + relu_with_kernel = kernelize( + ReLUWithKernel(), device="neuron", mode=Mode.INFERENCE + ) + Y_kernel = relu_with_kernel(X) + + torch.testing.assert_close(Y_kernel, Y) + + assert relu.n_calls == 1 + assert relu_with_kernel.n_calls == 0 + + # Check that the device type can be determined automatically. + class SMOL(nn.Module): + def __init__(self): + super().__init__() + self.linear = nn.Linear(16, 16) + self.relu = ReLUWithKernel() + + def forward(self, x): + return self.relu(self.linear(x)) + + smol = SMOL().to("neuron") + + Y = smol(X) + + with use_kernel_mapping(mapping): + smol = kernelize(smol, mode=Mode.INFERENCE) + Y_kernel = smol(X) + + torch.testing.assert_close(Y, Y_kernel) + assert smol.relu.n_calls == 1 + + @pytest.mark.rocm_only def test_hub_forward_rocm(): torch.manual_seed(0)