diff --git a/build2cmake/src/config/mod.rs b/build2cmake/src/config/mod.rs
index 83a64732..8d69276a 100644
--- a/build2cmake/src/config/mod.rs
+++ b/build2cmake/src/config/mod.rs
@@ -44,6 +44,7 @@ pub struct General {
     pub python_depends: Option<Vec<String>>,
 
     pub cuda: Option<CudaGeneral>,
+    pub neuron: Option<NeuronGeneral>,
     pub xpu: Option<XpuGeneral>,
 }
 
@@ -106,6 +107,10 @@ pub struct XpuGeneral {
     pub python_depends: Option<Vec<String>>,
 }
 
+pub struct NeuronGeneral {
+    pub python_depends: Option<Vec<String>>,
+}
+
 pub struct Hub {
     pub repo_id: Option<String>,
     pub branch: Option<String>,
@@ -237,16 +242,18 @@ pub enum Backend {
     Cpu,
     Cuda,
     Metal,
+    Neuron,
     Rocm,
     Xpu,
 }
 
 impl Backend {
-    pub const fn all() -> [Backend; 5] {
+    pub const fn all() -> [Backend; 6] {
         [
             Backend::Cpu,
             Backend::Cuda,
             Backend::Metal,
+            Backend::Neuron,
             Backend::Rocm,
             Backend::Xpu,
         ]
@@ -259,6 +266,7 @@ impl Display for Backend {
             Backend::Cpu => write!(f, "cpu"),
             Backend::Cuda => write!(f, "cuda"),
             Backend::Metal => write!(f, "metal"),
+            Backend::Neuron => write!(f, "neuron"),
             Backend::Rocm => write!(f, "rocm"),
             Backend::Xpu => write!(f, "xpu"),
         }
@@ -273,6 +281,7 @@ impl FromStr for Backend {
             "cpu" => Ok(Backend::Cpu),
             "cuda" => Ok(Backend::Cuda),
             "metal" => Ok(Backend::Metal),
+            "neuron" => Ok(Backend::Neuron),
             "rocm" => Ok(Backend::Rocm),
             "xpu" => Ok(Backend::Xpu),
             _ => Err(format!("Unknown backend: {s}")),
diff --git a/build2cmake/src/config/v1.rs b/build2cmake/src/config/v1.rs
index 687b0eea..3c69bbea 100644
--- a/build2cmake/src/config/v1.rs
+++ b/build2cmake/src/config/v1.rs
@@ -86,6 +86,7 @@ impl TryFrom<Build> for super::Build {
                 Backend::Cpu,
                 Backend::Cuda,
                 Backend::Metal,
+                Backend::Neuron,
                 Backend::Rocm,
                 Backend::Xpu,
             ]
@@ -102,6 +103,7 @@ impl TryFrom<Build> for super::Build {
                 license: None,
                 backends,
                 hub: None,
+                neuron: None,
                 python_depends: None,
                 cuda: None,
                 xpu: None,
diff --git a/build2cmake/src/config/v2.rs b/build2cmake/src/config/v2.rs
index b404aa06..871cb093 100644
--- a/build2cmake/src/config/v2.rs
+++ b/build2cmake/src/config/v2.rs
@@ -132,6 +132,7 @@ impl TryFrom<Build> for super::Build {
                 Backend::Cpu,
                 Backend::Cuda,
                 Backend::Metal,
+                Backend::Neuron,
                 Backend::Rocm,
                 Backend::Xpu,
             ]
@@ -168,6 +169,7 @@ impl General {
             backends,
             cuda,
             hub: general.hub.map(Into::into),
+            neuron: None,
             python_depends: None,
             xpu: None,
         }
diff --git a/build2cmake/src/config/v3.rs b/build2cmake/src/config/v3.rs
index 948b4563..6592c29d 100644
--- a/build2cmake/src/config/v3.rs
+++ b/build2cmake/src/config/v3.rs
@@ -31,6 +31,8 @@ pub struct General {
 
     pub hub: Option<Hub>,
 
+    pub neuron: Option<NeuronGeneral>,
+
     pub python_depends: Option<Vec<String>>,
 
     pub xpu: Option<XpuGeneral>,
@@ -44,6 +46,12 @@ pub struct CudaGeneral {
     pub python_depends: Option<Vec<String>>,
 }
 
+#[derive(Debug, Deserialize, Serialize)]
+#[serde(deny_unknown_fields, rename_all = "kebab-case")]
+pub struct NeuronGeneral {
+    pub python_depends: Option<Vec<String>>,
+}
+
 #[derive(Debug, Deserialize, Serialize)]
 #[serde(deny_unknown_fields, rename_all = "kebab-case")]
 pub struct XpuGeneral {
@@ -121,6 +129,7 @@ pub enum Backend {
     Cpu,
     Cuda,
     Metal,
+    Neuron,
     Rocm,
     Xpu,
 }
@@ -150,6 +159,7 @@ impl From<General> for super::General {
             backends: general.backends.into_iter().map(Into::into).collect(),
             cuda: general.cuda.map(Into::into),
             hub: general.hub.map(Into::into),
+            neuron: general.neuron.map(Into::into),
             python_depends: general.python_depends,
             xpu: general.xpu.map(Into::into),
         }
@@ -166,6 +176,14 @@ impl From<CudaGeneral> for super::CudaGeneral {
     }
 }
 
+impl From<NeuronGeneral> for super::NeuronGeneral {
+    fn from(neuron: NeuronGeneral) -> Self {
+        Self {
+            python_depends: neuron.python_depends,
+        }
+    }
+}
+
 impl From<XpuGeneral> for super::XpuGeneral {
     fn from(xpu: XpuGeneral) -> Self {
         Self {
@@ -201,6 +219,7 @@ impl From<Backend> for super::Backend {
             Backend::Cpu => super::Backend::Cpu,
             Backend::Cuda => super::Backend::Cuda,
             Backend::Metal => super::Backend::Metal,
+            Backend::Neuron => super::Backend::Neuron,
             Backend::Rocm => super::Backend::Rocm,
             Backend::Xpu => super::Backend::Xpu,
         }
@@ -304,6 +323,7 @@ impl From<super::General> for General {
             backends: general.backends.into_iter().map(Into::into).collect(),
             cuda: general.cuda.map(Into::into),
             hub: general.hub.map(Into::into),
+            neuron: general.neuron.map(Into::into),
             python_depends: general.python_depends,
             xpu: general.xpu.map(Into::into),
         }
@@ -320,6 +340,14 @@ impl From<super::CudaGeneral> for CudaGeneral {
     }
 }
 
+impl From<super::NeuronGeneral> for NeuronGeneral {
+    fn from(neuron: super::NeuronGeneral) -> Self {
+        Self {
+            python_depends: neuron.python_depends,
+        }
+    }
+}
+
 impl From<super::XpuGeneral> for XpuGeneral {
     fn from(xpu: super::XpuGeneral) -> Self {
         Self {
@@ -355,6 +383,7 @@ impl From<super::Backend> for Backend {
             super::Backend::Cpu => Backend::Cpu,
             super::Backend::Cuda => Backend::Cuda,
             super::Backend::Metal => Backend::Metal,
+            super::Backend::Neuron => Backend::Neuron,
             super::Backend::Rocm => Backend::Rocm,
             super::Backend::Xpu => Backend::Xpu,
         }
diff --git a/build2cmake/src/python_dependencies.json b/build2cmake/src/python_dependencies.json
index 25b5d9e3..a944b3aa 100644
--- a/build2cmake/src/python_dependencies.json
+++ b/build2cmake/src/python_dependencies.json
@@ -14,6 +14,12 @@
       }
     },
     "metal": {},
+    "neuron": {
+      "nki": {
+        "nix": [],
+        "python": ["nki"]
+      }
+    },
     "rocm": {},
     "xpu": {
       "onednn": {
diff --git a/build2cmake/src/templates/noarch/setup.py b/build2cmake/src/templates/noarch/setup.py
old mode 100755
new mode 100644
index 3e88a51a..9e1a2227
--- a/build2cmake/src/templates/noarch/setup.py
+++ b/build2cmake/src/templates/noarch/setup.py
@@ -1,8 +1,9 @@
 #!/usr/bin/env python
 
-import shutil
-from pathlib import Path
 from typing import Any
+from pathlib import Path
+import shutil
+import sys
 
 from setuptools import setup
 from setuptools.command.build import build
@@ -30,7 +31,10 @@ def run(self) -> None:
         """Execute the build command."""
         project_root = Path(__file__).parent
 
-        import tomllib
+        if sys.version_info >= (3, 11):
+            import tomllib
+        else:
+            import tomli as tomllib
 
         with open(project_root / "build.toml", "rb") as f:
             build_toml: dict[str, Any] = tomllib.load(f)
diff --git a/builder/examples/relu-nki/build.toml b/builder/examples/relu-nki/build.toml
new file mode 100644
index 00000000..70e4ef7e
--- /dev/null
+++ b/builder/examples/relu-nki/build.toml
@@ -0,0 +1,9 @@
+[general]
+name = "relu-nki"
+version = 1
+backends = [
+    "neuron",
+]
+
+[general.neuron]
+python-depends = ["nki"]
diff --git a/builder/examples/relu-nki/torch-ext/relu_nki/__init__.py b/builder/examples/relu-nki/torch-ext/relu_nki/__init__.py
new file mode 100644
index 00000000..0e3e6e96
--- /dev/null
+++ b/builder/examples/relu-nki/torch-ext/relu_nki/__init__.py
@@ -0,0 +1,27 @@
+import nki
+import nki.language as nl
+import nki.isa as nisa
+
+from ._ops import ops
+
+
+@nki.jit(platform_target="trn2")
+def relu(x):
+    # Check the first dimension's size to ensure it does not exceed on-chip
+    # memory tile size, since this simple kernel does not tile inputs.
+    assert x.shape[0] <= nl.tile_size.pmax
+    x_tile = sbuf.view(dtype=x.dtype, shape=x.shape)
+    nisa.dma_copy(dst=x_tile, src=x)
+    out_tile = sbuf.view(dtype=x.dtype, shape=x.shape)
+    nisa.tensor_scalar(dst=out_tile, data=x_tile, operand0=0, op0=nl.maximum)
+    c_output = hbm.view(dtype=x.dtype, shape=x.shape)
+    nisa.dma_copy(dst=c_output, src=out_tile)
+    return c_output
+
+
+from . import layers
+
+__all__ = [
+    "layers",
+    "relu",
+]
diff --git a/builder/examples/relu-nki/torch-ext/relu_nki/layers/__init__.py b/builder/examples/relu-nki/torch-ext/relu_nki/layers/__init__.py
new file mode 100644
index 00000000..7800a2ef
--- /dev/null
+++ b/builder/examples/relu-nki/torch-ext/relu_nki/layers/__init__.py
@@ -0,0 +1,9 @@
+import torch
+import torch.nn as nn
+
+from .. import relu
+
+
+class ReLU(nn.Module):
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        return relu(x)
diff --git a/kernels/src/kernels/layer/kernelize.py b/kernels/src/kernels/layer/kernelize.py
index 9b318427..4ed37de0 100644
--- a/kernels/src/kernels/layer/kernelize.py
+++ b/kernels/src/kernels/layer/kernelize.py
@@ -274,7 +274,7 @@ def forward(self, x: torch.Tensor) -> torch.Tensor:
 
 def _validate_device_type(device_type: str) -> None:
     """Validate that the device type is supported."""
-    supported_devices = {"cpu", "cuda", "mps", "npu", "rocm", "xpu"}
+    supported_devices = {"cpu", "cuda", "mps", "neuron", "npu", "rocm", "xpu"}
     if device_type not in supported_devices:
         raise ValueError(
             f"Unsupported device type '{device_type}'. Supported device types are: {', '.join(sorted(supported_devices))}"
@@ -310,3 +310,9 @@ def _is_rocm_platform():
     import torch
 
     return torch.version.hip is not None
+
+
+def _has_neuron_ops():
+    import torch
+
+    return hasattr(torch, "neuron")
diff --git a/kernels/src/kernels/layer/repos.py b/kernels/src/kernels/layer/repos.py
index 6eee11b3..7b3148c4 100644
--- a/kernels/src/kernels/layer/repos.py
+++ b/kernels/src/kernels/layer/repos.py
@@ -36,6 +36,8 @@ def create_repo(device: Device) -> "DeviceRepos":
             return _XPURepos()
         elif device.type == "npu":
             return _NPURepos()
+        elif device.type == "neuron":
+            return _NeuronRepos()
         else:
             raise ValueError(f"Unknown device type: {device.type}")
 
@@ -93,6 +95,26 @@ def insert(self, device: Device, repos: dict[Mode, RepositoryProtocol]):
         self._repos = repos
 
 
+class _NeuronRepos(DeviceRepos):
+    _repos: dict[Mode, RepositoryProtocol]
+
+    def __init__(self):
+        super().__init__()
+        self._repos = {}
+
+    @property
+    def repos(
+        self,
+    ) -> dict[Mode, RepositoryProtocol] | None:
+        return self._repos
+
+    def insert(self, device: Device, repos: dict[Mode, RepositoryProtocol]):
+        if device.type != "neuron":
+            raise ValueError(f"Device type must be 'neuron', got {device.type}")
+
+        self._repos = repos
+
+
 class _NPURepos(DeviceRepos):
     _repos: dict[Mode, RepositoryProtocol]
 
diff --git a/kernels/src/kernels/python_depends.json b/kernels/src/kernels/python_depends.json
index 25b5d9e3..a944b3aa 100644
--- a/kernels/src/kernels/python_depends.json
+++ b/kernels/src/kernels/python_depends.json
@@ -14,6 +14,12 @@
       }
     },
     "metal": {},
+    "neuron": {
+      "nki": {
+        "nix": [],
+        "python": ["nki"]
+      }
+    },
     "rocm": {},
     "xpu": {
       "onednn": {
diff --git a/kernels/src/kernels/utils.py b/kernels/src/kernels/utils.py
index 7ee546f1..2dcdd3e6 100644
--- a/kernels/src/kernels/utils.py
+++ b/kernels/src/kernels/utils.py
@@ -22,7 +22,7 @@
 from kernels.lockfile import KernelLock, VariantLock
 from kernels.metadata import Metadata
 
-KNOWN_BACKENDS = {"cpu", "cuda", "metal", "rocm", "xpu", "npu"}
+KNOWN_BACKENDS = {"cpu", "cuda", "metal", "neuron", "rocm", "xpu", "npu"}
 
 
 def _get_cache_dir() -> str | None:
@@ -74,7 +74,11 @@ def _get_privateuse_backend_name() -> str | None:
 def _backend() -> str:
     import torch
 
-    if torch.version.cuda is not None:
+    if hasattr(torch, "neuron"):
+        # Needs to be sorted before specific Torch builds, since Neuron
+        # extension can be loaded into e.g. CUDA Torch builds.
+        return "neuron"
+    elif torch.version.cuda is not None:
         return "cuda"
     elif torch.version.hip is not None:
         return "rocm"
@@ -88,7 +92,11 @@ def _backend() -> str:
         return "cpu"
 
 
-def _build_variant(backend: str | None) -> str:
+def _build_variant(backend: str | None) -> str | None:
+    """
+    Build vaariant for arch kernels, returns `None` when the backend
+    does not (yet) support arch kernels.
+    """
     backend = _select_backend(backend)
 
     import torch
@@ -101,6 +109,8 @@ def _build_variant(backend: str | None) -> str:
         compute_framework = f"rocm{rocm_version.major}{rocm_version.minor}"
     elif backend == "metal":
         compute_framework = "metal"
+    elif backend == "neuron":
+        return None
     elif backend == "xpu" and torch.version.xpu is not None:
         version = torch.version.xpu
         compute_framework = f"xpu{version[0:4]}{version[5:6]}"
@@ -149,6 +159,8 @@ def _build_variant_noarch(backend: str | None) -> str:
 
     if backend == "cuda":
         return "torch-cuda"
+    elif backend == "neuron":
+        return "torch-neuron"
     elif backend == "rocm":
         return "torch-rocm"
     elif backend == "metal":
@@ -168,11 +180,15 @@ def _build_variant_universal() -> str:
 
 def _build_variants(backend: str | None) -> list[str]:
     """Return compatible build variants in preferred order."""
-    return [
-        _build_variant(backend),
-        _build_variant_noarch(backend),
-        _build_variant_universal(),
-    ]
+    arch_variant = _build_variant(backend)
+    variants = [arch_variant] if arch_variant is not None else []
+    variants.extend(
+        [
+            _build_variant_noarch(backend),
+            _build_variant_universal(),
+        ]
+    )
+    return variants
 
 
 def _import_from_path(module_name: str, variant_path: Path) -> ModuleType:
diff --git a/kernels/tests/conftest.py b/kernels/tests/conftest.py
index 4867b9c0..08fe3f28 100644
--- a/kernels/tests/conftest.py
+++ b/kernels/tests/conftest.py
@@ -10,6 +10,9 @@
     and torch.version.cuda is not None
     and torch.cuda.device_count() > 0
 )
+
+has_neuron = hasattr(torch, "neuron") and torch.neuron.device_count() > 0
+
 has_rocm = (
     hasattr(torch.version, "hip")
     and torch.version.hip is not None
@@ -46,6 +49,8 @@ def device():
 def pytest_runtest_setup(item):
     if "cuda_only" in item.keywords and not has_cuda:
         pytest.skip("skipping CUDA-only test on host without CUDA")
+    if "neuron_only" in item.keywords and not has_neuron:
+        pytest.skip("skipping Neuron-only test on host without Neuron")
     if "rocm_only" in item.keywords and not has_rocm:
         pytest.skip("skipping ROCm-only test on host without ROCm")
     if "darwin_only" in item.keywords and not sys.platform.startswith("darwin"):
diff --git a/kernels/tests/test_basic.py b/kernels/tests/test_basic.py
index 241c9b2a..5a12ef50 100644
--- a/kernels/tests/test_basic.py
+++ b/kernels/tests/test_basic.py
@@ -262,6 +262,13 @@ def test_local_overrides(monkeypatch, local_kernel_path):
             get_kernel("kernels-test/activation")
 
 
+@pytest.mark.neuron_only
+def test_neuron():
+    relu = get_kernel("kernels-test/relu-nki", version=1)
+    x = torch.randn((16, 16), dtype=torch.float16).to(device="neuron")
+    torch.testing.assert_close(relu.relu(x), x.relu())
+
+
 def silu_and_mul_torch(x: torch.Tensor):
     d = x.shape[-1] // 2
     return F.silu(x[..., :d]) * x[..., d:]
diff --git a/kernels/tests/test_init.py b/kernels/tests/test_init.py
index 67ec09f0..84e3cddb 100644
--- a/kernels/tests/test_init.py
+++ b/kernels/tests/test_init.py
@@ -34,7 +34,8 @@ def e2e_init(backends: list[str]) -> None:
         expected_backend_dirs.remove(Path(f"{expected_normalized_name}_rocm"))
         expected_backend_dirs.add(Path(f"{expected_normalized_name}_cuda"))
 
-    # TODO: npu is not yet supported in the template
+    # TODO: neuron/npu are not yet supported in the template
+    expected_backend_dirs.discard(Path(f"{expected_normalized_name}_neuron"))
     expected_backend_dirs.discard(Path(f"{expected_normalized_name}_npu"))
 
     with tempfile.TemporaryDirectory() as tmpdir:
diff --git a/kernels/tests/test_layer.py b/kernels/tests/test_layer.py
index 05a586b0..0e203ab6 100644
--- a/kernels/tests/test_layer.py
+++ b/kernels/tests/test_layer.py
@@ -94,6 +94,23 @@ class RMSNormWithKernel(RMSNorm):
     pass
 
 
+class ReLU(nn.Module):
+    def __init__(self):
+        super().__init__()
+        # Used to check that we called hub kernel.
+        self.n_calls = 0
+
+    def forward(self, input: torch.Tensor) -> torch.Tensor:
+        self.n_calls += 1
+        d = input.shape[-1] // 2
+        return F.relu(input)
+
+
+@use_kernel_forward_from_hub("ReLU")
+class ReLUWithKernel(ReLU):
+    pass
+
+
 class SiluAndMul(nn.Module):
     def __init__(self):
         super().__init__()
@@ -198,6 +215,55 @@ def test_hub_func(cls):
     assert silu_and_mul_with_kernel.n_calls == 0
 
 
+@pytest.mark.neuron_only
+def test_hub_forward_neuron():
+    torch.manual_seed(0)
+
+    mapping = {
+        "ReLU": {
+            "neuron": LayerRepository(
+                repo_id="kernels-test/relu-nki", version=1, layer_name="ReLU"
+            )
+        }
+    }
+
+    relu = ReLU()
+    X = torch.randn((16, 16), device="neuron")
+    Y = relu(X)
+
+    with use_kernel_mapping(mapping):
+        relu_with_kernel = kernelize(
+            ReLUWithKernel(), device="neuron", mode=Mode.INFERENCE
+        )
+    Y_kernel = relu_with_kernel(X)
+
+    torch.testing.assert_close(Y_kernel, Y)
+
+    assert relu.n_calls == 1
+    assert relu_with_kernel.n_calls == 0
+
+    # Check that the device type can be determined automatically.
+    class SMOL(nn.Module):
+        def __init__(self):
+            super().__init__()
+            self.linear = nn.Linear(16, 16)
+            self.relu = ReLUWithKernel()
+
+        def forward(self, x):
+            return self.relu(self.linear(x))
+
+    smol = SMOL().to("neuron")
+
+    Y = smol(X)
+
+    with use_kernel_mapping(mapping):
+        smol = kernelize(smol, mode=Mode.INFERENCE)
+    Y_kernel = smol(X)
+
+    torch.testing.assert_close(Y, Y_kernel)
+    assert smol.relu.n_calls == 1
+
+
 @pytest.mark.rocm_only
 def test_hub_forward_rocm():
     torch.manual_seed(0)