diff --git a/setup.py b/setup.py
index bcccd8208f..69ec9ea233 100644
--- a/setup.py
+++ b/setup.py
@@ -484,7 +484,7 @@ def setup_pytorch_extension() -> setuptools.Extension:
     ]
 
     # Compiler flags
-    cxx_flags = ["-O3"]
+    cxx_flags = ["-O3", "-fvisibility=hidden"]
     nvcc_flags = [
         "-O3",
         "-gencode",
@@ -536,6 +536,73 @@ def setup_pytorch_extension() -> setuptools.Extension:
         },
     )
 
+def setup_sequential_extension() -> setuptools.Extension:
+    # Source files
+    src_dir = root_path / "transformer_engine" / "pytorch" / "sequential" / "nvte" / "cppsrc"
+    sources = [
+        src_dir / "pybind.cpp"
+    ]
+
+    # Header files
+    include_dirs = [
+        root_path / "transformer_engine" / "common" / "include",
+        root_path / "transformer_engine",
+        root_path / "3rdparty" / "cudnn-frontend" / "include",
+    ]
+
+    # Compiler flags
+    cxx_flags = ["-O3", "-fvisibility=hidden"]
+    nvcc_flags = [
+        "-O3",
+        "-gencode",
+        "arch=compute_70,code=sm_70",
+        "-U__CUDA_NO_HALF_OPERATORS__",
+        "-U__CUDA_NO_HALF_CONVERSIONS__",
+        "-U__CUDA_NO_BFLOAT16_OPERATORS__",
+        "-U__CUDA_NO_BFLOAT16_CONVERSIONS__",
+        "-U__CUDA_NO_BFLOAT162_OPERATORS__",
+        "-U__CUDA_NO_BFLOAT162_CONVERSIONS__",
+        "--expt-relaxed-constexpr",
+        "--expt-extended-lambda",
+        "--use_fast_math",
+    ]
+
+    # Version-dependent CUDA options
+    try:
+        version = cuda_version()
+    except FileNotFoundError:
+        print("Could not determine CUDA Toolkit version")
+    else:
+        if version >= (11, 2):
+            nvcc_flags.extend(["--threads", "4"])
+        if version >= (11, 0):
+            nvcc_flags.extend(["-gencode", "arch=compute_80,code=sm_80"])
+        if version >= (11, 8):
+            nvcc_flags.extend(["-gencode", "arch=compute_90,code=sm_90"])
+
+    # userbuffers support
+    if with_userbuffers():
+        if os.getenv("MPI_HOME"):
+            mpi_home = Path(os.getenv("MPI_HOME"))
+            include_dirs.append(mpi_home / "include")
+        cxx_flags.append("-DNVTE_WITH_USERBUFFERS")
+        nvcc_flags.append("-DNVTE_WITH_USERBUFFERS")
+
+    # Construct PyTorch CUDA extension
+    sources = [str(path) for path in sources]
+    include_dirs = [str(path) for path in include_dirs]
+    from torch.utils.cpp_extension import CUDAExtension
+    return CUDAExtension(
+        name="transformer_engine_cuda",
+        sources=sources,
+        include_dirs=include_dirs,
+        extra_compile_args={
+            "cxx": cxx_flags,
+            "nvcc": nvcc_flags,
+        },
+        package_data={"transformer_engine_cuda": ["py.typed", "*.pyi"]}
+    )
+
 
 def setup_paddle_extension() -> setuptools.Extension:
     """Setup CUDA extension for Paddle support"""
@@ -555,7 +622,7 @@ def setup_paddle_extension() -> setuptools.Extension:
     ]
 
     # Compiler flags
-    cxx_flags = ["-O3"]
+    cxx_flags = ["-O3", "-fvisibility=hidden"]
     nvcc_flags = [
         "-O3",
         "-gencode",
@@ -614,6 +681,7 @@ def main():
     ext_modules = [setup_common_extension()]
     if "pytorch" in frameworks():
         ext_modules.append(setup_pytorch_extension())
+        ext_modules.append(setup_sequential_extension())
 
     if "paddle" in frameworks():
         ext_modules.append(setup_paddle_extension())
diff --git a/tests/sequential/compare_pt_te_seq.py b/tests/sequential/compare_pt_te_seq.py
new file mode 100644
index 0000000000..6d5de265cd
--- /dev/null
+++ b/tests/sequential/compare_pt_te_seq.py
@@ -0,0 +1,162 @@
+from __future__ import annotations
+import torch
+import transformer_engine.pytorch.sequential as seq
+from torch import nn
+import transformer_engine.pytorch as te
+from math import sqrt
+
+import torch
+import torch.nn as nn
+
+
+class RMSNorm(nn.Module):
+    def __init__(self, hidden_dim: int, eps: float = 1e-5):
+        super().__init__()
+        self.hidden_dim = hidden_dim
+        self.eps = eps
+        self.weight = nn.Parameter(torch.ones(hidden_dim))
+
+    def forward(self, x: torch.Tensor):
+        x_norm = x.norm(2, dim=-1, keepdim=True)
+        rms_x = x_norm / sqrt(self.hidden_dim)
+        y = x / (rms_x + self.eps)
+        return y * self.weight
+
+
+torch.set_default_device("cuda")
+
+SEQ_LEN = 128
+HIDDEN_DIM = 768
+
+
+def max_abs_diff(a: torch.Tensor, b: torch.Tensor):
+    v = (a - b).abs().max().item()
+    if v >= 0.001:
+        return f"\033[31m{v:12.10f}\033[0m"
+    else:
+        return f"\033[32m{v:12.10f}\033[0m"
+
+
+def cpy(dst: torch.Tensor, src: torch.Tensor):
+    dst.data = torch.as_tensor(src.data.clone().detach(), dtype=dst.dtype).detach()
+
+
+def cmp_modules(te: nn.Module, seq: nn.Module, pt: nn.Module):
+    x_te = x_src.detach().clone().requires_grad_()
+    x_seq = x_src.detach().clone().requires_grad_()
+    x_pt = x_src.detach().clone().requires_grad_()
+
+    y_te = te(x_te)
+    y_seq = seq(x_seq)
+    y_pt = pt(x_pt)
+
+    y_te.sum().backward()
+    y_seq.sum().backward()
+    y_pt.sum().backward()
+
+    print(f"mad(dx_te, dx_seq): {max_abs_diff(x_te.grad, x_seq.grad)}")
+    print(f"mad(dx_te,  dx_pt): {max_abs_diff(x_te.grad, x_pt.grad)}")
+    print(f"mad(dx_seq, dx_pt): {max_abs_diff(x_seq.grad,x_pt.grad)}")
+
+    print(f"mad( y_te,  y_seq): {max_abs_diff(y_te, y_seq)}")
+    print(f"mad( y_te,   y_pt): {max_abs_diff(y_te, y_pt)}")
+    print(f"mad( y_seq,  y_pt): {max_abs_diff(y_seq,y_pt)}")
+
+
+def cmp_layernorm_mlp(norm: str, act: str):
+    m_seq = seq.Sequential(
+        seq.LayerNorm(HIDDEN_DIM) if norm == "LayerNorm" else seq.RMSNorm(HIDDEN_DIM),
+        seq.Linear(HIDDEN_DIM, 3 * HIDDEN_DIM),
+        seq.GELU() if act == "gelu" else seq.ReLU(),
+        seq.Linear(3 * HIDDEN_DIM, HIDDEN_DIM),
+    )
+    m_te = te.LayerNormMLP(
+        HIDDEN_DIM, 3 * HIDDEN_DIM, activation=act, normalization=norm
+    )
+    m_pt = nn.Sequential(
+        nn.LayerNorm(HIDDEN_DIM) if norm == "LayerNorm" else RMSNorm(HIDDEN_DIM),
+        nn.Linear(HIDDEN_DIM, 3 * HIDDEN_DIM),
+        nn.GELU() if act == "gelu" else nn.ReLU(),
+        nn.Linear(3 * HIDDEN_DIM, HIDDEN_DIM),
+    )
+
+    cpy(m_te.layer_norm_weight, m_seq._modules["0"].weight)
+    if norm == "LayerNorm":
+        cpy(m_te.layer_norm_bias, m_seq._modules["0"].bias)
+    cpy(m_te.fc1_weight, m_seq._modules["1"].weight)
+    cpy(m_te.fc1_bias, m_seq._modules["1"].bias)
+    cpy(m_te.fc2_weight, m_seq._modules["3"].weight)
+    cpy(m_te.fc2_bias, m_seq._modules["3"].bias)
+
+    cpy(m_pt[0].weight, m_seq._modules["0"].weight)
+    if norm == "LayerNorm":
+        cpy(m_pt[0].bias, m_seq._modules["0"].bias)
+    cpy(m_pt[1].weight, m_seq._modules["1"].weight)
+    cpy(m_pt[1].bias, m_seq._modules["1"].bias)
+    cpy(m_pt[3].weight, m_seq._modules["3"].weight)
+    cpy(m_pt[3].bias, m_seq._modules["3"].bias)
+
+    cmp_modules(m_te, m_seq, m_pt)
+
+
+def cmp_layernorm():
+    m_seq = seq.LayerNorm(HIDDEN_DIM)
+    m_te = te.LayerNorm(HIDDEN_DIM)
+    m_pt = nn.LayerNorm(HIDDEN_DIM)
+
+    cpy(m_te.weight, m_seq.weight)
+    cpy(m_te.bias, m_seq.bias)
+    cpy(m_pt.weight, m_seq.weight)
+    cpy(m_pt.bias, m_seq.bias)
+
+    cmp_modules(m_te, m_seq, m_pt)
+
+
+def cmp_linear():
+    m_seq = seq.Linear(HIDDEN_DIM, HIDDEN_DIM)
+    m_te = te.Linear(HIDDEN_DIM, HIDDEN_DIM)
+    m_pt = nn.Linear(HIDDEN_DIM, HIDDEN_DIM)
+
+    cpy(m_te.weight, m_seq.weight)
+    cpy(m_te.bias, m_seq.bias)
+    cpy(m_pt.weight, m_seq.weight)
+    cpy(m_pt.bias, m_seq.bias)
+
+    cmp_modules(m_te, m_seq, m_pt)
+
+
+def cmp_linear_no_bias():
+    m_seq = seq.Linear(HIDDEN_DIM, HIDDEN_DIM, bias=False)
+    m_te = te.Linear(HIDDEN_DIM, HIDDEN_DIM, bias=False)
+    m_pt = nn.Linear(HIDDEN_DIM, HIDDEN_DIM, bias=False)
+
+    cpy(m_te.weight, m_seq.weight)
+    cpy(m_pt.weight, m_seq.weight)
+
+    cmp_modules(m_te, m_seq, m_pt)
+
+
+print("\n ----- FP32 INPUT & WEIGHTS ------")
+x_src = torch.rand(SEQ_LEN, HIDDEN_DIM, device="cuda")
+
+for _ in range(10):
+    print("\n### Comparing LayerNormMPL (gelu) ###")
+    cmp_layernorm_mlp("LayerNorm", "gelu")
+
+    print("\n### Comparing LayerNormMPL (relu) ###")
+    cmp_layernorm_mlp("LayerNorm", "relu")
+
+    print("\n### Comparing RMSNormMPL (gelu) ###")
+    cmp_layernorm_mlp("RMSNorm", "gelu")
+
+    print("\n### Comparing RMSNormMPL (relu) ###")
+    cmp_layernorm_mlp("RMSNorm", "relu")
+
+    print("\n### Comparing LayerNorm ###")
+    cmp_layernorm()
+
+    print("\n### Comparing Linear ###")
+    cmp_linear()
+
+    print("\n### Comparing Linear (no bias) ###")
+    cmp_linear_no_bias()
diff --git a/tests/sequential/perf_test.py b/tests/sequential/perf_test.py
new file mode 100644
index 0000000000..96fbd40883
--- /dev/null
+++ b/tests/sequential/perf_test.py
@@ -0,0 +1,62 @@
+import torch
+import transformer_engine.pytorch.sequential as seq
+from torch import nn
+import transformer_engine.pytorch as te
+from math import sqrt
+
+SEQ_LEN = 4096
+HIDDEN_DIM = 1024
+
+seq.Sequential(
+    seq.RMSNorm(HIDDEN_DIM),
+)
+
+
+vasavani_dec = te.Sequential(
+    te.Residual(
+        te.Linear(HIDDEN_DIM, 3 * HIDDEN_DIM),
+        te.DotProductAttention(24),
+        te.Linear(HIDDEN_DIM, HIDDEN_DIM),
+        te.LayerNorm(HIDDEN_DIM),
+    ),
+    te.Residual(
+        te.Linear(HIDDEN_DIM, 4 * HIDDEN_DIM),
+        te.ReLU(),
+        te.Linear(4 * HIDDEN_DIM, HIDDEN_DIM),
+        te.LayerNorm(HIDDEN_DIM),
+    ),
+)
+
+gpt = te.Sequential(
+    te.Residual(
+        te.LayerNorm(HIDDEN_DIM),
+        te.Linear(HIDDEN_DIM, 3 * HIDDEN_DIM),
+        te.DotProductAttention(24),
+        te.Linear(HIDDEN_DIM, HIDDEN_DIM),
+        te.Dropout(0.1),
+    ),
+    te.Residual(
+        te.LayerNorm(HIDDEN_DIM),
+        te.Linear(HIDDEN_DIM, 4 * HIDDEN_DIM),
+        te.GELU(),
+        te.Linear(4 * HIDDEN_DIM, HIDDEN_DIM),
+        te.Dropout(0.1),
+    ),
+)
+
+llama = te.Sequential(
+    te.Residual(
+        te.RMSNorm(HIDDEN_DIM),
+        te.Linear(HIDDEN_DIM, 3 * HIDDEN_DIM),
+        te.DotProductAttention(24),
+        te.Linear(HIDDEN_DIM, HIDDEN_DIM),
+        te.Dropout(0.1),
+    ),
+    te.Residual(
+        te.RMSNorm(HIDDEN_DIM),
+        te.Linear(HIDDEN_DIM, 4 * HIDDEN_DIM),
+        te.SwiGLU(),
+        te.Linear(4 * HIDDEN_DIM, HIDDEN_DIM),
+        te.Dropout(0.1),
+    ),
+)
diff --git a/tests/sequential/simple_prec_compare.py b/tests/sequential/simple_prec_compare.py
new file mode 100644
index 0000000000..dfae42f58d
--- /dev/null
+++ b/tests/sequential/simple_prec_compare.py
@@ -0,0 +1,37 @@
+import torch
+from torch import nn
+import transformer_engine.pytorch.sequential as seq
+
+N = 2048
+HIDDEN_DIM = 1024
+x = torch.rand(N, HIDDEN_DIM, device="cuda", requires_grad=True)
+
+m = seq.Sequential(
+    seq.RMSNorm(HIDDEN_DIM),
+    seq.Linear(HIDDEN_DIM, 4 * HIDDEN_DIM),
+    seq.SwiGLU(),
+    seq.Linear(2 * HIDDEN_DIM, HIDDEN_DIM),
+)
+torch.set_printoptions(precision=4, sci_mode=False)
+
+m(x)
+
+with seq.Recipe(lowp=seq.nvte.DType.Float8E4M3):
+    opt: nn.Module = torch.compile(m, fullgraph=True, dynamic=True)
+    for _ in range(100):
+        y: torch.Tensor = opt(x)
+        y.sum().backward()
+        print(x.grad)
+        x.grad = None
+
+with seq.Recipe(lowp=seq.nvte.DType.BFloat16):
+    y = m(x)
+    y.sum().backward()
+    print(x.grad)
+    x.grad = None
+
+with seq.Recipe(lowp=seq.nvte.DType.Float32):
+    y = m(x)
+    y.sum().backward()
+    print(x.grad)
+    x.grad = None
diff --git a/tests/sequential/test_matrix1.py b/tests/sequential/test_matrix1.py
new file mode 100644
index 0000000000..f0a13106ba
--- /dev/null
+++ b/tests/sequential/test_matrix1.py
@@ -0,0 +1,249 @@
+from __future__ import annotations
+import torch
+from torch import nn
+import transformer_engine.pytorch.sequential as seq
+import transformer_engine.pytorch as te
+
+BATCH_SIZE = 512
+IN_FEATURES = 768
+OUT_FEATURES = 4 * IN_FEATURES
+
+
+def cpy(dst: torch.Tensor, src: torch.Tensor):
+    dst.data = torch.as_tensor(src.data.clone().detach(), dtype=dst.dtype).detach()
+
+
+def max_abs_diff(ref: torch.Tensor, cand: torch.Tensor):
+    # ab = abs(cand-ref).max().item()
+    # rl = abs((cand-ref)/ref).max().item()
+    # s=""
+    # if ab < 0.001:
+    #     s += f"a:\033[32m{ab:18.5f}\033[0m,"
+    # elif ab< 0.1:
+    #     s += f"a:\033[33m{ab:18.5f}\033[0m,"
+    # else:
+    #     s += f"a:\033[31m{ab:18.5f}\033[0m,"
+
+    # if rl < 0.001:
+    #     s += f"r:\033[32m{rl:18.5f}\033[0m"
+    # elif rl< 0.1:
+    #     s += f"r:\033[33m{rl:18.5f}\033[0m"
+    # else:
+    #     s += f"r:\033[31m{rl:18.5f}\033[0m"
+    # return s
+
+    try:
+        torch.testing.assert_close(cand, ref, atol=1e-5, rtol=1e-3)
+        ok = True
+    except AssertionError as e:
+        ok = False
+        print(str(e))
+
+    if ok:
+        return "\033[32mOK\033[0m"
+    else:
+        return "\033[31mWA\033[0m"
+
+
+def test(
+    enable_first_linear: bool,
+    use_te_linear: bool,
+    use_te_act: bool,
+    use_relu: bool,
+    use_gelu: bool,
+    div_std: bool,
+    enable_second_linear: bool,
+    lin1_w: torch.Tensor,
+    lin1_b: torch.Tensor,
+    lin2_w: torch.Tensor,
+    lin2_b: torch.Tensor,
+    inp: torch.Tensor,
+):
+    if enable_first_linear:
+        if use_te_linear:
+            lin1 = te.Linear(IN_FEATURES, OUT_FEATURES)
+            cpy(lin1.weight, lin1_w)
+            cpy(lin1.bias, lin1_b)
+        else:
+            lin1 = nn.Linear(IN_FEATURES, OUT_FEATURES)
+            cpy(lin1.weight, lin1_w)
+            cpy(lin1.bias, lin1_b)
+    else:
+        lin1 = lambda x: x
+
+    if enable_second_linear:
+        if enable_first_linear:
+            if use_te_linear:
+                lin2 = te.Linear(OUT_FEATURES, IN_FEATURES)
+                cpy(lin2.weight, lin2_w)
+                cpy(lin2.bias, lin2_b)
+            else:
+                lin2 = nn.Linear(IN_FEATURES, OUT_FEATURES)
+                cpy(lin2.weight, lin2_w)
+                cpy(lin2.bias, lin2_b)
+        else:
+            if use_te_linear:
+                lin2 = te.Linear(IN_FEATURES, OUT_FEATURES)
+                cpy(lin2.weight, lin1_w)
+                cpy(lin2.bias, lin1_b)
+            else:
+                lin2 = nn.Linear(IN_FEATURES, OUT_FEATURES)
+                cpy(lin2.weight, lin1_w)
+                cpy(lin2.bias, lin1_b)
+    else:
+        lin2 = lambda x: x
+
+    if use_relu:
+        if use_te_act:
+            relu = seq.ReLU()
+        else:
+            relu = nn.ReLU()
+    else:
+        relu = lambda x: x
+
+    if use_gelu:
+        if use_te_act:
+            gelu = seq.GELU()
+        else:
+            gelu = nn.GELU(approximate="tanh")
+    else:
+        gelu = lambda x: x
+
+    x = inp.detach().clone().requires_grad_()
+    x1 = x / x.std() if div_std else x
+    x2 = lin1(x1)
+    x3 = relu(x2)
+    x4 = gelu(x3)
+    x5 = lin2(x4)
+    x5.sum().backward()
+    assert x.grad is not None
+    return x.grad
+
+
+results = {}
+
+for _ in range(50):
+    lin1 = nn.Linear(IN_FEATURES, OUT_FEATURES, device="cuda")
+    lin2 = nn.Linear(OUT_FEATURES, IN_FEATURES, device="cuda")
+    x = torch.rand(BATCH_SIZE, IN_FEATURES, device="cuda") * 2.0 - 1.0
+
+    for i in range(128):
+        (
+            enable_first_linear,
+            use_te_linear,
+            use_te_act,
+            use_relu,
+            use_gelu,
+            div_std,
+            enable_second_linear,
+        ) = (bool(i & (1 << j)) for j in range(7))
+
+        if use_relu and use_gelu:
+            continue
+        ref_use_te_linear = False
+        ref_use_te_act = False
+        if ref_use_te_linear == use_te_linear and ref_use_te_act == use_te_act:
+            continue
+        if (
+            not enable_first_linear
+            and not enable_second_linear
+            and not use_relu
+            and not use_gelu
+        ):
+            continue
+        if (
+            not use_relu
+            and not use_gelu
+            and (use_te_act or ref_use_te_linear == use_te_linear)
+        ):
+            continue
+        if (
+            not enable_first_linear
+            and not enable_second_linear
+            and (use_te_linear or ref_use_te_act == use_te_act)
+        ):
+            continue
+        if (
+            not enable_first_linear
+            and not use_relu
+            and not use_gelu
+            and enable_second_linear
+        ):
+            continue
+
+        ref = test(
+            enable_first_linear,
+            ref_use_te_linear,
+            ref_use_te_act,
+            use_relu,
+            use_gelu,
+            div_std,
+            enable_second_linear,
+            lin1.weight,
+            lin1.bias,
+            lin2.weight,
+            lin2.bias,
+            x,
+        )
+        cand = test(
+            enable_first_linear,
+            use_te_linear,
+            use_te_act,
+            use_relu,
+            use_gelu,
+            div_std,
+            enable_second_linear,
+            lin1.weight,
+            lin1.bias,
+            lin2.weight,
+            lin2.bias,
+            x,
+        )
+        if i not in results:
+            results[i] = [max_abs_diff(ref, cand)]
+        else:
+            results[i].append(max_abs_diff(ref, cand))
+
+    del lin1, lin2, x
+
+for i, res in results.items():
+    (
+        enable_first_linear,
+        use_te_linear,
+        use_te_act,
+        use_relu,
+        use_gelu,
+        div_std,
+        enable_second_linear,
+    ) = (bool(i & (1 << j)) for j in range(7))
+
+    s = ""
+    if div_std:
+        s += "RMSNorm, "
+    if enable_first_linear:
+        if use_te_linear:
+            s += "te.Linear, "
+        else:
+            s += "nn.Linear, "
+    if use_relu:
+        if use_te_act:
+            s += "seq.ReLU, "
+        else:
+            s += "nn.ReLU, "
+    if use_gelu:
+        if use_te_act:
+            s += "seq.GELU, "
+        else:
+            s += "nn.GELU, "
+    if enable_second_linear:
+        if use_te_linear:
+            s += "te.Linear, "
+        else:
+            s += "nn.Linear, "
+    s = s[:-2] + ": "
+    s = s.rjust(45)
+
+    print(s, end="")
+    for r in res:
+        print(f"{r}, ", end="")
+    print()
diff --git a/tests/sequential/test_matrix2.py b/tests/sequential/test_matrix2.py
new file mode 100644
index 0000000000..fa2ca926c1
--- /dev/null
+++ b/tests/sequential/test_matrix2.py
@@ -0,0 +1,369 @@
+from __future__ import annotations
+import torch
+from enum import Enum
+from torch import nn, autocast
+import torch.backends.cuda
+import torch.backends.cudnn
+import transformer_engine.pytorch.sequential as seq
+from transformer_engine.pytorch.sequential.nvte import DType
+import transformer_engine.pytorch as te
+from math import sqrt
+
+torch.set_default_device("cuda")
+
+
+class RMSNorm(nn.Module):
+    def __init__(self, hidden_dim: int, eps: float = 1e-5):
+        super().__init__()  # type: ignore
+        self.hidden_dim = hidden_dim
+        self.eps = eps
+        self.weight = nn.Parameter(torch.ones(hidden_dim))
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        x_norm: float = x.norm(2, dim=-1, keepdim=True)  # type: ignore
+        rms_x: float = x_norm / sqrt(self.hidden_dim)  # type: ignore
+        y: torch.Tensor = x / (rms_x + self.eps)  # type: ignore
+        return y * self.weight  # type: ignore
+
+
+class NormalizationType(Enum):
+    NONE = 0
+    LAYERNORM = 1
+    RMSNORM = 2
+
+
+class ActivationType(Enum):
+    NONE = 0
+    RELU = 1
+    GELU = 2
+
+
+class InputInitMethodType(Enum):
+    Normal01 = 0
+    Uniform01 = 1
+    Normal11 = 2
+    Uniform11 = 3
+
+
+def cpy(dst: torch.Tensor, src: torch.Tensor):
+    dst.data = torch.as_tensor(src.data.clone().detach(), dtype=dst.dtype).detach()
+
+
+def normal_range(x: torch.Tensor, kinda_min: float, kinda_max: float):
+    mean = (kinda_min + kinda_max) / 2
+    range = kinda_max - kinda_min
+    kinda_radius = range / 2
+    # if the std. dev. of the result is 1/2 radius, then
+    # about 95% of values should be within 2 deviations
+    # let there be some outliers for diversity
+    std = kinda_radius / 2
+    return torch.nn.init.normal_(x, mean, std)
+
+
+def init_input(shape: tuple[int, ...], init_method: InputInitMethodType):
+    in_min_val = (
+        0.0
+        if init_method in [InputInitMethodType.Normal01, InputInitMethodType.Uniform01]
+        else -1.0
+    )
+    in_max_val = 1.0
+    distribution = (
+        torch.nn.init.uniform_
+        if init_method in [InputInitMethodType.Uniform01, InputInitMethodType.Uniform11]
+        else normal_range
+    )
+
+    input = torch.empty(shape, device="cuda")
+    input = distribution(input, in_min_val, in_max_val)
+    return input
+
+
+def pt_test(
+    normalization: NormalizationType,
+    first_linear: bool,
+    activation: ActivationType,
+    second_linear: bool,
+    lin1_weight: torch.Tensor,
+    lin1_bias: torch.Tensor,
+    lin2_weight: torch.Tensor,
+    lin2_bias: torch.Tensor,
+    x: torch.Tensor,
+):
+    modules = list[nn.Module]()
+
+    if normalization is NormalizationType.LAYERNORM:
+        modules.append(nn.LayerNorm(IN_FEATURES))
+    elif normalization is NormalizationType.RMSNORM:
+        modules.append(RMSNorm(IN_FEATURES))
+
+    if first_linear:
+        lin1 = nn.Linear(IN_FEATURES, OUT_FEATURES)
+        cpy(lin1.weight, lin1_weight)
+        cpy(lin1.bias, lin1_bias)
+        modules.append(lin1)
+
+    if activation is ActivationType.RELU:
+        modules.append(nn.ReLU())
+    elif activation is ActivationType.GELU:
+        modules.append(nn.GELU(approximate="tanh"))
+
+    if second_linear:
+        if not first_linear:
+            lin2 = nn.Linear(IN_FEATURES, OUT_FEATURES)
+            cpy(lin2.weight, lin1_weight)
+            cpy(lin2.bias, lin1_bias)
+            modules.append(lin2)
+        else:
+            lin2 = nn.Linear(OUT_FEATURES, IN_FEATURES)
+            cpy(lin2.weight, lin2_weight)
+            cpy(lin2.bias, lin2_bias)
+            modules.append(lin2)
+
+    assert len(modules) >= 1
+
+    m = nn.Sequential(*modules)
+    inp = x.detach().clone().requires_grad_()
+    out = m(inp)
+    out.sum().backward()
+    assert inp.grad is not None
+    return inp.grad
+
+
+def seq_test_unfused(
+    normalization: NormalizationType,
+    first_linear: bool,
+    activation: ActivationType,
+    second_linear: bool,
+    lin1_weight: torch.Tensor,
+    lin1_bias: torch.Tensor,
+    lin2_weight: torch.Tensor,
+    lin2_bias: torch.Tensor,
+    x: torch.Tensor,
+):
+    modules = list[nn.Module]()
+
+    if normalization is NormalizationType.LAYERNORM:
+        modules.append(seq.LayerNorm(IN_FEATURES))
+    elif normalization is NormalizationType.RMSNORM:
+        modules.append(seq.RMSNorm(IN_FEATURES))
+
+    if first_linear:
+        lin1 = seq.Linear(IN_FEATURES, OUT_FEATURES)
+        cpy(lin1.weight, lin1_weight)
+        cpy(lin1.bias, lin1_bias)
+        modules.append(lin1)
+
+    if activation is ActivationType.RELU:
+        modules.append(seq.ReLU())
+    elif activation is ActivationType.GELU:
+        modules.append(seq.GELU())
+
+    if second_linear:
+        if not first_linear:
+            lin2 = seq.Linear(IN_FEATURES, OUT_FEATURES)
+            cpy(lin2.weight, lin1_weight)
+            cpy(lin2.bias, lin1_bias)
+            modules.append(lin2)
+        else:
+            lin2 = seq.Linear(OUT_FEATURES, IN_FEATURES)
+            cpy(lin2.weight, lin2_weight)
+            cpy(lin2.bias, lin2_bias)
+            modules.append(lin2)
+
+    assert len(modules) >= 1
+
+    m = nn.Sequential(*modules)
+    inp = x.detach().clone().requires_grad_()
+    out = m(inp)
+    out.sum().backward()
+    assert inp.grad is not None
+    return inp.grad
+
+
+def seq_test_fused(
+    normalization: NormalizationType,
+    first_linear: bool,
+    activation: ActivationType,
+    second_linear: bool,
+    lin1_weight: torch.Tensor,
+    lin1_bias: torch.Tensor,
+    lin2_weight: torch.Tensor,
+    lin2_bias: torch.Tensor,
+    x: torch.Tensor,
+):
+    modules = list[nn.Module]()
+
+    if normalization is NormalizationType.LAYERNORM:
+        modules.append(seq.LayerNorm(IN_FEATURES))
+    elif normalization is NormalizationType.RMSNORM:
+        modules.append(seq.RMSNorm(IN_FEATURES))
+
+    if first_linear:
+        lin1 = seq.Linear(IN_FEATURES, OUT_FEATURES)
+        cpy(lin1.weight, lin1_weight)
+        cpy(lin1.bias, lin1_bias)
+        modules.append(lin1)
+
+    if activation is ActivationType.RELU:
+        modules.append(seq.ReLU())
+    elif activation is ActivationType.GELU:
+        modules.append(seq.GELU())
+
+    if second_linear:
+        if not first_linear:
+            lin2 = seq.Linear(IN_FEATURES, OUT_FEATURES)
+            cpy(lin2.weight, lin1_weight)
+            cpy(lin2.bias, lin1_bias)
+            modules.append(lin2)
+        else:
+            lin2 = seq.Linear(OUT_FEATURES, IN_FEATURES)
+            cpy(lin2.weight, lin2_weight)
+            cpy(lin2.bias, lin2_bias)
+            modules.append(lin2)
+
+    assert len(modules) >= 1
+
+    m = seq.Sequential(*modules)
+    inp = x.detach().clone().requires_grad_()
+    out = m(inp)
+    out.sum().backward()
+    assert inp.grad is not None
+    return inp.grad
+
+
+results = (
+    list[bool | None](),
+    list[bool | None](),
+    list[bool | None](),
+    list[bool | None](),
+)
+
+
+def test(
+    normalization: NormalizationType,
+    first_linear: bool,
+    activation: ActivationType,
+    second_linear: bool,
+    lin1_weight: torch.Tensor,
+    lin1_bias: torch.Tensor,
+    lin2_weight: torch.Tensor,
+    lin2_bias: torch.Tensor,
+    x: torch.Tensor,
+):
+    args = (
+        normalization,
+        first_linear,
+        activation,
+        second_linear,
+        lin1_weight,
+        lin1_bias,
+        lin2_weight,
+        lin2_bias,
+        x,
+    )
+
+    # Pytorch reference implementation in FP32, no TF32
+    torch.backends.cuda.matmul.allow_tf32 = False
+    torch.backends.cudnn.allow_tf32 = False
+    pt_fp32 = pt_test(*args)
+    # Pytorch reference implementation in FP32, with TF32
+    torch.backends.cuda.matmul.allow_tf32 = True
+    torch.backends.cudnn.allow_tf32 = True
+    pt_tf32 = pt_test(*args)
+    # Pytorch reference implementation with autocast to float16
+    with autocast("cuda", torch.float16):
+        pt_fp16 = pt_test(*args)
+    # Pytorch reference implementation with autocast to bfloat16
+    with autocast("cuda", torch.bfloat16):
+        pt_bf16 = pt_test(*args)
+
+    with seq.Recipe(lowp=DType.Float32):
+        sequ_fp32 = seq_test_unfused(*args)
+    with seq.Recipe(lowp=DType.BFloat16):
+        sequ_bf16 = seq_test_unfused(*args)
+    with seq.Recipe(lowp=DType.Float16):
+        sequ_fp16 = seq_test_unfused(*args)
+
+    with seq.Recipe(lowp=DType.Float32):
+        seqf_fp32 = seq_test_fused(*args)
+    with seq.Recipe(lowp=DType.BFloat16):
+        seqf_bf16 = seq_test_fused(*args)
+    with seq.Recipe(lowp=DType.Float16):
+        seqf_fp16 = seq_test_fused(*args)
+
+    for i, ref in enumerate([pt_fp32, pt_tf32, pt_fp16, pt_bf16]):
+        for cand in [sequ_fp32, sequ_bf16, sequ_fp16, seqf_fp32, seqf_bf16, seqf_fp16]:
+            try:
+                torch.testing.assert_close(cand, ref, atol=1e-5, rtol=1e-3)
+                ok = True
+            except AssertionError:
+                ok = False
+            results[i].append(ok)
+        results[i].append(None)
+
+
+def print_results():
+    print("\033[2J")
+    for chunk in range(0, len(results[0]), 126):
+        for i in range(4):
+            for res in results[i][chunk : chunk + 126]:
+                if res is None:
+                    print("  ", end="")
+                elif res:
+                    print(f"\033[42;97mOK\033[0m", end="")
+                else:
+                    print(f"\033[41;30mWA\033[0m", end="")
+            print()
+        print()
+        print()
+
+
+BATCH_SIZE = 512
+IN_FEATURES = 768
+OUT_FEATURES = 4 * IN_FEATURES
+TESTS = 10
+
+for input_init_method in InputInitMethodType:
+    for _ in range(TESTS):
+        lin1 = nn.Linear(
+            IN_FEATURES, OUT_FEATURES, device="cuda"
+        )  # used for initializing weights consistently
+        lin2 = nn.Linear(
+            OUT_FEATURES, IN_FEATURES, device="cuda"
+        )  # used for initializing weights consistently
+        x = init_input((BATCH_SIZE, IN_FEATURES), input_init_method)
+
+        for normalization in NormalizationType:
+            for first_linear in [True, False]:
+                for activation in ActivationType:
+                    for second_linear in [True, False]:
+                        # Skip invalid configurations
+                        if (
+                            normalization is NormalizationType.NONE
+                            and not first_linear
+                            and activation is ActivationType.NONE
+                            and not second_linear
+                        ):
+                            continue  # noop model
+                        if (
+                            not first_linear
+                            and activation is ActivationType.NONE
+                            and second_linear
+                        ):
+                            continue  # one linear layer, symmetrical to: first_linear and activation is ActivationType.NONE and not second_linear
+
+                        test(
+                            normalization,
+                            first_linear,
+                            activation,
+                            second_linear,
+                            lin1.weight,
+                            lin1.bias,
+                            lin2.weight,
+                            lin2.bias,
+                            x,
+                        )
+
+                        print_results()
+
+        del lin1, lin2, x  # force recreation of tensors
diff --git a/tests/sequential/transformer.py b/tests/sequential/transformer.py
new file mode 100644
index 0000000000..6582385fc1
--- /dev/null
+++ b/tests/sequential/transformer.py
@@ -0,0 +1,21 @@
+import torch
+import transformer_engine.pytorch.sequential as seq
+
+SEQ_LEN = 128
+HIDDEN_DIM = 768
+FFN_DIM = 4 * HIDDEN_DIM
+
+seq.Sequential(
+    seq.Residual(
+        seq.RMSNorm(HIDDEN_DIM),
+        seq.Linear(HIDDEN_DIM, 3 * HIDDEN_DIM),
+        seq.DotProductAttention(),
+        seq.Linear(3 * HIDDEN_DIM, HIDDEN_DIM),
+    ),
+    seq.Residual(
+        seq.RMSNorm(HIDDEN_DIM),
+        seq.Linear(HIDDEN_DIM, FFN_DIM),
+        seq.GELU(),
+        seq.Linear(FFN_DIM, HIDDEN_DIM),
+    ),
+)
diff --git a/transformer_engine/common/gemm/cublaslt_gemm.cu b/transformer_engine/common/gemm/cublaslt_gemm.cu
index 7f8b0b723d..6f957b429c 100644
--- a/transformer_engine/common/gemm/cublaslt_gemm.cu
+++ b/transformer_engine/common/gemm/cublaslt_gemm.cu
@@ -229,11 +229,11 @@ void cublas_gemm(const Tensor *inputA,
           preference, CUBLASLT_MATMUL_PREF_MAX_WORKSPACE_BYTES,
           &workspaceSize, sizeof(workspaceSize)));
 
-  NVTE_CHECK_CUBLAS(cublasLtMatmulAlgoGetHeuristic(handle, operationDesc, Adesc, Bdesc, Cdesc,
-                                                   Ddesc, preference, 1, &heuristicResult,
-                                                   &returnedResults));
-
-  if (returnedResults == 0) throw std::runtime_error("Unable to find any suitable algorithms");
+  const auto status = cublasLtMatmulAlgoGetHeuristic(handle, operationDesc, Adesc, Bdesc, Cdesc,
+                                                     Ddesc, preference, 1, &heuristicResult,
+                                                     &returnedResults);
+  if (status == CUBLAS_STATUS_NOT_SUPPORTED) throw std::runtime_error("Unable to find suitable CUBLAS GEMM algorithm.");
+  NVTE_CHECK_CUBLAS(status);
 
   // D = alpha * (A * B) + beta * C
 
diff --git a/transformer_engine/common/include/transformer_engine/logging.h b/transformer_engine/common/include/transformer_engine/logging.h
index 9ac0bbbde2..bec58f9f88 100644
--- a/transformer_engine/common/include/transformer_engine/logging.h
+++ b/transformer_engine/common/include/transformer_engine/logging.h
@@ -7,68 +7,70 @@
 #ifndef TRANSFORMER_ENGINE_LOGGING_H_
 #define TRANSFORMER_ENGINE_LOGGING_H_
 
-#include <cuda_runtime_api.h>
 #include <cublas_v2.h>
+#include <cuda_runtime_api.h>
 #include <cudnn.h>
 #include <nvrtc.h>
-#include <string>
 #include <stdexcept>
+#include <string>
 
-#define NVTE_ERROR(x) \
-    do { \
-        throw std::runtime_error(std::string(__FILE__ ":") + std::to_string(__LINE__) +            \
-                                 " in function " + __func__ + ": " + x);                           \
-    } while (false)
-
-#define NVTE_CHECK(x, ...)                                                                         \
-    do {                                                                                           \
-        if (!(x)) {                                                                                \
-            NVTE_ERROR(std::string("Assertion failed: "  #x ". ") + std::string(__VA_ARGS__));     \
-        }                                                                                          \
-    } while (false)
-
-namespace {
-
-inline void check_cuda_(cudaError_t status) {
-    if ( status != cudaSuccess ) {
-        NVTE_ERROR("CUDA Error: " + std::string(cudaGetErrorString(status)));
-    }
-}
-
-inline void check_cublas_(cublasStatus_t status) {
-    if ( status != CUBLAS_STATUS_SUCCESS ) {
-        NVTE_ERROR("CUBLAS Error: " + std::string(cublasGetStatusString(status)));
-    }
-}
-
-inline void check_cudnn_(cudnnStatus_t status) {
-    if ( status != CUDNN_STATUS_SUCCESS ) {
-        std::string message;
-        message.reserve(1024);
-        message += "CUDNN Error: ";
-        message += cudnnGetErrorString(status);
-        message += (". "
-                    "For more information, enable cuDNN error logging "
-                    "by setting CUDNN_LOGERR_DBG=1 and "
-                    "CUDNN_LOGDEST_DBG=stderr in the environment.");
-        NVTE_ERROR(message);
-    }
-}
-
-inline void check_nvrtc_(nvrtcResult status) {
-    if ( status != NVRTC_SUCCESS ) {
-        NVTE_ERROR("NVRTC Error: " + std::string(nvrtcGetErrorString(status)));
-    }
-}
+#define NVTE_ERROR(x)                                                          \
+  do {                                                                         \
+    throw std::runtime_error(std::string(__FILE__ ":") +                       \
+                             std::to_string(__LINE__) + " in function " +      \
+                             __func__ + ": " + x);                             \
+  } while (false)
 
-}  // namespace
+#define NVTE_CHECK(x, ...)                                                     \
+  do {                                                                         \
+    if (!(x)) {                                                                \
+      NVTE_ERROR(std::string("Assertion failed: " #x ". ") +                   \
+                 std::string(__VA_ARGS__));                                    \
+    }                                                                          \
+  } while (false)
 
-#define NVTE_CHECK_CUDA(ans) { check_cuda_(ans); }
+#define NVTE_CHECK_CUDA(status)                                                \
+  do {                                                                         \
+    if (status != cudaSuccess) {                                               \
+      NVTE_ERROR("CUDA Error: " + std::string(cudaGetErrorString(status)));    \
+    }                                                                          \
+  } while (false)
 
-#define NVTE_CHECK_CUBLAS(ans) { check_cublas_(ans); }
+#define NVTE_CHECK_CUBLAS(status)                                              \
+  do {                                                                         \
+    if (status != CUBLAS_STATUS_SUCCESS) {                                     \
+      std::string message;                                                     \
+      message.reserve(1024);                                                   \
+      message += "CUBLAS Error: ";                                             \
+      message += cublasGetStatusString(status);                                \
+      message += (". "                                                         \
+                  "For more information, increase CUBLASLT_LOG_LEVEL, "        \
+                  "by setting CUBLASLT_LOG_LEVEL=N [0-5] "                     \
+                  "in the environment.");                                      \
+      NVTE_ERROR(message);                                                     \
+    }                                                                          \
+  } while (false)
 
-#define NVTE_CHECK_CUDNN(ans) { check_cudnn_(ans); }
+#define NVTE_CHECK_CUDNN(status)                                               \
+  do {                                                                         \
+    if (status != CUDNN_STATUS_SUCCESS) {                                      \
+      std::string message;                                                     \
+      message.reserve(1024);                                                   \
+      message += "CUDNN Error: ";                                              \
+      message += cudnnGetErrorString(status);                                  \
+      message += (". "                                                         \
+                  "For more information, enable cuDNN error logging "          \
+                  "by setting CUDNN_LOGERR_DBG=1 and "                         \
+                  "CUDNN_LOGDEST_DBG=stderr in the environment.");             \
+      NVTE_ERROR(message);                                                     \
+    }                                                                          \
+  } while (false)
 
-#define NVTE_CHECK_NVRTC(ans) { check_nvrtc_(ans); }
+#define NVTE_CHECK_NVRTC(status)                                               \
+  do {                                                                         \
+    if (status != NVRTC_SUCCESS) {                                             \
+      NVTE_ERROR("NVRTC Error: " + std::string(nvrtcGetErrorString(status)));  \
+    }                                                                          \
+  } while (false)
 
-#endif  // TRANSFORMER_ENGINE_LOGGING_H_
+#endif // TRANSFORMER_ENGINE_LOGGING_H_
diff --git a/transformer_engine/common/include/transformer_engine/transpose.h b/transformer_engine/common/include/transformer_engine/transpose.h
index b12e3f8096..6eb653a359 100644
--- a/transformer_engine/common/include/transformer_engine/transpose.h
+++ b/transformer_engine/common/include/transformer_engine/transpose.h
@@ -146,9 +146,6 @@ void nvte_multi_cast_transpose(size_t num_tensors,
  *  - `cast_output` is the result of the cast
  *  - `transposed_output` is the transposed result of the cast.
  *
- *  Calling this function with workspace being an empty tensor will not perform the operation,
- *  but instead set the shape and type of the workspace tensor to the required values.
- *
  *  \param[in]     input               Input tensor of shape [N, H].
  *  \param[in]     geglu_input         Tensor used as input to the forward of GeGLU operation.
  *                                     Shape [N, H * 2].
diff --git a/transformer_engine/common/transformer_engine.cpp b/transformer_engine/common/transformer_engine.cpp
index 708712ff9a..4aaf3f988c 100644
--- a/transformer_engine/common/transformer_engine.cpp
+++ b/transformer_engine/common/transformer_engine.cpp
@@ -49,11 +49,11 @@ void CheckOutputTensor(const Tensor &t, const std::string &name, bool allow_empt
     NVTE_CHECK(t.amax.dtype == DType::kFloat32);
     NVTE_CHECK(t.amax.shape == std::vector<size_t>{ 1 });
     NVTE_CHECK(t.scale_inv.dptr != nullptr,
-               "FP8 output " + name + " must have scale.");
+               "FP8 output " + name + " must have inverse of scale.");
     NVTE_CHECK(t.scale_inv.dtype == DType::kFloat32);
     NVTE_CHECK(t.scale_inv.shape == std::vector<size_t>{ 1 });
     NVTE_CHECK(t.scale.dptr != nullptr,
-               "FP8 output " + name + " must have inverse of scale.");
+               "FP8 output " + name + " must have scale.");
     NVTE_CHECK(t.scale.dtype == DType::kFloat32);
     NVTE_CHECK(t.scale.shape == std::vector<size_t>{ 1 });
   } else {
diff --git a/transformer_engine/common/util/cuda_driver.h b/transformer_engine/common/util/cuda_driver.h
index 5d07e7a641..e4f9ca90d7 100644
--- a/transformer_engine/common/util/cuda_driver.h
+++ b/transformer_engine/common/util/cuda_driver.h
@@ -43,30 +43,28 @@ inline CUresult call(const char *symbol, ArgTs... args) {
 
 }  // namespace transformer_engine
 
-namespace {
-
-/*! \brief Throw exception if CUDA driver call has failed */
-inline void check_cuda_driver_(CUresult status) {
-  if (status != CUDA_SUCCESS) {
-    const char *description;
-    transformer_engine::cuda_driver::call("cuGetErrorString", &description);
-    NVTE_ERROR(transformer_engine::concat_strings("CUDA Error: ", description));
-  }
-}
-
-/*! \brief Call CUDA driver function and throw exception if it fails */
-template <typename... ArgTs>
-inline void call_and_check_cuda_driver_(const char *symbol,
-                                        ArgTs &&... args) {
-  check_cuda_driver_(transformer_engine::cuda_driver::call(symbol,
-                                                           std::forward<ArgTs>(args)...));
-}
-
-}  // namespace
-
-#define NVTE_CHECK_CUDA_DRIVER(ans) { check_cuda_driver_(ans); }
-
-#define NVTE_CALL_CHECK_CUDA_DRIVER(func, ...) \
-  { call_and_check_cuda_driver_(#func, __VA_ARGS__); }
-
-#endif  // TRANSFORMER_ENGINE_COMMON_UTIL_CUDA_DRIVER_H_
+#define NVTE_CHECK_CUDA_DRIVER(status)                                         \
+  do {                                                                         \
+    if (status != CUDA_SUCCESS) {                                              \
+      const char *description;                                                 \
+      transformer_engine::cuda_driver::call("cuGetErrorString", status,        \
+                                            &description);                     \
+      NVTE_ERROR(                                                              \
+          transformer_engine::concat_strings("CUDA Error: ", description));    \
+    }                                                                          \
+  } while (false)
+
+#define NVTE_CALL_CHECK_CUDA_DRIVER(symbol, ...)                               \
+  do {                                                                         \
+    CUresult status =                                                          \
+        transformer_engine::cuda_driver::call(#symbol, __VA_ARGS__);           \
+    if (status != CUDA_SUCCESS) {                                              \
+      const char *description;                                                 \
+      transformer_engine::cuda_driver::call("cuGetErrorString", status,        \
+                                            &description);                     \
+      NVTE_ERROR(                                                              \
+          transformer_engine::concat_strings(#symbol": ", description));       \
+    }                                                                          \
+  } while (false)
+
+#endif // TRANSFORMER_ENGINE_COMMON_UTIL_CUDA_DRIVER_H_
diff --git a/transformer_engine/pytorch/sequential/ARCHITECTURE.md b/transformer_engine/pytorch/sequential/ARCHITECTURE.md
new file mode 100644
index 0000000000..5af5fe9bad
--- /dev/null
+++ b/transformer_engine/pytorch/sequential/ARCHITECTURE.md
@@ -0,0 +1,38 @@
+# Architecure
+![Module dependency diagram](import_diagram.svg)
+_Generated with `pydeps .\transformer_engine\pytorch\sequential\ --only transformer_engine.pytorch.sequential --rmprefix transformer_engine.pytorch.sequential.`_
+
+## `ComputePipeline` and `Op`s
+
+The provided modules are a PyTorch interface to a framework-oblivious implementation present in `ops`. All modules are decomposed into `Op`s. An `Op` models a practically atomic operation. For example, a `Linear` layer is split into either an `MMT` (MatMulTranspose) and `Add` `Op` or into just an `MMT` `Op`. Such an `Op` can be thought of as a combination of an `nn.Module` and an `autograd.Function`, in the sense that it:
+1. Stores its trainable parameters (exposed through `require_grad`), like an `nn.Module`.
+2. Provides a `forward`, `backward` (and `inference`) method, like an `autograd.Function`.
+This is done to reduce the amount of needless boilerplate code. This allows for `Op` implementations to remain short, clean, and simple.
+
+The `Sequential` module itself is just a wrapper around a `ComputePipeline` object that is actually responsible for executing its constituent `Op`s, as well as managing the interaction between them, such as type inference or model parallelism.
+
+## Fusions
+
+Fusions of `Op`s are declared separately from them, making individual `Op`s self-contained and oblivious to the existence of other `Op`s.
+
+## Commands
+
+The implementations of the `forward`, `backward`, and `inference` passes for `Op`s and fusions use types and functions defined in `nvte`. This makes them oblivious to the framework, as instead of using `torch.Tensor`s, they use `nvte.Tensor`s, which, contrary to `torch.Tensor`s support FP8 `dtype`s.
+
+## Dependencies
+
+Currently, the code is structured in such a way, to maintain separation of concerns and the principle of least knowledge. While writing new code, maintain the current dependency graph:
+
+* `nvte` depends on `cpp_extensions`
+* `cpp_extensions` depends on `cppsrc`
+* `ops` depends on `nvte`
+* `fusions` depends on `nvte`
+* `fusions` depends on `ops`
+* `compute_pipeline` depends on `ops`
+* `compute_pipeline` depends on `fusions`
+* `module` depends on `compute_pipeline`
+
+For example:
+* `torch` **must not** be imported anywhere inside of the `compute_pipeline` folder
+* `cpp_extensions` **must not** be imported anywhere, except for inside `nvte`
+* `fusions` **must not** be imported anywhere, except for `compute_pipeline.py`
diff --git a/transformer_engine/pytorch/sequential/README.md b/transformer_engine/pytorch/sequential/README.md
new file mode 100644
index 0000000000..723a89625a
--- /dev/null
+++ b/transformer_engine/pytorch/sequential/README.md
@@ -0,0 +1,88 @@
+# `te.Sequential`
+While it originally started as just an implementation of an `nn.Sequential`-like module, `te.Sequential` is essentially becoming a reimplementation of the current PyTorch-side Transformer Engine API. The main goals of this refactoring are:
+- **Increased expressivity**. Instead of using configuration flags, you can declare different Transformer architectures, by declaring their structure directly, within a `te.Sequential` module:
+    - _Old API:_
+        ```python
+        gpt = te.TransformerLayer(
+            HIDDEN_SIZE,
+            4 * HIDDEN_SIZE,
+            NUM_HEADS,
+            apply_residual_connection_post_layernorm=False,
+            output_layernorm=False,
+            layer_type="encoder"
+        )
+        ```
+    - _**New API:**_
+        ```python
+        gpt = te.Sequential(
+            te.Residual(
+                te.LayerNorm(HIDDEN_SIZE),
+                te.Linear(HIDDEN_SIZE, 3 * HIDDEN_SIZE),
+                te.MultiHeadedSelfAttention(
+                    HIDDEN_SIZE,
+                    NUM_HEADS,
+                    te.DotProductAttention
+                ),
+                te.Linear(3 * HIDDEN_SIZE, HIDDEN_SIZE),
+            ),
+            te.Residual(
+                te.LayerNorm(HIDDEN_SIZE),
+                te.Linear(HIDDEN_SIZE, 4 * HIDDEN_SIZE),
+                te.GELU(),
+                te.Linear(4 * HIDDEN_SIZE, HIDDEN_SIZE),
+            )
+        )
+        ```
+- **Added flexibility**. Instead of using preavailable fused modules, you can use a `te.Sequential` that will perform inter-module fusions automatically:
+    - _Old API:_
+        ```python
+        mlp = te.LayerNormMLP(
+            HIDDEN_SIZE,
+            4 * HIDDEN_SIZE,
+            activation="swiglu",
+            normalization="RMSNorm",
+        )
+        ```
+    - _**New API:**_
+        ```python
+        mpl = te.Sequential(
+            te.RMSNorm(HIDDEN_SIZE),
+            te.Linear(HIDDEN_SIZE, 4 * HIDDEN_SIZE),
+            te.SwiGLU(),
+            te.Linear(4 * HIDDEN_SIZE, HIDDEN_SIZE),
+        )
+        ```
+- **Improved performance**. Now, using `torch.compile(te.Sequential(...), fullgraph=True)`, you can fuse your model to a single FX graph for accelerated execution by PyTorch. **##NOT WORKING YET due to various issues in Torch Dynamo; see `compute_pipeline_function.py`##**
+
+## Modules
+`Sequential` is meant to be used with Transformer-like models that operate on tokens. As such, provided are modules typically most used when implement such architectures:
+- `te.Linear` - a PyTorch-like linear layer supporting FP8 operations for accelerated performance on Hopper and Ada architectures.
+- `te.LayerNorm` - a PyTorch-like LayerNorm with custom FP8 kernels manually fine-tuned for best performance on Hopper and Ada architectures.
+- `te.RMSNorm` - an alternative normalization layer [[Zhang and Sennrich, 2019]](https://arxiv.org/abs/1910.07467) beating LayerNorm in computational and training performance, with custom FP8 kernels manually fine-tuned for best performance on Hopper and Ada architectures.
+- `te.***LU` - a collection of activation functions most suitable for Transformer-based architectures with custom kernels supporting FP8 tensors for reduce memory bandwith consumption. Supported activation functions include `te.ReLU` (Transformer, GPT-1, T5), `te.GELU` (GPT-2, GPT-3, BERT), `te.SwiGLU` (PaLM, LLaMA), `te.GeGLU` (LaMDA), and `te.ReGLU`.
+- `te.GroupedQueryAttention` - a generalized form of the attention mechanism, of which `te.MultiQuerySelfAttention` and `te.MultiHeadedSelfAttention` are special cases. These attention layers support for different attention mechanism implementations including `te.DotProductAttention`, `te.BlockSparseAttention`, `te.HungryHungryHippoes`... **##NOT YET IMPLEMENTED##**
+- `te.Residual` - models a residual connection with a model. Its function is analogous to `te.Sequential`, except it adds the incoming activation to its final output. **##NOT YET IMPLEMENTED##**
+
+## Input format
+Usually, the input during the process of training of a Transformer model is composed of multiple sequences, forming a batch. The `te.Sequential` module accepts such a batch as input in one of a few formats.
+
+Usually, batches are processed as rank-3 tensors of the form `(batch_size, seq_len, hidden_dim)`.
+The problem with this is that this requires adding padding to make all sequences have the same length. To solve this issue, the input to the `te.Sequential` module is composed of two tensors: _`tokens`_`(total_tokens, hidden_dim)` + _`seq_lens`_`(batch_size)`, where the _`tokens`_ tensor is a concatenation of all sequences in the batch, and _`seq_lens`_ is a tensor containing the length of each sequence in the batch. Specifying _`seq_lens`_ is necessary for self-attention.
+
+Given any `m: te.Sequential`, it can be invoked in one of three ways:
+1. `m(x, seq_lens)` where `x` and `seq_lens` are respectively a 2D and a 1D tensor, as defined above.
+2. `m(x)` where `x` is a 2D tensor - this is equivalent to `m(x, torch.Tensor([x.shape[0]]))`, ie. _`seq_lens`_ is `torch.Tensor([x.shape[0]])` or, simply, `x` is treated as a single sequence.
+3. `m(x)` where `x` is a 3D tensor - this is equivalent to `m(x.view(-1, x.shape[-1]), torch.Tensor([x.shape[0]] * x.shape[1]))`, which means that `x` is "flattened" from being a 3D tensor to a 2D tensor, and each of its previous slices is assumed to have been a single sequence.
+
+## Notes
+* The GELU activation function is implemented as an approximation. For numerical results equivalent to PyTorch, use `nn.GELU(approximate="tanh")`.
+* Due to limitations of TorchDynamo, some standard modules cannot be used. Some compatible replacements are provided in `utils.py`. Examples include `contextmanager` (replacement for `contextlib.contextmanager`) and `cache` (replacement for `functools.cache`).
+* For optimized execution (removed assertions, self consistency checks, decreased memory usage) invoke `python` with the `-O` flag.
+* The first iteration cannot be run inside of `torch.compile`. As such, you can, for example, first run `m(x)`, and only then `opt = torch.compile(m, fullgraph=True); opt(x)`.
+
+## Idea
+The main idea behind `te.Sequential` is that it doesn't have to execute eagerly, contrary to how PyTorch usually works. This is thanks to the fact that usually, its constitutent modules are provided during initialization and do not change since. This allows for performing optimizations such as fusions.
+
+The main limitation of PyTorch that Transformer Engine is dealing with is that PyTorch does not have support for FP8 `dtype`s. Meanwhile, by taking advantage of these optimized formats, performance on the Hopper and Ada architectures can be significantly increased.
+
+`te.Sequential` allows for sidestepping this issue by encapsulating the communications between subsequent modules. A bare `Linear` layer cannot return an FP8 tensor, even if the next operation supports that as an input, as there is no way to express this is PyTorch user code. However, by encapsulating both layers inside the `Sequential`, the communication between them happens in a way oblivious to the user. Only the input and output of the whole `Sequential` need to be representible as PyTorch tensors.
diff --git a/transformer_engine/pytorch/sequential/RECIPES.md b/transformer_engine/pytorch/sequential/RECIPES.md
new file mode 100644
index 0000000000..c4c5a2c213
--- /dev/null
+++ b/transformer_engine/pytorch/sequential/RECIPES.md
@@ -0,0 +1,251 @@
+# Extending `te.Sequential`
+## Recipe: Adding a new `module`
+
+Let's say you're adding `XYZLayer`:
+1. In `modules` create `xyz_layer.py`.
+2. In `modules`/`xyz_layer.py` create `class XYZLayer(BaseModule)`.
+3. In `modules`/`xyz_layer.py` implement `XYZLayer`, analogically to existing modules.
+    1. `XYZLayer.__init__` must follow this schema:
+        ```
+        def __init__(self, ...):
+        ```
+        Initialize the `BaseModule` superclass to be able to assign `nn.Parameter`s to `self`:
+        ```
+            super().__init__()
+        ```
+        Assign `nn.Parameter`s to `self`, save configurable state, perform other necessary initialization:
+        ```
+            self.weight = nn.Parameter(
+                weight_init_method(
+                    torch.empty(out_features, in_features, dtype=param_dtype, device="cuda")
+                )
+            )
+            self.bias = (
+                nn.Parameter(
+                    bias_init_method(
+                        torch.empty(out_features, dtype=param_dtype, device="cuda")
+                    )
+                )
+                if bias
+                else None
+            )
+    2. Implement an `XYZLayer._ops` method returning the `Op`s constituting the implementation of the module. If (at least some of) the operations are to be executed conditionally (like adding bias in a `Linear`), you can return `None`. If (at least some of) the operations are not unary and use trainable parameters, pass them to their initializer (the parameters must be owned by the module object), converted to `nvte.Tensor` objects:
+        ```
+        def _ops(self) -> list[ops.Op | None]:
+            return [
+                ops.MMT(make_nvte_tensor(self.weight)),
+                ops.Add(make_nvte_tensor(self.bias)) if self.bias is not None else None,
+            ]
+        ```
+    3. If your module contains trainable parameters, and (at least some of) these parameters are randomly initialied (like `weight` and `bias` in `Linear`, but not `gamma` or `beta` in `LayerNorm`), allow the user to specify a custom initializer for these parameters, but provide a default one, if possible:
+        ```
+        def __init__(
+            self,
+            weight_init_method: ParameterInitMethod = _default_weight_init_method,
+            ...
+        ):
+            ...
+            self.weight = nn.Parameter(
+                weight_init_method(torch.empty(...))
+            )
+            ...
+        ```
+    4. If your module is stateful, expose all configurable state through `extra_repl`:
+        ```
+        def extra_repr(self):
+            return f"do_xyz={self.do_xyz}"
+        ```
+4. In `modules`/`__init__.py` add `from xyz_layer import XYZLayer`.
+5. In `modules`/`__init__.py` insert `XYZLayer` to the module's `__all__` list.
+6. in `__init__.py` add `from .modules import XYZLayer`.
+7. In `__init__.py` insert `XYZLayer` to the module's `__all__` list.
+
+## Recipe: Adding a new `Op`
+
+Let's say you're adding `XYZLayer`:
+1. In `compute_pipeline`/`ops` create `xyz_layer.py`.
+2. In `compute_pipeline`/`ops`/`awesomelu.py` create `class XYZLayer(Op)`.
+3. In `compute_pipeline`/`ops`/`awesomelu.py` implement `XYZLayer`, analogically to existing operation implementations
+    1. In `XYZLayer.__init__`:
+        1. Take any secondary inputs to the forward pass as arguments:
+            ```
+            def __init__(
+                weight: nvte.Tensor,
+            ```
+        2. Allow for configuring the type of:
+            * The primary input to the operation in the forward pass `x` (input activation).
+            * The input to the operation in the backward pass `dy` (partial derivative of the loss over the operation's activation `∂L/∂y`).
+            * The output of the operation in the forward pass `y` (activation).
+            * The primary output of the operation in the backward pass `dx` (partial derivative of the loss over the operation's input activation `∂L/∂x`).
+            * The parametrized inputs to the operation in the forward pass (ex. `weight`, `bias`)
+            * The secondary outputs of the operation in the backward pass (partial derivative of the loss over the operation's parametrized inputs, ex. `dweight`, `dbias`)
+                ```
+                    x_dtype: nvte.DType | None = ...,
+                    weight_dtype: nvte.DType | None = ...,
+                    dy_dtype: nvte.DType | None = ...,
+                    y_dtype: nvte.DType | None = ...,
+                    dx_dtype: nvte.DType | None = ...,
+                    dweight_dtype: nvte.DType | None = ...,
+                ):
+                ```
+        3. Note that if `x`, `dy` or (at least some of) the parameters can be processed by the operation's computations, without changing their type, this is to be signalled by using `None`. If the output type(s) are to be automatically deduced (based on other `Op`s), this is also to be signalled by using `None`: **##TYPE INFERENCE NOT YET IMPLEMENTED##**
+            > ```
+            >     x_dtype: nvte.DType | None = ...,
+            >     weight_dtype: nvte.DType | None = ...,
+            >     dy_dtype: nvte.DType | None = ...,
+            > ```
+        4. Provide defaults for these types to allow for constructing the operation object `XYZLayer` without having to explicitly specify the types. Choose such default types that will result in optimal performance in the FP8 computational regime.
+
+        **##TODO: Implement type deduction mechanism and multiple type recipes for training at different precisions##**
+    2. In `XYZLayer.require_grad` return the list of all tensor attributes of `AwesomeLU` that require gradients.
+    3. In `XYZLayer.forward` provide the implementation of the forward pass of the operation:
+        1. The input activation is to be taken as an argument to the `forward` function. _Note: Contrary to Pytorch's `autograd.Function`, any parameters or configuration, can be conveniently accessed using the `self` object._
+            ```
+            def forward(self, x: nvte.Tensor):
+            ```
+        2. Remember to cast all `Tensor`-typed inputs to their requested types before performing computations on them, ex.:
+            ```
+                x = nvte.cast_checked(x, self.x_dtype)
+                weight = nvte.cast_checked(self.weight, self.weight_dtype)
+                bias = nvte.cast_checked(self.bias, self.bias_dtype)
+            ```
+        3. Return all auxilary tensors needed for the backward pass in a `Context` (`dict[str, Tensor]`) object. **Do not** store auxilary tensors in the `self` object. **Do not** return non-`Tensor` objects. These **may** be stored in the `self` object, and will remain accessible in the backward pass. **Do not** rely on the context being the same object. The dictionary keys **must** be valid Python identifier names. Example:
+            ```
+                return y, {"x": x, "weight": weight, "mu": mu, "rsigma": rsigma}
+            ```
+        4. If no auxilary tensors are needed for the backward pass, return an empty context.
+    4. In `XYZLayer.inference` provide the implementation of the forward pass of the operation, optimized for inference-time use. For optimized performance, you **may** use inplace operations. **##NOT YET IMPLEMENTED: inplace operations##**
+    5. In `XYZLayer.backward` provide the implementation of the backward pass of the operation:
+        1. Retrieve the tensors stored in the forward pass inside the context, by using their keys. **Do not** attempt to access other keys of the dictionary. **Do not** use `Tensor`s stored in the `self` object for computations. Note: You **may** access the attributes to, for example, access the `dtype` of a tensor, but you **must not** access the tensor's `data` or other numerical data. Example:
+            ```
+            def backward(self, ctx: Context, dy: nvte.Tensor):
+                x, weight, mu, rsigma = ctx["x"], ctx["weight"], ctx["mu"], ctx["rsigma"]
+            ```
+        2. Remember to cast `dy` to its request type, before performing computations on it:
+            ```
+                dy = nvte.cast_checked(dy, self.dy_dtype)
+            ```
+        3. Return `dy` and a list of the gradients of all tensors returned by `XYZLayer.require_grad` in **the same order** (if `require_grad` returns `[weight, bias]`, `backward` **must** return `dy, [dweight, dbias]`).
+        4. If `XYZLayer.require_grad` returns `[]`, return `dy, []`.
+    6. Remember to use fused implementations, when possible. For example, in some cases, using a sequence of `nvte.cast_checked` calls may be suboptimal, when, for example, `nvte.multi_cast_transpose` could be used instead, if the tensors are to be later transposed.
+4. In `compute_pipeline`/`ops`/`__init__.py` add `from xyz_layer import XYZLayer`.
+5. In `compute_pipeline`/`ops`/`__init__.py` insert `XYZLayer` to the module's `__all__` list.
+6. Remember to implement fusions concerning `XYZLayer`.
+
+## Recipe: Adding a new `nvte.` function
+
+Let's say you're adding support for `nvte_xyz`.
+1. If `nvte_xyz` is not present in `nvte`/`_nvte.pyi`:
+    * If all parameters of `nvte_awesomelu` have one of these types...
+        * `NVTEDType`
+        * `NVTE_Fused_Attn_Backed`
+        * `NVTE_QKV_Layout`
+        * `NVTE_BiasType`
+        * `NVTE_Mask_Type`
+        * `NVTETensorPack`
+        * `NVTETensor`
+        * [the types automatically converted by Pybind11](https://pybind11.readthedocs.io/en/stable/advanced/cast/overview.html#conversion-table)
+    * ...then:
+        * In `cpp_extensions`/`pybind.cpp` register `nvte_xyz`:
+            ```
+            m.def("nvte_xyz", wrap(nvte_xyz));
+            ```
+    * ...else if the mapping of C++ arguments to Python arguments is a bijection, and the semantic meaning of the arguments is preserved, and the order of the arguments is preserved, and the mapping of C++ arguments' types to their their Python-side equivalents' types is a bijection, then, assuming an argument to `nvte_awesomelu` has a C type `c_type` that is to be exposed to the Python side as `PyType` that is to be converted by Pybind to `conv_type` then:
+        1. If necessary, implement a C++ wrapper `conv_type` type over `c_type` to expose to the Python side as `PyType` and register it in Pybind using `py::class_<conv_type>(m, "PyType", py::module_local())` or similar.
+        2. Specialize the `wrapped_arg` template:
+            ```
+            template <> struct wrapped_arg<c_type> : trait<conv_type> {};
+            ```
+        3. Register `nvte_xyz`:
+            ```
+            m.def("nvte_xyz", wrap(nvte_xyz));
+            ```
+    * ...else:
+        * Manually implement a C++ wrapper over `nvte_xyz`
+        * Register the wrapper to pybind using `m.def`.
+    * In `nvte`/`_nvte.pyi` describe the Python-side interface to `nvte_xyz`, by replacing the C++ types with their Python-side equivalents - either types defined in `nvte`/`_nvte.pyi` or according to [builtin Pybind11 conversions](https://pybind11.readthedocs.io/en/stable/advanced/cast/overview.html#conversion-table), and template specializations of `wrapped_arg`.
+2. In `nvte` create `xyz.py` importing `_nvte` using `from . import cpp_extensions as _nvte`.
+3. In `nvte`/`xyz.py` implement function `xyz`.
+    * Note: usually, if `nvtexyz` requires temporary tensors, such as `workspace` or `barrier`, construct them inside of `xyz`, rather than take them as parameters.
+    * Note: allow the user to specify the type of the output, if `nvte_xyz` supports that.
+    * Note: the current computational pass (`forward`, `backward`, or `inference`) can be accessed through `execution_state.pass_`.
+4. In `nvte`/`__init__.py` add `from xyz import xyz`.
+5. In `nvte`/`__init__.py` insert `xyz` to the module's `__all__` list.
+
+## Recipe: Adding a new fusion
+
+A fusions is an optimized implementation of a sequence of operations.
+
+There are three types of fusions:
+* fusions of inference passes
+* fusions of the forward passes
+* fusions of the backward passes
+
+Specifically, there may be a fusion of forward passes that does not have a backward counterpart, and vice-versa.
+
+To implement a fusion of the inference passes of operations `A`, `B`, and `C`:
+1. In an appropriate existing or new file in `fusions` declare a function:
+    ```
+    @register_fusion_inference
+    def a_b_c_inf_fused(a: A, b: B, c: C, x: nvte.Tensor):
+    ```
+2. The fusion must be equivalent to the sequence of inference passes it replaces.
+
+To implement a fusion of the forward passes of operations `A`, `B`, and `C`:
+1. In an appropriate existing or new file in `fusions` declare a function:
+    ```
+    @register_fusion_forward
+    def a_b_c_fwd_fused(a: A, b: B, c: C, x: nvte.Tensor):
+    ```
+2. From `a_b_c_fwd_fused`, return:
+    ```
+    y, (a_ctx, b_ctx, c_ctx)
+    ```
+    Where `a_ctx`, `b_ctx`, and `c_ctx` are valid contexts of the corresponding `Op`s. Specifically:
+    ```
+    y, (a_ctx, b_ctx, c_ctx) = a_b_c_fwd_fused(a, b, c, x)
+    dy = ... # ∂L/∂y
+    dx2, a_grads = a.backward(a, a_ctx, dy)
+    dx1, b_grads = b.backward(b, b_ctx, dx2)
+    dx, c_grads = c.backward(c, c_ctx, dx1)
+    ```
+    **Must** be equivalent to:
+    ```
+    x1, a_ctx = a.forward(x)
+    x2, b_ctx = b.forward(x1)
+    y, c_ctx = c.forward(x2)
+    dy = ... # `∂L/∂y`
+    dx2, a_grads = a.backward(a, a_ctx, dy)
+    dx1, b_grads = b.backward(b, b_ctx, dx2)
+    dx, c_grads = c.backward(c, c_ctx, dy1)
+    ```
+
+To implement a fusion of the backward passes of operations `A`, `B`, and `C`:
+1. In an appropriate existing or new file in `fusions` declare a function:
+    ```
+    @register_fusion_backward
+    def a_b_c_bwd_fused(a: A, b: B, c: C, a_ctx: Context, b_ctx: Context, c_ctx: Context, dy: nvte.Tensor):
+    ```
+    Where `a_ctx`, `b_ctx`, and `c_ctx` are valid contexts of the corresponding `Op`s.
+2. From `a_b_c_bwd_fused`, return:
+    ```
+    y, (a_grads, b_grads, c_cgrads)
+    ```
+    Where `a_grads`, `b_grads`, and `c_grads` are valid gradients of the corresponding `Op`s. Specifically:
+    ```
+    x1, a_ctx = a.forward(x)
+    x2, b_ctx = b.forward(x1)
+    y, c_ctx = c.forward(x2)
+    dy = ... # `∂L/∂y`
+    dx, (a_grads, b_grads, c_grads) = a_b_c_bwd_fused(a, b, c, a_ctx, b_ctx, c_ctx, dy)
+    ```
+    **Must** be equivalent to:
+    ```
+    x1, a_ctx = a.forward(x)
+    x2, b_ctx = b.forward(x1)
+    y, c_ctx = c.forward(x2)
+    dy = ... # `∂L/∂y`
+    dx2, a_grads = a.backward(a, a_ctx, dy)
+    dx1, b_grads = b.backward(b, b_ctx, dx2)
+    dx, c_grads = c.backward(c, c_ctx, dy1)
+    ```
diff --git a/transformer_engine/pytorch/sequential/TODO.md b/transformer_engine/pytorch/sequential/TODO.md
new file mode 100644
index 0000000000..08a2bc36b2
--- /dev/null
+++ b/transformer_engine/pytorch/sequential/TODO.md
@@ -0,0 +1,16 @@
+## Not Yet Implemented
+- Inplace operations:
+    - inplace `nvte.***` for use during inference
+    - using those commands in `training` methods of `Op`s
+- Torch compile fullgraph support - requires action from Meta side
+- Attention
+- Dropout
+- Type inference
+- Model parallelism
+- User buffers
+- Margin used for scaling factor calculation is currently hardcoded to be 1.0
+- Make the sources saved by `exec_saving_source` be garbage collected when there are no references to objects from within the source.
+- Cleanup `compute_pipeline_function.py` and `base.py`. Currently they are both a mess full of hacks around Torch Dynamo issues.
+- Maybe cleanup `nvte/_common.py`??? It has a complicated implementation of `nvte.torch_op`. Though, maybe it is that's just how this has to be implemented.
+- Maybe rename some files and move some code??? Files like `_common.py` or `_storage.py` were supposed to be internal to a folder, but static type chackers complain about them being private. They also export some things...
+- ..? Other things supported by current implementation
diff --git a/transformer_engine/pytorch/sequential/__init__.py b/transformer_engine/pytorch/sequential/__init__.py
new file mode 100644
index 0000000000..e5d7e7d713
--- /dev/null
+++ b/transformer_engine/pytorch/sequential/__init__.py
@@ -0,0 +1,31 @@
+from .module import (
+    Activation,
+    ReLU,
+    GELU,
+    ReGLU,
+    GeGLU,
+    SwiGLU,
+    LayerNorm,
+    RMSNorm,
+    Linear,
+    Sequential,
+    Residual,
+)
+from .recipe import Recipe
+
+__all__ = [
+    # nn.Modules
+    "Activation",
+    "ReLU",
+    "GELU",
+    "ReGLU",
+    "GeGLU",
+    "SwiGLU",
+    "LayerNorm",
+    "RMSNorm",
+    "Linear",
+    "Sequential",
+    "Residual",
+    # Recipe context manager
+    "Recipe",
+]
diff --git a/transformer_engine/pytorch/sequential/compute_pipeline/__init__.py b/transformer_engine/pytorch/sequential/compute_pipeline/__init__.py
new file mode 100644
index 0000000000..3f88897336
--- /dev/null
+++ b/transformer_engine/pytorch/sequential/compute_pipeline/__init__.py
@@ -0,0 +1,10 @@
+from .ops import Op, Context, Grads
+from .compute_pipeline import ComputePipeline, SelfContainedOp
+
+__all__ = [
+    "Op",
+    "Context",
+    "Grads",
+    "ComputePipeline",
+    "SelfContainedOp",
+]
diff --git a/transformer_engine/pytorch/sequential/compute_pipeline/compute_pipeline.py b/transformer_engine/pytorch/sequential/compute_pipeline/compute_pipeline.py
new file mode 100644
index 0000000000..e1748f94ac
--- /dev/null
+++ b/transformer_engine/pytorch/sequential/compute_pipeline/compute_pipeline.py
@@ -0,0 +1,151 @@
+from __future__ import annotations
+from functools import reduce
+import operator
+from .. import nvte
+from .ops import Op, Grads, Context
+from .fusions import FusedOp, get_fused_op_list
+from ..recipe import Recipe
+from ..metatensors import PersistentFP8Meta
+
+
+class SelfContainedOp(Op):
+    def __init__(self, fwds: list[Op], bwds: list[Op]) -> None:
+        self.fwds = fwds
+        self.bwds = bwds
+
+    def inference(self, x: nvte.Tensor) -> nvte.Tensor:
+        raise AssertionError("Not used for inference")
+
+    def forward(self, x: nvte.Tensor):
+        full_ctx: Context = {}
+        for op in self.fwds:
+            x, ctx = op.forward(x)
+            if not isinstance(op, FusedOp):
+                op_name = getattr(op, "name")
+                ctx = {op_name + name: tensor for name, tensor in ctx.items()}
+            full_ctx.update(ctx)
+        return x, full_ctx
+
+    def backward(self, ctx: Context, dy: nvte.Tensor):
+        ctxs: list[Context] = []
+        for op in self.bwds:
+            if isinstance(op, FusedOp):
+                ctxs.append(ctx)
+            else:
+                op_name = getattr(op, "name")
+                ctxs.append(
+                    {
+                        name[len(op_name) :]: tensor
+                        for name, tensor in ctx.items()
+                        if name.startswith(op_name)
+                    }
+                )
+
+        full_grads: Grads = []
+        for op, ctx in list(zip(self.bwds, ctxs))[::-1]:
+            dy, grads = op.backward(ctx, dy)
+            full_grads += grads
+        return dy, full_grads
+
+    def require_grad(self):
+        list_: list[nvte.Tensor] = []
+        for op in self.fwds:
+            list_.extend(op.require_grad())
+        return list_
+
+
+def force_use_precision(ops: list[Op], allowed: nvte.DType):
+    PRECISION = {
+        nvte.DType.Float8E4M3.value: 0,
+        nvte.DType.Float8E5M2.value: 0,
+        nvte.DType.BFloat16.value: 1,
+        nvte.DType.Float16.value: 2,
+        nvte.DType.Float32.value: 3,
+        nvte.DType.Int64.value: 4,
+    }
+
+    for op in ops:
+        attributes = dir(op)
+        dtype_attributes = [attr for attr in attributes if attr.endswith("_dtype")]
+        for dtype_attribute in dtype_attributes:
+            attr_val = getattr(op, dtype_attribute)
+            if (
+                isinstance(attr_val, nvte.DType)
+                and PRECISION[attr_val.value] < PRECISION[allowed.value]
+            ):
+                setattr(op, dtype_attribute, allowed)
+
+
+def model_parallel_transform(ops: list[Op]):
+    raise NotImplementedError()  # TODO
+
+
+def name_ops(ops: list[Op]):
+    for i, op in enumerate(ops):
+        setattr(op, "name", f"{i}({op.__class__.__name__})")
+
+
+def split_into_self_contained(fwds: list[Op], bwds: list[Op]):
+    functions: list[SelfContainedOp] = []
+    while fwds or bwds:
+        fwd = fwds.pop(0)
+        unmatched_fwd_ops: set[Op] = {
+            *reduce(operator.iadd, [fwd.ops if isinstance(fwd, FusedOp) else [fwd]], [])
+        }
+        used_forwards = [fwd]
+        used_backwards: list[Op] = []
+        unmatched_bwd_ops: set[Op] = set()
+        while unmatched_fwd_ops or unmatched_bwd_ops:
+            while unmatched_fwd_ops:
+                bwd = bwds.pop(0)
+                used_backwards.append(bwd)
+                ops = bwd.ops if isinstance(bwd, FusedOp) else [bwd]
+                for op in ops:
+                    if op in unmatched_fwd_ops:
+                        unmatched_fwd_ops.remove(op)
+                    else:
+                        unmatched_bwd_ops.add(op)
+            while unmatched_bwd_ops:
+                fwd = fwds.pop(0)
+                used_forwards.append(fwd)
+                ops = fwd.ops if isinstance(fwd, FusedOp) else [fwd]
+                for op in ops:
+                    if op in unmatched_bwd_ops:
+                        unmatched_bwd_ops.remove(op)
+                    else:
+                        unmatched_fwd_ops.add(op)
+        functions.append(SelfContainedOp(used_forwards, used_backwards))
+    return functions
+
+
+class ComputePipeline:
+    def __init__(self, ops: list[Op], env: Recipe):
+        name_ops(ops)
+        force_use_precision(ops, env.lowp)
+        if env.world_size > 1:
+            model_parallel_transform(ops)
+
+        self._inf = get_fused_op_list(ops, "inference")
+
+        self.functions = split_into_self_contained(
+            get_fused_op_list(ops, "forward"), get_fused_op_list(ops, "backward")
+        )
+        self.forward = tuple(op for f in self.functions for op in f.fwds)
+        self.backward = tuple(op for f in self.functions for op in f.bwds)
+        self.meta_fwd = PersistentFP8Meta()
+        self.meta_bwd = PersistentFP8Meta()
+
+    def run_inference(self, x: nvte.Tensor):
+        for op in self._inf:
+            x = op.inference(x)
+        return x
+
+    def next_iteration(self):
+        self.meta_fwd.next_iteration()
+        self.meta_bwd.next_iteration()
+
+    def __repr__(self):
+        return f"""ComputePipeline(
+    forward: {self.forward},
+    backward: {self.backward},
+)"""
diff --git a/transformer_engine/pytorch/sequential/compute_pipeline/fusions/__init__.py b/transformer_engine/pytorch/sequential/compute_pipeline/fusions/__init__.py
new file mode 100644
index 0000000000..9bdb2c4edb
--- /dev/null
+++ b/transformer_engine/pytorch/sequential/compute_pipeline/fusions/__init__.py
@@ -0,0 +1,4 @@
+from .interface import FusedOp, get_fused_op_list
+from . import mmt  # only for side effects
+
+__all__ = ["FusedOp", "get_fused_op_list"]
diff --git a/transformer_engine/pytorch/sequential/compute_pipeline/fusions/_common.py b/transformer_engine/pytorch/sequential/compute_pipeline/fusions/_common.py
new file mode 100644
index 0000000000..e38675d65e
--- /dev/null
+++ b/transformer_engine/pytorch/sequential/compute_pipeline/fusions/_common.py
@@ -0,0 +1,41 @@
+from __future__ import annotations
+
+from typing import Callable
+from typing_extensions import TypeVarTuple, Unpack
+from ..ops import Context, Grads
+from ... import nvte
+from ._storage import FUSIONS_FWD, FUSIONS_BWD, FUSIONS_INF
+from ...utils import get_arg_types
+
+_Ops = TypeVarTuple("_Ops")
+_OpsAndCtxs = TypeVarTuple("_OpsAndCtxs")
+
+
+def register_fusion_inference(f: Callable[[Unpack[_Ops], nvte.Tensor], nvte.Tensor]):  # type: ignore[invalid-typevar-use]
+    fused_modules = get_arg_types(f)[:-1]
+    FUSIONS_INF[tuple(fused_modules)] = f
+    return f
+
+
+def register_fusion_forward(
+    f: Callable[
+        [Unpack[_Ops], nvte.Tensor],  # type: ignore[invalid-typevar-use]
+        tuple[nvte.Tensor, tuple[Context, ...]],
+    ]
+):
+    fused_modules = get_arg_types(f)[:-1]
+    FUSIONS_FWD[tuple(fused_modules)] = f
+    return f
+
+
+def register_fusion_backward(
+    f: Callable[
+        [Unpack[_OpsAndCtxs], nvte.Tensor],  # type: ignore[invalid-typevar-use]
+        tuple[nvte.Tensor, tuple[Grads, ...]],
+    ]
+):
+    arg_types = get_arg_types(f)
+    module_count = (len(arg_types) - 1) // 2
+    fused_modules = arg_types[:module_count]
+    FUSIONS_BWD[tuple(fused_modules)] = f
+    return f
diff --git a/transformer_engine/pytorch/sequential/compute_pipeline/fusions/_storage.py b/transformer_engine/pytorch/sequential/compute_pipeline/fusions/_storage.py
new file mode 100644
index 0000000000..d6442c78c5
--- /dev/null
+++ b/transformer_engine/pytorch/sequential/compute_pipeline/fusions/_storage.py
@@ -0,0 +1,5 @@
+from typing import Callable, Any
+
+FUSIONS_INF: dict[tuple[type, ...], Callable[..., Any]] = {}
+FUSIONS_FWD: dict[tuple[type, ...], Callable[..., Any]] = {}
+FUSIONS_BWD: dict[tuple[type, ...], Callable[..., Any]] = {}
diff --git a/transformer_engine/pytorch/sequential/compute_pipeline/fusions/interface.py b/transformer_engine/pytorch/sequential/compute_pipeline/fusions/interface.py
new file mode 100644
index 0000000000..1e51f20382
--- /dev/null
+++ b/transformer_engine/pytorch/sequential/compute_pipeline/fusions/interface.py
@@ -0,0 +1,98 @@
+from __future__ import annotations
+
+from functools import partial
+from ..ops import Op
+from typing import Literal
+from ... import nvte
+from ..ops_types import (
+    BackwardFused,
+    ForwardFused,
+    Grads,
+    Context,
+    Inference,
+)
+from ._storage import FUSIONS_FWD, FUSIONS_BWD, FUSIONS_INF
+
+
+class FusedOp(Op):
+    def __init__(
+        self,
+        ops: list[Op],
+        forward: ForwardFused | None = None,
+        backward: BackwardFused | None = None,
+        inference: Inference | None = None,
+    ):
+        self.forward_ = forward
+        self.backward_ = backward
+        self.inference_ = inference
+        self.ops = ops
+
+    def inference(self, x: nvte.Tensor) -> nvte.Tensor:
+        assert self.inference_ is not None
+        return self.inference_(x)
+
+    def forward(self, x: nvte.Tensor):
+        assert self.forward_ is not None
+        y, ctxs = self.forward_(x)
+        full_ctx: Context = {}
+        for op, ctx in zip(self.ops, ctxs):
+            op_name = getattr(op, "name")
+            ctx: Context = {op_name + name: tensor for name, tensor in ctx.items()}
+            full_ctx.update(ctx)
+        return y, full_ctx
+
+    def backward(self, ctx: Context, dy: nvte.Tensor):
+        assert self.backward_ is not None
+        ctxs: list[Context] = []
+        for op in self.ops:
+            op_name = getattr(op, "name")
+            ctxs.append(
+                {
+                    name[len(op_name) :]: tensor
+                    for name, tensor in ctx.items()
+                    if name.startswith(op_name)
+                }
+            )
+
+        dx, grads = self.backward_(*ctxs, dy)
+        grads_total: Grads = [grad for op_grads in grads for grad in op_grads]
+        return dx, grads_total
+
+    def require_grad(self):
+        list_: list[nvte.Tensor] = []
+        for op in self.ops:
+            list_.extend(op.require_grad())
+        return list_
+
+    def __repr__(self):
+        return f"""FusedOp{self.ops}"""
+
+
+def get_fused_op_list(
+    ops: list[Op], fuse_by: Literal["forward", "backward", "inference"]
+):
+    ops = ops.copy()
+    if fuse_by == "forward":
+        fusion_dict = FUSIONS_FWD
+    elif fuse_by == "backward":
+        fusion_dict = FUSIONS_BWD
+    else:  # pass_ == "inference":
+        fusion_dict = FUSIONS_INF
+    fusions = [(len(arg_types), arg_types, f) for arg_types, f in fusion_dict.items()]
+    fusions.sort(key=lambda x: x[0], reverse=True)  # largest first
+    for cnt, arg_types, f in fusions:
+        startPos = 0
+        while startPos < len(ops) - cnt + 1:
+            if all(
+                ops[startPos + i].fusion_type[fuse_by] is arg_types[i]
+                for i in range(cnt)
+            ):
+                fused_ops = ops[startPos : startPos + cnt]
+                func = partial(f, *fused_ops)
+                fused_op = FusedOp(fused_ops, **{fuse_by: func})
+                ops[startPos : startPos + cnt] = [fused_op]
+            startPos += 1
+    return ops
+
+
+__all__ = ["FusedOp", "get_fused_op_list"]
diff --git a/transformer_engine/pytorch/sequential/compute_pipeline/fusions/mmt.py b/transformer_engine/pytorch/sequential/compute_pipeline/fusions/mmt.py
new file mode 100644
index 0000000000..4367afd437
--- /dev/null
+++ b/transformer_engine/pytorch/sequential/compute_pipeline/fusions/mmt.py
@@ -0,0 +1,243 @@
+from __future__ import annotations
+
+from ... import nvte
+from ..ops import Context, Grads, MMT, Add, GELU, GeGLU
+from ... import nvte
+from ._common import (
+    register_fusion_inference,
+    register_fusion_backward,
+    register_fusion_forward,
+)
+
+
+# MMT, Add
+@register_fusion_inference
+def mmt_add_inf_fused(mmt: MMT, add: Add, x: nvte.Tensor):
+    x = nvte.cast_checked(x, mmt.x_dtype)
+    weight = nvte.cast_checked(mmt.weight, mmt.weight_dtype)
+    bias = nvte.cast_checked(add.bias, add.bias_dtype)
+
+    y = nvte.matmul_transpose_add(
+        x, weight, bias, add.y_dtype or mmt.y_dtype or x.dtype
+    )
+
+    return y
+
+
+@register_fusion_forward
+def mmt_add_fwd_fused(
+    mmt: MMT, add: Add, x: nvte.Tensor
+) -> tuple[nvte.Tensor, tuple[Context, Context]]:
+    (x, x_t), (weight, weight_t) = nvte.multi_cast_transpose_checked(
+        (x, mmt.x_dtype), (mmt.weight, mmt.weight_dtype)
+    )
+    bias = nvte.cast_checked(add.bias, add.bias_dtype)
+
+    y = nvte.matmul_transpose_add(
+        x, weight, bias, add.y_dtype or mmt.y_dtype or x.dtype
+    )
+
+    return y, ({"x_t": x_t, "weight_t": weight_t}, {})
+
+
+@register_fusion_backward
+def mmt_add_bwd_fused(
+    mmt: MMT,
+    add: Add,
+    mmt_ctx: Context,
+    add_ctx: Context,
+    dy: nvte.Tensor,
+):
+    del add_ctx
+    x_t, weight_t = mmt_ctx["x_t"], mmt_ctx["weight_t"]
+    dy, dy_t, dbias = nvte.cast_transpose_dbias_checked(
+        dy, mmt.dy_dtype, add.dbias_dtype or add.bias.dtype
+    )
+
+    dx = nvte.matmul_transpose(dy, weight_t, mmt.dx_dtype or add.dx_dtype or dy.dtype)
+    dweight = nvte.matmul_transpose(x_t, dy_t, mmt.dweight_dtype or mmt.weight.dtype)
+
+    return dx, ([dweight], [dbias])
+
+
+# MMT, Add, GELU
+@register_fusion_inference
+def mmt_add_gelu_inf_fused(mmt: MMT, add: Add, gelu: GELU, x: nvte.Tensor):
+    x = nvte.cast_checked(x, mmt.x_dtype)
+    weight = nvte.cast_checked(mmt.weight, mmt.weight_dtype)
+    bias = nvte.cast_checked(add.bias, add.bias_dtype)
+
+    _, y = nvte.matmul_transpose_add_gelu(
+        x, weight, bias, gelu.y_dtype or add.y_dtype or mmt.y_dtype or x.dtype
+    )
+
+    return y
+
+
+@register_fusion_forward
+def mmt_add_gelu_fwd_fused(
+    mmt: MMT, add: Add, gelu: GELU, x: nvte.Tensor
+) -> tuple[nvte.Tensor, tuple[Context, Context, Context]]:
+    (x, x_t), (weight, weight_t) = nvte.multi_cast_transpose_checked(
+        (x, mmt.x_dtype), (mmt.weight, mmt.weight_dtype)
+    )
+    bias = nvte.cast_checked(add.bias, add.bias_dtype)
+
+    pre_gelu, y = nvte.matmul_transpose_add_gelu(
+        x, weight, bias, gelu.y_dtype or add.y_dtype or mmt.y_dtype or x.dtype
+    )
+
+    return y, ({"x_t": x_t, "weight_t": weight_t}, {}, {"x": pre_gelu})
+
+
+@register_fusion_backward
+def mmt_add_gelu_bwd_fused(
+    mmt: MMT,
+    add: Add,
+    gelu: GELU,
+    mmt_ctx: Context,
+    add_ctx: Context,
+    gelu_ctx: Context,
+    dy: nvte.Tensor,
+) -> tuple[nvte.Tensor, tuple[Grads, Grads, Grads]]:
+    del add_ctx
+    x_t, weight_t, pre_gelu = mmt_ctx["x_t"], mmt_ctx["weight_t"], gelu_ctx["x"]
+    dy, dy_t, dbias = nvte.cast_transpose_dbias_dgelu_checked(
+        dy, pre_gelu, mmt.dy_dtype, add.dbias_dtype or add.bias.dtype
+    )
+
+    dx = nvte.matmul_transpose(
+        dy, weight_t, mmt.dx_dtype or add.dx_dtype or gelu.dx_dtype or dy.dtype
+    )
+    dweight = nvte.matmul_transpose(x_t, dy_t, mmt.dweight_dtype or mmt.weight.dtype)
+
+    return dx, ([dweight], [dbias], [])
+
+
+# MMT, GELU
+@register_fusion_inference
+def mmt_gelu_inf_fused(mmt: MMT, gelu: GELU, x: nvte.Tensor):
+    x = nvte.cast_checked(x, mmt.x_dtype)
+    weight = nvte.cast_checked(mmt.weight, mmt.weight_dtype)
+
+    _, y = nvte.matmul_transpose_gelu(x, weight, gelu.y_dtype or mmt.y_dtype or x.dtype)
+
+    return y
+
+
+@register_fusion_forward
+def mmt_gelu_fwd_fused(mmt: MMT, gelu: GELU, x: nvte.Tensor):
+    (x, x_t), (weight, weight_t) = nvte.multi_cast_transpose_checked(
+        (x, mmt.x_dtype), (mmt.weight, mmt.weight_dtype)
+    )
+
+    pre_gelu, y = nvte.matmul_transpose_gelu(
+        x, weight, gelu.y_dtype or mmt.y_dtype or x.dtype
+    )
+
+    return y, ({"x_t": x_t, "weight_t": weight_t}, {"x": pre_gelu})
+
+
+# MMT, GELU, Add
+@register_fusion_inference
+def mmt_gelu_add_inf_fused(mmt: MMT, gelu: GELU, add: Add, x: nvte.Tensor):
+    x = nvte.cast_checked(x, mmt.x_dtype)
+    weight = nvte.cast_checked(mmt.weight, mmt.weight_dtype)
+    bias = nvte.cast_checked(add.bias, add.bias_dtype)
+
+    _, y = nvte.matmul_transpose_gelu_add(x, weight, bias)
+
+    return y
+
+
+@register_fusion_forward
+def mmt_gelu_add_fwd_fused(mmt: MMT, gelu: GELU, add: Add, x: nvte.Tensor):
+    (x, x_t), (weight, weight_t) = nvte.multi_cast_transpose_checked(
+        (x, mmt.x_dtype), (mmt.weight, mmt.weight_dtype)
+    )
+    bias = nvte.cast_checked(add.bias, add.bias_dtype)
+
+    pre_gelu, y = nvte.matmul_transpose_gelu_add(x, weight, bias)
+
+    return y, ({"x_t": x_t, "weight_t": weight_t}, {"x": pre_gelu})
+
+
+# MMT, Add, Add
+@register_fusion_inference
+def mmt_add_add_inf_fused(mmt: MMT, add1: Add, add2: Add, x: nvte.Tensor):
+    x = nvte.cast_checked(x, mmt.x_dtype)
+    weight = nvte.cast_checked(mmt.weight, mmt.weight_dtype)
+    bias1 = nvte.cast_checked(add1.bias, add1.bias_dtype)
+    bias2 = nvte.cast_checked(add2.bias, add2.bias_dtype)
+
+    y = nvte.matmul_transpose_add_add(x, weight, bias1, bias2)
+
+    return y
+
+
+@register_fusion_forward
+def mmt_add_add_fwd_fused(
+    mmt: MMT, add1: Add, add2: Add, x: nvte.Tensor
+) -> tuple[nvte.Tensor, tuple[Context, Context, Context]]:
+    (x, x_t), (weight, weight_t) = nvte.multi_cast_transpose_checked(
+        (x, mmt.x_dtype), (mmt.weight, mmt.weight_dtype)
+    )
+    bias1 = nvte.cast_checked(add1.bias, add1.bias_dtype)
+    bias2 = nvte.cast_checked(add2.bias, add2.bias_dtype)
+
+    y = nvte.matmul_transpose_add_add(x, weight, bias1, bias2)
+
+    return y, ({"x_t": x_t, "weight_t": weight_t}, {}, {})
+
+
+# MMT, Add, GELU, Add
+@register_fusion_inference
+def mmt_add_gelu_add_inf_fused(
+    mmt: MMT, add1: Add, gelu: GELU, add2: Add, x: nvte.Tensor
+):
+    x = nvte.cast_checked(x, mmt.x_dtype)
+    weight = nvte.cast_checked(mmt.weight, mmt.weight_dtype)
+    bias1 = nvte.cast_checked(add1.bias, add1.bias_dtype)
+    bias2 = nvte.cast_checked(add2.bias, add2.bias_dtype)
+
+    _, y = nvte.matmul_transpose_add_gelu_add(x, weight, bias1, bias2)
+
+    return y
+
+
+@register_fusion_forward
+def mmt_add_gelu_add_fwd_fused(
+    mmt: MMT, add1: Add, gelu: GELU, add2: Add, x: nvte.Tensor
+) -> tuple[nvte.Tensor, tuple[Context, Context, Context, Context]]:
+    (x, x_t), (weight, weight_t) = nvte.multi_cast_transpose_checked(
+        (x, mmt.x_dtype), (mmt.weight, mmt.weight_dtype)
+    )
+    bias1 = nvte.cast_checked(add1.bias, add1.bias_dtype)
+    bias2 = nvte.cast_checked(add2.bias, add2.bias_dtype)
+
+    pre_gelu, y = nvte.matmul_transpose_add_gelu_add(x, weight, bias1, bias2)
+
+    return y, (
+        {"x_t": x_t, "weight_t": weight_t},
+        {},
+        {"x": pre_gelu},
+        {},
+    )
+
+
+# MMT, GEGLU
+@register_fusion_backward
+def mmt_geglu_bwd_fused(
+    mmt: MMT, geglu: GeGLU, mmt_ctx: Context, geglu_ctx: Context, grad: nvte.Tensor
+) -> tuple[nvte.Tensor, tuple[Grads, Grads]]:
+    x_t, weight_t, pre_geglu = mmt_ctx["x_t"], mmt_ctx["weight_t"], geglu_ctx["x"]
+    dy, dy_t = nvte.cast_transpose_dgeglu_checked(grad, pre_geglu, mmt.dy_dtype)
+
+    dx = nvte.matmul_transpose(dy, weight_t, mmt.dx_dtype or geglu.dx_dtype or dy.dtype)
+    dweight = nvte.matmul_transpose(x_t, dy_t, mmt.dweight_dtype or mmt.weight.dtype)
+
+    return dx, ([dweight], [])
+
+
+# fusion function names (ex. mmt_add_bwd_fused) are for debugging only, as they are called from a dictionary like FUSIONS_FWD
+__all__ = []
diff --git a/transformer_engine/pytorch/sequential/compute_pipeline/ops/__init__.py b/transformer_engine/pytorch/sequential/compute_pipeline/ops/__init__.py
new file mode 100644
index 0000000000..e94fc84096
--- /dev/null
+++ b/transformer_engine/pytorch/sequential/compute_pipeline/ops/__init__.py
@@ -0,0 +1,25 @@
+from .op import Op, Context, Grads
+from .activation import Activation, ReLU, GELU, ReGLU, GeGLU, SwiGLU
+from .layernorm import LayerNorm
+from .rmsnorm import RMSNorm
+from .mmt import MMT
+from .add import Add
+from .residual import ResidualBegin, ResidualEnd
+
+__all__ = [
+    "Op",
+    "Context",
+    "Grads",
+    "Activation",
+    "ReLU",
+    "GELU",
+    "ReGLU",
+    "GeGLU",
+    "SwiGLU",
+    "LayerNorm",
+    "RMSNorm",
+    "MMT",
+    "Add",
+    "ResidualBegin",
+    "ResidualEnd",
+]
diff --git a/transformer_engine/pytorch/sequential/compute_pipeline/ops/activation.py b/transformer_engine/pytorch/sequential/compute_pipeline/ops/activation.py
new file mode 100644
index 0000000000..059448dc74
--- /dev/null
+++ b/transformer_engine/pytorch/sequential/compute_pipeline/ops/activation.py
@@ -0,0 +1,77 @@
+from __future__ import annotations
+
+from typing import Callable
+from abc import ABC
+from ... import nvte
+from .op import Grads, Op, Context
+
+
+class Activation(Op, ABC):
+    def __init__(
+        self,
+        *,
+        x_dtype: nvte.DType | None = nvte.DType.BFloat16,
+        dy_dtype: nvte.DType | None = nvte.DType.BFloat16,
+        y_dtype: nvte.DType | None = nvte.DType.Float8E4M3,
+        dx_dtype: nvte.DType | None = nvte.DType.BFloat16,
+    ):
+        self._x_dtype = x_dtype
+        self._dy_dtype = dy_dtype
+        self._y_dtype = y_dtype
+        self._dx_dtype = dx_dtype
+
+    def forward(self, x: nvte.Tensor):
+        x = nvte.cast_checked(x, self.x_dtype)
+
+        y = type(self)._forward(x, self.y_dtype or self.x_dtype or x.dtype)
+
+        return y, {"x": x}
+
+    def backward(self, ctx: Context, dy: nvte.Tensor) -> tuple[nvte.Tensor, Grads]:
+        x = ctx["x"]
+        dy = nvte.cast_checked(dy, self.dy_dtype)
+
+        dx = type(self)._backward(dy, x, self.dx_dtype or dy.dtype)
+
+        return dx, []
+
+    def require_grad(self) -> list[nvte.Tensor]:
+        return []
+
+    _forward: Callable[[nvte.Tensor, nvte.DType], nvte.Tensor]
+    _backward: Callable[[nvte.Tensor, nvte.Tensor, nvte.DType], nvte.Tensor]
+
+
+class ReLU(Activation):
+    _forward = nvte.relu
+    _backward = nvte.drelu
+
+
+class GELU(Activation):
+    _forward = nvte.gelu
+    _backward = nvte.dgelu
+
+
+class ReGLU(Activation):
+    _forward = nvte.reglu
+    _backward = nvte.dreglu
+
+
+class GeGLU(Activation):
+    _forward = nvte.geglu
+    _backward = nvte.dgeglu
+
+
+class SwiGLU(Activation):
+    _forward = nvte.swiglu
+    _backward = nvte.dswiglu
+
+
+__all__ = [
+    "Activation",
+    "ReLU",
+    "GELU",
+    "ReGLU",
+    "GeGLU",
+    "SwiGLU",
+]
diff --git a/transformer_engine/pytorch/sequential/compute_pipeline/ops/add.py b/transformer_engine/pytorch/sequential/compute_pipeline/ops/add.py
new file mode 100644
index 0000000000..3a93939b42
--- /dev/null
+++ b/transformer_engine/pytorch/sequential/compute_pipeline/ops/add.py
@@ -0,0 +1,48 @@
+from __future__ import annotations
+
+from ... import nvte
+from .op import Op, Context
+
+
+class Add(Op):
+    def __init__(
+        self,
+        bias: nvte.Tensor,
+        *,
+        x_dtype: nvte.DType | None = None,
+        bias_dtype: nvte.DType | None = nvte.DType.BFloat16,
+        dy_dtype: nvte.DType | None = None,
+        y_dtype: nvte.DType | None = None,
+        dx_dtype: nvte.DType | None = nvte.DType.BFloat16,
+        dbias_dtype: nvte.DType | None = nvte.DType.BFloat16,
+    ):
+        self.bias = bias
+        self._x_dtype = x_dtype
+        self.bias_dtype = bias_dtype
+        self._dy_dtype = dy_dtype
+        self._y_dtype = y_dtype
+        self._dx_dtype = dx_dtype
+        self.dbias_dtype = dbias_dtype
+
+    def forward(self, x: nvte.Tensor) -> tuple[nvte.Tensor, Context]:
+        x = nvte.cast_checked(x, self.x_dtype)
+        bias = nvte.cast_checked(self.bias, self.bias_dtype)
+
+        y = nvte.add(x, bias, self.y_dtype or x.dtype)
+
+        return y, {}
+
+    def backward(self, ctx: Context, dy: nvte.Tensor):
+        del ctx
+        dy = nvte.cast_checked(dy, self.dy_dtype)
+
+        dx = nvte.cast_checked(dy, self.dx_dtype)
+        dbias = nvte.dbias(dy, self.dbias_dtype or self.bias.dtype)
+
+        return dx, [dbias]
+
+    def require_grad(self):
+        return [self.bias]
+
+
+__all__ = ["Add"]
diff --git a/transformer_engine/pytorch/sequential/compute_pipeline/ops/attention.py b/transformer_engine/pytorch/sequential/compute_pipeline/ops/attention.py
new file mode 100644
index 0000000000..a44a6bdb8c
--- /dev/null
+++ b/transformer_engine/pytorch/sequential/compute_pipeline/ops/attention.py
@@ -0,0 +1,24 @@
+from __future__ import annotations
+
+from typing import Callable
+from abc import ABC
+from ... import nvte
+from .op import Grads, Op, Context
+
+
+class DotProductAttention(Op, ABC):
+    def __init__(
+        self,
+        *,
+        x_dtype: nvte.DType | None = nvte.DType.BFloat16,
+        dy_dtype: nvte.DType | None = nvte.DType.BFloat16,
+        y_dtype: nvte.DType | None = nvte.DType.BFloat16,
+        dx_dtype: nvte.DType | None = nvte.DType.BFloat16,
+    ):
+        self._x_dtype = x_dtype
+        self._dy_dtype = dy_dtype
+        self._y_dtype = y_dtype
+        self._dx_dtype = dx_dtype
+
+    def forward(self, qkv_packed: nvte.Tensor):
+        ...  # TODO
diff --git a/transformer_engine/pytorch/sequential/compute_pipeline/ops/layernorm.py b/transformer_engine/pytorch/sequential/compute_pipeline/ops/layernorm.py
new file mode 100644
index 0000000000..5d4f1aff93
--- /dev/null
+++ b/transformer_engine/pytorch/sequential/compute_pipeline/ops/layernorm.py
@@ -0,0 +1,75 @@
+from __future__ import annotations
+
+from ... import nvte
+from .op import Op, Context
+
+
+class LayerNorm(Op):
+    def __init__(
+        self,
+        eps: float,
+        zero_centered_gamma: bool,
+        weight: nvte.Tensor,
+        bias: nvte.Tensor,
+        *,
+        x_dtype: nvte.DType | None = nvte.DType.BFloat16,
+        weight_dtype: nvte.DType | None = nvte.DType.BFloat16,
+        bias_dtype: nvte.DType | None = nvte.DType.BFloat16,
+        dy_dtype: nvte.DType | None = nvte.DType.BFloat16,
+        y_dtype: nvte.DType | None = nvte.DType.Float8E4M3,
+        dx_dtype: nvte.DType | None = nvte.DType.BFloat16,
+        dweight_dtype: nvte.DType | None = nvte.DType.BFloat16,
+        dbias_dtype: nvte.DType | None = nvte.DType.BFloat16,
+    ):
+        self.eps = eps
+        self.zero_centered_gamma = zero_centered_gamma
+        self.weight = weight
+        self.bias = bias
+        self._x_dtype = x_dtype
+        self.weight_dtype = weight_dtype
+        self.bias_dtype = bias_dtype
+        self._dy_dtype = dy_dtype
+        self._y_dtype = y_dtype
+        self._dx_dtype = dx_dtype
+        self.dweight_dtype = dweight_dtype
+        self.dbias_dtype = dbias_dtype
+
+    def forward(self, x: nvte.Tensor):
+        x = nvte.cast_checked(x, self.x_dtype)
+        weight = nvte.cast_checked(self.weight, self.weight_dtype)
+        bias = nvte.cast_checked(self.bias, self.bias_dtype)
+
+        y, mu, rsigma = nvte.layernorm(
+            x,
+            self.eps,
+            self.zero_centered_gamma,
+            weight,
+            bias,
+            self.y_dtype or x.dtype,
+        )
+
+        return y, {"x": x, "weight": weight, "mu": mu, "rsigma": rsigma}
+
+    def backward(self, ctx: Context, dy: nvte.Tensor):
+        x, weight, mu, rsigma = ctx["x"], ctx["weight"], ctx["mu"], ctx["rsigma"]
+        dy = nvte.cast_checked(dy, self.dy_dtype)
+
+        dx, dweight, dbias = nvte.dlayernorm(
+            dy,
+            self.zero_centered_gamma,
+            x,
+            weight,
+            mu,
+            rsigma,
+            self.dx_dtype or dy.dtype,
+            self.dweight_dtype or self.weight.dtype,
+            self.dbias_dtype or self.bias.dtype,
+        )
+
+        return dx, [dweight, dbias]
+
+    def require_grad(self):
+        return [self.weight, self.bias]
+
+
+__all__ = ["LayerNorm"]
diff --git a/transformer_engine/pytorch/sequential/compute_pipeline/ops/mmt.py b/transformer_engine/pytorch/sequential/compute_pipeline/ops/mmt.py
new file mode 100644
index 0000000000..b326b7e9a0
--- /dev/null
+++ b/transformer_engine/pytorch/sequential/compute_pipeline/ops/mmt.py
@@ -0,0 +1,59 @@
+from __future__ import annotations
+
+from ... import nvte
+from .op import Op, Context
+
+
+class MMT(Op):
+    def __init__(
+        self,
+        weight: nvte.Tensor,
+        *,
+        x_dtype: nvte.DType | None = nvte.DType.Float8E4M3,
+        weight_dtype: nvte.DType | None = nvte.DType.Float8E4M3,
+        dy_dtype: nvte.DType | None = nvte.DType.Float8E5M2,
+        y_dtype: nvte.DType | None = nvte.DType.BFloat16,
+        dx_dtype: nvte.DType | None = nvte.DType.BFloat16,
+        dweight_dtype: nvte.DType | None = nvte.DType.BFloat16,
+    ):
+        self.weight = weight
+        self._x_dtype = x_dtype
+        self.weight_dtype = weight_dtype
+        self._dy_dtype = dy_dtype
+        self._y_dtype = y_dtype
+        self._dx_dtype = dx_dtype
+        self.dweight_dtype = dweight_dtype
+
+    def inference(self, x: nvte.Tensor):
+        x = nvte.cast_checked(x, self.x_dtype)
+        weight = nvte.cast_checked(self.weight, self.weight_dtype)
+
+        y = nvte.matmul_transpose(x, weight, self.y_dtype or x.dtype)
+
+        return y
+
+    def forward(self, x: nvte.Tensor):
+        (x, x_t), (weight, weight_t) = nvte.multi_cast_transpose_checked(
+            (x, self.x_dtype), (self.weight, self.weight_dtype)
+        )
+
+        y = nvte.matmul_transpose(x, weight, self.y_dtype or x.dtype)
+
+        return y, {"x_t": x_t, "weight_t": weight_t}
+
+    def backward(self, ctx: Context, dy: nvte.Tensor):
+        x_t, weight_t = ctx["x_t"], ctx["weight_t"]
+        dy, dy_t = nvte.cast_transpose_checked(dy, self.dy_dtype)
+
+        dx = nvte.matmul_transpose(dy, weight_t, self.dx_dtype or dy.dtype)
+        dweight = nvte.matmul_transpose(
+            x_t, dy_t, self.dweight_dtype or self.weight.dtype
+        )
+
+        return dx, [dweight]
+
+    def require_grad(self):
+        return [self.weight]
+
+
+__all__ = ["MMT"]
diff --git a/transformer_engine/pytorch/sequential/compute_pipeline/ops/op.py b/transformer_engine/pytorch/sequential/compute_pipeline/ops/op.py
new file mode 100644
index 0000000000..2cfcc08676
--- /dev/null
+++ b/transformer_engine/pytorch/sequential/compute_pipeline/ops/op.py
@@ -0,0 +1,70 @@
+from __future__ import annotations
+
+from abc import ABC, abstractmethod
+from ... import nvte
+
+Context = dict[str, nvte.Tensor]
+Grads = list[nvte.Tensor]
+
+
+class Op(ABC):
+    @abstractmethod
+    def __init__(
+        self,
+        *,
+        x_dtype: nvte.DType | None = None,
+        y_dtype: nvte.DType | None = None,
+        dy_dtype: nvte.DType | None = None,
+        dx_dtype: nvte.DType | None = None,
+    ):
+        ...
+
+    def inference(self, x: nvte.Tensor, /):
+        return self.forward(x)[0]
+
+    @abstractmethod
+    def forward(self, x: nvte.Tensor, /) -> tuple[nvte.Tensor, Context]:
+        ...
+
+    @abstractmethod
+    def backward(self, ctx: Context, dy: nvte.Tensor, /) -> tuple[nvte.Tensor, Grads]:
+        ...
+
+    @abstractmethod
+    def require_grad(self) -> list[nvte.Tensor]:
+        ...
+
+    def __repr__(self):
+        return self.__class__.__name__
+
+    @property
+    def x_dtype(self):
+        return self._x_dtype
+
+    @property
+    def y_dtype(self):
+        return self._y_dtype or self.x_dtype
+
+    @property
+    def dy_dtype(self):
+        return self._dy_dtype
+
+    @property
+    def dx_dtype(self):
+        return self._dx_dtype or self._dy_dtype
+
+    _x_dtype: nvte.DType | None
+    _y_dtype: nvte.DType | None
+    _dy_dtype: nvte.DType | None
+    _dx_dtype: nvte.DType | None
+
+    @property
+    def fusion_type(self):
+        return {
+            "forward": type(self),
+            "backward": type(self),
+            "inference": type(self),
+        }
+
+
+__all__ = ["Op", "Context", "Grads"]
diff --git a/transformer_engine/pytorch/sequential/compute_pipeline/ops/residual.py b/transformer_engine/pytorch/sequential/compute_pipeline/ops/residual.py
new file mode 100644
index 0000000000..b43419c60f
--- /dev/null
+++ b/transformer_engine/pytorch/sequential/compute_pipeline/ops/residual.py
@@ -0,0 +1,89 @@
+from __future__ import annotations
+
+from transformer_engine.pytorch.sequential import nvte
+
+from . import Op, Grads, Context
+from . import Add
+from ... import nvte
+
+
+class ResidualBegin(Op):
+    end: ResidualEnd
+    residual_backward: nvte.Tensor
+
+    def __init__(
+        self,
+        *,
+        x_dtype: nvte.DType | None = nvte.DType.BFloat16,
+        dy_dtype: nvte.DType | None = nvte.DType.BFloat16,
+        y_dtype: nvte.DType | None = nvte.DType.BFloat16,
+        dx_dtype: nvte.DType | None = nvte.DType.BFloat16,
+    ):
+        self._x_dtype = x_dtype
+        self._dy_dtype = dy_dtype
+        self._y_dtype = y_dtype
+        self._dx_dtype = dx_dtype
+
+    def forward(self, x: nvte.Tensor) -> tuple[nvte.Tensor, Context]:
+        x = nvte.cast_checked(x, self.x_dtype)
+        self.end.residual_forward = x
+        y = nvte.cast_checked(x, self.y_dtype)
+        return y, {}
+
+    def backward(self, ctx: Context, dy: nvte.Tensor) -> tuple[nvte.Tensor, Grads]:
+        del ctx
+        dy = nvte.cast_checked(dy, self.dy_dtype)
+        dx = nvte.add(dy, self.residual_backward, self.dx_dtype or dy.dtype)
+        del self.residual_backward
+        return dx, []
+
+    def require_grad(self) -> list[nvte.Tensor]:
+        return []
+
+
+class ResidualEnd(Op):
+    begin: ResidualBegin
+    residual_forward: nvte.Tensor
+
+    def __init__(
+        self,
+        *,
+        x_dtype: nvte.DType | None = nvte.DType.BFloat16,
+        dy_dtype: nvte.DType | None = nvte.DType.BFloat16,
+        y_dtype: nvte.DType | None = nvte.DType.BFloat16,
+        dx_dtype: nvte.DType | None = nvte.DType.BFloat16,
+    ):
+        self._x_dtype = x_dtype
+        self._dy_dtype = dy_dtype
+        self._y_dtype = y_dtype
+        self._dx_dtype = dx_dtype
+
+    def forward(self, x: nvte.Tensor) -> tuple[nvte.Tensor, Context]:
+        x = nvte.cast_checked(x, self.x_dtype)
+        y = nvte.add(x, self.residual_forward, self.y_dtype or x.dtype)
+        del self.residual_forward
+        return y, {}
+
+    def backward(self, ctx: Context, dy: nvte.Tensor) -> tuple[nvte.Tensor, Grads]:
+        del ctx
+        dy = nvte.cast_checked(dy, self.dy_dtype)
+        self.begin.residual_backward = dy
+        dx = nvte.cast_checked(dy, self.dx_dtype)
+        return dx, []
+
+    def require_grad(self) -> list[nvte.Tensor]:
+        return []
+
+    @property
+    def bias(self):
+        return self.residual_forward
+
+    @property
+    def bias_dtype(self):
+        return None
+
+    @property
+    def fusion_type(self):
+        return super().fusion_type | {
+            "forward": Add,
+        }
diff --git a/transformer_engine/pytorch/sequential/compute_pipeline/ops/rmsnorm.py b/transformer_engine/pytorch/sequential/compute_pipeline/ops/rmsnorm.py
new file mode 100644
index 0000000000..de56741fe7
--- /dev/null
+++ b/transformer_engine/pytorch/sequential/compute_pipeline/ops/rmsnorm.py
@@ -0,0 +1,65 @@
+from __future__ import annotations
+
+from ... import nvte
+from .op import Op, Context
+
+
+class RMSNorm(Op):
+    def __init__(
+        self,
+        eps: float,
+        zero_centered_gamma: bool,
+        weight: nvte.Tensor,
+        *,
+        x_dtype: nvte.DType | None = nvte.DType.BFloat16,
+        weight_dtype: nvte.DType | None = nvte.DType.BFloat16,
+        dy_dtype: nvte.DType | None = nvte.DType.BFloat16,
+        y_dtype: nvte.DType | None = nvte.DType.Float8E4M3,
+        dx_dtype: nvte.DType | None = nvte.DType.BFloat16,
+        dweight_dtype: nvte.DType | None = nvte.DType.BFloat16,
+    ):
+        self.eps = eps
+        self.zero_centered_gamma = zero_centered_gamma
+        self.weight = weight
+        self._x_dtype = x_dtype
+        self.weight_dtype = weight_dtype
+        self._dy_dtype = dy_dtype
+        self._y_dtype = y_dtype
+        self._dx_dtype = dx_dtype
+        self.dweight_dtype = dweight_dtype
+
+    def forward(self, x: nvte.Tensor):
+        x = nvte.cast_checked(x, self.x_dtype)
+        weight = nvte.cast_checked(self.weight, self.weight_dtype)
+
+        y, rsigma = nvte.rmsnorm(
+            x,
+            self.eps,
+            self.zero_centered_gamma,
+            weight,
+            self.y_dtype or x.dtype,
+        )
+
+        return y, {"x": x, "weight": weight, "rsigma": rsigma}
+
+    def backward(self, ctx: Context, dy: nvte.Tensor):
+        x, weight, rsigma = ctx["x"], ctx["weight"], ctx["rsigma"]
+        dy = nvte.cast_checked(dy, self.dy_dtype)
+
+        dx, dweight = nvte.drmsnorm(
+            dy,
+            self.zero_centered_gamma,
+            x,
+            weight,
+            rsigma,
+            self.dx_dtype or dy.dtype,
+            self.dweight_dtype or self.weight.dtype,
+        )
+
+        return dx, [dweight]
+
+    def require_grad(self):
+        return [self.weight]
+
+
+__all__ = ["RMSNorm"]
diff --git a/transformer_engine/pytorch/sequential/compute_pipeline/ops_types.py b/transformer_engine/pytorch/sequential/compute_pipeline/ops_types.py
new file mode 100644
index 0000000000..602eef8672
--- /dev/null
+++ b/transformer_engine/pytorch/sequential/compute_pipeline/ops_types.py
@@ -0,0 +1,23 @@
+from __future__ import annotations
+from typing import Callable
+from typing_extensions import Unpack
+from .. import nvte
+from .ops import Context, Grads
+
+Forward = Callable[[nvte.Tensor], tuple[nvte.Tensor, Context]]
+ForwardFused = Callable[[nvte.Tensor], tuple[nvte.Tensor, tuple[Context, ...]]]
+Backward = Callable[[Context, nvte.Tensor], tuple[nvte.Tensor, Grads]]
+BackwardFused = Callable[
+    [Unpack[tuple[Context, ...]], nvte.Tensor], tuple[nvte.Tensor, tuple[Grads, ...]]
+]
+Inference = Callable[[nvte.Tensor], nvte.Tensor]
+
+__all__ = [
+    "Forward",
+    "ForwardFused",
+    "Backward",
+    "BackwardFused",
+    "Inference",
+    "Context",
+    "Grads",
+]
diff --git a/transformer_engine/pytorch/sequential/compute_pipeline_function.py b/transformer_engine/pytorch/sequential/compute_pipeline_function.py
new file mode 100644
index 0000000000..ae41ded206
--- /dev/null
+++ b/transformer_engine/pytorch/sequential/compute_pipeline_function.py
@@ -0,0 +1,267 @@
+from __future__ import annotations
+import torch
+from torch import autograd
+from torch.autograd.function import FunctionCtx
+from typing import Final
+from .persistent import Persistent
+from . import nvte
+from .compute_pipeline import ComputePipeline, Context, Op
+
+FP8Meta = tuple[torch.Tensor, torch.Tensor, torch.Tensor]
+
+
+class ForwardArgs:
+    nvte_x: nvte.Tensor
+    is_exposed_x_squished_now: bool
+    upcoming_backward: BackwardComm | None
+    op: Final[Op]
+    meta_tensor_provider_fwd: Final[Persistent[nvte.DType, FP8Meta]]
+    meta_tensor_provider_bwd: Final[Persistent[nvte.DType, FP8Meta]]
+
+    def __init__(
+        self,
+        nvte_x: nvte.Tensor,
+        is_exposed_x_squished_now: bool,
+        upcoming_backward: BackwardComm | None,
+        op: Op,
+        meta_tensor_provider_fwd: Persistent[nvte.DType, FP8Meta],
+        meta_tensor_provider_bwd: Persistent[nvte.DType, FP8Meta],
+    ):
+        self.nvte_x = nvte_x
+        self.is_exposed_x_squished_now = is_exposed_x_squished_now
+        self.upcoming_backward = upcoming_backward
+        self.op = op
+        self.meta_tensor_provider_fwd = meta_tensor_provider_fwd
+        self.meta_tensor_provider_bwd = meta_tensor_provider_bwd
+
+
+class BackwardComm:
+    nvte_grad_output: nvte.Tensor | None = None
+
+
+class ComputePipelineFunction(autograd.Function):
+    @staticmethod
+    def forward(  # type: ignore[arg-type]
+        ctx: FunctionCtx,
+        exposed_x: torch.Tensor,
+        *exposed_args: torch.Tensor | ForwardArgs,
+    ):
+        """
+        exposed_x is used only to let autograd construct the computation graph
+        real input and output is in list, as nvte.Tensor is immutable
+        exposed_tensors are exposed for the optimizer to later apply gradients
+        """
+        exposed_tensors, args = exposed_args[:-1], exposed_args[-1]
+        del exposed_tensors
+        assert isinstance(args, ForwardArgs)
+
+        nvte_x = args.nvte_x
+
+        nvte.set_execution_state("forward", args.meta_tensor_provider_fwd)
+        y, to_save = args.op.forward(nvte_x)
+
+        # Expose backward context for tracing
+        bwd_ctx = list[torch.Tensor]()
+        for _, tensor in to_save.items():
+            bwd_ctx.append(tensor.data)
+            if tensor.amax.numel():
+                bwd_ctx.append(tensor.amax)
+            if tensor.scale.numel():
+                bwd_ctx.append(tensor.scale)
+            if tensor.scale_inv.numel():
+                bwd_ctx.append(tensor.scale_inv)
+        ctx.save_for_backward(*bwd_ctx)
+
+        # Save real context
+        setattr(ctx, "nvte_ctx", to_save)
+        setattr(ctx, "nvte_op", args.op)
+        setattr(ctx, "nvte_meta_tensor_provider_bwd", args.meta_tensor_provider_bwd)
+
+        # Actually store the result
+        args.nvte_x = y
+
+        # Pytorch will break the computation graph
+        # if it will see an output tensor of an integer type.
+        # As fp8 tensors internally have dtype int8,
+        # we need to pretend that this type is actually different
+        # by "squishing" it into a floating point dtype.
+        # ("Squishing" because, while the new dtype is larger,
+        # the numel() gets smaller).
+        # This doesn't work in TorchScript, but this code
+        # won't run at inference anyway.
+
+        # Unsquish x if needed:
+        if args.is_exposed_x_squished_now:
+            # Intentionally commented out - _unsquish(exposed_x)
+            # We don't need to perform the unsquish itself, as this
+            # data will not be read anyway.
+            # Actually, we cannot do that, as x,
+            # cannot be modified in place.
+            # It is only really neccesarry to notify
+            # the backward.
+            args.is_exposed_x_squished_now = False
+            # If the input to the forward was squished,
+            # Pytorch will expect its gradient to be squished
+            # as well. The backward of this forward will be
+            # responsible for producing the gradient of
+            # this squished input, so it is responsible for
+            # squishing it.
+            setattr(ctx, "nvte_squish_outgoing_dgrad", True)
+        else:
+            setattr(ctx, "nvte_squish_outgoing_dgrad", False)
+
+        # Expose result for Pytorch
+        x_data = exposed_x.data
+        exposed_x.data = torch.Tensor().cuda()  # avoid copy
+        exposed_y = exposed_x.clone()  # copy history
+        exposed_x.data = x_data
+        exposed_y.data = y.data
+
+        # Squish y if fp8:
+        if exposed_y.data.dtype == torch.int8:
+            _squish(exposed_y)
+            # Because the output is squished, the gradient also needs to be.
+            # The backward of this forward recieves the gradient of the
+            # output as its input. So, the backward before it needs
+            # to squish it, while the backward coresponding to this
+            # forward needs to unsquish it.
+            setattr(ctx, "nvte_unsquish_incoming_dgrad", True)
+            args.is_exposed_x_squished_now = True
+        else:
+            setattr(ctx, "nvte_unsquish_incoming_dgrad", False)
+            args.is_exposed_x_squished_now = False
+
+        # Save backward comm
+        # This object is allows for the current backward to
+        # pass data to the next backward (the backward of the
+        # preceding operation). This is needed to pass
+        # fp8 gradients properly.
+        setattr(ctx, "nvte_upcoming_backward_comm", args.upcoming_backward)
+        args.upcoming_backward = BackwardComm()
+        setattr(ctx, "nvte_preceding_backward_comm", args.upcoming_backward)
+
+        return exposed_y
+
+    @staticmethod
+    def backward(ctx: FunctionCtx, grad_output: torch.Tensor):  # type: ignore[arg-type]
+        # The context needs to think that the tensors were read
+        _ = ctx.saved_tensors  # type: ignore
+
+        # Get real context
+        saved: Context = getattr(ctx, "nvte_ctx")
+        op: Op = getattr(ctx, "nvte_op")
+        preceding_backward: BackwardComm = getattr(ctx, "nvte_preceding_backward_comm")
+        upcoming_backward: BackwardComm | None = getattr(
+            ctx, "nvte_upcoming_backward_comm"
+        )
+
+        # Get real gradient
+        if preceding_backward.nvte_grad_output is None:
+            # This is the first backward in the compute pipeline
+
+            grad_output = grad_output.contiguous()  # TODO: try to avoid this
+
+            # Check if incoming gradient needs to be unsquished
+            unsquish_incoming_dgrad: bool = getattr(ctx, "nvte_unsquish_incoming_dgrad")
+            if unsquish_incoming_dgrad:
+                _unsquish(grad_output)
+            nvte_grad = nvte.make_nvte_tensor(grad_output)
+        else:
+            nvte_grad = preceding_backward.nvte_grad_output
+        del grad_output
+
+        meta_tensor_provider: Persistent[nvte.DType, FP8Meta] = getattr(
+            ctx, "nvte_meta_tensor_provider_bwd"
+        )
+        nvte.set_execution_state("backward", meta_tensor_provider)
+        data_grad, param_grads = op.backward(saved, nvte_grad)
+
+        # Store real gradient for next backward in pipeline
+        if upcoming_backward is None:
+            # This is the last backward in the compute pipeline
+            assert not nvte.is_fp8(data_grad)
+        else:
+            upcoming_backward.nvte_grad_output = data_grad
+
+        # Check that gradients are not fp8 and can be processed by the optimizer
+        # TODO: change this when fp8 optimizer comes along
+        assert all(not nvte.is_fp8(g) for g in param_grads)
+
+        # Check if outgoing gradient needs to be squished
+        exposed_dgrad = data_grad.data
+        squish_outgoing_dgrad: bool = getattr(ctx, "nvte_squish_outgoing_dgrad")
+        if squish_outgoing_dgrad:
+            _squish(exposed_dgrad)
+
+        torch_grads = [exposed_dgrad] + [g.data for g in param_grads]
+
+        return (*torch_grads, None, None, None)
+
+
+def apply(x: torch.Tensor, pipeline: ComputePipeline, training: bool) -> torch.Tensor:
+    if not training:
+        y = pipeline.run_inference(nvte.make_nvte_tensor(x))
+        assert not nvte.is_fp8(y)
+        return y.data
+    else:
+        pipeline.next_iteration()
+        nvte_x = nvte.make_nvte_tensor(x)
+        is_exposed_x_squished_now = False
+        upcoming_backward = None
+        for contained_op in pipeline.functions:
+            nvte_tensors = contained_op.require_grad()
+            exposed_tensors = list[torch.Tensor]()
+            for nvte_tensor in nvte_tensors:
+                assert not nvte.is_fp8(
+                    nvte_tensor
+                )  # TODO: change when fp8 optimizer comes along
+                exposed_tensors.append(nvte_tensor.data)
+            args = ForwardArgs(
+                nvte_x,
+                is_exposed_x_squished_now,
+                upcoming_backward,
+                contained_op,
+                pipeline.meta_fwd,
+                pipeline.meta_bwd,
+            )
+            x = ComputePipelineFunction.apply(x, *exposed_tensors, args)  # type: ignore
+            nvte_x, is_exposed_x_squished_now, upcoming_backward = (
+                args.nvte_x,
+                args.is_exposed_x_squished_now,
+                args.upcoming_backward,
+            )
+        return x
+
+
+# The squish needs to be invertible and
+# always reduce the numel() of the tensor by the same
+# amount.
+#
+# If a tensor is to be squished, it must have been
+#   1. an fp8 result from forward
+#   2. an outgoing gradient
+#
+# The outgoing gradient could have any type,
+# but it is reasonable to assume that if someone is
+# using fp8, they are also probably using bfloat16
+# rather than float16.
+#
+# And they probably won't be using float64.
+SQUISH_TABLE = {
+    torch.int8: torch.float16,
+    torch.bfloat16: torch.float32,
+    torch.float32: torch.float64,
+}
+UNSQUISH_TABLE = {v: k for k, v in SQUISH_TABLE.items()}
+
+
+def _unsquish(t: torch.Tensor):
+    assert t.data.dtype in UNSQUISH_TABLE
+    t.data = t.data.view(UNSQUISH_TABLE[t.data.dtype])
+
+
+def _squish(t: torch.Tensor):
+    if t.data.dtype in SQUISH_TABLE:
+        t.data = t.data.view(SQUISH_TABLE[t.data.dtype])
+    else:
+        raise RuntimeError("Invalid dtype of gradient for FP8 tensor.")
diff --git a/transformer_engine/pytorch/sequential/exec_saving_source.py b/transformer_engine/pytorch/sequential/exec_saving_source.py
new file mode 100644
index 0000000000..1d6d9da16b
--- /dev/null
+++ b/transformer_engine/pytorch/sequential/exec_saving_source.py
@@ -0,0 +1,42 @@
+# Need to be in seperate file as it cannot have
+# from __future__ import annotations
+
+from typing import TYPE_CHECKING, Any
+
+if __debug__ or TYPE_CHECKING:
+
+    def exec_saving_source(
+        source: str,
+        globals: dict[str, Any] | None = None,
+        locals: dict[str, Any] | None = None,
+    ):
+        """Equivalent to exec, but allows for the code to be introspected by,
+        for example, `pdb` or `inspect`"""
+        import ast
+        import linecache
+
+        if not hasattr(exec_saving_source, "sources"):
+            old_getlines = linecache.getlines
+            sources: list[str] = []
+
+            def patched_getlines(filename: str, module_globals: Any = None):
+                if "<exec#" in filename:
+                    index = int(filename.split("#")[1].split(">")[0])
+                    return sources[index].splitlines(True)
+                else:
+                    return old_getlines(filename, module_globals)
+
+            linecache.getlines = patched_getlines
+            setattr(exec_saving_source, "sources", sources)
+        sources: list[str] = getattr(exec_saving_source, "sources")
+        sources.append(source)
+        exec(
+            compile(
+                ast.parse(source), filename=f"<exec#{len(sources) - 1}>", mode="exec"
+            ),
+            globals,
+            locals,
+        )
+
+else:
+    exec_saving_source = exec
diff --git a/transformer_engine/pytorch/sequential/import_diagram.svg b/transformer_engine/pytorch/sequential/import_diagram.svg
new file mode 100644
index 0000000000..e3fb549f58
--- /dev/null
+++ b/transformer_engine/pytorch/sequential/import_diagram.svg
@@ -0,0 +1,1313 @@
+<?xml version="1.0" encoding="UTF-8" standalone="no"?>
+<!DOCTYPE svg PUBLIC "-//W3C//DTD SVG 1.1//EN"
+ "http://www.w3.org/Graphics/SVG/1.1/DTD/svg11.dtd">
+<!-- Generated by graphviz version 8.1.0 (20230707.0739)
+ -->
+<!-- Title: G Pages: 1 -->
+<svg width="4054pt" height="708pt"
+ viewBox="0.00 0.00 4053.78 707.93" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink">
+<g id="graph0" class="graph" transform="scale(1 1) rotate(0) translate(4 703.93)">
+<title>G</title><style>.edge>path:hover{stroke-width:8}</style>
+<polygon fill="white" stroke="none" points="-4,4 -4,-703.93 4049.78,-703.93 4049.78,4 -4,4"/>
+<!-- transformer_engine_pytorch_sequential -->
+<g id="node1" class="node">
+<title>transformer_engine_pytorch_sequential</title><style>.edge>path:hover{stroke-width:8}</style>
+<ellipse fill="#d10505" stroke="black" cx="1843.1" cy="-31.11" rx="71.77" ry="31.11"/>
+<text text-anchor="middle" x="1843.1" y="-39.61" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#ffffff">transformer_engine.</text>
+<text text-anchor="middle" x="1843.1" y="-27.61" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#ffffff">pytorch.</text>
+<text text-anchor="middle" x="1843.1" y="-15.61" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#ffffff">sequential</text>
+</g>
+<!-- transformer_engine_pytorch_sequential_compute_pipeline -->
+<g id="node2" class="node">
+<title>transformer_engine_pytorch_sequential_compute_pipeline</title><style>.edge>path:hover{stroke-width:8}</style>
+<ellipse fill="#e50505" stroke="black" cx="3158.1" cy="-282.59" rx="55.17" ry="18"/>
+<text text-anchor="middle" x="3158.1" y="-279.09" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#ffffff">compute_pipeline</text>
+</g>
+<!-- transformer_engine_pytorch_sequential_compute_pipeline_function -->
+<g id="node21" class="node">
+<title>transformer_engine_pytorch_sequential_compute_pipeline_function</title><style>.edge>path:hover{stroke-width:8}</style>
+<ellipse fill="#813737" stroke="black" cx="1434.1" cy="-197.48" rx="78.62" ry="18"/>
+<text text-anchor="middle" x="1434.1" y="-193.98" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#ffffff">compute_pipeline_function</text>
+</g>
+<!-- transformer_engine_pytorch_sequential_compute_pipeline&#45;&gt;transformer_engine_pytorch_sequential_compute_pipeline_function -->
+<g id="edge1" class="edge">
+<title>transformer_engine_pytorch_sequential_compute_pipeline&#45;&gt;transformer_engine_pytorch_sequential_compute_pipeline_function</title><style>.edge>path:hover{stroke-width:8}</style>
+<path fill="none" stroke="black" d="M3131.91,-266.29C3120.66,-260.51 3107.12,-254.59 3094.1,-251.48 2961.66,-219.83 2005.19,-220.14 1869.1,-215.48 1748.86,-211.36 1609.95,-205.79 1522.09,-202.17"/>
+<polygon fill="#e50505" stroke="black" points="1522.24,-198.63 1512.1,-201.71 1521.95,-205.62 1522.24,-198.63"/>
+</g>
+<!-- transformer_engine_pytorch_sequential_module_Activation -->
+<g id="node25" class="node">
+<title>transformer_engine_pytorch_sequential_module_Activation</title><style>.edge>path:hover{stroke-width:8}</style>
+<ellipse fill="#a44646" stroke="black" cx="1843.1" cy="-120.85" rx="41.01" ry="22.63"/>
+<text text-anchor="middle" x="1843.1" y="-123.35" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#ffffff">module.</text>
+<text text-anchor="middle" x="1843.1" y="-111.35" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#ffffff">Activation</text>
+</g>
+<!-- transformer_engine_pytorch_sequential_compute_pipeline&#45;&gt;transformer_engine_pytorch_sequential_module_Activation -->
+<g id="edge2" class="edge">
+<title>transformer_engine_pytorch_sequential_compute_pipeline&#45;&gt;transformer_engine_pytorch_sequential_module_Activation</title><style>.edge>path:hover{stroke-width:8}</style>
+<path fill="none" stroke="black" d="M3131.88,-266.42C3120.62,-260.67 3107.09,-254.73 3094.1,-251.48 2867.89,-194.88 2264.38,-280.53 2046.1,-198.48"/>
+<path fill="none" stroke="black" d="M2046.1,-196.48C2033.87,-191.88 2033.91,-185.1 2022.1,-179.48 1968.36,-153.9 1949.22,-163.34 1893.1,-143.48 1890.18,-142.44 1887.18,-141.32 1884.18,-140.14"/>
+<polygon fill="#e50505" stroke="black" points="1885.63,-136.55 1875.05,-136.01 1882.99,-143.03 1885.63,-136.55"/>
+</g>
+<!-- transformer_engine_pytorch_sequential_module_Linear -->
+<g id="node26" class="node">
+<title>transformer_engine_pytorch_sequential_module_Linear</title><style>.edge>path:hover{stroke-width:8}</style>
+<ellipse fill="#964040" stroke="black" cx="2046.1" cy="-120.85" rx="46.15" ry="18"/>
+<text text-anchor="middle" x="2046.1" y="-117.35" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#ffffff">module.Linear</text>
+</g>
+<!-- transformer_engine_pytorch_sequential_compute_pipeline&#45;&gt;transformer_engine_pytorch_sequential_module_Linear -->
+<g id="edge3" class="edge">
+<title>transformer_engine_pytorch_sequential_compute_pipeline&#45;&gt;transformer_engine_pytorch_sequential_module_Linear</title><style>.edge>path:hover{stroke-width:8}</style>
+<path fill="none" stroke="black" d="M2046.1,-196.48C2028.87,-190 2030.77,-167.72 2035.87,-149.18"/>
+<polygon fill="#e50505" stroke="black" points="2039.46,-150.43 2039.12,-139.84 2032.78,-148.32 2039.46,-150.43"/>
+</g>
+<!-- transformer_engine_pytorch_sequential_module_activation -->
+<g id="node29" class="node">
+<title>transformer_engine_pytorch_sequential_module_activation</title><style>.edge>path:hover{stroke-width:8}</style>
+<ellipse fill="#a44646" stroke="black" cx="1942.1" cy="-120.85" rx="39.95" ry="22.63"/>
+<text text-anchor="middle" x="1942.1" y="-123.35" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#ffffff">module.</text>
+<text text-anchor="middle" x="1942.1" y="-111.35" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#ffffff">activation</text>
+</g>
+<!-- transformer_engine_pytorch_sequential_compute_pipeline&#45;&gt;transformer_engine_pytorch_sequential_module_activation -->
+<g id="edge4" class="edge">
+<title>transformer_engine_pytorch_sequential_compute_pipeline&#45;&gt;transformer_engine_pytorch_sequential_module_activation</title><style>.edge>path:hover{stroke-width:8}</style>
+<path fill="none" stroke="black" d="M2046.1,-196.48C2045.58,-196.29 2005.22,-167.28 1974.94,-145.49"/>
+<polygon fill="#e50505" stroke="black" points="1977.48,-142.29 1967.32,-139.29 1973.39,-147.97 1977.48,-142.29"/>
+</g>
+<!-- transformer_engine_pytorch_sequential_module_base -->
+<g id="node30" class="node">
+<title>transformer_engine_pytorch_sequential_module_base</title><style>.edge>path:hover{stroke-width:8}</style>
+<ellipse fill="#b30404" stroke="black" cx="1401.1" cy="-120.85" rx="43" ry="18"/>
+<text text-anchor="middle" x="1401.1" y="-117.35" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#ffffff">module.base</text>
+</g>
+<!-- transformer_engine_pytorch_sequential_compute_pipeline&#45;&gt;transformer_engine_pytorch_sequential_module_base -->
+<g id="edge5" class="edge">
+<title>transformer_engine_pytorch_sequential_compute_pipeline&#45;&gt;transformer_engine_pytorch_sequential_module_base</title><style>.edge>path:hover{stroke-width:8}</style>
+<path fill="none" stroke="black" d="M3131.91,-266.29C3120.66,-260.52 3107.12,-254.6 3094.1,-251.48 2835.29,-189.54 2152.47,-290.36 1897.1,-215.48 1867.26,-206.73 1866.65,-189.17 1837.1,-179.48 1674.23,-126.06 1619.45,-184.86 1453.1,-143.48 1448.39,-142.31 1443.54,-140.77 1438.79,-139.06"/>
+<polygon fill="#e50505" stroke="black" points="1440.24,-135.48 1429.65,-135.11 1437.69,-142 1440.24,-135.48"/>
+</g>
+<!-- transformer_engine_pytorch_sequential_module_linear -->
+<g id="node31" class="node">
+<title>transformer_engine_pytorch_sequential_module_linear</title><style>.edge>path:hover{stroke-width:8}</style>
+<ellipse fill="#964040" stroke="black" cx="1740.1" cy="-120.85" rx="44.35" ry="18"/>
+<text text-anchor="middle" x="1740.1" y="-117.35" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#ffffff">module.linear</text>
+</g>
+<!-- transformer_engine_pytorch_sequential_compute_pipeline&#45;&gt;transformer_engine_pytorch_sequential_module_linear -->
+<g id="edge6" class="edge">
+<title>transformer_engine_pytorch_sequential_compute_pipeline&#45;&gt;transformer_engine_pytorch_sequential_module_linear</title><style>.edge>path:hover{stroke-width:8}</style>
+<path fill="none" stroke="black" d="M3131.9,-266.32C3120.65,-260.55 3107.12,-254.62 3094.1,-251.48 2864.56,-196.08 2265.1,-253.86 2032.1,-215.48 1977.37,-206.47 1845.4,-161.98 1793.1,-143.48 1788.84,-141.97 1784.41,-140.31 1780.01,-138.61"/>
+<polygon fill="#e50505" stroke="black" points="1781.49,-135.03 1770.9,-134.59 1778.91,-141.54 1781.49,-135.03"/>
+</g>
+<!-- transformer_engine_pytorch_sequential_module_normalization -->
+<g id="node32" class="node">
+<title>transformer_engine_pytorch_sequential_module_normalization</title><style>.edge>path:hover{stroke-width:8}</style>
+<ellipse fill="#9d4343" stroke="black" cx="2268.1" cy="-120.85" rx="52.15" ry="22.63"/>
+<text text-anchor="middle" x="2268.1" y="-123.35" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#ffffff">module.</text>
+<text text-anchor="middle" x="2268.1" y="-111.35" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#ffffff">normalization</text>
+</g>
+<!-- transformer_engine_pytorch_sequential_compute_pipeline&#45;&gt;transformer_engine_pytorch_sequential_module_normalization -->
+<g id="edge7" class="edge">
+<title>transformer_engine_pytorch_sequential_compute_pipeline&#45;&gt;transformer_engine_pytorch_sequential_module_normalization</title><style>.edge>path:hover{stroke-width:8}</style>
+<path fill="none" stroke="black" d="M4039.1,-196.48C4035.89,-152.95 2646.7,-127.95 2331.29,-122.84"/>
+<polygon fill="#e50505" stroke="black" points="2331.58,-119.33 2321.52,-122.66 2331.46,-126.32 2331.58,-119.33"/>
+<path fill="none" stroke="black" d="M3184.67,-266.36C3195.86,-260.67 3209.26,-254.79 3222.1,-251.48 3310.19,-228.77 4045.81,-289.2 4039.1,-198.48"/>
+</g>
+<!-- transformer_engine_pytorch_sequential_compute_pipeline_compute_pipeline -->
+<g id="node3" class="node">
+<title>transformer_engine_pytorch_sequential_compute_pipeline_compute_pipeline</title><style>.edge>path:hover{stroke-width:8}</style>
+<ellipse fill="#9b2626" stroke="black" cx="1194.1" cy="-282.59" rx="67" ry="22.63"/>
+<text text-anchor="middle" x="1194.1" y="-285.09" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#ffffff">compute_pipeline.</text>
+<text text-anchor="middle" x="1194.1" y="-273.09" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#ffffff">compute_pipeline</text>
+</g>
+<!-- transformer_engine_pytorch_sequential_compute_pipeline_compute_pipeline&#45;&gt;transformer_engine_pytorch_sequential_compute_pipeline -->
+<g id="edge8" class="edge">
+<title>transformer_engine_pytorch_sequential_compute_pipeline_compute_pipeline&#45;&gt;transformer_engine_pytorch_sequential_compute_pipeline</title><style>.edge>path:hover{stroke-width:8}</style>
+<path fill="none" stroke="black" d="M1243.37,-298.29C1283.27,-310.14 1341.18,-325.39 1393.1,-331.71 1486.42,-343.05 2995.2,-359.14 3085.1,-331.71 3101.72,-326.64 3117.99,-316.37 3130.93,-306.54"/>
+<polygon fill="#9b2626" stroke="black" points="3132.67,-308.83 3138.32,-299.86 3128.32,-303.35 3132.67,-308.83"/>
+</g>
+<!-- transformer_engine_pytorch_sequential_compute_pipeline_compute_pipeline&#45;&gt;transformer_engine_pytorch_sequential_compute_pipeline_function -->
+<g id="edge9" class="edge">
+<title>transformer_engine_pytorch_sequential_compute_pipeline_compute_pipeline&#45;&gt;transformer_engine_pytorch_sequential_compute_pipeline_function</title><style>.edge>path:hover{stroke-width:8}</style>
+<path fill="none" stroke="black" d="M1237.72,-265.07C1249.8,-260.57 1262.94,-255.77 1275.1,-251.48 1309.87,-239.23 1349.16,-226.12 1379.99,-216.01"/>
+<polygon fill="#9b2626" stroke="black" points="1380.68,-219.14 1389.1,-212.7 1378.51,-212.49 1380.68,-219.14"/>
+</g>
+<!-- transformer_engine_pytorch_sequential_compute_pipeline_compute_pipeline&#45;&gt;transformer_engine_pytorch_sequential_module_base -->
+<g id="edge10" class="edge">
+<title>transformer_engine_pytorch_sequential_compute_pipeline_compute_pipeline&#45;&gt;transformer_engine_pytorch_sequential_module_base</title><style>.edge>path:hover{stroke-width:8}</style>
+<path fill="none" stroke="black" d="M1228.92,-262.83C1256.65,-247.2 1295.87,-223.51 1327.1,-198.48"/>
+</g>
+<!-- transformer_engine_pytorch_sequential_compute_pipeline_fusions -->
+<g id="node4" class="node">
+<title>transformer_engine_pytorch_sequential_compute_pipeline_fusions</title><style>.edge>path:hover{stroke-width:8}</style>
+<ellipse fill="#a13535" stroke="black" cx="3602.1" cy="-282.59" rx="67" ry="22.63"/>
+<text text-anchor="middle" x="3602.1" y="-285.09" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#ffffff">compute_pipeline.</text>
+<text text-anchor="middle" x="3602.1" y="-273.09" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#ffffff">fusions</text>
+</g>
+<!-- transformer_engine_pytorch_sequential_compute_pipeline_fusions&#45;&gt;transformer_engine_pytorch_sequential_compute_pipeline_compute_pipeline -->
+<g id="edge11" class="edge">
+<title>transformer_engine_pytorch_sequential_compute_pipeline_fusions&#45;&gt;transformer_engine_pytorch_sequential_compute_pipeline_compute_pipeline</title><style>.edge>path:hover{stroke-width:8}</style>
+<path fill="none" stroke="black" d="M3574.65,-303.42C3558.76,-314.16 3537.85,-326.14 3517.1,-331.71 3403.13,-362.27 1510.24,-345.95 1393.1,-331.71 1345.54,-325.92 1292.95,-312.64 1253.79,-301.34"/>
+<polygon fill="#a13535" stroke="black" points="1254.95,-297.74 1244.37,-298.29 1252.98,-304.46 1254.95,-297.74"/>
+</g>
+<!-- transformer_engine_pytorch_sequential_compute_pipeline_fusions__common -->
+<g id="node5" class="node">
+<title>transformer_engine_pytorch_sequential_compute_pipeline_fusions__common</title><style>.edge>path:hover{stroke-width:8}</style>
+<ellipse fill="#964040" stroke="black" cx="1612.1" cy="-282.59" rx="67" ry="31.11"/>
+<text text-anchor="middle" x="1612.1" y="-291.09" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#ffffff">compute_pipeline.</text>
+<text text-anchor="middle" x="1612.1" y="-279.09" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#ffffff">fusions.</text>
+<text text-anchor="middle" x="1612.1" y="-267.09" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#ffffff">_common</text>
+</g>
+<!-- transformer_engine_pytorch_sequential_compute_pipeline_fusions_mmt -->
+<g id="node8" class="node">
+<title>transformer_engine_pytorch_sequential_compute_pipeline_fusions_mmt</title><style>.edge>path:hover{stroke-width:8}</style>
+<ellipse fill="#8f3d3d" stroke="black" cx="3298.1" cy="-282.59" rx="67" ry="31.11"/>
+<text text-anchor="middle" x="3298.1" y="-291.09" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#ffffff">compute_pipeline.</text>
+<text text-anchor="middle" x="3298.1" y="-279.09" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#ffffff">fusions.</text>
+<text text-anchor="middle" x="3298.1" y="-267.09" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#ffffff">mmt</text>
+</g>
+<!-- transformer_engine_pytorch_sequential_compute_pipeline_fusions__common&#45;&gt;transformer_engine_pytorch_sequential_compute_pipeline_fusions_mmt -->
+<g id="edge12" class="edge">
+<title>transformer_engine_pytorch_sequential_compute_pipeline_fusions__common&#45;&gt;transformer_engine_pytorch_sequential_compute_pipeline_fusions_mmt</title><style>.edge>path:hover{stroke-width:8}</style>
+<path fill="none" stroke="black" d="M1649.76,-308.61C1665.25,-317.82 1683.82,-327.04 1702.1,-331.71 1783.44,-352.45 3132.02,-353.45 3213.1,-331.71 3226.75,-328.05 3240.46,-321.61 3252.78,-314.58"/>
+<polygon fill="#964040" stroke="black" points="3254.11,-317.26 3260.9,-309.12 3250.52,-311.25 3254.11,-317.26"/>
+</g>
+<!-- transformer_engine_pytorch_sequential_compute_pipeline_fusions__storage -->
+<g id="node6" class="node">
+<title>transformer_engine_pytorch_sequential_compute_pipeline_fusions__storage</title><style>.edge>path:hover{stroke-width:8}</style>
+<ellipse fill="#c24747" stroke="black" cx="1460.1" cy="-282.59" rx="67" ry="31.11"/>
+<text text-anchor="middle" x="1460.1" y="-291.09" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#ffffff">compute_pipeline.</text>
+<text text-anchor="middle" x="1460.1" y="-279.09" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#ffffff">fusions.</text>
+<text text-anchor="middle" x="1460.1" y="-267.09" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#ffffff">_storage</text>
+</g>
+<!-- transformer_engine_pytorch_sequential_compute_pipeline_fusions__storage&#45;&gt;transformer_engine_pytorch_sequential_compute_pipeline_fusions__common -->
+<g id="edge13" class="edge">
+<title>transformer_engine_pytorch_sequential_compute_pipeline_fusions__storage&#45;&gt;transformer_engine_pytorch_sequential_compute_pipeline_fusions__common</title><style>.edge>path:hover{stroke-width:8}</style>
+<path fill="none" stroke="black" d="M1527.49,-282.59C1529.64,-282.59 1531.79,-282.59 1533.94,-282.59"/>
+<polygon fill="#c24747" stroke="black" points="1533.68,-286.09 1543.68,-282.59 1533.68,-279.09 1533.68,-286.09"/>
+</g>
+<!-- transformer_engine_pytorch_sequential_compute_pipeline_fusions_interface -->
+<g id="node7" class="node">
+<title>transformer_engine_pytorch_sequential_compute_pipeline_fusions_interface</title><style>.edge>path:hover{stroke-width:8}</style>
+<ellipse fill="#8f3d3d" stroke="black" cx="3450.1" cy="-282.59" rx="67" ry="31.11"/>
+<text text-anchor="middle" x="3450.1" y="-291.09" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#ffffff">compute_pipeline.</text>
+<text text-anchor="middle" x="3450.1" y="-279.09" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#ffffff">fusions.</text>
+<text text-anchor="middle" x="3450.1" y="-267.09" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#ffffff">interface</text>
+</g>
+<!-- transformer_engine_pytorch_sequential_compute_pipeline_fusions__storage&#45;&gt;transformer_engine_pytorch_sequential_compute_pipeline_fusions_interface -->
+<g id="edge14" class="edge">
+<title>transformer_engine_pytorch_sequential_compute_pipeline_fusions__storage&#45;&gt;transformer_engine_pytorch_sequential_compute_pipeline_fusions_interface</title><style>.edge>path:hover{stroke-width:8}</style>
+<path fill="none" stroke="black" d="M1496.31,-309.12C1510.75,-318.15 1527.98,-327.12 1545.1,-331.71 1593.93,-344.8 3316.27,-344.8 3365.1,-331.71 3378.75,-328.05 3392.46,-321.61 3404.78,-314.58"/>
+<polygon fill="#c24747" stroke="black" points="3406.11,-317.26 3412.9,-309.12 3402.52,-311.25 3406.11,-317.26"/>
+</g>
+<!-- transformer_engine_pytorch_sequential_compute_pipeline_fusions_interface&#45;&gt;transformer_engine_pytorch_sequential_compute_pipeline_fusions -->
+<g id="edge15" class="edge">
+<title>transformer_engine_pytorch_sequential_compute_pipeline_fusions_interface&#45;&gt;transformer_engine_pytorch_sequential_compute_pipeline_fusions</title><style>.edge>path:hover{stroke-width:8}</style>
+<path fill="none" stroke="black" d="M3517.49,-282.59C3519.64,-282.59 3521.79,-282.59 3523.94,-282.59"/>
+<polygon fill="#8f3d3d" stroke="black" points="3523.68,-286.09 3533.68,-282.59 3523.68,-279.09 3523.68,-286.09"/>
+</g>
+<!-- transformer_engine_pytorch_sequential_compute_pipeline_fusions_mmt&#45;&gt;transformer_engine_pytorch_sequential_compute_pipeline_fusions -->
+<g id="edge16" class="edge">
+<title>transformer_engine_pytorch_sequential_compute_pipeline_fusions_mmt&#45;&gt;transformer_engine_pytorch_sequential_compute_pipeline_fusions</title><style>.edge>path:hover{stroke-width:8}</style>
+<path fill="none" stroke="black" d="M3334.31,-309.12C3348.75,-318.15 3365.98,-327.12 3383.1,-331.71 3440.63,-347.13 3459.58,-347.13 3517.1,-331.71 3534.12,-327.14 3551.25,-318.26 3565.63,-309.29"/>
+<polygon fill="#8f3d3d" stroke="black" points="3567.18,-311.81 3573.65,-303.42 3563.36,-305.94 3567.18,-311.81"/>
+</g>
+<!-- transformer_engine_pytorch_sequential_compute_pipeline_ops -->
+<g id="node9" class="node">
+<title>transformer_engine_pytorch_sequential_compute_pipeline_ops</title><style>.edge>path:hover{stroke-width:8}</style>
+<ellipse fill="#c70505" stroke="black" cx="1802.1" cy="-282.59" rx="67" ry="22.63"/>
+<text text-anchor="middle" x="1802.1" y="-285.09" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#ffffff">compute_pipeline.</text>
+<text text-anchor="middle" x="1802.1" y="-273.09" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#ffffff">ops</text>
+</g>
+<!-- transformer_engine_pytorch_sequential_compute_pipeline_ops&#45;&gt;transformer_engine_pytorch_sequential_compute_pipeline -->
+<g id="edge17" class="edge">
+<title>transformer_engine_pytorch_sequential_compute_pipeline_ops&#45;&gt;transformer_engine_pytorch_sequential_compute_pipeline</title><style>.edge>path:hover{stroke-width:8}</style>
+<path fill="none" stroke="black" d="M1829.55,-303.42C1845.45,-314.16 1866.36,-326.14 1887.1,-331.71 1951.39,-348.94 3021.45,-351.13 3085.1,-331.71 3101.72,-326.64 3117.99,-316.37 3130.93,-306.54"/>
+<polygon fill="#c70505" stroke="black" points="3132.67,-308.83 3138.32,-299.86 3128.32,-303.35 3132.67,-308.83"/>
+</g>
+<!-- transformer_engine_pytorch_sequential_compute_pipeline_ops&#45;&gt;transformer_engine_pytorch_sequential_compute_pipeline_compute_pipeline -->
+<g id="edge18" class="edge">
+<title>transformer_engine_pytorch_sequential_compute_pipeline_ops&#45;&gt;transformer_engine_pytorch_sequential_compute_pipeline_compute_pipeline</title><style>.edge>path:hover{stroke-width:8}</style>
+<path fill="none" stroke="black" d="M1772.86,-303.44C1755.98,-314.18 1733.84,-326.16 1712.1,-331.71 1550.12,-373.03 1351.63,-328.94 1254.07,-301.33"/>
+<polygon fill="#c70505" stroke="black" points="1255.11,-297.7 1244.53,-298.31 1253.17,-304.43 1255.11,-297.7"/>
+</g>
+<!-- transformer_engine_pytorch_sequential_compute_pipeline_ops&#45;&gt;transformer_engine_pytorch_sequential_compute_pipeline_fusions__common -->
+<g id="edge19" class="edge">
+<title>transformer_engine_pytorch_sequential_compute_pipeline_ops&#45;&gt;transformer_engine_pytorch_sequential_compute_pipeline_fusions__common</title><style>.edge>path:hover{stroke-width:8}</style>
+<path fill="none" stroke="black" d="M1734.93,-282.59C1720,-282.59 1705.07,-282.59 1690.14,-282.59"/>
+<polygon fill="#c70505" stroke="black" points="1690.27,-279.09 1680.27,-282.59 1690.27,-286.09 1690.27,-279.09"/>
+</g>
+<!-- transformer_engine_pytorch_sequential_compute_pipeline_ops&#45;&gt;transformer_engine_pytorch_sequential_compute_pipeline_fusions_interface -->
+<g id="edge20" class="edge">
+<title>transformer_engine_pytorch_sequential_compute_pipeline_ops&#45;&gt;transformer_engine_pytorch_sequential_compute_pipeline_fusions_interface</title><style>.edge>path:hover{stroke-width:8}</style>
+<path fill="none" stroke="black" d="M1829.55,-303.42C1845.45,-314.16 1866.36,-326.14 1887.1,-331.71 1966.41,-352.97 3285.79,-352.97 3365.1,-331.71 3378.75,-328.05 3392.46,-321.61 3404.78,-314.58"/>
+<polygon fill="#c70505" stroke="black" points="3406.11,-317.26 3412.9,-309.12 3402.52,-311.25 3406.11,-317.26"/>
+</g>
+<!-- transformer_engine_pytorch_sequential_compute_pipeline_ops&#45;&gt;transformer_engine_pytorch_sequential_compute_pipeline_fusions_mmt -->
+<g id="edge21" class="edge">
+<title>transformer_engine_pytorch_sequential_compute_pipeline_ops&#45;&gt;transformer_engine_pytorch_sequential_compute_pipeline_fusions_mmt</title><style>.edge>path:hover{stroke-width:8}</style>
+<path fill="none" stroke="black" d="M1829.55,-303.42C1845.45,-314.16 1866.36,-326.14 1887.1,-331.71 1958.26,-350.78 3141.95,-350.78 3213.1,-331.71 3226.75,-328.05 3240.46,-321.61 3252.78,-314.58"/>
+<polygon fill="#c70505" stroke="black" points="3254.11,-317.26 3260.9,-309.12 3250.52,-311.25 3254.11,-317.26"/>
+</g>
+<!-- transformer_engine_pytorch_sequential_compute_pipeline_ops_types -->
+<g id="node20" class="node">
+<title>transformer_engine_pytorch_sequential_compute_pipeline_ops_types</title><style>.edge>path:hover{stroke-width:8}</style>
+<ellipse fill="#a44646" stroke="black" cx="2562.1" cy="-282.59" rx="67" ry="22.63"/>
+<text text-anchor="middle" x="2562.1" y="-285.09" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#ffffff">compute_pipeline.</text>
+<text text-anchor="middle" x="2562.1" y="-273.09" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#ffffff">ops_types</text>
+</g>
+<!-- transformer_engine_pytorch_sequential_compute_pipeline_ops&#45;&gt;transformer_engine_pytorch_sequential_compute_pipeline_ops_types -->
+<g id="edge22" class="edge">
+<title>transformer_engine_pytorch_sequential_compute_pipeline_ops&#45;&gt;transformer_engine_pytorch_sequential_compute_pipeline_ops_types</title><style>.edge>path:hover{stroke-width:8}</style>
+<path fill="none" stroke="black" d="M1829.55,-303.42C1845.45,-314.16 1866.36,-326.14 1887.1,-331.71 1950.42,-348.68 2413.79,-348.68 2477.1,-331.71 2494.12,-327.14 2511.25,-318.26 2525.63,-309.29"/>
+<polygon fill="#c70505" stroke="black" points="2527.18,-311.81 2533.65,-303.42 2523.36,-305.94 2527.18,-311.81"/>
+</g>
+<!-- transformer_engine_pytorch_sequential_compute_pipeline_ops&#45;&gt;transformer_engine_pytorch_sequential_compute_pipeline_function -->
+<g id="edge23" class="edge">
+<title>transformer_engine_pytorch_sequential_compute_pipeline_ops&#45;&gt;transformer_engine_pytorch_sequential_compute_pipeline_function</title><style>.edge>path:hover{stroke-width:8}</style>
+<path fill="none" stroke="black" d="M1760.52,-264.56C1748,-259.86 1734.12,-255.07 1721.1,-251.48 1651.1,-232.16 1569.52,-217.86 1511.27,-209.02"/>
+<polygon fill="#c70505" stroke="black" points="1512.08,-205.45 1501.67,-207.43 1511.04,-212.37 1512.08,-205.45"/>
+</g>
+<!-- transformer_engine_pytorch_sequential_compute_pipeline_ops&#45;&gt;transformer_engine_pytorch_sequential_module_Activation -->
+<g id="edge24" class="edge">
+<title>transformer_engine_pytorch_sequential_compute_pipeline_ops&#45;&gt;transformer_engine_pytorch_sequential_module_Activation</title><style>.edge>path:hover{stroke-width:8}</style>
+<path fill="none" stroke="black" d="M1823.1,-196.48C1822.02,-181.91 1825.48,-166.24 1829.81,-153.03"/>
+<polygon fill="#c70505" stroke="black" points="1833.31,-154.63 1833.4,-144.04 1826.73,-152.26 1833.31,-154.63"/>
+</g>
+<!-- transformer_engine_pytorch_sequential_compute_pipeline_ops&#45;&gt;transformer_engine_pytorch_sequential_module_Linear -->
+<g id="edge25" class="edge">
+<title>transformer_engine_pytorch_sequential_compute_pipeline_ops&#45;&gt;transformer_engine_pytorch_sequential_module_Linear</title><style>.edge>path:hover{stroke-width:8}</style>
+<path fill="none" stroke="black" d="M2008.1,-196.48C2026.24,-188.09 2035.86,-167.19 2040.87,-149.64"/>
+<polygon fill="#c70505" stroke="black" points="2044.43,-150.83 2043.44,-140.28 2037.64,-149.15 2044.43,-150.83"/>
+<path fill="none" stroke="black" d="M1844.42,-264.78C1900.07,-242.63 1992.62,-205.65 2008.1,-198.48"/>
+</g>
+<!-- transformer_engine_pytorch_sequential_compute_pipeline_ops&#45;&gt;transformer_engine_pytorch_sequential_module_activation -->
+<g id="edge26" class="edge">
+<title>transformer_engine_pytorch_sequential_compute_pipeline_ops&#45;&gt;transformer_engine_pytorch_sequential_module_activation</title><style>.edge>path:hover{stroke-width:8}</style>
+<path fill="none" stroke="black" d="M1820.69,-260.38C1845.69,-231.87 1890.16,-181.12 1917.78,-149.6"/>
+<polygon fill="#c70505" stroke="black" points="1920.8,-152.47 1924.76,-142.64 1915.54,-147.86 1920.8,-152.47"/>
+</g>
+<!-- transformer_engine_pytorch_sequential_compute_pipeline_ops&#45;&gt;transformer_engine_pytorch_sequential_module_base -->
+<g id="edge27" class="edge">
+<title>transformer_engine_pytorch_sequential_compute_pipeline_ops&#45;&gt;transformer_engine_pytorch_sequential_module_base</title><style>.edge>path:hover{stroke-width:8}</style>
+<path fill="none" stroke="black" d="M1759.03,-264.82C1693.03,-239.07 1563,-188.07 1453.1,-143.48 1449.14,-141.87 1445.01,-140.18 1440.89,-138.48"/>
+<polygon fill="#c70505" stroke="black" points="1442.34,-134.88 1431.77,-134.29 1439.67,-141.35 1442.34,-134.88"/>
+</g>
+<!-- transformer_engine_pytorch_sequential_compute_pipeline_ops&#45;&gt;transformer_engine_pytorch_sequential_module_linear -->
+<g id="edge28" class="edge">
+<title>transformer_engine_pytorch_sequential_compute_pipeline_ops&#45;&gt;transformer_engine_pytorch_sequential_module_linear</title><style>.edge>path:hover{stroke-width:8}</style>
+<path fill="none" stroke="black" d="M1811.69,-259.71C1817.94,-243.15 1824.68,-219.7 1823.1,-198.48"/>
+<path fill="none" stroke="black" d="M1823.1,-196.48C1821.23,-171.18 1799.55,-152.23 1778.83,-139.76"/>
+<polygon fill="#c70505" stroke="black" points="1780.88,-136.34 1770.45,-134.49 1777.45,-142.44 1780.88,-136.34"/>
+</g>
+<!-- transformer_engine_pytorch_sequential_compute_pipeline_ops&#45;&gt;transformer_engine_pytorch_sequential_module_normalization -->
+<g id="edge29" class="edge">
+<title>transformer_engine_pytorch_sequential_compute_pipeline_ops&#45;&gt;transformer_engine_pytorch_sequential_module_normalization</title><style>.edge>path:hover{stroke-width:8}</style>
+<path fill="none" stroke="black" d="M2008.1,-196.48C2019.97,-190.99 2020.11,-184.69 2032.1,-179.48 2104.94,-147.86 2130.61,-164.8 2207.1,-143.48 2211.13,-142.36 2215.27,-141.11 2219.41,-139.79"/>
+<polygon fill="#c70505" stroke="black" points="2220.25,-142.86 2228.64,-136.39 2218.04,-136.22 2220.25,-142.86"/>
+</g>
+<!-- transformer_engine_pytorch_sequential_compute_pipeline_ops_Add -->
+<g id="node10" class="node">
+<title>transformer_engine_pytorch_sequential_compute_pipeline_ops_Add</title><style>.edge>path:hover{stroke-width:8}</style>
+<ellipse fill="#a44646" stroke="black" cx="2866.1" cy="-282.59" rx="67" ry="31.11"/>
+<text text-anchor="middle" x="2866.1" y="-291.09" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#ffffff">compute_pipeline.</text>
+<text text-anchor="middle" x="2866.1" y="-279.09" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#ffffff">ops.</text>
+<text text-anchor="middle" x="2866.1" y="-267.09" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#ffffff">Add</text>
+</g>
+<!-- transformer_engine_pytorch_sequential_compute_pipeline_ops_Add&#45;&gt;transformer_engine_pytorch_sequential_compute_pipeline_fusions_mmt -->
+<g id="edge30" class="edge">
+<title>transformer_engine_pytorch_sequential_compute_pipeline_ops_Add&#45;&gt;transformer_engine_pytorch_sequential_compute_pipeline_fusions_mmt</title><style>.edge>path:hover{stroke-width:8}</style>
+<path fill="none" stroke="black" d="M2902.31,-309.12C2916.75,-318.15 2933.98,-327.12 2951.1,-331.71 3007.34,-346.78 3156.87,-346.78 3213.1,-331.71 3226.75,-328.05 3240.46,-321.61 3252.78,-314.58"/>
+<polygon fill="#a44646" stroke="black" points="3254.11,-317.26 3260.9,-309.12 3250.52,-311.25 3254.11,-317.26"/>
+</g>
+<!-- transformer_engine_pytorch_sequential_compute_pipeline_ops_MMT -->
+<g id="node11" class="node">
+<title>transformer_engine_pytorch_sequential_compute_pipeline_ops_MMT</title><style>.edge>path:hover{stroke-width:8}</style>
+<ellipse fill="#a44646" stroke="black" cx="3018.1" cy="-282.59" rx="67" ry="31.11"/>
+<text text-anchor="middle" x="3018.1" y="-291.09" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#ffffff">compute_pipeline.</text>
+<text text-anchor="middle" x="3018.1" y="-279.09" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#ffffff">ops.</text>
+<text text-anchor="middle" x="3018.1" y="-267.09" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#ffffff">MMT</text>
+</g>
+<!-- transformer_engine_pytorch_sequential_compute_pipeline_ops_MMT&#45;&gt;transformer_engine_pytorch_sequential_compute_pipeline_fusions_mmt -->
+<g id="edge31" class="edge">
+<title>transformer_engine_pytorch_sequential_compute_pipeline_ops_MMT&#45;&gt;transformer_engine_pytorch_sequential_compute_pipeline_fusions_mmt</title><style>.edge>path:hover{stroke-width:8}</style>
+<path fill="none" stroke="black" d="M3054.31,-309.12C3068.75,-318.15 3085.98,-327.12 3103.1,-331.71 3150.32,-344.37 3165.88,-344.37 3213.1,-331.71 3226.75,-328.05 3240.46,-321.61 3252.78,-314.58"/>
+<polygon fill="#a44646" stroke="black" points="3254.11,-317.26 3260.9,-309.12 3250.52,-311.25 3254.11,-317.26"/>
+</g>
+<!-- transformer_engine_pytorch_sequential_compute_pipeline_ops_Op -->
+<g id="node12" class="node">
+<title>transformer_engine_pytorch_sequential_compute_pipeline_ops_Op</title><style>.edge>path:hover{stroke-width:8}</style>
+<ellipse fill="#dd1818" stroke="black" cx="1042.1" cy="-282.59" rx="67" ry="31.11"/>
+<text text-anchor="middle" x="1042.1" y="-291.09" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#ffffff">compute_pipeline.</text>
+<text text-anchor="middle" x="1042.1" y="-279.09" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#ffffff">ops.</text>
+<text text-anchor="middle" x="1042.1" y="-267.09" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#ffffff">Op</text>
+</g>
+<!-- transformer_engine_pytorch_sequential_compute_pipeline_ops_Op&#45;&gt;transformer_engine_pytorch_sequential_compute_pipeline -->
+<g id="edge32" class="edge">
+<title>transformer_engine_pytorch_sequential_compute_pipeline_ops_Op&#45;&gt;transformer_engine_pytorch_sequential_compute_pipeline</title><style>.edge>path:hover{stroke-width:8}</style>
+<path fill="none" stroke="black" d="M1078.31,-309.12C1092.75,-318.15 1109.98,-327.12 1127.1,-331.71 1179.64,-345.79 3033.08,-347.58 3085.1,-331.71 3101.72,-326.64 3117.99,-316.37 3130.93,-306.54"/>
+<polygon fill="#dd1818" stroke="black" points="3132.67,-308.83 3138.32,-299.86 3128.32,-303.35 3132.67,-308.83"/>
+</g>
+<!-- transformer_engine_pytorch_sequential_compute_pipeline_ops_Op&#45;&gt;transformer_engine_pytorch_sequential_compute_pipeline_compute_pipeline -->
+<g id="edge33" class="edge">
+<title>transformer_engine_pytorch_sequential_compute_pipeline_ops_Op&#45;&gt;transformer_engine_pytorch_sequential_compute_pipeline_compute_pipeline</title><style>.edge>path:hover{stroke-width:8}</style>
+<path fill="none" stroke="black" d="M1109.49,-282.59C1111.64,-282.59 1113.79,-282.59 1115.94,-282.59"/>
+<polygon fill="#dd1818" stroke="black" points="1115.68,-286.09 1125.68,-282.59 1115.68,-279.09 1115.68,-286.09"/>
+</g>
+<!-- transformer_engine_pytorch_sequential_compute_pipeline_ops_Op&#45;&gt;transformer_engine_pytorch_sequential_compute_pipeline_fusions_interface -->
+<g id="edge34" class="edge">
+<title>transformer_engine_pytorch_sequential_compute_pipeline_ops_Op&#45;&gt;transformer_engine_pytorch_sequential_compute_pipeline_fusions_interface</title><style>.edge>path:hover{stroke-width:8}</style>
+<path fill="none" stroke="black" d="M1078.31,-309.12C1092.75,-318.15 1109.98,-327.12 1127.1,-331.71 1187.15,-347.81 3305.06,-347.81 3365.1,-331.71 3378.75,-328.05 3392.46,-321.61 3404.78,-314.58"/>
+<polygon fill="#dd1818" stroke="black" points="3406.11,-317.26 3412.9,-309.12 3402.52,-311.25 3406.11,-317.26"/>
+</g>
+<!-- transformer_engine_pytorch_sequential_compute_pipeline_ops_Op&#45;&gt;transformer_engine_pytorch_sequential_compute_pipeline_function -->
+<g id="edge35" class="edge">
+<title>transformer_engine_pytorch_sequential_compute_pipeline_ops_Op&#45;&gt;transformer_engine_pytorch_sequential_compute_pipeline_function</title><style>.edge>path:hover{stroke-width:8}</style>
+<path fill="none" stroke="black" d="M1090.22,-260.72C1099.36,-257.27 1108.94,-254 1118.1,-251.48 1214.9,-224.83 1242.16,-232.51 1341.1,-215.48 1348.47,-214.21 1356.16,-212.86 1363.82,-211.49"/>
+<polygon fill="#dd1818" stroke="black" points="1364.38,-214.76 1373.6,-209.55 1363.14,-207.87 1364.38,-214.76"/>
+</g>
+<!-- transformer_engine_pytorch_sequential_compute_pipeline_ops_Op&#45;&gt;transformer_engine_pytorch_sequential_module_base -->
+<g id="edge36" class="edge">
+<title>transformer_engine_pytorch_sequential_compute_pipeline_ops_Op&#45;&gt;transformer_engine_pytorch_sequential_module_base</title><style>.edge>path:hover{stroke-width:8}</style>
+<path fill="none" stroke="black" d="M1327.1,-196.48C1337.76,-187.94 1361.24,-163.89 1378.91,-145.38"/>
+<polygon fill="#dd1818" stroke="black" points="1380.72,-148.5 1385.07,-138.84 1375.65,-143.68 1380.72,-148.5"/>
+<path fill="none" stroke="black" d="M1091.24,-261.06C1100.11,-257.66 1109.33,-254.32 1118.1,-251.48 1209.28,-221.99 1252.33,-258.41 1327.1,-198.48"/>
+</g>
+<!-- transformer_engine_pytorch_sequential_compute_pipeline_ops_activation -->
+<g id="node13" class="node">
+<title>transformer_engine_pytorch_sequential_compute_pipeline_ops_activation</title><style>.edge>path:hover{stroke-width:8}</style>
+<ellipse fill="#a44646" stroke="black" cx="2714.1" cy="-282.59" rx="67" ry="31.11"/>
+<text text-anchor="middle" x="2714.1" y="-291.09" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#ffffff">compute_pipeline.</text>
+<text text-anchor="middle" x="2714.1" y="-279.09" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#ffffff">ops.</text>
+<text text-anchor="middle" x="2714.1" y="-267.09" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#ffffff">activation</text>
+</g>
+<!-- transformer_engine_pytorch_sequential_compute_pipeline_ops_activation&#45;&gt;transformer_engine_pytorch_sequential_compute_pipeline_ops -->
+<g id="edge37" class="edge">
+<title>transformer_engine_pytorch_sequential_compute_pipeline_ops_activation&#45;&gt;transformer_engine_pytorch_sequential_compute_pipeline_ops</title><style>.edge>path:hover{stroke-width:8}</style>
+<path fill="none" stroke="black" d="M2677.9,-309.12C2663.46,-318.15 2646.22,-327.12 2629.1,-331.71 2549.47,-353.06 1966.74,-353.06 1887.1,-331.71 1870.09,-327.14 1852.96,-318.26 1838.58,-309.29"/>
+<polygon fill="#a44646" stroke="black" points="1840.85,-305.94 1830.55,-303.42 1837.03,-311.81 1840.85,-305.94"/>
+</g>
+<!-- transformer_engine_pytorch_sequential_compute_pipeline_ops_add -->
+<g id="node14" class="node">
+<title>transformer_engine_pytorch_sequential_compute_pipeline_ops_add</title><style>.edge>path:hover{stroke-width:8}</style>
+<ellipse fill="#a44646" stroke="black" cx="3944.1" cy="-282.59" rx="67" ry="31.11"/>
+<text text-anchor="middle" x="3944.1" y="-291.09" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#ffffff">compute_pipeline.</text>
+<text text-anchor="middle" x="3944.1" y="-279.09" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#ffffff">ops.</text>
+<text text-anchor="middle" x="3944.1" y="-267.09" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#ffffff">add</text>
+</g>
+<!-- transformer_engine_pytorch_sequential_compute_pipeline_ops_add&#45;&gt;transformer_engine_pytorch_sequential_compute_pipeline_ops -->
+<g id="edge38" class="edge">
+<title>transformer_engine_pytorch_sequential_compute_pipeline_ops_add&#45;&gt;transformer_engine_pytorch_sequential_compute_pipeline_ops</title><style>.edge>path:hover{stroke-width:8}</style>
+<path fill="none" stroke="black" d="M3907.9,-309.12C3893.46,-318.15 3876.22,-327.12 3859.1,-331.71 3806.2,-345.89 1940.01,-345.89 1887.1,-331.71 1870.09,-327.14 1852.96,-318.26 1838.58,-309.29"/>
+<polygon fill="#a44646" stroke="black" points="1840.85,-305.94 1830.55,-303.42 1837.03,-311.81 1840.85,-305.94"/>
+</g>
+<!-- transformer_engine_pytorch_sequential_compute_pipeline_ops_attention -->
+<g id="node15" class="node">
+<title>transformer_engine_pytorch_sequential_compute_pipeline_ops_attention</title><style>.edge>path:hover{stroke-width:8}</style>
+<ellipse fill="#995252" stroke="black" cx="3792.1" cy="-282.59" rx="67" ry="31.11"/>
+<text text-anchor="middle" x="3792.1" y="-291.09" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#ffffff">compute_pipeline.</text>
+<text text-anchor="middle" x="3792.1" y="-279.09" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#ffffff">ops.</text>
+<text text-anchor="middle" x="3792.1" y="-267.09" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#ffffff">attention</text>
+</g>
+<!-- transformer_engine_pytorch_sequential_compute_pipeline_ops_layernorm -->
+<g id="node16" class="node">
+<title>transformer_engine_pytorch_sequential_compute_pipeline_ops_layernorm</title><style>.edge>path:hover{stroke-width:8}</style>
+<ellipse fill="#a44646" stroke="black" cx="2106.1" cy="-282.59" rx="67" ry="31.11"/>
+<text text-anchor="middle" x="2106.1" y="-291.09" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#ffffff">compute_pipeline.</text>
+<text text-anchor="middle" x="2106.1" y="-279.09" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#ffffff">ops.</text>
+<text text-anchor="middle" x="2106.1" y="-267.09" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#ffffff">layernorm</text>
+</g>
+<!-- transformer_engine_pytorch_sequential_compute_pipeline_ops_layernorm&#45;&gt;transformer_engine_pytorch_sequential_compute_pipeline_ops -->
+<g id="edge39" class="edge">
+<title>transformer_engine_pytorch_sequential_compute_pipeline_ops_layernorm&#45;&gt;transformer_engine_pytorch_sequential_compute_pipeline_ops</title><style>.edge>path:hover{stroke-width:8}</style>
+<path fill="none" stroke="black" d="M2069.9,-309.12C2055.46,-318.15 2038.22,-327.12 2021.1,-331.71 1963.58,-347.13 1944.63,-347.13 1887.1,-331.71 1870.09,-327.14 1852.96,-318.26 1838.58,-309.29"/>
+<polygon fill="#a44646" stroke="black" points="1840.85,-305.94 1830.55,-303.42 1837.03,-311.81 1840.85,-305.94"/>
+</g>
+<!-- transformer_engine_pytorch_sequential_compute_pipeline_ops_mmt -->
+<g id="node17" class="node">
+<title>transformer_engine_pytorch_sequential_compute_pipeline_ops_mmt</title><style>.edge>path:hover{stroke-width:8}</style>
+<ellipse fill="#a44646" stroke="black" cx="2258.1" cy="-282.59" rx="67" ry="31.11"/>
+<text text-anchor="middle" x="2258.1" y="-291.09" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#ffffff">compute_pipeline.</text>
+<text text-anchor="middle" x="2258.1" y="-279.09" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#ffffff">ops.</text>
+<text text-anchor="middle" x="2258.1" y="-267.09" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#ffffff">mmt</text>
+</g>
+<!-- transformer_engine_pytorch_sequential_compute_pipeline_ops_mmt&#45;&gt;transformer_engine_pytorch_sequential_compute_pipeline_ops -->
+<g id="edge40" class="edge">
+<title>transformer_engine_pytorch_sequential_compute_pipeline_ops_mmt&#45;&gt;transformer_engine_pytorch_sequential_compute_pipeline_ops</title><style>.edge>path:hover{stroke-width:8}</style>
+<path fill="none" stroke="black" d="M2221.9,-309.12C2207.46,-318.15 2190.22,-327.12 2173.1,-331.71 2111.72,-348.17 1948.49,-348.17 1887.1,-331.71 1870.09,-327.14 1852.96,-318.26 1838.58,-309.29"/>
+<polygon fill="#a44646" stroke="black" points="1840.85,-305.94 1830.55,-303.42 1837.03,-311.81 1840.85,-305.94"/>
+</g>
+<!-- transformer_engine_pytorch_sequential_compute_pipeline_ops_op -->
+<g id="node18" class="node">
+<title>transformer_engine_pytorch_sequential_compute_pipeline_ops_op</title><style>.edge>path:hover{stroke-width:8}</style>
+<ellipse fill="#ef0606" stroke="black" cx="1954.1" cy="-282.59" rx="67" ry="31.11"/>
+<text text-anchor="middle" x="1954.1" y="-291.09" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#ffffff">compute_pipeline.</text>
+<text text-anchor="middle" x="1954.1" y="-279.09" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#ffffff">ops.</text>
+<text text-anchor="middle" x="1954.1" y="-267.09" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#ffffff">op</text>
+</g>
+<!-- transformer_engine_pytorch_sequential_compute_pipeline_ops_op&#45;&gt;transformer_engine_pytorch_sequential_compute_pipeline_ops -->
+<g id="edge41" class="edge">
+<title>transformer_engine_pytorch_sequential_compute_pipeline_ops_op&#45;&gt;transformer_engine_pytorch_sequential_compute_pipeline_ops</title><style>.edge>path:hover{stroke-width:8}</style>
+<path fill="none" stroke="black" d="M1886.68,-282.59C1884.53,-282.59 1882.38,-282.59 1880.24,-282.59"/>
+<polygon fill="#ef0606" stroke="black" points="1880.49,-279.09 1870.49,-282.59 1880.49,-286.09 1880.49,-279.09"/>
+</g>
+<!-- transformer_engine_pytorch_sequential_compute_pipeline_ops_op&#45;&gt;transformer_engine_pytorch_sequential_compute_pipeline_ops_Add -->
+<g id="edge42" class="edge">
+<title>transformer_engine_pytorch_sequential_compute_pipeline_ops_op&#45;&gt;transformer_engine_pytorch_sequential_compute_pipeline_ops_Add</title><style>.edge>path:hover{stroke-width:8}</style>
+<path fill="none" stroke="black" d="M1990.31,-309.12C2004.75,-318.15 2021.98,-327.12 2039.1,-331.71 2118.74,-353.06 2701.47,-353.06 2781.1,-331.71 2794.75,-328.05 2808.46,-321.61 2820.78,-314.58"/>
+<polygon fill="#ef0606" stroke="black" points="2822.11,-317.26 2828.9,-309.12 2818.52,-311.25 2822.11,-317.26"/>
+</g>
+<!-- transformer_engine_pytorch_sequential_compute_pipeline_ops_op&#45;&gt;transformer_engine_pytorch_sequential_compute_pipeline_ops_MMT -->
+<g id="edge43" class="edge">
+<title>transformer_engine_pytorch_sequential_compute_pipeline_ops_op&#45;&gt;transformer_engine_pytorch_sequential_compute_pipeline_ops_MMT</title><style>.edge>path:hover{stroke-width:8}</style>
+<path fill="none" stroke="black" d="M1990.31,-309.12C2004.75,-318.15 2021.98,-327.12 2039.1,-331.71 2087.08,-344.57 2885.13,-344.57 2933.1,-331.71 2946.75,-328.05 2960.46,-321.61 2972.78,-314.58"/>
+<polygon fill="#ef0606" stroke="black" points="2974.11,-317.26 2980.9,-309.12 2970.52,-311.25 2974.11,-317.26"/>
+</g>
+<!-- transformer_engine_pytorch_sequential_compute_pipeline_ops_op&#45;&gt;transformer_engine_pytorch_sequential_compute_pipeline_ops_activation -->
+<g id="edge44" class="edge">
+<title>transformer_engine_pytorch_sequential_compute_pipeline_ops_op&#45;&gt;transformer_engine_pytorch_sequential_compute_pipeline_ops_activation</title><style>.edge>path:hover{stroke-width:8}</style>
+<path fill="none" stroke="black" d="M1990.31,-309.12C2004.75,-318.15 2021.98,-327.12 2039.1,-331.71 2102.42,-348.68 2565.79,-348.68 2629.1,-331.71 2642.75,-328.05 2656.46,-321.61 2668.78,-314.58"/>
+<polygon fill="#ef0606" stroke="black" points="2670.11,-317.26 2676.9,-309.12 2666.52,-311.25 2670.11,-317.26"/>
+</g>
+<!-- transformer_engine_pytorch_sequential_compute_pipeline_ops_op&#45;&gt;transformer_engine_pytorch_sequential_compute_pipeline_ops_add -->
+<g id="edge45" class="edge">
+<title>transformer_engine_pytorch_sequential_compute_pipeline_ops_op&#45;&gt;transformer_engine_pytorch_sequential_compute_pipeline_ops_add</title><style>.edge>path:hover{stroke-width:8}</style>
+<path fill="none" stroke="black" d="M1990.31,-309.12C2004.75,-318.15 2021.98,-327.12 2039.1,-331.71 2087.93,-344.8 3810.27,-344.8 3859.1,-331.71 3872.75,-328.05 3886.46,-321.61 3898.78,-314.58"/>
+<polygon fill="#ef0606" stroke="black" points="3900.11,-317.26 3906.9,-309.12 3896.52,-311.25 3900.11,-317.26"/>
+</g>
+<!-- transformer_engine_pytorch_sequential_compute_pipeline_ops_op&#45;&gt;transformer_engine_pytorch_sequential_compute_pipeline_ops_attention -->
+<g id="edge46" class="edge">
+<title>transformer_engine_pytorch_sequential_compute_pipeline_ops_op&#45;&gt;transformer_engine_pytorch_sequential_compute_pipeline_ops_attention</title><style>.edge>path:hover{stroke-width:8}</style>
+<path fill="none" stroke="black" d="M1990.31,-309.12C2004.75,-318.15 2021.98,-327.12 2039.1,-331.71 2128.34,-355.63 3612.58,-354.54 3702.1,-331.71 3716.82,-327.95 3731.72,-321.25 3745.04,-313.98"/>
+<polygon fill="#ef0606" stroke="black" points="3746.49,-316.61 3753.45,-308.61 3743.03,-310.53 3746.49,-316.61"/>
+</g>
+<!-- transformer_engine_pytorch_sequential_compute_pipeline_ops_op&#45;&gt;transformer_engine_pytorch_sequential_compute_pipeline_ops_layernorm -->
+<g id="edge47" class="edge">
+<title>transformer_engine_pytorch_sequential_compute_pipeline_ops_op&#45;&gt;transformer_engine_pytorch_sequential_compute_pipeline_ops_layernorm</title><style>.edge>path:hover{stroke-width:8}</style>
+<path fill="none" stroke="black" d="M2021.49,-282.59C2023.64,-282.59 2025.79,-282.59 2027.94,-282.59"/>
+<polygon fill="#ef0606" stroke="black" points="2027.68,-286.09 2037.68,-282.59 2027.68,-279.09 2027.68,-286.09"/>
+</g>
+<!-- transformer_engine_pytorch_sequential_compute_pipeline_ops_op&#45;&gt;transformer_engine_pytorch_sequential_compute_pipeline_ops_mmt -->
+<g id="edge48" class="edge">
+<title>transformer_engine_pytorch_sequential_compute_pipeline_ops_op&#45;&gt;transformer_engine_pytorch_sequential_compute_pipeline_ops_mmt</title><style>.edge>path:hover{stroke-width:8}</style>
+<path fill="none" stroke="black" d="M1990.31,-309.12C2004.75,-318.15 2021.98,-327.12 2039.1,-331.71 2096.63,-347.13 2115.58,-347.13 2173.1,-331.71 2186.75,-328.05 2200.46,-321.61 2212.78,-314.58"/>
+<polygon fill="#ef0606" stroke="black" points="2214.11,-317.26 2220.9,-309.12 2210.52,-311.25 2214.11,-317.26"/>
+</g>
+<!-- transformer_engine_pytorch_sequential_compute_pipeline_ops_rmsnorm -->
+<g id="node19" class="node">
+<title>transformer_engine_pytorch_sequential_compute_pipeline_ops_rmsnorm</title><style>.edge>path:hover{stroke-width:8}</style>
+<ellipse fill="#a44646" stroke="black" cx="2410.1" cy="-282.59" rx="67" ry="31.11"/>
+<text text-anchor="middle" x="2410.1" y="-291.09" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#ffffff">compute_pipeline.</text>
+<text text-anchor="middle" x="2410.1" y="-279.09" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#ffffff">ops.</text>
+<text text-anchor="middle" x="2410.1" y="-267.09" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#ffffff">rmsnorm</text>
+</g>
+<!-- transformer_engine_pytorch_sequential_compute_pipeline_ops_op&#45;&gt;transformer_engine_pytorch_sequential_compute_pipeline_ops_rmsnorm -->
+<g id="edge49" class="edge">
+<title>transformer_engine_pytorch_sequential_compute_pipeline_ops_op&#45;&gt;transformer_engine_pytorch_sequential_compute_pipeline_ops_rmsnorm</title><style>.edge>path:hover{stroke-width:8}</style>
+<path fill="none" stroke="black" d="M1990.31,-309.12C2004.75,-318.15 2021.98,-327.12 2039.1,-331.71 2100.49,-348.17 2263.72,-348.17 2325.1,-331.71 2338.75,-328.05 2352.46,-321.61 2364.78,-314.58"/>
+<polygon fill="#ef0606" stroke="black" points="2366.11,-317.26 2372.9,-309.12 2362.52,-311.25 2366.11,-317.26"/>
+</g>
+<!-- transformer_engine_pytorch_sequential_compute_pipeline_ops_rmsnorm&#45;&gt;transformer_engine_pytorch_sequential_compute_pipeline_ops -->
+<g id="edge50" class="edge">
+<title>transformer_engine_pytorch_sequential_compute_pipeline_ops_rmsnorm&#45;&gt;transformer_engine_pytorch_sequential_compute_pipeline_ops</title><style>.edge>path:hover{stroke-width:8}</style>
+<path fill="none" stroke="black" d="M2373.9,-309.12C2359.46,-318.15 2342.22,-327.12 2325.1,-331.71 2278.1,-344.31 1934.11,-344.31 1887.1,-331.71 1870.09,-327.14 1852.96,-318.26 1838.58,-309.29"/>
+<polygon fill="#a44646" stroke="black" points="1840.85,-305.94 1830.55,-303.42 1837.03,-311.81 1840.85,-305.94"/>
+</g>
+<!-- transformer_engine_pytorch_sequential_compute_pipeline_ops_types&#45;&gt;transformer_engine_pytorch_sequential_compute_pipeline_fusions_interface -->
+<g id="edge51" class="edge">
+<title>transformer_engine_pytorch_sequential_compute_pipeline_ops_types&#45;&gt;transformer_engine_pytorch_sequential_compute_pipeline_fusions_interface</title><style>.edge>path:hover{stroke-width:8}</style>
+<path fill="none" stroke="black" d="M2589.55,-303.42C2605.45,-314.16 2626.36,-326.14 2647.1,-331.71 2724.16,-352.37 3288.05,-352.37 3365.1,-331.71 3378.75,-328.05 3392.46,-321.61 3404.78,-314.58"/>
+<polygon fill="#a44646" stroke="black" points="3406.11,-317.26 3412.9,-309.12 3402.52,-311.25 3406.11,-317.26"/>
+</g>
+<!-- transformer_engine_pytorch_sequential_compute_pipeline_function&#45;&gt;transformer_engine_pytorch_sequential_module_base -->
+<g id="edge52" class="edge">
+<title>transformer_engine_pytorch_sequential_compute_pipeline_function&#45;&gt;transformer_engine_pytorch_sequential_module_base</title><style>.edge>path:hover{stroke-width:8}</style>
+<path fill="none" stroke="black" d="M1426.45,-179.17C1422.47,-170.16 1417.5,-158.94 1413.03,-148.82"/>
+<polygon fill="#813737" stroke="black" points="1415.85,-147.54 1408.6,-139.81 1409.45,-150.37 1415.85,-147.54"/>
+</g>
+<!-- transformer_engine_pytorch_sequential_exec_saving_source -->
+<g id="node22" class="node">
+<title>transformer_engine_pytorch_sequential_exec_saving_source</title><style>.edge>path:hover{stroke-width:8}</style>
+<ellipse fill="#b65353" stroke="black" cx="420.1" cy="-681.93" rx="62.39" ry="18"/>
+<text text-anchor="middle" x="420.1" y="-678.43" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#ffffff">exec_saving_source</text>
+</g>
+<!-- transformer_engine_pytorch_sequential_utils -->
+<g id="node52" class="node">
+<title>transformer_engine_pytorch_sequential_utils</title><style>.edge>path:hover{stroke-width:8}</style>
+<ellipse fill="#f90606" stroke="black" cx="420.1" cy="-609.93" rx="27" ry="18"/>
+<text text-anchor="middle" x="420.1" y="-606.43" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#ffffff">utils</text>
+</g>
+<!-- transformer_engine_pytorch_sequential_exec_saving_source&#45;&gt;transformer_engine_pytorch_sequential_utils -->
+<g id="edge53" class="edge">
+<title>transformer_engine_pytorch_sequential_exec_saving_source&#45;&gt;transformer_engine_pytorch_sequential_utils</title><style>.edge>path:hover{stroke-width:8}</style>
+<path fill="none" stroke="black" d="M420.1,-663.63C420.1,-656.17 420.1,-647.25 420.1,-638.9"/>
+<polygon fill="#b65353" stroke="black" points="423.6,-639.04 420.1,-629.04 416.6,-639.04 423.6,-639.04"/>
+</g>
+<!-- transformer_engine_pytorch_sequential_meta -->
+<g id="node23" class="node">
+<title>transformer_engine_pytorch_sequential_meta</title><style>.edge>path:hover{stroke-width:8}</style>
+<ellipse fill="#b83d3d" stroke="black" cx="964.1" cy="-367.71" rx="27" ry="18"/>
+<text text-anchor="middle" x="964.1" y="-364.21" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#ffffff">meta</text>
+</g>
+<!-- transformer_engine_pytorch_sequential_meta&#45;&gt;transformer_engine_pytorch_sequential_compute_pipeline_compute_pipeline -->
+<g id="edge54" class="edge">
+<title>transformer_engine_pytorch_sequential_meta&#45;&gt;transformer_engine_pytorch_sequential_compute_pipeline_compute_pipeline</title><style>.edge>path:hover{stroke-width:8}</style>
+<path fill="none" stroke="black" d="M987.97,-358.87C1018.34,-348.81 1072.36,-330.63 1118.1,-313.71 1125.87,-310.83 1134.05,-307.7 1142.04,-304.59"/>
+<polygon fill="#b83d3d" stroke="black" points="1143,-307.58 1151.03,-300.66 1140.44,-301.06 1143,-307.58"/>
+</g>
+<!-- transformer_engine_pytorch_sequential_nvte_execution_state -->
+<g id="node46" class="node">
+<title>transformer_engine_pytorch_sequential_nvte_execution_state</title><style>.edge>path:hover{stroke-width:8}</style>
+<ellipse fill="#c82323" stroke="black" cx="60.1" cy="-282.59" rx="60.1" ry="22.63"/>
+<text text-anchor="middle" x="60.1" y="-285.09" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#ffffff">nvte.</text>
+<text text-anchor="middle" x="60.1" y="-273.09" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#ffffff">execution_state</text>
+</g>
+<!-- transformer_engine_pytorch_sequential_meta&#45;&gt;transformer_engine_pytorch_sequential_nvte_execution_state -->
+<g id="edge55" class="edge">
+<title>transformer_engine_pytorch_sequential_meta&#45;&gt;transformer_engine_pytorch_sequential_nvte_execution_state</title><style>.edge>path:hover{stroke-width:8}</style>
+<path fill="none" stroke="black" d="M937.54,-363.33C906.71,-359.52 854.3,-353.38 809.1,-349.71 507.45,-325.2 423.63,-383.35 129.1,-313.71 120.88,-311.76 112.4,-308.88 104.33,-305.65"/>
+<polygon fill="#b83d3d" stroke="black" points="105.94,-302.11 95.37,-301.43 103.21,-308.55 105.94,-302.11"/>
+</g>
+<!-- transformer_engine_pytorch_sequential_module -->
+<g id="node24" class="node">
+<title>transformer_engine_pytorch_sequential_module</title><style>.edge>path:hover{stroke-width:8}</style>
+<ellipse fill="#9d4343" stroke="black" cx="1311.1" cy="-120.85" rx="28.56" ry="18"/>
+<text text-anchor="middle" x="1311.1" y="-117.35" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#ffffff">module</text>
+</g>
+<!-- transformer_engine_pytorch_sequential_module&#45;&gt;transformer_engine_pytorch_sequential -->
+<g id="edge56" class="edge">
+<title>transformer_engine_pytorch_sequential_module&#45;&gt;transformer_engine_pytorch_sequential</title><style>.edge>path:hover{stroke-width:8}</style>
+<path fill="none" stroke="black" d="M1330.15,-107.12C1336.01,-103.73 1342.63,-100.41 1349.1,-98.23 1488.39,-51.26 1660.36,-37.51 1760.29,-33.58"/>
+<polygon fill="#9d4343" stroke="black" points="1760.36,-37.04 1770.22,-33.18 1760.1,-30.05 1760.36,-37.04"/>
+</g>
+<!-- transformer_engine_pytorch_sequential_module_Activation&#45;&gt;transformer_engine_pytorch_sequential -->
+<g id="edge57" class="edge">
+<title>transformer_engine_pytorch_sequential_module_Activation&#45;&gt;transformer_engine_pytorch_sequential</title><style>.edge>path:hover{stroke-width:8}</style>
+<path fill="none" stroke="black" d="M1843.1,-98.16C1843.1,-90.6 1843.1,-81.84 1843.1,-73.23"/>
+<polygon fill="#a44646" stroke="black" points="1846.6,-73.49 1843.1,-63.49 1839.6,-73.49 1846.6,-73.49"/>
+</g>
+<!-- transformer_engine_pytorch_sequential_module_Linear&#45;&gt;transformer_engine_pytorch_sequential -->
+<g id="edge58" class="edge">
+<title>transformer_engine_pytorch_sequential_module_Linear&#45;&gt;transformer_engine_pytorch_sequential</title><style>.edge>path:hover{stroke-width:8}</style>
+<path fill="none" stroke="black" d="M2016.15,-106.91C1986.49,-94.09 1940.23,-74.09 1903.01,-58"/>
+<polygon fill="#964040" stroke="black" points="1904.83,-54.55 1894.26,-53.79 1902.05,-60.97 1904.83,-54.55"/>
+</g>
+<!-- transformer_engine_pytorch_sequential_module_Sequential -->
+<g id="node27" class="node">
+<title>transformer_engine_pytorch_sequential_module_Sequential</title><style>.edge>path:hover{stroke-width:8}</style>
+<ellipse fill="#b34c4c" stroke="black" cx="2154.1" cy="-120.85" rx="43.66" ry="22.63"/>
+<text text-anchor="middle" x="2154.1" y="-123.35" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#ffffff">module.</text>
+<text text-anchor="middle" x="2154.1" y="-111.35" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#ffffff">Sequential</text>
+</g>
+<!-- transformer_engine_pytorch_sequential_module_Sequential&#45;&gt;transformer_engine_pytorch_sequential -->
+<g id="edge59" class="edge">
+<title>transformer_engine_pytorch_sequential_module_Sequential&#45;&gt;transformer_engine_pytorch_sequential</title><style>.edge>path:hover{stroke-width:8}</style>
+<path fill="none" stroke="black" d="M2121.29,-105.65C2114.67,-103.01 2107.71,-100.41 2101.1,-98.23 2040.11,-78.07 1969.33,-60.32 1917.35,-48.3"/>
+<polygon fill="#b34c4c" stroke="black" points="1918.42,-44.72 1907.89,-45.9 1916.85,-51.55 1918.42,-44.72"/>
+</g>
+<!-- transformer_engine_pytorch_sequential_module__common -->
+<g id="node28" class="node">
+<title>transformer_engine_pytorch_sequential_module__common</title><style>.edge>path:hover{stroke-width:8}</style>
+<ellipse fill="#c24747" stroke="black" cx="1622.1" cy="-120.85" rx="56.07" ry="18"/>
+<text text-anchor="middle" x="1622.1" y="-117.35" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#ffffff">module._common</text>
+</g>
+<!-- transformer_engine_pytorch_sequential_module__common&#45;&gt;transformer_engine_pytorch_sequential_module_Linear -->
+<g id="edge60" class="edge">
+<title>transformer_engine_pytorch_sequential_module__common&#45;&gt;transformer_engine_pytorch_sequential_module_Linear</title><style>.edge>path:hover{stroke-width:8}</style>
+<path fill="none" stroke="black" d="M1645.48,-137.67C1659.43,-146.66 1677.93,-156.82 1696.1,-161.48 1757.67,-177.27 1921.07,-179.22 1982.1,-161.48 1994.71,-157.82 2007.22,-150.84 2017.78,-143.68"/>
+<polygon fill="#c24747" stroke="black" points="2019.42,-146.1 2025.52,-137.43 2015.35,-140.4 2019.42,-146.1"/>
+</g>
+<!-- transformer_engine_pytorch_sequential_module__common&#45;&gt;transformer_engine_pytorch_sequential_module_linear -->
+<g id="edge61" class="edge">
+<title>transformer_engine_pytorch_sequential_module__common&#45;&gt;transformer_engine_pytorch_sequential_module_linear</title><style>.edge>path:hover{stroke-width:8}</style>
+<path fill="none" stroke="black" d="M1678.34,-120.85C1680.51,-120.85 1682.68,-120.85 1684.85,-120.85"/>
+<polygon fill="#c24747" stroke="black" points="1684.71,-124.35 1694.71,-120.85 1684.71,-117.35 1684.71,-124.35"/>
+</g>
+<!-- transformer_engine_pytorch_sequential_module_activation&#45;&gt;transformer_engine_pytorch_sequential_module -->
+<g id="edge62" class="edge">
+<title>transformer_engine_pytorch_sequential_module_activation&#45;&gt;transformer_engine_pytorch_sequential_module</title><style>.edge>path:hover{stroke-width:8}</style>
+<path fill="none" stroke="black" d="M1920.97,-140.38C1910.66,-148.6 1897.59,-157.23 1884.1,-161.48 1828.37,-179.06 1412.88,-181.87 1358.1,-161.48 1348.45,-157.89 1339.47,-151.32 1331.99,-144.51"/>
+<polygon fill="#a44646" stroke="black" points="1335.08,-142.66 1325.5,-138.14 1330.17,-147.65 1335.08,-142.66"/>
+</g>
+<!-- transformer_engine_pytorch_sequential_module_base&#45;&gt;transformer_engine_pytorch_sequential_module_Activation -->
+<g id="edge63" class="edge">
+<title>transformer_engine_pytorch_sequential_module_base&#45;&gt;transformer_engine_pytorch_sequential_module_Activation</title><style>.edge>path:hover{stroke-width:8}</style>
+<path fill="none" stroke="black" d="M1419.65,-137.42C1431.03,-146.45 1446.39,-156.73 1462.1,-161.48 1530.59,-182.2 1715.78,-182.73 1784.1,-161.48 1794.24,-158.33 1804.18,-152.76 1812.92,-146.74"/>
+<polygon fill="#b30404" stroke="black" points="1814.57,-149.13 1820.55,-140.39 1810.43,-143.49 1814.57,-149.13"/>
+</g>
+<!-- transformer_engine_pytorch_sequential_module_base&#45;&gt;transformer_engine_pytorch_sequential_module_Linear -->
+<g id="edge64" class="edge">
+<title>transformer_engine_pytorch_sequential_module_base&#45;&gt;transformer_engine_pytorch_sequential_module_Linear</title><style>.edge>path:hover{stroke-width:8}</style>
+<path fill="none" stroke="black" d="M1419.65,-137.42C1431.03,-146.45 1446.39,-156.73 1462.1,-161.48 1517.41,-178.21 1926.62,-177.6 1982.1,-161.48 1994.71,-157.82 2007.22,-150.84 2017.78,-143.68"/>
+<polygon fill="#b30404" stroke="black" points="2019.42,-146.1 2025.52,-137.43 2015.35,-140.4 2019.42,-146.1"/>
+</g>
+<!-- transformer_engine_pytorch_sequential_module_base&#45;&gt;transformer_engine_pytorch_sequential_module_Sequential -->
+<g id="edge65" class="edge">
+<title>transformer_engine_pytorch_sequential_module_base&#45;&gt;transformer_engine_pytorch_sequential_module_Sequential</title><style>.edge>path:hover{stroke-width:8}</style>
+<path fill="none" stroke="black" d="M1419.65,-137.42C1431.03,-146.45 1446.39,-156.73 1462.1,-161.48 1529.11,-181.75 2025.03,-181.5 2092.1,-161.48 2102.74,-158.31 2113.25,-152.66 2122.5,-146.57"/>
+<polygon fill="#b30404" stroke="black" points="2124.12,-149 2130.3,-140.4 2120.11,-143.27 2124.12,-149"/>
+</g>
+<!-- transformer_engine_pytorch_sequential_module_base&#45;&gt;transformer_engine_pytorch_sequential_module_activation -->
+<g id="edge66" class="edge">
+<title>transformer_engine_pytorch_sequential_module_base&#45;&gt;transformer_engine_pytorch_sequential_module_activation</title><style>.edge>path:hover{stroke-width:8}</style>
+<path fill="none" stroke="black" d="M1419.65,-137.42C1431.03,-146.45 1446.39,-156.73 1462.1,-161.48 1506.98,-175.06 1839.39,-175.58 1884.1,-161.48 1894.01,-158.36 1903.69,-152.87 1912.21,-146.93"/>
+<polygon fill="#b30404" stroke="black" points="1914.06,-149.17 1919.97,-140.38 1909.87,-143.56 1914.06,-149.17"/>
+</g>
+<!-- transformer_engine_pytorch_sequential_module_base&#45;&gt;transformer_engine_pytorch_sequential_module_linear -->
+<g id="edge67" class="edge">
+<title>transformer_engine_pytorch_sequential_module_base&#45;&gt;transformer_engine_pytorch_sequential_module_linear</title><style>.edge>path:hover{stroke-width:8}</style>
+<path fill="none" stroke="black" d="M1419.65,-137.42C1431.03,-146.45 1446.39,-156.73 1462.1,-161.48 1508.05,-175.38 1632.11,-175.21 1678.1,-161.48 1690.28,-157.85 1702.29,-150.97 1712.45,-143.89"/>
+<polygon fill="#b30404" stroke="black" points="1714.25,-146.18 1720.21,-137.42 1710.09,-140.55 1714.25,-146.18"/>
+</g>
+<!-- transformer_engine_pytorch_sequential_module_base&#45;&gt;transformer_engine_pytorch_sequential_module_normalization -->
+<g id="edge68" class="edge">
+<title>transformer_engine_pytorch_sequential_module_base&#45;&gt;transformer_engine_pytorch_sequential_module_normalization</title><style>.edge>path:hover{stroke-width:8}</style>
+<path fill="none" stroke="black" d="M1419.65,-137.42C1431.03,-146.45 1446.39,-156.73 1462.1,-161.48 1540.38,-185.16 2119.14,-182.74 2198.1,-161.48 2209.95,-158.29 2221.85,-152.56 2232.4,-146.38"/>
+<polygon fill="#b30404" stroke="black" points="2234,-148.88 2240.66,-140.64 2230.32,-142.93 2234,-148.88"/>
+</g>
+<!-- transformer_engine_pytorch_sequential_module_sequential -->
+<g id="node33" class="node">
+<title>transformer_engine_pytorch_sequential_module_sequential</title><style>.edge>path:hover{stroke-width:8}</style>
+<ellipse fill="#b34c4c" stroke="black" cx="1505.1" cy="-120.85" rx="42.6" ry="22.63"/>
+<text text-anchor="middle" x="1505.1" y="-123.35" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#ffffff">module.</text>
+<text text-anchor="middle" x="1505.1" y="-111.35" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#ffffff">sequential</text>
+</g>
+<!-- transformer_engine_pytorch_sequential_module_base&#45;&gt;transformer_engine_pytorch_sequential_module_sequential -->
+<g id="edge69" class="edge">
+<title>transformer_engine_pytorch_sequential_module_base&#45;&gt;transformer_engine_pytorch_sequential_module_sequential</title><style>.edge>path:hover{stroke-width:8}</style>
+<path fill="none" stroke="black" d="M1444.57,-120.85C1446.76,-120.85 1448.95,-120.85 1451.13,-120.85"/>
+<polygon fill="#b30404" stroke="black" points="1451.07,-124.35 1461.07,-120.85 1451.07,-117.35 1451.07,-124.35"/>
+</g>
+<!-- transformer_engine_pytorch_sequential_module_linear&#45;&gt;transformer_engine_pytorch_sequential_module -->
+<g id="edge70" class="edge">
+<title>transformer_engine_pytorch_sequential_module_linear&#45;&gt;transformer_engine_pytorch_sequential_module</title><style>.edge>path:hover{stroke-width:8}</style>
+<path fill="none" stroke="black" d="M1721.21,-137.42C1709.63,-146.45 1694.01,-156.73 1678.1,-161.48 1609.96,-181.82 1424.75,-186.29 1358.1,-161.48 1348.45,-157.89 1339.47,-151.32 1331.99,-144.51"/>
+<polygon fill="#964040" stroke="black" points="1335.08,-142.66 1325.5,-138.14 1330.17,-147.65 1335.08,-142.66"/>
+</g>
+<!-- transformer_engine_pytorch_sequential_module_normalization&#45;&gt;transformer_engine_pytorch_sequential_module -->
+<g id="edge71" class="edge">
+<title>transformer_engine_pytorch_sequential_module_normalization&#45;&gt;transformer_engine_pytorch_sequential_module</title><style>.edge>path:hover{stroke-width:8}</style>
+<path fill="none" stroke="black" d="M2241.66,-140.64C2229.15,-148.8 2213.57,-157.32 2198.1,-161.48 2153.04,-173.61 1401.84,-177.76 1358.1,-161.48 1348.45,-157.89 1339.47,-151.32 1331.99,-144.51"/>
+<polygon fill="#9d4343" stroke="black" points="1335.08,-142.66 1325.5,-138.14 1330.17,-147.65 1335.08,-142.66"/>
+</g>
+<!-- transformer_engine_pytorch_sequential_module_sequential&#45;&gt;transformer_engine_pytorch_sequential_module -->
+<g id="edge72" class="edge">
+<title>transformer_engine_pytorch_sequential_module_sequential&#45;&gt;transformer_engine_pytorch_sequential_module</title><style>.edge>path:hover{stroke-width:8}</style>
+<path fill="none" stroke="black" d="M1482.72,-140.39C1471.84,-148.61 1458.11,-157.24 1444.1,-161.48 1407.52,-172.55 1393.92,-174.82 1358.1,-161.48 1348.45,-157.89 1339.47,-151.32 1331.99,-144.51"/>
+<polygon fill="#b34c4c" stroke="black" points="1335.08,-142.66 1325.5,-138.14 1330.17,-147.65 1335.08,-142.66"/>
+</g>
+<!-- transformer_engine_pytorch_sequential_nvte -->
+<g id="node34" class="node">
+<title>transformer_engine_pytorch_sequential_nvte</title><style>.edge>path:hover{stroke-width:8}</style>
+<ellipse fill="#950303" stroke="black" cx="986.1" cy="-524.82" rx="27" ry="18"/>
+<text text-anchor="middle" x="986.1" y="-521.32" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#ffffff">nvte</text>
+</g>
+<!-- transformer_engine_pytorch_sequential_nvte&#45;&gt;transformer_engine_pytorch_sequential_compute_pipeline_compute_pipeline -->
+<g id="edge73" class="edge">
+<title>transformer_engine_pytorch_sequential_nvte&#45;&gt;transformer_engine_pytorch_sequential_compute_pipeline_compute_pipeline</title><style>.edge>path:hover{stroke-width:8}</style>
+<path fill="none" stroke="black" d="M1013.41,-522.98C1182.61,-517.62 2081.91,-487 2115.1,-440.71"/>
+<path fill="none" stroke="black" d="M2115.1,-438.71C2118.25,-434.32 1479.88,-330.06 1264.25,-294.99"/>
+<polygon fill="#950303" stroke="black" points="1265.13,-291.42 1254.7,-293.27 1264.01,-298.33 1265.13,-291.42"/>
+</g>
+<!-- transformer_engine_pytorch_sequential_nvte&#45;&gt;transformer_engine_pytorch_sequential_compute_pipeline_fusions__common -->
+<g id="edge74" class="edge">
+<title>transformer_engine_pytorch_sequential_nvte&#45;&gt;transformer_engine_pytorch_sequential_compute_pipeline_fusions__common</title><style>.edge>path:hover{stroke-width:8}</style>
+<path fill="none" stroke="black" d="M2115.1,-438.71C2196.47,-418.65 2819.83,-438.58 2866.1,-368.71"/>
+<path fill="none" stroke="black" d="M2866.1,-366.71C2902.13,-312.32 1757.03,-326.7 1693.1,-313.71 1685.81,-312.22 1678.3,-310.18 1670.93,-307.86"/>
+<polygon fill="#950303" stroke="black" points="1672.24,-304.27 1661.65,-304.41 1670.02,-310.91 1672.24,-304.27"/>
+</g>
+<!-- transformer_engine_pytorch_sequential_nvte&#45;&gt;transformer_engine_pytorch_sequential_compute_pipeline_fusions_interface -->
+<g id="edge75" class="edge">
+<title>transformer_engine_pytorch_sequential_nvte&#45;&gt;transformer_engine_pytorch_sequential_compute_pipeline_fusions_interface</title><style>.edge>path:hover{stroke-width:8}</style>
+<path fill="none" stroke="black" d="M2866.1,-366.71C2928.78,-272.08 3263.87,-340.75 3374.1,-313.71 3380.11,-312.23 3386.27,-310.39 3392.36,-308.34"/>
+<polygon fill="#950303" stroke="black" points="3393.25,-311.38 3401.51,-304.74 3390.91,-304.79 3393.25,-311.38"/>
+</g>
+<!-- transformer_engine_pytorch_sequential_nvte&#45;&gt;transformer_engine_pytorch_sequential_compute_pipeline_fusions_mmt -->
+<g id="edge76" class="edge">
+<title>transformer_engine_pytorch_sequential_nvte&#45;&gt;transformer_engine_pytorch_sequential_compute_pipeline_fusions_mmt</title><style>.edge>path:hover{stroke-width:8}</style>
+<path fill="none" stroke="black" d="M2866.1,-366.71C2910.27,-300.02 3144.86,-334.46 3222.1,-313.71 3227.99,-312.12 3234.03,-310.23 3240.02,-308.16"/>
+<polygon fill="#950303" stroke="black" points="3240.79,-311.25 3249.01,-304.56 3238.42,-304.66 3240.79,-311.25"/>
+</g>
+<!-- transformer_engine_pytorch_sequential_nvte&#45;&gt;transformer_engine_pytorch_sequential_compute_pipeline_ops_Add -->
+<g id="edge77" class="edge">
+<title>transformer_engine_pytorch_sequential_nvte&#45;&gt;transformer_engine_pytorch_sequential_compute_pipeline_ops_Add</title><style>.edge>path:hover{stroke-width:8}</style>
+<path fill="none" stroke="black" d="M2866.1,-366.71C2874.17,-354.53 2875.86,-338.99 2874.95,-324.65"/>
+<polygon fill="#950303" stroke="black" points="2878.35,-324.57 2873.81,-315 2871.39,-325.31 2878.35,-324.57"/>
+</g>
+<!-- transformer_engine_pytorch_sequential_nvte&#45;&gt;transformer_engine_pytorch_sequential_compute_pipeline_ops_MMT -->
+<g id="edge78" class="edge">
+<title>transformer_engine_pytorch_sequential_nvte&#45;&gt;transformer_engine_pytorch_sequential_compute_pipeline_ops_MMT</title><style>.edge>path:hover{stroke-width:8}</style>
+<path fill="none" stroke="black" d="M2866.1,-366.71C2877.69,-349.21 2921.8,-326.1 2959.54,-308.69"/>
+<polygon fill="#950303" stroke="black" points="2960.83,-311.49 2968.48,-304.16 2957.93,-305.12 2960.83,-311.49"/>
+</g>
+<!-- transformer_engine_pytorch_sequential_nvte&#45;&gt;transformer_engine_pytorch_sequential_compute_pipeline_ops_Op -->
+<g id="edge79" class="edge">
+<title>transformer_engine_pytorch_sequential_nvte&#45;&gt;transformer_engine_pytorch_sequential_compute_pipeline_ops_Op</title><style>.edge>path:hover{stroke-width:8}</style>
+<path fill="none" stroke="black" d="M1012.57,-519.99C1070.56,-510.02 1208.23,-477.31 1265.1,-385.71 1326.76,-286.38 1165.78,-328.91 1118.1,-313.71 1112.61,-311.95 1106.95,-310 1101.31,-307.95"/>
+<polygon fill="#950303" stroke="black" points="1102.81,-304.4 1092.22,-304.19 1100.36,-310.96 1102.81,-304.4"/>
+</g>
+<!-- transformer_engine_pytorch_sequential_nvte&#45;&gt;transformer_engine_pytorch_sequential_compute_pipeline_ops_activation -->
+<g id="edge80" class="edge">
+<title>transformer_engine_pytorch_sequential_nvte&#45;&gt;transformer_engine_pytorch_sequential_compute_pipeline_ops_activation</title><style>.edge>path:hover{stroke-width:8}</style>
+<path fill="none" stroke="black" d="M2866.1,-366.71C2872.41,-357.19 2817.75,-329.71 2772.23,-308.94"/>
+<polygon fill="#950303" stroke="black" points="2773.94,-305.42 2763.39,-304.48 2771.06,-311.8 2773.94,-305.42"/>
+</g>
+<!-- transformer_engine_pytorch_sequential_nvte&#45;&gt;transformer_engine_pytorch_sequential_compute_pipeline_ops_add -->
+<g id="edge81" class="edge">
+<title>transformer_engine_pytorch_sequential_nvte&#45;&gt;transformer_engine_pytorch_sequential_compute_pipeline_ops_add</title><style>.edge>path:hover{stroke-width:8}</style>
+<path fill="none" stroke="black" d="M2866.1,-366.71C2927.67,-273.76 3759.18,-337.5 3868.1,-313.71 3874.23,-312.37 3880.51,-310.59 3886.7,-308.57"/>
+<polygon fill="#950303" stroke="black" points="3887.72,-311.58 3896.01,-304.98 3885.41,-304.97 3887.72,-311.58"/>
+</g>
+<!-- transformer_engine_pytorch_sequential_nvte&#45;&gt;transformer_engine_pytorch_sequential_compute_pipeline_ops_attention -->
+<g id="edge82" class="edge">
+<title>transformer_engine_pytorch_sequential_nvte&#45;&gt;transformer_engine_pytorch_sequential_compute_pipeline_ops_attention</title><style>.edge>path:hover{stroke-width:8}</style>
+<path fill="none" stroke="black" d="M2866.1,-366.71C2918.05,-288.28 3619.08,-333.25 3711.1,-313.71 3718.39,-312.16 3725.89,-310.08 3733.25,-307.74"/>
+<polygon fill="#950303" stroke="black" points="3734.17,-310.78 3742.53,-304.27 3731.94,-304.15 3734.17,-310.78"/>
+</g>
+<!-- transformer_engine_pytorch_sequential_nvte&#45;&gt;transformer_engine_pytorch_sequential_compute_pipeline_ops_layernorm -->
+<g id="edge83" class="edge">
+<title>transformer_engine_pytorch_sequential_nvte&#45;&gt;transformer_engine_pytorch_sequential_compute_pipeline_ops_layernorm</title><style>.edge>path:hover{stroke-width:8}</style>
+<path fill="none" stroke="black" d="M2866.1,-366.71C2908.2,-303.15 2256.37,-330.88 2182.1,-313.71 2176.08,-312.31 2169.91,-310.52 2163.81,-308.51"/>
+<polygon fill="#950303" stroke="black" points="2165.25,-304.95 2154.65,-304.94 2162.93,-311.56 2165.25,-304.95"/>
+</g>
+<!-- transformer_engine_pytorch_sequential_nvte&#45;&gt;transformer_engine_pytorch_sequential_compute_pipeline_ops_mmt -->
+<g id="edge84" class="edge">
+<title>transformer_engine_pytorch_sequential_nvte&#45;&gt;transformer_engine_pytorch_sequential_compute_pipeline_ops_mmt</title><style>.edge>path:hover{stroke-width:8}</style>
+<path fill="none" stroke="black" d="M2866.1,-366.71C2898.91,-317.18 2391.83,-327.73 2334.1,-313.71 2328.1,-312.25 2321.93,-310.41 2315.84,-308.37"/>
+<polygon fill="#950303" stroke="black" points="2317.29,-304.82 2306.69,-304.78 2314.95,-311.42 2317.29,-304.82"/>
+</g>
+<!-- transformer_engine_pytorch_sequential_nvte&#45;&gt;transformer_engine_pytorch_sequential_compute_pipeline_ops_op -->
+<g id="edge85" class="edge">
+<title>transformer_engine_pytorch_sequential_nvte&#45;&gt;transformer_engine_pytorch_sequential_compute_pipeline_ops_op</title><style>.edge>path:hover{stroke-width:8}</style>
+<path fill="none" stroke="black" d="M2866.1,-366.71C2917.5,-289.11 2120.93,-334.04 2030.1,-313.71 2023.98,-312.33 2017.71,-310.54 2011.52,-308.51"/>
+<polygon fill="#950303" stroke="black" points="2012.81,-304.9 2002.22,-304.9 2010.5,-311.51 2012.81,-304.9"/>
+</g>
+<!-- transformer_engine_pytorch_sequential_nvte&#45;&gt;transformer_engine_pytorch_sequential_compute_pipeline_ops_rmsnorm -->
+<g id="edge86" class="edge">
+<title>transformer_engine_pytorch_sequential_nvte&#45;&gt;transformer_engine_pytorch_sequential_compute_pipeline_ops_rmsnorm</title><style>.edge>path:hover{stroke-width:8}</style>
+<path fill="none" stroke="black" d="M2866.1,-366.71C2913.19,-295.62 2568.55,-335.45 2486.1,-313.71 2480.21,-312.15 2474.16,-310.27 2468.17,-308.22"/>
+<polygon fill="#950303" stroke="black" points="2469.77,-304.72 2459.18,-304.63 2467.41,-311.31 2469.77,-304.72"/>
+</g>
+<!-- transformer_engine_pytorch_sequential_nvte&#45;&gt;transformer_engine_pytorch_sequential_compute_pipeline_ops_types -->
+<g id="edge87" class="edge">
+<title>transformer_engine_pytorch_sequential_nvte&#45;&gt;transformer_engine_pytorch_sequential_compute_pipeline_ops_types</title><style>.edge>path:hover{stroke-width:8}</style>
+<path fill="none" stroke="black" d="M2866.1,-366.71C2923.55,-279.97 2737.41,-344.73 2638.1,-313.71 2629.82,-311.12 2621.16,-308.05 2612.77,-304.88"/>
+<polygon fill="#950303" stroke="black" points="2614.31,-301.33 2603.72,-300.98 2611.78,-307.86 2614.31,-301.33"/>
+</g>
+<!-- transformer_engine_pytorch_sequential_nvte&#45;&gt;transformer_engine_pytorch_sequential_compute_pipeline_function -->
+<g id="edge88" class="edge">
+<title>transformer_engine_pytorch_sequential_nvte&#45;&gt;transformer_engine_pytorch_sequential_compute_pipeline_function</title><style>.edge>path:hover{stroke-width:8}</style>
+<path fill="none" stroke="black" d="M1707.1,-281.59C1692.06,-276.68 1701.1,-260.5 1688.1,-251.48 1673.21,-241.14 1576.6,-222.85 1506.94,-210.69"/>
+<polygon fill="#950303" stroke="black" points="1507.89,-207.13 1497.44,-208.87 1506.7,-214.03 1507.89,-207.13"/>
+<path fill="none" stroke="black" d="M2866.1,-366.71C2936.13,-260.99 1835.69,-377.5 1726.1,-313.71 1712.43,-305.74 1722.15,-288.5 1707.1,-283.59"/>
+</g>
+<!-- transformer_engine_pytorch_sequential_nvte&#45;&gt;transformer_engine_pytorch_sequential_module_Linear -->
+<g id="edge89" class="edge">
+<title>transformer_engine_pytorch_sequential_nvte&#45;&gt;transformer_engine_pytorch_sequential_module_Linear</title><style>.edge>path:hover{stroke-width:8}</style>
+<path fill="none" stroke="black" d="M2866.1,-366.71C2916.03,-291.33 3616.29,-379.69 3678.1,-313.71 3697.01,-293.52 3696.98,-271.69 3678.1,-251.48 3558.23,-123.12 2272.45,-182.07 2101.1,-143.48 2095.99,-142.33 2090.73,-140.78 2085.59,-139.03"/>
+<polygon fill="#950303" stroke="black" points="2086.86,-135.4 2076.27,-135.23 2084.44,-141.97 2086.86,-135.4"/>
+</g>
+<!-- transformer_engine_pytorch_sequential_nvte&#45;&gt;transformer_engine_pytorch_sequential_module_base -->
+<g id="edge90" class="edge">
+<title>transformer_engine_pytorch_sequential_nvte&#45;&gt;transformer_engine_pytorch_sequential_module_base</title><style>.edge>path:hover{stroke-width:8}</style>
+<path fill="none" stroke="black" d="M970.06,-509.98C956.77,-497.61 938.45,-478.31 928.1,-457.71 909.96,-421.56 909.1,-409.15 909.1,-368.71 909.1,-368.71 909.1,-368.71 909.1,-281.59 909.1,-252.94 939.5,-262.12 966.1,-251.48 1130.32,-185.81 1182.05,-201.54 1349.1,-143.48 1353.36,-142 1357.78,-140.35 1362.15,-138.64"/>
+<polygon fill="#950303" stroke="black" points="1363.22,-141.58 1371.19,-134.6 1360.6,-135.09 1363.22,-141.58"/>
+</g>
+<!-- transformer_engine_pytorch_sequential_nvte&#45;&gt;transformer_engine_pytorch_sequential_module_linear -->
+<g id="edge91" class="edge">
+<title>transformer_engine_pytorch_sequential_nvte&#45;&gt;transformer_engine_pytorch_sequential_module_linear</title><style>.edge>path:hover{stroke-width:8}</style>
+<path fill="none" stroke="black" d="M1707.1,-281.59C1681.98,-273.4 1711.02,-192.27 1728.61,-148.89"/>
+<polygon fill="#950303" stroke="black" points="1732.11,-150.57 1732.69,-139.99 1725.64,-147.9 1732.11,-150.57"/>
+</g>
+<!-- transformer_engine_pytorch_sequential_nvte&#45;&gt;transformer_engine_pytorch_sequential_module_normalization -->
+<g id="edge92" class="edge">
+<title>transformer_engine_pytorch_sequential_nvte&#45;&gt;transformer_engine_pytorch_sequential_module_normalization</title><style>.edge>path:hover{stroke-width:8}</style>
+<path fill="none" stroke="black" d="M2866.1,-366.71C2936.56,-259.41 3924.17,-398.98 4020.1,-313.71 4058.9,-279.22 4042.93,-250.24 4039.1,-198.48"/>
+</g>
+<!-- transformer_engine_pytorch_sequential_recipe -->
+<g id="node51" class="node">
+<title>transformer_engine_pytorch_sequential_recipe</title><style>.edge>path:hover{stroke-width:8}</style>
+<ellipse fill="#d02424" stroke="black" cx="1052.1" cy="-439.71" rx="27" ry="18"/>
+<text text-anchor="middle" x="1052.1" y="-436.21" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#ffffff">recipe</text>
+</g>
+<!-- transformer_engine_pytorch_sequential_nvte&#45;&gt;transformer_engine_pytorch_sequential_recipe -->
+<g id="edge93" class="edge">
+<title>transformer_engine_pytorch_sequential_nvte&#45;&gt;transformer_engine_pytorch_sequential_recipe</title><style>.edge>path:hover{stroke-width:8}</style>
+<path fill="none" stroke="black" d="M998.21,-508.57C1008.01,-496.23 1021.99,-478.63 1033.25,-464.45"/>
+<polygon fill="#950303" stroke="black" points="1036.37,-467.14 1039.85,-457.13 1030.89,-462.79 1036.37,-467.14"/>
+</g>
+<!-- transformer_engine_pytorch_sequential_nvte_DType -->
+<g id="node35" class="node">
+<title>transformer_engine_pytorch_sequential_nvte_DType</title><style>.edge>path:hover{stroke-width:8}</style>
+<ellipse fill="#a44646" stroke="black" cx="865.1" cy="-524.82" rx="38.49" ry="18"/>
+<text text-anchor="middle" x="865.1" y="-521.32" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#ffffff">nvte.DType</text>
+</g>
+<!-- transformer_engine_pytorch_sequential_nvte_DType&#45;&gt;transformer_engine_pytorch_sequential_recipe -->
+<g id="edge94" class="edge">
+<title>transformer_engine_pytorch_sequential_nvte_DType&#45;&gt;transformer_engine_pytorch_sequential_recipe</title><style>.edge>path:hover{stroke-width:8}</style>
+<path fill="none" stroke="black" d="M887.42,-509.74C896.45,-504.35 907.08,-498.38 917.1,-493.71 959.53,-473.9 972.89,-475.72 1016.1,-457.71 1017.77,-457.01 1019.47,-456.28 1021.18,-455.53"/>
+<polygon fill="#a44646" stroke="black" points="1022.27,-458.43 1029.93,-451.1 1019.38,-452.05 1022.27,-458.43"/>
+</g>
+<!-- transformer_engine_pytorch_sequential_nvte__common -->
+<g id="node36" class="node">
+<title>transformer_engine_pytorch_sequential_nvte__common</title><style>.edge>path:hover{stroke-width:8}</style>
+<ellipse fill="#db0505" stroke="black" cx="643.1" cy="-524.82" rx="47.51" ry="18"/>
+<text text-anchor="middle" x="643.1" y="-521.32" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#ffffff">nvte._common</text>
+</g>
+<!-- transformer_engine_pytorch_sequential_nvte__common&#45;&gt;transformer_engine_pytorch_sequential_nvte -->
+<g id="edge95" class="edge">
+<title>transformer_engine_pytorch_sequential_nvte__common&#45;&gt;transformer_engine_pytorch_sequential_nvte</title><style>.edge>path:hover{stroke-width:8}</style>
+<path fill="none" stroke="black" d="M659.79,-542.06C672.16,-553.54 690.12,-567.64 709.1,-573.93 756.99,-589.79 889.41,-593.03 936.1,-573.93 948.35,-568.92 959.21,-559.31 967.66,-549.91"/>
+<polygon fill="#db0505" stroke="black" points="970.98,-552.39 974.7,-542.47 965.61,-547.9 970.98,-552.39"/>
+</g>
+<!-- transformer_engine_pytorch_sequential_nvte_activation -->
+<g id="node37" class="node">
+<title>transformer_engine_pytorch_sequential_nvte_activation</title><style>.edge>path:hover{stroke-width:8}</style>
+<ellipse fill="#a13535" stroke="black" cx="500.1" cy="-282.59" rx="39.95" ry="22.63"/>
+<text text-anchor="middle" x="500.1" y="-285.09" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#ffffff">nvte.</text>
+<text text-anchor="middle" x="500.1" y="-273.09" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#ffffff">activation</text>
+</g>
+<!-- transformer_engine_pytorch_sequential_nvte__common&#45;&gt;transformer_engine_pytorch_sequential_nvte_activation -->
+<g id="edge96" class="edge">
+<title>transformer_engine_pytorch_sequential_nvte__common&#45;&gt;transformer_engine_pytorch_sequential_nvte_activation</title><style>.edge>path:hover{stroke-width:8}</style>
+<path fill="none" stroke="black" d="M567.1,-438.71C541.66,-398.98 521.74,-347.95 510.45,-315.5"/>
+<polygon fill="#db0505" stroke="black" points="513.5,-314.59 506.96,-306.25 506.87,-316.84 513.5,-314.59"/>
+</g>
+<!-- transformer_engine_pytorch_sequential_nvte_add -->
+<g id="node38" class="node">
+<title>transformer_engine_pytorch_sequential_nvte_add</title><style>.edge>path:hover{stroke-width:8}</style>
+<ellipse fill="#a13535" stroke="black" cx="740.1" cy="-524.82" rx="31.27" ry="18"/>
+<text text-anchor="middle" x="740.1" y="-521.32" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#ffffff">nvte.add</text>
+</g>
+<!-- transformer_engine_pytorch_sequential_nvte__common&#45;&gt;transformer_engine_pytorch_sequential_nvte_add -->
+<g id="edge97" class="edge">
+<title>transformer_engine_pytorch_sequential_nvte__common&#45;&gt;transformer_engine_pytorch_sequential_nvte_add</title><style>.edge>path:hover{stroke-width:8}</style>
+<path fill="none" stroke="black" d="M690.85,-524.82C693.15,-524.82 695.46,-524.82 697.76,-524.82"/>
+<polygon fill="#db0505" stroke="black" points="697.55,-528.32 707.55,-524.82 697.55,-521.32 697.55,-528.32"/>
+</g>
+<!-- transformer_engine_pytorch_sequential_nvte_cast_transpose -->
+<g id="node40" class="node">
+<title>transformer_engine_pytorch_sequential_nvte_cast_transpose</title><style>.edge>path:hover{stroke-width:8}</style>
+<ellipse fill="#993333" stroke="black" cx="616.1" cy="-282.59" rx="58.51" ry="22.63"/>
+<text text-anchor="middle" x="616.1" y="-285.09" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#ffffff">nvte.</text>
+<text text-anchor="middle" x="616.1" y="-273.09" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#ffffff">cast_transpose</text>
+</g>
+<!-- transformer_engine_pytorch_sequential_nvte__common&#45;&gt;transformer_engine_pytorch_sequential_nvte_cast_transpose -->
+<g id="edge98" class="edge">
+<title>transformer_engine_pytorch_sequential_nvte__common&#45;&gt;transformer_engine_pytorch_sequential_nvte_cast_transpose</title><style>.edge>path:hover{stroke-width:8}</style>
+<path fill="none" stroke="black" d="M567.1,-438.71C564.85,-432.79 589.77,-359.14 604.92,-315.48"/>
+<polygon fill="#db0505" stroke="black" points="608.46,-316.95 608.44,-306.35 601.85,-314.65 608.46,-316.95"/>
+</g>
+<!-- transformer_engine_pytorch_sequential_nvte_misc_fusions -->
+<g id="node47" class="node">
+<title>transformer_engine_pytorch_sequential_nvte_misc_fusions</title><style>.edge>path:hover{stroke-width:8}</style>
+<ellipse fill="#793434" stroke="black" cx="744.1" cy="-282.59" rx="51.62" ry="22.63"/>
+<text text-anchor="middle" x="744.1" y="-285.09" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#ffffff">nvte.</text>
+<text text-anchor="middle" x="744.1" y="-273.09" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#ffffff">misc_fusions</text>
+</g>
+<!-- transformer_engine_pytorch_sequential_nvte__common&#45;&gt;transformer_engine_pytorch_sequential_nvte_misc_fusions -->
+<g id="edge99" class="edge">
+<title>transformer_engine_pytorch_sequential_nvte__common&#45;&gt;transformer_engine_pytorch_sequential_nvte_misc_fusions</title><style>.edge>path:hover{stroke-width:8}</style>
+<path fill="none" stroke="black" d="M637.33,-506.72C632.28,-488.24 627.87,-459.01 643.1,-440.71"/>
+</g>
+<!-- transformer_engine_pytorch_sequential_nvte_mmt -->
+<g id="node48" class="node">
+<title>transformer_engine_pytorch_sequential_nvte_mmt</title><style>.edge>path:hover{stroke-width:8}</style>
+<ellipse fill="#883a3a" stroke="black" cx="847.1" cy="-282.59" rx="33.52" ry="18"/>
+<text text-anchor="middle" x="847.1" y="-279.09" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#ffffff">nvte.mmt</text>
+</g>
+<!-- transformer_engine_pytorch_sequential_nvte__common&#45;&gt;transformer_engine_pytorch_sequential_nvte_mmt -->
+<g id="edge100" class="edge">
+<title>transformer_engine_pytorch_sequential_nvte__common&#45;&gt;transformer_engine_pytorch_sequential_nvte_mmt</title><style>.edge>path:hover{stroke-width:8}</style>
+<path fill="none" stroke="black" d="M646,-506.43C650.19,-487.12 659.9,-456.59 681.1,-440.71"/>
+</g>
+<!-- transformer_engine_pytorch_sequential_nvte_normalization -->
+<g id="node49" class="node">
+<title>transformer_engine_pytorch_sequential_nvte_normalization</title><style>.edge>path:hover{stroke-width:8}</style>
+<ellipse fill="#813737" stroke="black" cx="390.1" cy="-282.59" rx="52.15" ry="22.63"/>
+<text text-anchor="middle" x="390.1" y="-285.09" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#ffffff">nvte.</text>
+<text text-anchor="middle" x="390.1" y="-273.09" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#ffffff">normalization</text>
+</g>
+<!-- transformer_engine_pytorch_sequential_nvte__common&#45;&gt;transformer_engine_pytorch_sequential_nvte_normalization -->
+<g id="edge101" class="edge">
+<title>transformer_engine_pytorch_sequential_nvte__common&#45;&gt;transformer_engine_pytorch_sequential_nvte_normalization</title><style>.edge>path:hover{stroke-width:8}</style>
+<path fill="none" stroke="black" d="M626.26,-507.55C610.06,-491.43 585.4,-465.6 567.1,-440.71"/>
+<path fill="none" stroke="black" d="M567.1,-438.71C545.56,-409.38 468.3,-345.77 423.29,-309.78"/>
+<polygon fill="#db0505" stroke="black" points="425.75,-306.46 415.75,-302.97 421.39,-311.94 425.75,-306.46"/>
+</g>
+<!-- transformer_engine_pytorch_sequential_nvte_activation&#45;&gt;transformer_engine_pytorch_sequential_nvte -->
+<g id="edge102" class="edge">
+<title>transformer_engine_pytorch_sequential_nvte_activation&#45;&gt;transformer_engine_pytorch_sequential_nvte</title><style>.edge>path:hover{stroke-width:8}</style>
+<path fill="none" stroke="black" d="M525.34,-300.43C532.56,-304.97 540.53,-309.72 548.1,-313.71 668.69,-377.14 722.17,-356.11 828.36,-432.3"/>
+<polygon fill="#a13535" stroke="black" points="825.97,-435.62 836.1,-438.71 830.11,-429.97 825.97,-435.62"/>
+<path fill="none" stroke="black" d="M837.1,-440.71C879.93,-472.72 899.38,-465.99 945.1,-493.71 953.41,-498.74 962.15,-505.01 969.42,-510.52"/>
+</g>
+<!-- transformer_engine_pytorch_sequential_nvte_activation&#45;&gt;transformer_engine_pytorch_sequential_nvte_misc_fusions -->
+<g id="edge103" class="edge">
+<title>transformer_engine_pytorch_sequential_nvte_activation&#45;&gt;transformer_engine_pytorch_sequential_nvte_misc_fusions</title><style>.edge>path:hover{stroke-width:8}</style>
+<path fill="none" stroke="black" d="M517.34,-303.29C527.63,-313.99 541.68,-325.98 557.1,-331.71 606.27,-349.96 625.15,-347.67 675.1,-331.71 689.03,-327.26 702.55,-318.85 713.87,-310.24"/>
+<polygon fill="#a13535" stroke="black" points="715.84,-312.35 721.45,-303.36 711.46,-306.89 715.84,-312.35"/>
+</g>
+<!-- transformer_engine_pytorch_sequential_nvte_add&#45;&gt;transformer_engine_pytorch_sequential_nvte -->
+<g id="edge104" class="edge">
+<title>transformer_engine_pytorch_sequential_nvte_add&#45;&gt;transformer_engine_pytorch_sequential_nvte</title><style>.edge>path:hover{stroke-width:8}</style>
+<path fill="none" stroke="black" d="M752.71,-541.5C762.54,-553.04 777.32,-567.44 794.1,-573.93 852.97,-596.7 877.69,-597.82 936.1,-573.93 948.35,-568.92 959.21,-559.31 967.66,-549.91"/>
+<polygon fill="#a13535" stroke="black" points="970.98,-552.39 974.7,-542.47 965.61,-547.9 970.98,-552.39"/>
+</g>
+<!-- transformer_engine_pytorch_sequential_nvte_add&#45;&gt;transformer_engine_pytorch_sequential_nvte_misc_fusions -->
+<g id="edge105" class="edge">
+<title>transformer_engine_pytorch_sequential_nvte_add&#45;&gt;transformer_engine_pytorch_sequential_nvte_misc_fusions</title><style>.edge>path:hover{stroke-width:8}</style>
+<path fill="none" stroke="black" d="M722.04,-509.83C693.07,-487.2 639.79,-444.69 643.1,-440.71"/>
+</g>
+<!-- transformer_engine_pytorch_sequential_nvte_attention -->
+<g id="node39" class="node">
+<title>transformer_engine_pytorch_sequential_nvte_attention</title><style>.edge>path:hover{stroke-width:8}</style>
+<ellipse fill="#924e4e" stroke="black" cx="276.1" cy="-282.59" rx="44.35" ry="18"/>
+<text text-anchor="middle" x="276.1" y="-279.09" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#ffffff">nvte.attention</text>
+</g>
+<!-- transformer_engine_pytorch_sequential_nvte_cast_transpose&#45;&gt;transformer_engine_pytorch_sequential_nvte -->
+<g id="edge106" class="edge">
+<title>transformer_engine_pytorch_sequential_nvte_cast_transpose&#45;&gt;transformer_engine_pytorch_sequential_nvte</title><style>.edge>path:hover{stroke-width:8}</style>
+<path fill="none" stroke="black" d="M795.1,-368.71C819.32,-395.72 808.05,-416.98 837.1,-438.71"/>
+<path fill="none" stroke="black" d="M653.39,-300.46C695.39,-319.51 760.99,-349.44 785.5,-361.54"/>
+<polygon fill="#993333" stroke="black" points="783.64,-365.05 794.1,-366.71 786.96,-358.88 783.64,-365.05"/>
+</g>
+<!-- transformer_engine_pytorch_sequential_nvte_cast_transpose&#45;&gt;transformer_engine_pytorch_sequential_nvte_misc_fusions -->
+<g id="edge107" class="edge">
+<title>transformer_engine_pytorch_sequential_nvte_cast_transpose&#45;&gt;transformer_engine_pytorch_sequential_nvte_misc_fusions</title><style>.edge>path:hover{stroke-width:8}</style>
+<path fill="none" stroke="black" d="M675.1,-282.59C677.14,-282.59 679.17,-282.59 681.21,-282.59"/>
+<polygon fill="#993333" stroke="black" points="681.08,-286.09 691.08,-282.59 681.08,-279.09 681.08,-286.09"/>
+</g>
+<!-- transformer_engine_pytorch_sequential_nvte_cpp_extensions -->
+<g id="node41" class="node">
+<title>transformer_engine_pytorch_sequential_nvte_cpp_extensions</title><style>.edge>path:hover{stroke-width:8}</style>
+<ellipse fill="#d10505" stroke="black" cx="427.1" cy="-524.82" rx="59.57" ry="22.63"/>
+<text text-anchor="middle" x="427.1" y="-527.32" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#ffffff">nvte.</text>
+<text text-anchor="middle" x="427.1" y="-515.32" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#ffffff">cpp_extensions</text>
+</g>
+<!-- transformer_engine_pytorch_sequential_nvte_cpp_extensions&#45;&gt;transformer_engine_pytorch_sequential_nvte -->
+<g id="edge108" class="edge">
+<title>transformer_engine_pytorch_sequential_nvte_cpp_extensions&#45;&gt;transformer_engine_pytorch_sequential_nvte</title><style>.edge>path:hover{stroke-width:8}</style>
+<path fill="none" stroke="black" d="M452.03,-545.62C466.53,-556.36 485.73,-568.34 505.1,-573.93 551.12,-587.21 891.78,-592.06 936.1,-573.93 948.35,-568.92 959.21,-559.31 967.66,-549.91"/>
+<polygon fill="#d10505" stroke="black" points="970.98,-552.39 974.7,-542.47 965.61,-547.9 970.98,-552.39"/>
+</g>
+<!-- transformer_engine_pytorch_sequential_nvte_cpp_extensions&#45;&gt;transformer_engine_pytorch_sequential_nvte_DType -->
+<g id="edge109" class="edge">
+<title>transformer_engine_pytorch_sequential_nvte_cpp_extensions&#45;&gt;transformer_engine_pytorch_sequential_nvte_DType</title><style>.edge>path:hover{stroke-width:8}</style>
+<path fill="none" stroke="black" d="M452.03,-545.62C466.53,-556.36 485.73,-568.34 505.1,-573.93 561.9,-590.32 713.7,-588.02 771.1,-573.93 793.58,-568.41 816.66,-556.47 834.31,-545.74"/>
+<polygon fill="#d10505" stroke="black" points="835.92,-548.24 842.53,-539.96 832.2,-542.31 835.92,-548.24"/>
+</g>
+<!-- transformer_engine_pytorch_sequential_nvte_cpp_extensions&#45;&gt;transformer_engine_pytorch_sequential_nvte__common -->
+<g id="edge110" class="edge">
+<title>transformer_engine_pytorch_sequential_nvte_cpp_extensions&#45;&gt;transformer_engine_pytorch_sequential_nvte__common</title><style>.edge>path:hover{stroke-width:8}</style>
+<path fill="none" stroke="black" d="M452.03,-545.62C466.53,-556.36 485.73,-568.34 505.1,-573.93 535.85,-582.8 546.73,-583.99 577.1,-573.93 592.23,-568.92 606.71,-558.95 618.23,-549.29"/>
+<polygon fill="#d10505" stroke="black" points="620.24,-552.3 625.42,-543.06 615.61,-547.05 620.24,-552.3"/>
+</g>
+<!-- transformer_engine_pytorch_sequential_nvte_cpp_extensions&#45;&gt;transformer_engine_pytorch_sequential_nvte_activation -->
+<g id="edge111" class="edge">
+<title>transformer_engine_pytorch_sequential_nvte_cpp_extensions&#45;&gt;transformer_engine_pytorch_sequential_nvte_activation</title><style>.edge>path:hover{stroke-width:8}</style>
+<path fill="none" stroke="black" d="M432.96,-501.99C441.92,-468.89 459.84,-404.11 477.1,-349.71 480.66,-338.5 484.81,-326.32 488.6,-315.5"/>
+<polygon fill="#d10505" stroke="black" points="492.2,-316.82 492.24,-306.23 485.6,-314.49 492.2,-316.82"/>
+</g>
+<!-- transformer_engine_pytorch_sequential_nvte_cpp_extensions&#45;&gt;transformer_engine_pytorch_sequential_nvte_add -->
+<g id="edge112" class="edge">
+<title>transformer_engine_pytorch_sequential_nvte_cpp_extensions&#45;&gt;transformer_engine_pytorch_sequential_nvte_add</title><style>.edge>path:hover{stroke-width:8}</style>
+<path fill="none" stroke="black" d="M452.03,-545.62C466.53,-556.36 485.73,-568.34 505.1,-573.93 544.82,-585.39 652.93,-589.77 691.1,-573.93 703.03,-568.98 713.56,-559.57 721.77,-550.3"/>
+<polygon fill="#d10505" stroke="black" points="724.98,-552.9 728.63,-542.95 719.58,-548.45 724.98,-552.9"/>
+</g>
+<!-- transformer_engine_pytorch_sequential_nvte_cpp_extensions&#45;&gt;transformer_engine_pytorch_sequential_nvte_attention -->
+<g id="edge113" class="edge">
+<title>transformer_engine_pytorch_sequential_nvte_cpp_extensions&#45;&gt;transformer_engine_pytorch_sequential_nvte_attention</title><style>.edge>path:hover{stroke-width:8}</style>
+<path fill="none" stroke="black" d="M390.33,-506.73C380.29,-502.28 369.35,-497.64 359.1,-493.71 299.31,-470.75 267.09,-489.06 225.1,-440.71"/>
+<path fill="none" stroke="black" d="M225.1,-438.71C198.85,-408.47 195.94,-386.76 211.1,-349.71 218.27,-332.19 232.57,-316.81 245.94,-305.31"/>
+<polygon fill="#d10505" stroke="black" points="247.54,-307.7 253.07,-298.66 243.11,-302.28 247.54,-307.7"/>
+</g>
+<!-- transformer_engine_pytorch_sequential_nvte_cpp_extensions&#45;&gt;transformer_engine_pytorch_sequential_nvte_cast_transpose -->
+<g id="edge114" class="edge">
+<title>transformer_engine_pytorch_sequential_nvte_cpp_extensions&#45;&gt;transformer_engine_pytorch_sequential_nvte_cast_transpose</title><style>.edge>path:hover{stroke-width:8}</style>
+<path fill="none" stroke="black" d="M529.1,-438.71C529.12,-389.53 562.8,-341.5 588.24,-312.23"/>
+<polygon fill="#d10505" stroke="black" points="590.43,-314.91 594.49,-305.12 585.21,-310.24 590.43,-314.91"/>
+<path fill="none" stroke="black" d="M463.18,-506.45C487.33,-492.51 516.86,-470.37 529.1,-440.71"/>
+</g>
+<!-- transformer_engine_pytorch_sequential_nvte_dtype -->
+<g id="node44" class="node">
+<title>transformer_engine_pytorch_sequential_nvte_dtype</title><style>.edge>path:hover{stroke-width:8}</style>
+<ellipse fill="#df0b0b" stroke="black" cx="541.1" cy="-524.82" rx="36.23" ry="18"/>
+<text text-anchor="middle" x="541.1" y="-521.32" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#ffffff">nvte.dtype</text>
+</g>
+<!-- transformer_engine_pytorch_sequential_nvte_cpp_extensions&#45;&gt;transformer_engine_pytorch_sequential_nvte_dtype -->
+<g id="edge115" class="edge">
+<title>transformer_engine_pytorch_sequential_nvte_cpp_extensions&#45;&gt;transformer_engine_pytorch_sequential_nvte_dtype</title><style>.edge>path:hover{stroke-width:8}</style>
+<path fill="none" stroke="black" d="M486.78,-524.82C489.1,-524.82 491.42,-524.82 493.74,-524.82"/>
+<polygon fill="#d10505" stroke="black" points="493.6,-528.32 503.6,-524.82 493.6,-521.32 493.6,-528.32"/>
+</g>
+<!-- transformer_engine_pytorch_sequential_nvte_empty -->
+<g id="node45" class="node">
+<title>transformer_engine_pytorch_sequential_nvte_empty</title><style>.edge>path:hover{stroke-width:8}</style>
+<ellipse fill="#d10505" stroke="black" cx="176.1" cy="-282.59" rx="38.03" ry="18"/>
+<text text-anchor="middle" x="176.1" y="-279.09" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#ffffff">nvte.empty</text>
+</g>
+<!-- transformer_engine_pytorch_sequential_nvte_cpp_extensions&#45;&gt;transformer_engine_pytorch_sequential_nvte_empty -->
+<g id="edge116" class="edge">
+<title>transformer_engine_pytorch_sequential_nvte_cpp_extensions&#45;&gt;transformer_engine_pytorch_sequential_nvte_empty</title><style>.edge>path:hover{stroke-width:8}</style>
+<path fill="none" stroke="black" d="M391.23,-506.5C381,-501.96 369.74,-497.32 359.1,-493.71 295.01,-471.91 258,-506.53 211.1,-457.71 173.26,-418.31 171.04,-350.05 173.25,-311.75"/>
+<polygon fill="#d10505" stroke="black" points="176.81,-312.12 174.05,-301.89 169.83,-311.6 176.81,-312.12"/>
+</g>
+<!-- transformer_engine_pytorch_sequential_nvte_cpp_extensions&#45;&gt;transformer_engine_pytorch_sequential_nvte_misc_fusions -->
+<g id="edge117" class="edge">
+<title>transformer_engine_pytorch_sequential_nvte_cpp_extensions&#45;&gt;transformer_engine_pytorch_sequential_nvte_misc_fusions</title><style>.edge>path:hover{stroke-width:8}</style>
+<path fill="none" stroke="black" d="M643.1,-438.71C676.54,-398.53 708.27,-346.78 726.95,-314.39"/>
+<polygon fill="#d10505" stroke="black" points="730.4,-316.41 732.32,-305.99 724.32,-312.94 730.4,-316.41"/>
+<path fill="none" stroke="black" d="M464.52,-506.96C474.71,-502.51 485.78,-497.81 496.1,-493.71 560.63,-468.02 604.12,-498.18 643.1,-440.71"/>
+</g>
+<!-- transformer_engine_pytorch_sequential_nvte_cpp_extensions&#45;&gt;transformer_engine_pytorch_sequential_nvte_mmt -->
+<g id="edge118" class="edge">
+<title>transformer_engine_pytorch_sequential_nvte_cpp_extensions&#45;&gt;transformer_engine_pytorch_sequential_nvte_mmt</title><style>.edge>path:hover{stroke-width:8}</style>
+<path fill="none" stroke="black" d="M681.1,-438.71C710.65,-416.57 785.74,-343.87 823.97,-306.39"/>
+<polygon fill="#d10505" stroke="black" points="825.86,-309.46 830.54,-299.96 820.95,-304.47 825.86,-309.46"/>
+<path fill="none" stroke="black" d="M463.12,-506.55C473.6,-501.95 485.17,-497.28 496.1,-493.71 565.8,-470.94 590.1,-487.48 657.1,-457.71 669.05,-452.4 670.52,-448.38 681.1,-440.71"/>
+</g>
+<!-- transformer_engine_pytorch_sequential_nvte_cpp_extensions&#45;&gt;transformer_engine_pytorch_sequential_nvte_normalization -->
+<g id="edge119" class="edge">
+<title>transformer_engine_pytorch_sequential_nvte_cpp_extensions&#45;&gt;transformer_engine_pytorch_sequential_nvte_normalization</title><style>.edge>path:hover{stroke-width:8}</style>
+<path fill="none" stroke="black" d="M225.1,-438.71C204.71,-415.21 205.63,-392.97 225.1,-368.71"/>
+</g>
+<!-- transformer_engine_pytorch_sequential_nvte_cpp_extensions_all_fp8_values -->
+<g id="node42" class="node">
+<title>transformer_engine_pytorch_sequential_nvte_cpp_extensions_all_fp8_values</title><style>.edge>path:hover{stroke-width:8}</style>
+<ellipse fill="#b65353" stroke="black" cx="288.1" cy="-524.82" rx="61.7" ry="31.11"/>
+<text text-anchor="middle" x="288.1" y="-533.32" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#ffffff">nvte.</text>
+<text text-anchor="middle" x="288.1" y="-521.32" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#ffffff">cpp_extensions.</text>
+<text text-anchor="middle" x="288.1" y="-509.32" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#ffffff">all_fp8_values</text>
+</g>
+<!-- transformer_engine_pytorch_sequential_nvte_cpp_extensions_all_fp8_values&#45;&gt;transformer_engine_pytorch_sequential_nvte_cpp_extensions -->
+<g id="edge120" class="edge">
+<title>transformer_engine_pytorch_sequential_nvte_cpp_extensions_all_fp8_values&#45;&gt;transformer_engine_pytorch_sequential_nvte_cpp_extensions</title><style>.edge>path:hover{stroke-width:8}</style>
+<path fill="none" stroke="black" d="M350.27,-524.82C352.29,-524.82 354.3,-524.82 356.31,-524.82"/>
+<polygon fill="#b65353" stroke="black" points="356.08,-528.32 366.08,-524.82 356.08,-521.32 356.08,-528.32"/>
+</g>
+<!-- transformer_engine_pytorch_sequential_nvte_cpp_extensions_dynamic_load -->
+<g id="node43" class="node">
+<title>transformer_engine_pytorch_sequential_nvte_cpp_extensions_dynamic_load</title><style>.edge>path:hover{stroke-width:8}</style>
+<ellipse fill="#b34c4c" stroke="black" cx="108.1" cy="-524.82" rx="61.7" ry="31.11"/>
+<text text-anchor="middle" x="108.1" y="-533.32" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#ffffff">nvte.</text>
+<text text-anchor="middle" x="108.1" y="-521.32" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#ffffff">cpp_extensions.</text>
+<text text-anchor="middle" x="108.1" y="-509.32" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#ffffff">dynamic_load</text>
+</g>
+<!-- transformer_engine_pytorch_sequential_nvte_cpp_extensions_dynamic_load&#45;&gt;transformer_engine_pytorch_sequential_nvte_cpp_extensions -->
+<g id="edge121" class="edge">
+<title>transformer_engine_pytorch_sequential_nvte_cpp_extensions_dynamic_load&#45;&gt;transformer_engine_pytorch_sequential_nvte_cpp_extensions</title><style>.edge>path:hover{stroke-width:8}</style>
+<path fill="none" stroke="black" d="M151.07,-547.52C172.89,-557.78 200.21,-568.76 226.1,-573.93 280.15,-584.73 297.2,-589.37 350.1,-573.93 365.54,-569.43 380.82,-560.79 393.66,-551.99"/>
+<polygon fill="#b34c4c" stroke="black" points="395.46,-554.3 401.54,-545.62 391.37,-548.61 395.46,-554.3"/>
+</g>
+<!-- transformer_engine_pytorch_sequential_nvte_dtype&#45;&gt;transformer_engine_pytorch_sequential_nvte -->
+<g id="edge122" class="edge">
+<title>transformer_engine_pytorch_sequential_nvte_dtype&#45;&gt;transformer_engine_pytorch_sequential_nvte</title><style>.edge>path:hover{stroke-width:8}</style>
+<path fill="none" stroke="black" d="M554.13,-541.99C563.96,-553.44 578.56,-567.53 595.1,-573.93 630.44,-587.6 901.03,-588.27 936.1,-573.93 948.35,-568.92 959.21,-559.31 967.66,-549.91"/>
+<polygon fill="#df0b0b" stroke="black" points="970.98,-552.39 974.7,-542.47 965.61,-547.9 970.98,-552.39"/>
+</g>
+<!-- transformer_engine_pytorch_sequential_nvte_dtype&#45;&gt;transformer_engine_pytorch_sequential_nvte_add -->
+<g id="edge123" class="edge">
+<title>transformer_engine_pytorch_sequential_nvte_dtype&#45;&gt;transformer_engine_pytorch_sequential_nvte_add</title><style>.edge>path:hover{stroke-width:8}</style>
+<path fill="none" stroke="black" d="M554.13,-541.99C563.96,-553.44 578.56,-567.53 595.1,-573.93 634.9,-589.32 651.69,-590.28 691.1,-573.93 703.03,-568.98 713.56,-559.57 721.77,-550.3"/>
+<polygon fill="#df0b0b" stroke="black" points="724.98,-552.9 728.63,-542.95 719.58,-548.45 724.98,-552.9"/>
+</g>
+<!-- transformer_engine_pytorch_sequential_nvte_dtype&#45;&gt;transformer_engine_pytorch_sequential_nvte_cast_transpose -->
+<g id="edge124" class="edge">
+<title>transformer_engine_pytorch_sequential_nvte_dtype&#45;&gt;transformer_engine_pytorch_sequential_nvte_cast_transpose</title><style>.edge>path:hover{stroke-width:8}</style>
+<path fill="none" stroke="black" d="M537.17,-506.73C533.67,-490 529.1,-463.74 529.1,-440.71"/>
+</g>
+<!-- transformer_engine_pytorch_sequential_nvte_dtype&#45;&gt;transformer_engine_pytorch_sequential_nvte_empty -->
+<g id="edge125" class="edge">
+<title>transformer_engine_pytorch_sequential_nvte_dtype&#45;&gt;transformer_engine_pytorch_sequential_nvte_empty</title><style>.edge>path:hover{stroke-width:8}</style>
+<path fill="none" stroke="black" d="M520.07,-509.69C512.52,-504.63 503.94,-498.9 496.1,-493.71 390.11,-423.44 263.79,-340.83 206.74,-303.58"/>
+<polygon fill="#df0b0b" stroke="black" points="209.16,-300.33 198.88,-297.79 205.34,-306.19 209.16,-300.33"/>
+</g>
+<!-- transformer_engine_pytorch_sequential_nvte_dtype&#45;&gt;transformer_engine_pytorch_sequential_nvte_misc_fusions -->
+<g id="edge126" class="edge">
+<title>transformer_engine_pytorch_sequential_nvte_dtype&#45;&gt;transformer_engine_pytorch_sequential_nvte_misc_fusions</title><style>.edge>path:hover{stroke-width:8}</style>
+<path fill="none" stroke="black" d="M562.69,-509.93C584.87,-494.88 619.43,-469.15 643.1,-440.71"/>
+</g>
+<!-- transformer_engine_pytorch_sequential_nvte_dtype&#45;&gt;transformer_engine_pytorch_sequential_nvte_normalization -->
+<g id="edge127" class="edge">
+<title>transformer_engine_pytorch_sequential_nvte_dtype&#45;&gt;transformer_engine_pytorch_sequential_nvte_normalization</title><style>.edge>path:hover{stroke-width:8}</style>
+<path fill="none" stroke="black" d="M527.81,-507.93C507.27,-482.94 467.23,-432.44 439.1,-385.71 425.4,-362.93 412.51,-335.74 403.39,-315.15"/>
+<polygon fill="#df0b0b" stroke="black" points="406.31,-314.09 399.11,-306.33 399.9,-316.89 406.31,-314.09"/>
+</g>
+<!-- transformer_engine_pytorch_sequential_nvte_empty&#45;&gt;transformer_engine_pytorch_sequential_nvte -->
+<g id="edge128" class="edge">
+<title>transformer_engine_pytorch_sequential_nvte_empty&#45;&gt;transformer_engine_pytorch_sequential_nvte</title><style>.edge>path:hover{stroke-width:8}</style>
+<path fill="none" stroke="black" d="M196.64,-297.99C204.68,-303.3 214.12,-309.13 223.1,-313.71 262.22,-333.65 275.27,-331.24 315.1,-349.71 406.76,-392.19 418.28,-428.88 515.1,-457.71 698.91,-512.42 765.52,-426.42 945.1,-493.71 951.14,-495.97 957.07,-499.39 962.47,-503.13"/>
+<polygon fill="#d10505" stroke="black" points="959.87,-506.31 969.96,-509.56 964.1,-500.73 959.87,-506.31"/>
+</g>
+<!-- transformer_engine_pytorch_sequential_nvte_empty&#45;&gt;transformer_engine_pytorch_sequential_nvte_activation -->
+<g id="edge129" class="edge">
+<title>transformer_engine_pytorch_sequential_nvte_empty&#45;&gt;transformer_engine_pytorch_sequential_nvte_activation</title><style>.edge>path:hover{stroke-width:8}</style>
+<path fill="none" stroke="black" d="M189.74,-299.78C200,-311.24 215.16,-325.33 232.1,-331.71 275.78,-348.14 398.29,-347.76 442.1,-331.71 454.18,-327.28 465.46,-319.11 474.81,-310.68"/>
+<polygon fill="#d10505" stroke="black" points="476.71,-313.75 481.5,-304.3 471.86,-308.7 476.71,-313.75"/>
+</g>
+<!-- transformer_engine_pytorch_sequential_nvte_empty&#45;&gt;transformer_engine_pytorch_sequential_nvte_attention -->
+<g id="edge130" class="edge">
+<title>transformer_engine_pytorch_sequential_nvte_empty&#45;&gt;transformer_engine_pytorch_sequential_nvte_attention</title><style>.edge>path:hover{stroke-width:8}</style>
+<path fill="none" stroke="black" d="M214.39,-282.59C216.41,-282.59 218.43,-282.59 220.45,-282.59"/>
+<polygon fill="#d10505" stroke="black" points="220.26,-286.09 230.26,-282.59 220.26,-279.09 220.26,-286.09"/>
+</g>
+<!-- transformer_engine_pytorch_sequential_nvte_empty&#45;&gt;transformer_engine_pytorch_sequential_nvte_cast_transpose -->
+<g id="edge131" class="edge">
+<title>transformer_engine_pytorch_sequential_nvte_empty&#45;&gt;transformer_engine_pytorch_sequential_nvte_cast_transpose</title><style>.edge>path:hover{stroke-width:8}</style>
+<path fill="none" stroke="black" d="M189.74,-299.78C200,-311.24 215.16,-325.33 232.1,-331.71 296.16,-355.82 474.46,-351.07 540.1,-331.71 555.38,-327.2 570.48,-318.56 583.14,-309.76"/>
+<polygon fill="#d10505" stroke="black" points="584.87,-312.11 590.9,-303.39 580.76,-306.45 584.87,-312.11"/>
+</g>
+<!-- transformer_engine_pytorch_sequential_nvte_empty&#45;&gt;transformer_engine_pytorch_sequential_nvte_misc_fusions -->
+<g id="edge132" class="edge">
+<title>transformer_engine_pytorch_sequential_nvte_empty&#45;&gt;transformer_engine_pytorch_sequential_nvte_misc_fusions</title><style>.edge>path:hover{stroke-width:8}</style>
+<path fill="none" stroke="black" d="M189.74,-299.78C200,-311.24 215.16,-325.33 232.1,-331.71 278.17,-349.04 628.22,-346.69 675.1,-331.71 689.03,-327.26 702.55,-318.85 713.87,-310.24"/>
+<polygon fill="#d10505" stroke="black" points="715.84,-312.35 721.45,-303.36 711.46,-306.89 715.84,-312.35"/>
+</g>
+<!-- transformer_engine_pytorch_sequential_nvte_empty&#45;&gt;transformer_engine_pytorch_sequential_nvte_mmt -->
+<g id="edge133" class="edge">
+<title>transformer_engine_pytorch_sequential_nvte_empty&#45;&gt;transformer_engine_pytorch_sequential_nvte_mmt</title><style>.edge>path:hover{stroke-width:8}</style>
+<path fill="none" stroke="black" d="M189.74,-299.78C200,-311.24 215.16,-325.33 232.1,-331.71 290.75,-353.78 737.98,-355.14 796.1,-331.71 808.45,-326.73 819.47,-317.17 828.09,-307.81"/>
+<polygon fill="#d10505" stroke="black" points="831.15,-310.61 835.01,-300.74 825.84,-306.05 831.15,-310.61"/>
+</g>
+<!-- transformer_engine_pytorch_sequential_nvte_empty&#45;&gt;transformer_engine_pytorch_sequential_nvte_normalization -->
+<g id="edge134" class="edge">
+<title>transformer_engine_pytorch_sequential_nvte_empty&#45;&gt;transformer_engine_pytorch_sequential_nvte_normalization</title><style>.edge>path:hover{stroke-width:8}</style>
+<path fill="none" stroke="black" d="M189.74,-299.78C200,-311.24 215.16,-325.33 232.1,-331.71 268.71,-345.48 282.81,-343.48 320.1,-331.71 334.32,-327.22 348.15,-318.69 359.71,-309.99"/>
+<polygon fill="#d10505" stroke="black" points="361.41,-312.32 367.08,-303.37 357.07,-306.83 361.41,-312.32"/>
+</g>
+<!-- transformer_engine_pytorch_sequential_nvte_execution_state&#45;&gt;transformer_engine_pytorch_sequential_nvte -->
+<g id="edge135" class="edge">
+<title>transformer_engine_pytorch_sequential_nvte_execution_state&#45;&gt;transformer_engine_pytorch_sequential_nvte</title><style>.edge>path:hover{stroke-width:8}</style>
+<path fill="none" stroke="black" d="M90.3,-302.6C151.58,-340.05 296.14,-422.81 430.1,-457.71 652.14,-515.54 729.7,-414.68 945.1,-493.71 951.16,-495.93 957.1,-499.33 962.49,-503.06"/>
+<polygon fill="#c82323" stroke="black" points="959.9,-506.25 969.98,-509.5 964.13,-500.67 959.9,-506.25"/>
+</g>
+<!-- transformer_engine_pytorch_sequential_nvte_execution_state&#45;&gt;transformer_engine_pytorch_sequential_nvte_empty -->
+<g id="edge136" class="edge">
+<title>transformer_engine_pytorch_sequential_nvte_execution_state&#45;&gt;transformer_engine_pytorch_sequential_nvte_empty</title><style>.edge>path:hover{stroke-width:8}</style>
+<path fill="none" stroke="black" d="M120.37,-282.59C122.55,-282.59 124.72,-282.59 126.9,-282.59"/>
+<polygon fill="#c82323" stroke="black" points="126.79,-286.09 136.79,-282.59 126.79,-279.09 126.79,-286.09"/>
+</g>
+<!-- transformer_engine_pytorch_sequential_nvte_execution_state&#45;&gt;transformer_engine_pytorch_sequential_nvte_mmt -->
+<g id="edge137" class="edge">
+<title>transformer_engine_pytorch_sequential_nvte_execution_state&#45;&gt;transformer_engine_pytorch_sequential_nvte_mmt</title><style>.edge>path:hover{stroke-width:8}</style>
+<path fill="none" stroke="black" d="M85.03,-303.4C99.53,-314.13 118.73,-326.11 138.1,-331.71 208.35,-351.98 728.3,-359.05 796.1,-331.71 808.45,-326.73 819.47,-317.17 828.09,-307.81"/>
+<polygon fill="#c82323" stroke="black" points="831.15,-310.61 835.01,-300.74 825.84,-306.05 831.15,-310.61"/>
+</g>
+<!-- transformer_engine_pytorch_sequential_nvte_execution_state&#45;&gt;transformer_engine_pytorch_sequential_nvte_normalization -->
+<g id="edge138" class="edge">
+<title>transformer_engine_pytorch_sequential_nvte_execution_state&#45;&gt;transformer_engine_pytorch_sequential_nvte_normalization</title><style>.edge>path:hover{stroke-width:8}</style>
+<path fill="none" stroke="black" d="M85.03,-303.4C99.53,-314.13 118.73,-326.11 138.1,-331.71 215.82,-354.13 242.97,-356.07 320.1,-331.71 334.32,-327.22 348.15,-318.69 359.71,-309.99"/>
+<polygon fill="#c82323" stroke="black" points="361.41,-312.32 367.08,-303.37 357.07,-306.83 361.41,-312.32"/>
+</g>
+<!-- transformer_engine_pytorch_sequential_nvte_misc_fusions&#45;&gt;transformer_engine_pytorch_sequential_nvte -->
+<g id="edge139" class="edge">
+<title>transformer_engine_pytorch_sequential_nvte_misc_fusions&#45;&gt;transformer_engine_pytorch_sequential_nvte</title><style>.edge>path:hover{stroke-width:8}</style>
+<path fill="none" stroke="black" d="M754.56,-305.17C762.46,-320.36 774.29,-341.02 788.15,-358.45"/>
+<polygon fill="#793434" stroke="black" points="785.98,-360.31 795.1,-365.71 791.34,-355.8 785.98,-360.31"/>
+</g>
+<!-- transformer_engine_pytorch_sequential_nvte_mmt&#45;&gt;transformer_engine_pytorch_sequential_nvte -->
+<g id="edge140" class="edge">
+<title>transformer_engine_pytorch_sequential_nvte_mmt&#45;&gt;transformer_engine_pytorch_sequential_nvte</title><style>.edge>path:hover{stroke-width:8}</style>
+<path fill="none" stroke="black" d="M848.09,-300.98C849.01,-321.34 849.88,-356.05 847.1,-385.71 845.31,-404.91 827.75,-417.9 830.76,-429.99"/>
+<polygon fill="#883a3a" stroke="black" points="828.39,-431.68 837.1,-437.71 834.05,-427.56 828.39,-431.68"/>
+</g>
+<!-- transformer_engine_pytorch_sequential_nvte_normalization&#45;&gt;transformer_engine_pytorch_sequential_nvte -->
+<g id="edge141" class="edge">
+<title>transformer_engine_pytorch_sequential_nvte_normalization&#45;&gt;transformer_engine_pytorch_sequential_nvte</title><style>.edge>path:hover{stroke-width:8}</style>
+<path fill="none" stroke="black" d="M424.09,-300.12C448.33,-312.5 481.25,-330.54 508.1,-349.71 537.44,-370.64 596.21,-442.97 629.1,-457.71 758.11,-515.48 813.49,-442.13 945.1,-493.71 951.11,-496.06 957.03,-499.52 962.41,-503.27"/>
+<polygon fill="#813737" stroke="black" points="959.82,-506.45 969.91,-509.69 964.04,-500.87 959.82,-506.45"/>
+</g>
+<!-- transformer_engine_pytorch_sequential_persistent -->
+<g id="node50" class="node">
+<title>transformer_engine_pytorch_sequential_persistent</title><style>.edge>path:hover{stroke-width:8}</style>
+<ellipse fill="#ce3b3b" stroke="black" cx="972.1" cy="-439.71" rx="35.33" ry="18"/>
+<text text-anchor="middle" x="972.1" y="-436.21" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#ffffff">persistent</text>
+</g>
+<!-- transformer_engine_pytorch_sequential_persistent&#45;&gt;transformer_engine_pytorch_sequential_compute_pipeline_function -->
+<g id="edge142" class="edge">
+<title>transformer_engine_pytorch_sequential_persistent&#45;&gt;transformer_engine_pytorch_sequential_compute_pipeline_function</title><style>.edge>path:hover{stroke-width:8}</style>
+<path fill="none" stroke="black" d="M999.16,-427.82C1067.81,-400.14 1245.15,-328.2 1270.1,-313.71 1309.36,-290.91 1313.94,-277.55 1351.1,-251.48 1366.15,-240.93 1383.18,-229.9 1397.89,-220.65"/>
+<polygon fill="#ce3b3b" stroke="black" points="1399.55,-223.12 1406.18,-214.85 1395.84,-217.18 1399.55,-223.12"/>
+</g>
+<!-- transformer_engine_pytorch_sequential_persistent&#45;&gt;transformer_engine_pytorch_sequential_meta -->
+<g id="edge143" class="edge">
+<title>transformer_engine_pytorch_sequential_persistent&#45;&gt;transformer_engine_pytorch_sequential_meta</title><style>.edge>path:hover{stroke-width:8}</style>
+<path fill="none" stroke="black" d="M970.13,-421.4C969.27,-413.94 968.26,-405.03 967.3,-396.67"/>
+<polygon fill="#ce3b3b" stroke="black" points="970.67,-396.35 966.06,-386.81 963.72,-397.14 970.67,-396.35"/>
+</g>
+<!-- transformer_engine_pytorch_sequential_persistent&#45;&gt;transformer_engine_pytorch_sequential_nvte_execution_state -->
+<g id="edge144" class="edge">
+<title>transformer_engine_pytorch_sequential_persistent&#45;&gt;transformer_engine_pytorch_sequential_nvte_execution_state</title><style>.edge>path:hover{stroke-width:8}</style>
+<path fill="none" stroke="black" d="M938.26,-433.94C785.86,-412.46 168.97,-325.14 129.1,-313.71 121.33,-311.48 113.28,-308.56 105.55,-305.43"/>
+<polygon fill="#ce3b3b" stroke="black" points="107.18,-301.9 96.61,-301.22 104.45,-308.34 107.18,-301.9"/>
+</g>
+<!-- transformer_engine_pytorch_sequential_recipe&#45;&gt;transformer_engine_pytorch_sequential -->
+<g id="edge145" class="edge">
+<title>transformer_engine_pytorch_sequential_recipe&#45;&gt;transformer_engine_pytorch_sequential</title><style>.edge>path:hover{stroke-width:8}</style>
+<path fill="none" stroke="black" d="M1043.94,-422.26C1034.49,-403.85 1017.91,-373.48 1000.1,-349.71 986.91,-332.09 974.23,-334.16 966.1,-313.71 955.89,-288.01 951.56,-275 966.1,-251.48 1046.32,-121.78 1126.94,-141.72 1273.1,-98.23 1441.28,-48.18 1647.87,-35.7 1760.39,-32.79"/>
+<polygon fill="#d02424" stroke="black" points="1760.4,-36.27 1770.32,-32.53 1760.24,-29.27 1760.4,-36.27"/>
+</g>
+<!-- transformer_engine_pytorch_sequential_recipe&#45;&gt;transformer_engine_pytorch_sequential_compute_pipeline_compute_pipeline -->
+<g id="edge146" class="edge">
+<title>transformer_engine_pytorch_sequential_recipe&#45;&gt;transformer_engine_pytorch_sequential_compute_pipeline_compute_pipeline</title><style>.edge>path:hover{stroke-width:8}</style>
+<path fill="none" stroke="black" d="M1077.6,-433.29C1126.17,-422.42 1229.12,-396.61 1251.1,-368.71"/>
+<path fill="none" stroke="black" d="M1251.1,-366.71C1265.51,-348.43 1249.56,-326.71 1230.91,-310.06"/>
+<polygon fill="#d02424" stroke="black" points="1233.74,-307.02 1223.83,-303.26 1229.23,-312.37 1233.74,-307.02"/>
+</g>
+<!-- transformer_engine_pytorch_sequential_recipe&#45;&gt;transformer_engine_pytorch_sequential_meta -->
+<g id="edge147" class="edge">
+<title>transformer_engine_pytorch_sequential_recipe&#45;&gt;transformer_engine_pytorch_sequential_meta</title><style>.edge>path:hover{stroke-width:8}</style>
+<path fill="none" stroke="black" d="M1035.14,-425.21C1022.23,-414.94 1004.26,-400.65 989.63,-389.01"/>
+<polygon fill="#d02424" stroke="black" points="992.02,-385.65 982.02,-382.16 987.67,-391.13 992.02,-385.65"/>
+</g>
+<!-- transformer_engine_pytorch_sequential_recipe&#45;&gt;transformer_engine_pytorch_sequential_module_base -->
+<g id="edge148" class="edge">
+<title>transformer_engine_pytorch_sequential_recipe&#45;&gt;transformer_engine_pytorch_sequential_module_base</title><style>.edge>path:hover{stroke-width:8}</style>
+<path fill="none" stroke="black" d="M1251.1,-366.71C1301.88,-302.27 1263.08,-249.79 1327.1,-198.48"/>
+</g>
+<!-- transformer_engine_pytorch_sequential_utils&#45;&gt;transformer_engine_pytorch_sequential_compute_pipeline_fusions__common -->
+<g id="edge149" class="edge">
+<title>transformer_engine_pytorch_sequential_utils&#45;&gt;transformer_engine_pytorch_sequential_compute_pipeline_fusions__common</title><style>.edge>path:hover{stroke-width:8}</style>
+<path fill="none" stroke="black" d="M931.1,-523.82C946.58,-520.49 936.85,-502.34 950.1,-493.71 1041.58,-434.1 1087.54,-485.58 1193.1,-457.71 1278.81,-435.07 1297.16,-419.5 1379.1,-385.71 1439.58,-360.76 1508.09,-330.49 1555.08,-309.42"/>
+<polygon fill="#f90606" stroke="black" points="1556.37,-312.22 1564.06,-304.93 1553.5,-305.84 1556.37,-312.22"/>
+</g>
+<!-- transformer_engine_pytorch_sequential_utils&#45;&gt;transformer_engine_pytorch_sequential_compute_pipeline_function -->
+<g id="edge150" class="edge">
+<title>transformer_engine_pytorch_sequential_utils&#45;&gt;transformer_engine_pytorch_sequential_compute_pipeline_function</title><style>.edge>path:hover{stroke-width:8}</style>
+<path fill="none" stroke="black" d="M931.1,-523.82C946.51,-520.22 937.33,-503.05 950.1,-493.71 1001.26,-456.28 1027.95,-477.69 1088.1,-457.71 1128.4,-444.32 1230.44,-410.24 1265.1,-385.71 1330.18,-339.65 1330.3,-310.31 1384.1,-251.48 1392.6,-242.19 1402.05,-232.11 1410.42,-223.26"/>
+<polygon fill="#f90606" stroke="black" points="1412.41,-226.2 1416.76,-216.54 1407.33,-221.38 1412.41,-226.2"/>
+<path fill="none" stroke="black" d="M447.57,-609.05C539.29,-608.98 833.65,-604.86 912.1,-555.93 925.53,-547.56 915.73,-529.59 931.1,-525.82"/>
+</g>
+<!-- transformer_engine_pytorch_sequential_utils&#45;&gt;transformer_engine_pytorch_sequential_nvte__common -->
+<g id="edge151" class="edge">
+<title>transformer_engine_pytorch_sequential_utils&#45;&gt;transformer_engine_pytorch_sequential_nvte__common</title><style>.edge>path:hover{stroke-width:8}</style>
+<path fill="none" stroke="black" d="M445.07,-602.45C477.98,-593.57 537.43,-576.37 586.1,-555.93 593.41,-552.86 601.04,-549.19 608.25,-545.48"/>
+<polygon fill="#f90606" stroke="black" points="609.55,-548.22 616.75,-540.45 606.27,-542.03 609.55,-548.22"/>
+</g>
+<!-- transformer_engine_pytorch_sequential_utils&#45;&gt;transformer_engine_pytorch_sequential_nvte_cpp_extensions_dynamic_load -->
+<g id="edge152" class="edge">
+<title>transformer_engine_pytorch_sequential_utils&#45;&gt;transformer_engine_pytorch_sequential_nvte_cpp_extensions_dynamic_load</title><style>.edge>path:hover{stroke-width:8}</style>
+<path fill="none" stroke="black" d="M393.91,-604.44C349.99,-596.59 258.94,-579.01 184.1,-555.93 177.78,-553.98 171.25,-551.75 164.78,-549.39"/>
+<polygon fill="#f90606" stroke="black" points="166.32,-545.85 155.72,-545.62 163.86,-552.41 166.32,-545.85"/>
+</g>
+<!-- transformer_engine_pytorch_sequential_utils&#45;&gt;transformer_engine_pytorch_sequential_nvte_execution_state -->
+<g id="edge153" class="edge">
+<title>transformer_engine_pytorch_sequential_utils&#45;&gt;transformer_engine_pytorch_sequential_nvte_execution_state</title><style>.edge>path:hover{stroke-width:8}</style>
+<path fill="none" stroke="black" d="M392.79,-609.55C313.1,-610.71 83.57,-609.27 37.1,-555.93 -22.17,-487.9 19.31,-370.66 44.56,-314.86"/>
+<polygon fill="#f90606" stroke="black" points="48.02,-316.73 49.06,-306.19 41.67,-313.78 48.02,-316.73"/>
+</g>
+<!-- transformer_engine_pytorch_sequential_utils&#45;&gt;transformer_engine_pytorch_sequential_nvte_mmt -->
+<g id="edge154" class="edge">
+<title>transformer_engine_pytorch_sequential_utils&#45;&gt;transformer_engine_pytorch_sequential_nvte_mmt</title><style>.edge>path:hover{stroke-width:8}</style>
+<path fill="none" stroke="black" d="M447.38,-608.71C522.72,-607.55 731.32,-600.29 780.1,-555.93 798.41,-539.29 829.88,-377.42 842.07,-311.4"/>
+<polygon fill="#f90606" stroke="black" points="845.63,-312.42 843.99,-301.96 838.74,-311.16 845.63,-312.42"/>
+</g>
+<!-- transformer_engine_pytorch_sequential_utils&#45;&gt;transformer_engine_pytorch_sequential_nvte_normalization -->
+<g id="edge155" class="edge">
+<title>transformer_engine_pytorch_sequential_utils&#45;&gt;transformer_engine_pytorch_sequential_nvte_normalization</title><style>.edge>path:hover{stroke-width:8}</style>
+<path fill="none" stroke="black" d="M393.33,-605.81C343.47,-599.42 239.43,-583.08 217.1,-555.93 164.2,-491.6 172.97,-433.66 225.1,-368.71"/>
+<path fill="none" stroke="black" d="M225.1,-366.71C226.17,-365.38 298.41,-329.25 346.94,-305.06"/>
+<polygon fill="#f90606" stroke="black" points="348.21,-307.84 355.61,-300.25 345.09,-301.58 348.21,-307.84"/>
+</g>
+</g>
+</svg>
diff --git a/transformer_engine/pytorch/sequential/metatensors.py b/transformer_engine/pytorch/sequential/metatensors.py
new file mode 100644
index 0000000000..f97c42b34f
--- /dev/null
+++ b/transformer_engine/pytorch/sequential/metatensors.py
@@ -0,0 +1,78 @@
+from __future__ import annotations
+import torch
+
+from .nvte import DType
+from .persistent import Persistent
+from .recipe import Recipe
+
+FP8Meta = tuple[torch.Tensor, torch.Tensor, torch.Tensor]
+
+
+class PersistentFP8Meta(Persistent[DType, FP8Meta]):
+    amaxes: torch.Tensor  # (amax_history_len, num_tensors)
+    scaling_factors: torch.Tensor  # (num_tensors,)
+    scaling_factors_inversed: torch.Tensor  # (num_tensors,)
+    scaling_factor_type_maximums: torch.Tensor  # (num_tensors,)
+
+    def _generate(self, fp8_dtype: DType):
+        if self._iteration() == 1:
+            if self._is_new_iteration():
+                # Allocate first iteration metatensors
+                self._one = torch.ones(1, device="cuda")
+                self._first_iteration_amaxes: list[torch.Tensor] = []
+                self._fp8_dtypes: list[DType] = []
+            amax = torch.zeros(1, device="cuda")
+            self._first_iteration_amaxes.append(amax)
+            self._fp8_dtypes.append(fp8_dtype)
+            self._index_within_iteration()  # increment tensor index
+            return (amax, self._one, self._one)
+        else:
+            if self._iteration() == 2 and self._is_new_iteration():
+                # Allocate metatensors
+                self.amaxes = torch.zeros(
+                    (Recipe.current().amax_history_len, self._max_index()),
+                    device="cuda",
+                )
+                self.scaling_factors = torch.ones(self._max_index(), device="cuda")
+                self.scaling_factors_inversed = torch.ones(
+                    self._max_index(), device="cuda"
+                )
+                # Copy amaxes from first iteration
+                self.amaxes[0] = torch.cat(self._first_iteration_amaxes)
+                # Set scaling factor type maximums
+                FP8E4M3_MAX = 448.0
+                FP8E5M2_MAX = 57344.0
+                self.scaling_factor_type_maximums = torch.Tensor(
+                    [
+                        (FP8E4M3_MAX if dtype == DType.Float8E4M3 else FP8E5M2_MAX)
+                        for dtype in self._fp8_dtypes
+                    ],
+                    device="cuda",
+                )
+                # Delete first iteration data
+                del self._one
+                del self._first_iteration_amaxes
+                del self._fp8_dtypes
+            if self._iteration() % Recipe.current().amax_reduction_period == 0:
+                amaxes_t = self.amaxes.T  # (num_tensors, amax_history_len)
+                reduced = Recipe.current().amax_reduction_method(
+                    amaxes_t
+                )  # (num_tensors,)
+                Recipe.current().scaling_factor_compute_method(
+                    reduced,
+                    self.scaling_factor_type_maximums,
+                    torch.zeros_like(reduced),
+                    self.scaling_factors,
+                )
+                torch.reciprocal(
+                    self.scaling_factors,
+                    out=self.scaling_factors_inversed,
+                )
+            tensor_idx = self._index_within_iteration()
+            return (
+                self.amaxes[
+                    self._iteration() % Recipe.current().amax_history_len, tensor_idx
+                ],
+                self.scaling_factors[tensor_idx],
+                self.scaling_factors_inversed[tensor_idx],
+            )
diff --git a/transformer_engine/pytorch/sequential/module/__init__.py b/transformer_engine/pytorch/sequential/module/__init__.py
new file mode 100644
index 0000000000..4956f3a727
--- /dev/null
+++ b/transformer_engine/pytorch/sequential/module/__init__.py
@@ -0,0 +1,20 @@
+from .activation import Activation, ReLU, GELU, ReGLU, GeGLU, SwiGLU
+from .normalization import Normalization, LayerNorm, RMSNorm
+from .linear import Linear
+from .sequential import Sequential
+from .residual import Residual
+
+__all__ = [
+    "Activation",
+    "ReLU",
+    "GELU",
+    "ReGLU",
+    "GeGLU",
+    "SwiGLU",
+    "Normalization",
+    "LayerNorm",
+    "RMSNorm",
+    "Linear",
+    "Sequential",
+    "Residual",
+]
diff --git a/transformer_engine/pytorch/sequential/module/_common.py b/transformer_engine/pytorch/sequential/module/_common.py
new file mode 100644
index 0000000000..0614f9e697
--- /dev/null
+++ b/transformer_engine/pytorch/sequential/module/_common.py
@@ -0,0 +1,5 @@
+from __future__ import annotations
+from typing import Callable
+import torch
+
+ParameterInitMethod = Callable[[torch.Tensor], torch.Tensor]
diff --git a/transformer_engine/pytorch/sequential/module/activation.py b/transformer_engine/pytorch/sequential/module/activation.py
new file mode 100644
index 0000000000..a26413db97
--- /dev/null
+++ b/transformer_engine/pytorch/sequential/module/activation.py
@@ -0,0 +1,33 @@
+from abc import ABC
+from .base import BaseModule
+from ..compute_pipeline import ops
+
+
+class Activation(BaseModule, ABC):
+    def __init__(self):
+        super().__init__()
+
+    def _ops(self) -> list[ops.Op | None]:
+        return [type(self)._op_type()]
+
+    _op_type: type[ops.Activation]
+
+
+class ReLU(Activation):
+    _op_type = ops.ReLU
+
+
+class GELU(Activation):
+    _op_type = ops.GELU
+
+
+class ReGLU(Activation):
+    _op_type = ops.ReGLU
+
+
+class GeGLU(Activation):
+    _op_type = ops.GeGLU
+
+
+class SwiGLU(Activation):
+    _op_type = ops.SwiGLU
diff --git a/transformer_engine/pytorch/sequential/module/base.py b/transformer_engine/pytorch/sequential/module/base.py
new file mode 100644
index 0000000000..b149661391
--- /dev/null
+++ b/transformer_engine/pytorch/sequential/module/base.py
@@ -0,0 +1,70 @@
+from __future__ import annotations
+from abc import ABC, abstractmethod
+import torch
+from torch import nn
+from ..compute_pipeline.ops import Op
+from ..recipe import Recipe
+from ..compute_pipeline.compute_pipeline import ComputePipeline
+from ..compute_pipeline_function import apply
+
+
+class BaseModule(nn.Module, ABC):
+    pipeline: ComputePipeline | None
+    compile_env: Recipe | None
+
+    @abstractmethod
+    def _ops(self) -> list[Op | None]:
+        ...
+
+    def __init__(self):
+        super().__init__()  # type: ignore
+        self.pipeline = None
+        self.compile_env = None
+
+    def forward(
+        self, x: torch.Tensor, seq_lens: torch.Tensor | None = None
+    ) -> torch.Tensor:
+        self._precompiled_for(x, seq_lens)
+        return self._run(x)
+
+    def _precompiled_for(self, x: torch.Tensor, seq_lens: torch.Tensor | None = None):
+        with torch.no_grad():
+            assert x.is_cuda
+            assert x.is_contiguous()
+            if seq_lens is None:
+                seq_lens = BaseModule._create_seq_lens_tensor(x)
+            assert seq_lens.is_cuda
+            assert seq_lens.is_contiguous()
+
+            self._setup_pipeline(x, seq_lens)
+
+        return self._run
+
+    def _run(self, x: torch.Tensor):
+        assert self.pipeline is not None
+        return apply(x, self.pipeline, self.training)
+
+    @staticmethod
+    def _create_seq_lens_tensor(x: torch.Tensor):
+        if x.dim() == 2:
+            seq_lens = torch.tensor([x.shape[0]], dtype=torch.int32, device="cuda")
+        elif x.dim() == 3:
+            seq_lens = torch.tensor(
+                [x.shape[1]] * x.shape[0], dtype=torch.int32, device="cuda"
+            )
+            x = x.view(x.shape[1] * x.shape[0], x.shape[2])
+        else:
+            raise ValueError(f"Unsupported input shape: {x.shape}")
+        return seq_lens
+
+    def _setup_pipeline(self, x: torch.Tensor, seq_lens: torch.Tensor):
+        del x, seq_lens  # TODO: take x's type into account, save seq_lens
+        env = self._current_env()
+        if self.pipeline is None or env != self.compile_env:
+            self.pipeline = ComputePipeline(
+                [op for op in self._ops() if op is not None], env
+            )
+            self.compile_env = env
+
+    def _current_env(self) -> Recipe:
+        return Recipe.current()
diff --git a/transformer_engine/pytorch/sequential/module/dot_product_attention.py b/transformer_engine/pytorch/sequential/module/dot_product_attention.py
new file mode 100644
index 0000000000..952237f13d
--- /dev/null
+++ b/transformer_engine/pytorch/sequential/module/dot_product_attention.py
@@ -0,0 +1,62 @@
+from abc import abstractmethod, ABC
+from .base import BaseModule
+from ..compute_pipeline import ops
+
+class Attention(ABC):
+    @abstractmethod
+    def make_op(self) -> ops.Op:
+        ...
+
+class DotProductAttention(Attention):
+    def __init__(self, causal_mask: bool = True, pre_softmax_scale: float, dropout_p: float):
+        self.causal_mask = causal_mask
+
+    def make_op(self):
+        return ops.DotProductAttention(causal_mask)
+
+class GroupedQuerySelfAttention(BaseModule):
+    def __init__(
+        self,
+        token_dim: int,
+        num_query_heads: int,
+        num_kv_heads: int,
+        attention_mechanism: Attention,
+    ):
+        assert num_kv_heads <= num_query_heads
+        assert num_query_heads % num_kv_heads == 0
+        assert token_dim % num_query_heads == 0
+        self.attention_mechanism = attention_mechanism
+        super().__init__()
+
+    def _ops(self) -> list[ops.Op | None]:
+        return [self.attention_mechanism.make_op()]
+
+
+class MultiQuerySelfAttention(GroupedQuerySelfAttention):
+    def __init__(
+        self,
+        token_dim: int,
+        num_query_heads: int,
+        attention_mechanism: Attention,
+    ):
+        super().__init__(
+            token_dim,
+            num_query_heads,
+            1,
+            attention_mechanism,
+        )
+
+
+class MultiHeadedSelfAttention(GroupedQuerySelfAttention):
+    def __init__(
+        self,
+        token_dim: int,
+        num_heads: int,
+        attention_mechanism: Attention,
+    ):
+        super().__init__(
+            token_dim,
+            num_heads,
+            num_heads,
+            attention_mechanism,
+        )
diff --git a/transformer_engine/pytorch/sequential/module/linear.py b/transformer_engine/pytorch/sequential/module/linear.py
new file mode 100644
index 0000000000..ee69d43a77
--- /dev/null
+++ b/transformer_engine/pytorch/sequential/module/linear.py
@@ -0,0 +1,60 @@
+from __future__ import annotations
+from math import sqrt
+import torch
+from torch import nn
+from ..compute_pipeline import ops
+from ..nvte import make_nvte_tensor
+from ._common import ParameterInitMethod
+from .base import BaseModule
+
+
+def _default_weight_init_method(weight: torch.Tensor):
+    in_features = weight.shape[0]
+    k = 1 / sqrt(in_features)
+    return nn.init.uniform_(weight, -k, k)
+
+
+def _default_bias_init_method(bias: torch.Tensor):
+    out_features = bias.shape[0]
+    k = 1 / sqrt(out_features)
+    return nn.init.uniform_(bias, -k, k)
+
+
+class Linear(BaseModule):
+    def __init__(
+        self,
+        in_features: int,
+        out_features: int,
+        bias: bool = True,
+        param_dtype: torch.dtype = torch.get_default_dtype(),
+        weight_init_method: ParameterInitMethod = _default_weight_init_method,
+        bias_init_method: ParameterInitMethod = _default_bias_init_method,
+    ):
+        super().__init__()
+
+        self.in_features = in_features
+        self.out_features = out_features
+
+        self.weight = nn.Parameter(
+            weight_init_method(
+                torch.empty(out_features, in_features, dtype=param_dtype, device="cuda")
+            )
+        )
+        self.bias = (
+            nn.Parameter(
+                bias_init_method(
+                    torch.empty(out_features, dtype=param_dtype, device="cuda")
+                )
+            )
+            if bias
+            else None
+        )
+
+    def _ops(self) -> list[ops.Op | None]:
+        return [
+            ops.MMT(make_nvte_tensor(self.weight)),
+            ops.Add(make_nvte_tensor(self.bias)) if self.bias is not None else None,
+        ]
+
+    def extra_repr(self):
+        return f"in_features={self.in_features}, out_features={self.out_features}, bias={self.bias is not None}"
diff --git a/transformer_engine/pytorch/sequential/module/normalization.py b/transformer_engine/pytorch/sequential/module/normalization.py
new file mode 100644
index 0000000000..484eff8875
--- /dev/null
+++ b/transformer_engine/pytorch/sequential/module/normalization.py
@@ -0,0 +1,62 @@
+from abc import ABC
+import torch
+from torch import nn
+from .base import BaseModule
+from ..compute_pipeline import ops
+from ..nvte import make_nvte_tensor
+
+
+class Normalization(BaseModule, ABC):
+    def __init__(
+        self,
+        features: int,
+        eps: float = 1e-5,
+        zero_centered_gamma: bool = False,
+        param_dtype: torch.dtype = torch.get_default_dtype(),
+    ):
+        super().__init__()
+
+        self.features = features
+        self.eps = eps
+        self.zero_centered_gamma = zero_centered_gamma
+
+        self.weight = nn.Parameter(
+            torch.zeros(features, dtype=param_dtype, device="cuda")
+            if zero_centered_gamma
+            else torch.ones(features, dtype=param_dtype, device="cuda")
+        )
+        self.bias = (
+            nn.Parameter(torch.zeros(features, dtype=param_dtype, device="cuda"))
+            if type(self)._bias
+            else None
+        )
+
+    def _ops(self) -> list[ops.Op | None]:
+        return [
+            type(self)._op_type(
+                *(
+                    (
+                        self.eps,
+                        self.zero_centered_gamma,
+                        make_nvte_tensor(self.weight),
+                    )
+                    + ((make_nvte_tensor(self.bias),) if self.bias is not None else ())
+                )
+            ),
+        ]
+
+    def extra_repr(self):
+        return f"features={self.features}, eps={self.eps}, zero_centered_gamma={self.zero_centered_gamma}"
+
+    _bias: bool
+    _op_type: type[ops.Op]
+
+
+class LayerNorm(Normalization):
+    _bias = True
+    _op_type = ops.LayerNorm
+
+
+class RMSNorm(Normalization):
+    _bias = False
+    _op_type = ops.RMSNorm
diff --git a/transformer_engine/pytorch/sequential/module/residual.py b/transformer_engine/pytorch/sequential/module/residual.py
new file mode 100644
index 0000000000..972999fd0e
--- /dev/null
+++ b/transformer_engine/pytorch/sequential/module/residual.py
@@ -0,0 +1,10 @@
+from ..compute_pipeline import ops
+from .sequential import Sequential
+
+
+class Residual(Sequential):
+    def _ops(self):
+        begin, end = ops.ResidualBegin(), ops.ResidualEnd()
+        begin.end = end
+        end.begin = begin
+        return [begin] + super()._ops() + [end]
diff --git a/transformer_engine/pytorch/sequential/module/sequential.py b/transformer_engine/pytorch/sequential/module/sequential.py
new file mode 100644
index 0000000000..8f4735490a
--- /dev/null
+++ b/transformer_engine/pytorch/sequential/module/sequential.py
@@ -0,0 +1,76 @@
+from __future__ import annotations
+from typing import OrderedDict, overload
+
+from .base import BaseModule
+
+
+class Sequential(BaseModule):
+    _modules: dict[str, BaseModule]  # type: ignore[assignment]
+
+    @overload
+    def __init__(
+        self,
+        *modules: BaseModule,
+    ) -> None:
+        ...
+
+    @overload
+    def __init__(
+        self,
+        module_dict: OrderedDict[str, BaseModule],
+        /,
+    ) -> None:
+        ...
+
+    def __init__(
+        self,
+        *args: BaseModule | OrderedDict[str, BaseModule],
+    ):
+        super().__init__()
+        self.contained_modules = self._modules_from_args(args)
+
+    def _modules_from_args(
+        self, args: tuple[BaseModule | OrderedDict[str, BaseModule], ...]
+    ):
+        modules: list[tuple[str, BaseModule]]
+        if len(args) == 1 and isinstance(args[0], OrderedDict):
+            modules = list(args[0].items())
+        else:
+            args1: tuple[BaseModule, ...] = args  # type: ignore
+            modules = list(map(lambda p: (f"{p[0]}", p[1]), enumerate(args1)))
+
+        for name, module in modules:
+            submodules: list[tuple[str, BaseModule]]
+            if isinstance(module, Sequential):
+                submodules = [(k, v) for k, v in module._modules.items()]
+                for i, (submodule_name, submodule) in enumerate(submodules):
+                    submodules[i] = (f"{name}[{submodule_name}]", submodule)
+            else:
+                submodules = [(name, module)]
+
+            for submodule_name, submodule in submodules:
+                self.add_module(submodule_name, submodule)
+        return modules
+
+    def _ops(self):
+        return [op for _, module in self.contained_modules for op in module._ops()]
+
+    def __len__(self):
+        return len(self._modules)
+
+    def __add__(self, other: Sequential) -> Sequential:
+        return Sequential(
+            self,
+            other,
+        )
+
+    def __mul__(self, other: int):
+        if other <= 0:
+            raise ValueError("Repetition factor must be >= 1")
+        else:
+            return Sequential(
+                *(self for _ in range(other)),
+            )
+
+    def __rmul__(self, other: int):
+        return self * other
diff --git a/transformer_engine/pytorch/sequential/nvte/__init__.py b/transformer_engine/pytorch/sequential/nvte/__init__.py
new file mode 100644
index 0000000000..dc9d679af8
--- /dev/null
+++ b/transformer_engine/pytorch/sequential/nvte/__init__.py
@@ -0,0 +1,102 @@
+from ._common import make_nvte_tensor, torch_op
+from .cpp_extensions import (
+    QKVLayout,
+    BiasType,
+    MaskType,
+    FusedAttnBackend,
+    DType,
+    Tensor,
+)
+from .add import add, dbias
+from .cast_transpose import (
+    cast_checked,
+    cast_transpose_checked,
+    cast_transpose,
+    cast,
+    multi_cast_transpose_checked,
+    multi_cast_transpose,
+    transpose,
+)
+from .dtype import te_to_torch_dtype, torch_to_te_dtype, bit_width, dtype_name, is_fp8
+from .empty import empty, empty_like, multi_empty_share_metadata
+from .execution_state import set_execution_state
+from .activation import (
+    relu,
+    drelu,
+    reglu,
+    dreglu,
+    gelu,
+    dgelu,
+    geglu,
+    dgeglu,
+    swiglu,
+    dswiglu,
+)
+from .normalization import layernorm, dlayernorm, rmsnorm, drmsnorm
+from .misc_fusions import (
+    cast_transpose_dbias_checked,
+    cast_transpose_dbias_dgelu_checked,
+    cast_transpose_dgeglu_checked,
+)
+from .mmt import (
+    matmul_transpose_add_add,
+    matmul_transpose_add_gelu_add,
+    matmul_transpose_add_gelu,
+    matmul_transpose_add,
+    matmul_transpose_gelu_add,
+    matmul_transpose_gelu,
+    matmul_transpose,
+)
+
+__all__ = [
+    "add",
+    "BiasType",
+    "bit_width",
+    "cast_checked",
+    "cast_transpose_checked",
+    "cast_transpose_dbias_checked",
+    "cast_transpose_dbias_dgelu_checked",
+    "cast_transpose_dgeglu_checked",
+    "cast_transpose",
+    "cast",
+    "dbias",
+    "dgeglu",
+    "dgelu",
+    "dlayernorm",
+    "dreglu",
+    "drelu",
+    "drmsnorm",
+    "dswiglu",
+    "dtype_name",
+    "DType",
+    "empty_like",
+    "empty",
+    "FusedAttnBackend",
+    "geglu",
+    "gelu",
+    "is_fp8",
+    "layernorm",
+    "make_nvte_tensor",
+    "MaskType",
+    "matmul_transpose_add_add",
+    "matmul_transpose_add_gelu_add",
+    "matmul_transpose_add_gelu",
+    "matmul_transpose_add",
+    "matmul_transpose_gelu_add",
+    "matmul_transpose_gelu",
+    "matmul_transpose",
+    "multi_cast_transpose_checked",
+    "multi_cast_transpose",
+    "multi_empty_share_metadata",
+    "QKVLayout",
+    "reglu",
+    "relu",
+    "rmsnorm",
+    "set_execution_state",
+    "swiglu",
+    "te_to_torch_dtype",
+    "Tensor",
+    "torch_op",
+    "torch_to_te_dtype",
+    "transpose",
+]
diff --git a/transformer_engine/pytorch/sequential/nvte/_common.py b/transformer_engine/pytorch/sequential/nvte/_common.py
new file mode 100644
index 0000000000..89ac37fe4e
--- /dev/null
+++ b/transformer_engine/pytorch/sequential/nvte/_common.py
@@ -0,0 +1,400 @@
+from __future__ import annotations
+
+from collections import namedtuple
+from typing import TYPE_CHECKING, Any, Callable, Sequence, TypeVar, overload
+from types import GenericAlias, NoneType
+import typing
+from typing_extensions import TypeVarTuple, Unpack
+import warnings
+from enum import Enum
+
+import torch
+
+from torch.autograd.function import FunctionCtx
+from . import cpp_extensions as _nvte
+
+from ..utils import (
+    get_arg_names,
+    get_arg_types,
+    get_return_type,
+    exec_saving_source,
+    is_generic,
+)
+
+
+def _type_name(t: type) -> str:
+    if is_generic(t):
+        result = str(t)
+    else:
+        result = f"{t.__module__}.{t.__name__}"
+
+    return (
+        result.replace("builtins.", "")
+        .replace("transformer_engine.pytorch.sequential.nvte.", "")
+        .replace("collections.abc", "typing")
+        .replace("__init__.pyi", "cpp_extensions")
+        .replace("NoneType", "None")
+    )
+
+
+def _wrap_type(
+    type_wrap_func: Callable[[type], type],
+    arg_type_: type | GenericAlias,
+) -> Any:
+    if is_generic(arg_type_):
+        origin = arg_type_.__origin__  # type: ignore
+        while hasattr(origin, "__origin__"):  # type: ignore
+            origin = getattr(origin, "__origin__")  # type: ignore
+        args: tuple[type | GenericAlias, ...] = typing.get_args(arg_type_)
+        new_args = tuple(_wrap_type(type_wrap_func, arg) for arg in args)
+        return origin.__class_getitem__(new_args)  # type: ignore
+    else:
+        if TYPE_CHECKING:
+            assert isinstance(arg_type_, type)
+        return type_wrap_func(arg_type_)
+
+
+def _arg_type_wrap_func(arg_type: type):
+    if arg_type is _nvte.Tensor:
+        return Sequence[torch.Tensor]
+    elif issubclass(arg_type, Enum):
+        return int
+    elif issubclass(
+        arg_type, (int, float, bool, str, torch.Tensor, NoneType, FunctionCtx)
+    ):
+        return arg_type
+    else:
+        raise NotImplementedError(arg_type)
+
+
+def _wrap_arg_type(arg_type: type | GenericAlias) -> Any:
+    return _wrap_type(_arg_type_wrap_func, arg_type)
+
+
+def _result_type_wrap_func(result_type: type):
+    if result_type is _nvte.Tensor:
+        return tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]
+    else:
+        return _arg_type_wrap_func(result_type)
+
+
+def _is_generic_tuple(t: type) -> bool:
+    return is_generic(t) and (t.__origin__ is tuple)  # type: ignore
+
+
+def _wrap_result_type(result_type: type | GenericAlias) -> Any:
+    wrapped_type = _wrap_type(_result_type_wrap_func, result_type)
+
+    # Flatten tuple of tuples of tensors
+    if _is_generic_tuple(wrapped_type):
+        arg_types = typing.get_args(wrapped_type)
+        if any(_is_generic_tuple(arg_type) for arg_type in arg_types):
+            assert all(
+                _is_generic_tuple(arg_type)
+                and typing.get_args(arg_type)
+                == (torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor)
+                for arg_type in arg_types
+            )
+            tensors = len(arg_types)
+            types = (torch.Tensor,) * (4 * tensors)
+            return tuple.__class_getitem__(types)
+    return wrapped_type  # type: ignore
+
+
+def _wrap_unwrap_code(
+    arg_name: str,
+    arg_type: type,
+    arg_type_name: str,
+    wrapped_arg_type_name: str,
+):
+    if arg_type is _nvte.Tensor:
+        w = f"    {arg_name}_: {wrapped_arg_type_name} = te_to_torch_tensor({arg_name})\n"
+        u = f"    {arg_name}: {arg_type_name} = torch_to_te_tensor({arg_name}_)\n"
+    elif _is_generic_tuple(arg_type) and all(
+        sub_type is _nvte.Tensor for sub_type in typing.get_args(arg_type)
+    ):
+        w = f"    {arg_name}_: {wrapped_arg_type_name} = tuple(t for tensor in {arg_name} for t in te_to_torch_tensor(tensor))\n"
+        u = f"    {arg_name}: {arg_type_name} = tuple(torch_to_te_tensor(tuple({arg_name}_[j] for j in range(i, i + 4, 1))) for i in range(0, len({arg_name}_), 4))\n"
+    elif issubclass(arg_type, Enum):
+        w = f"    {arg_name}_: {wrapped_arg_type_name} = {arg_name}.value\n"
+        u = f"    {arg_name}: {arg_type_name} = {arg_type_name}({arg_name}_)\n"
+    else:
+        w = f"    {arg_name}_: {wrapped_arg_type_name} = {arg_name}\n"
+        u = f"    {arg_name}: {arg_type_name} = {arg_name}_\n"
+    return (w, u)
+
+
+def _arg_wrap_unwrap_code(arg_name: str, arg_type: type, arg_type_name: str):
+    wrapped_arg_type_name = _type_name(_wrap_arg_type(arg_type))
+    return _wrap_unwrap_code(arg_name, arg_type, arg_type_name, wrapped_arg_type_name)
+
+
+def _result_wrap_unwrap_code(result_type: type, result_type_name: str):
+    wrapped_result_type_name = _type_name(_wrap_result_type(result_type))
+    return _wrap_unwrap_code(
+        "result", result_type, result_type_name, wrapped_result_type_name
+    )
+
+
+def _register_op(
+    func: Callable[..., Any],
+    abstract_impl: Callable[..., Any],
+    save_for_backward: Callable[..., Any] | None = None,
+    backward: Callable[..., Any] | None = None,
+):
+    name = f"nvte::{func.__name__}"
+    # Different versions of PyTorch have different ways of registering custom ops
+    try:
+        decl, impl, aimp, save, bwd = (  # type: ignore
+            torch._custom_ops.custom_op,  # type: ignore
+            torch._custom_ops.impl,  # type: ignore
+            torch._custom_ops.impl_abstract,  # type: ignore
+            torch._custom_ops.impl_save_for_backward,  # type: ignore
+            torch._custom_ops.impl_backward,  # type: ignore
+        )
+        decl(name)(func)
+        impl(name)(func)
+        aimp(name)(abstract_impl)
+        if save_for_backward:
+            save(name)(save_for_backward)
+        if backward:
+            bwd(name)(backward)
+        return
+    except AttributeError:
+        pass
+    try:
+        decl = torch._custom_op.impl.custom_op  # type: ignore
+        declared = decl(name)(func)  # type: ignore
+        declared.impl("cuda")(func)  # type: ignore
+        declared.impl_abstract()(abstract_impl)  # type: ignore
+        if save_for_backward:
+            declared.impl_save_for_backward()(save_for_backward)  # type: ignore
+        if backward:
+            declared.impl_backward()(backward)  # type: ignore
+        return
+    except AttributeError:
+        pass
+    if not hasattr(_register_op, "warned"):  # type: ignore
+        _register_op.warned = True  # type: ignore
+        warnings.warn("Unable to find custom_op, decorator has no effect")
+
+
+def _generate_wrapping_unwrapping_code(
+    func: Callable[..., Any],
+    inner_additional_setup_code: str,
+    inner_additional_teardown_code: str,
+):
+    try:
+        arg_types = get_arg_types(func)
+        return_type = get_return_type(func)
+    except Exception as e:
+        raise RuntimeError(
+            f"Failed to get argument and return types for {func.__name__}. Make sure the function is annotated with types."
+        ) from e
+    arg_names = get_arg_names(func)
+    arg_type_names = list(map(_type_name, arg_types))
+    return_type_name = _type_name(return_type)
+    outer_sig = f"""({ ','.join(
+            f'{arg_name}: {arg_type_name}'
+            for arg_name, arg_type_name in zip(arg_names, arg_type_names)
+        ) }) -> {return_type_name}"""
+    arg_wrapping_code = ""
+    arg_unwrapping_code = ""
+    for arg_name, arg_type, arg_type_name in zip(arg_names, arg_types, arg_type_names):
+        w, u = _arg_wrap_unwrap_code(arg_name, arg_type, arg_type_name)
+        arg_wrapping_code += w
+        arg_unwrapping_code += u
+    wrapped_args = ",".join(f"{arg_name}_" for arg_name in arg_names)
+
+    result_wrapping_code, result_unwrapping_code = _result_wrap_unwrap_code(
+        return_type, return_type_name
+    )
+
+    wrapped_arg_names = [f"{arg_name}_" for arg_name in arg_names]
+    wrapped_arg_types = [_wrap_arg_type(t) for t in arg_types]
+    wrapped_arg_type_names = [_type_name(t) for t in wrapped_arg_types]
+    wrapped_return_type = _wrap_result_type(return_type)
+    wrapped_return_type_name = _type_name(wrapped_return_type)
+    inner_sig = f"""({ ','.join(
+            f'{arg_name}: {arg_type_name}'
+            for arg_name, arg_type_name in zip(wrapped_arg_names, wrapped_arg_type_names)
+        ) }) -> {wrapped_return_type_name}"""
+    unwrapped_args = ",".join(f"{arg_name}" for arg_name in arg_names)
+
+    arg_unwrapping_code = arg_unwrapping_code.lstrip()
+    arg_wrapping_code = arg_wrapping_code.lstrip()
+    result_wrapping_code = result_wrapping_code.lstrip()
+    result_unwrapping_code = result_unwrapping_code.lstrip()
+    inner_additional_setup_code = inner_additional_setup_code.lstrip()
+    inner_additional_teardown_code = inner_additional_teardown_code.lstrip()
+
+    inner = f"""\
+def {func.__name__}{inner_sig}:
+    {arg_unwrapping_code}
+    {inner_additional_setup_code}
+    result: {return_type_name} = func({unwrapped_args})
+    {inner_additional_teardown_code}
+    {result_wrapping_code}
+    return result_
+"""
+    outer = f"""\
+def {func.__name__}_wrap{outer_sig}:
+    {arg_wrapping_code}
+    result_: {wrapped_return_type_name} = torch.ops.nvte.{func.__name__}({wrapped_args})
+    {result_unwrapping_code}
+    return result
+"""
+    return inner, outer
+
+
+def _run_full_code(*codes: str, **namespace: Any):
+    source = """\
+import torch
+from . import cpp_extensions
+import typing
+
+def te_to_torch_tensor(t: cpp_extensions.Tensor) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]:
+    return (t.data, t.amax, t.scale, t.scale_inv)
+
+def torch_to_te_tensor(t: typing.Sequence[torch.Tensor]) -> cpp_extensions.Tensor:
+    return cpp_extensions.Tensor(*t)
+"""
+    for code in codes:
+        source += code + "\n"
+    while "\n" * 3 in source:
+        source = source.replace("\n" * 3, "\n" * 2)
+    exec_saving_source(source, namespace)
+    return namespace
+
+
+T1 = TypeVar("T1")
+T2 = TypeVar("T2")
+Ts = TypeVarTuple("Ts")
+
+
+def _make_wrapper(
+    func: Callable[[Unpack[Ts]], T1],
+    save_for_backward: Callable[[Unpack[Ts], T1], T2] | None,
+    backward: Callable[[FunctionCtx, T2, Unpack[tuple[Any, ...]]], Any] | None,
+) -> Callable[[Unpack[Ts]], T1]:
+    # Dynamically generate code of the wrappers
+
+    impl_code, wrap_code = _generate_wrapping_unwrapping_code(func, "", "")
+    func.__name__ = func.__name__ + "_aimp"
+    aimp_code, _________ = _generate_wrapping_unwrapping_code(
+        func,
+        'func.__globals__["_nvte"] = impostor',
+        'func.__globals__["_nvte"] = cpp_extensions',
+    )
+    func.__name__ = func.__name__[:-5]
+    if save_for_backward is not None or backward is not None:
+        assert save_for_backward is not None and backward is not None
+        save_for_backward_code, _ = _generate_wrapping_unwrapping_code(
+            save_for_backward, "", ""
+        )
+        backward_code, _ = _generate_wrapping_unwrapping_code(backward, "", "")
+    else:
+        save_for_backward_code = ""
+        backward_code = ""
+
+    try:
+        # Swap real cpp_extensions (_nvte) for impostor that does nothing
+        # This is needed so the abstract implementation is traceable by PyTorch Dynamo
+        class NVTEImpostor:
+            def __getattr__(self, attr_name: str) -> Any:
+                if attr_name == "Tensor":
+                    return namedtuple("Tensor", ["data", "amax", "scale", "scale_inv"])  # type: ignore
+                else:
+                    attr = getattr(_nvte, attr_name)
+                    if isinstance(attr, type) and issubclass(attr, Enum):
+                        return attr
+                    elif callable(attr):
+                        return lambda *args, **kwargs: None  # type: ignore
+                    else:
+                        return attr
+
+        # Create op
+        ns = _run_full_code(
+            impl_code,
+            wrap_code,
+            func=func,
+            __name__=__name__,
+        )
+        op_impl: Callable[..., Any] = ns[func.__name__]  # type: ignore
+        op_wrap: Callable[[Unpack[Ts]], T1] = ns[f"{func.__name__}_wrap"]  # type: ignore
+        ns = _run_full_code(
+            aimp_code,
+            func=func,
+            __name__=__name__,
+            impostor=NVTEImpostor(),
+        )
+        op_aimp: Callable[..., Any] = ns[f"{func.__name__}_aimp"]  # type: ignore
+
+        if save_for_backward is not None:
+            ns = _run_full_code(
+                save_for_backward_code,
+                func=save_for_backward,
+                __name__=__name__,
+            )
+            op_save_for_backward = ns[f"{save_for_backward.__name__}"]  # type: ignore
+            ns = _run_full_code(
+                backward_code,
+                func=save_for_backward,
+                __name__=__name__,
+            )
+            op_backward = ns[f"{backward.__name__}"]  # type: ignore
+        else:
+            op_save_for_backward = None
+            op_backward = None
+
+        _register_op(op_impl, op_aimp, op_save_for_backward, op_backward)
+
+        return op_wrap
+    except Exception as e:
+        raise RuntimeError(f"Failed to compile wrapper for {func.__name__}.") from e
+
+
+@overload
+def torch_op(
+    func: Callable[[Unpack[Ts]], T1],
+) -> Callable[[Unpack[Ts]], T1]:
+    ...
+
+
+@overload
+def torch_op(
+    *,
+    save_for_backward: Callable[[tuple[Unpack[Ts]], T1], T2],
+    backward: Callable[[FunctionCtx, T2, Unpack[tuple[Any, ...]]], Any],
+) -> Callable[[Callable[[Unpack[Ts]], T1]], Callable[[Unpack[Ts]], T1]]:
+    ...
+
+
+def torch_op(
+    func: Callable[[Unpack[Ts]], T1] | None = None,
+    *,
+    save_for_backward: Callable[[tuple[Unpack[Ts]], T1], T2] | None = None,
+    backward: Callable[[FunctionCtx, T2, Unpack[tuple[Any, ...]]], Any] | None = None,
+) -> (
+    Callable[[Unpack[Ts]], T1]
+    | Callable[[Callable[[Unpack[Ts]], T1]], Callable[[Unpack[Ts]], T1]]
+):
+    if save_for_backward is not None or backward is not None:
+        assert save_for_backward is not None and backward is not None
+        assert func is None
+        decorator: Callable[
+            [Callable[[Unpack[Ts]], T1]], Callable[[Unpack[Ts]], T1]
+        ] = lambda func: _make_wrapper(func, save_for_backward, backward)
+        return decorator
+    else:
+        assert func is not None
+        return _make_wrapper(func, None, None)
+
+
+def make_nvte_tensor(t: torch.Tensor) -> _nvte.Tensor:
+    return _nvte.Tensor(
+        t.data,
+        torch.Tensor().cuda(),
+        torch.Tensor().cuda(),
+        torch.Tensor().cuda(),
+    )
diff --git a/transformer_engine/pytorch/sequential/nvte/activation.py b/transformer_engine/pytorch/sequential/nvte/activation.py
new file mode 100644
index 0000000000..4595ed1656
--- /dev/null
+++ b/transformer_engine/pytorch/sequential/nvte/activation.py
@@ -0,0 +1,76 @@
+from __future__ import annotations
+from . import cpp_extensions as _nvte
+from .empty import empty
+from ._common import torch_op
+
+
+@torch_op
+def relu(x: _nvte.Tensor, out_dtype: _nvte.DType) -> _nvte.Tensor:
+    output = empty(x.shape, out_dtype)
+    _nvte.relu(x, output)
+    return output
+
+
+@torch_op
+def drelu(grad: _nvte.Tensor, x: _nvte.Tensor, out_dtype: _nvte.DType) -> _nvte.Tensor:
+    output = empty(x.shape, out_dtype)
+    _nvte.drelu(grad, x, output)
+    return output
+
+
+@torch_op
+def gelu(x: _nvte.Tensor, out_dtype: _nvte.DType) -> _nvte.Tensor:
+    output = empty(x.shape, out_dtype)
+    _nvte.gelu(x, output)
+    return output
+
+
+@torch_op
+def dgelu(grad: _nvte.Tensor, x: _nvte.Tensor, out_dtype: _nvte.DType) -> _nvte.Tensor:
+    output = empty(x.shape, out_dtype)
+    _nvte.dgelu(grad, x, output)
+    return output
+
+
+@torch_op
+def reglu(x: _nvte.Tensor, out_dtype: _nvte.DType) -> _nvte.Tensor:
+    output = empty((x.shape[0], x.shape[1] // 2), out_dtype)
+    _nvte.reglu(x, output)
+    return output
+
+
+@torch_op
+def dreglu(grad: _nvte.Tensor, x: _nvte.Tensor, out_dtype: _nvte.DType) -> _nvte.Tensor:
+    output = empty(x.shape, out_dtype)
+    _nvte.dreglu(grad, x, output)
+    return output
+
+
+@torch_op
+def geglu(x: _nvte.Tensor, out_dtype: _nvte.DType) -> _nvte.Tensor:
+    output = empty((x.shape[0], x.shape[1] // 2), out_dtype)
+    _nvte.geglu(x, output)
+    return output
+
+
+@torch_op
+def dgeglu(grad: _nvte.Tensor, x: _nvte.Tensor, out_dtype: _nvte.DType) -> _nvte.Tensor:
+    output = empty(x.shape, out_dtype)
+    _nvte.dgeglu(grad, x, output)
+    return output
+
+
+@torch_op
+def swiglu(x: _nvte.Tensor, out_dtype: _nvte.DType) -> _nvte.Tensor:
+    output = empty((x.shape[0], x.shape[1] // 2), out_dtype)
+    _nvte.swiglu(x, output)
+    return output
+
+
+@torch_op
+def dswiglu(
+    grad: _nvte.Tensor, x: _nvte.Tensor, out_dtype: _nvte.DType
+) -> _nvte.Tensor:
+    output = empty(x.shape, out_dtype)
+    _nvte.dswiglu(grad, x, output)
+    return output
diff --git a/transformer_engine/pytorch/sequential/nvte/add.py b/transformer_engine/pytorch/sequential/nvte/add.py
new file mode 100644
index 0000000000..e3ea3e357f
--- /dev/null
+++ b/transformer_engine/pytorch/sequential/nvte/add.py
@@ -0,0 +1,23 @@
+from __future__ import annotations
+import torch
+from . import cpp_extensions as _nvte
+
+from ._common import make_nvte_tensor
+from .dtype import is_fp8, te_to_torch_dtype
+
+
+def add(A: _nvte.Tensor, B: _nvte.Tensor, out_dtype: _nvte.DType):
+    if is_fp8(A) or is_fp8(B):
+        raise NotImplementedError()  # TODO
+    else:
+        output = torch.empty(A.shape, dtype=te_to_torch_dtype(out_dtype), device="cuda")
+        torch.add(A.data, B.data, out=output)
+        return make_nvte_tensor(output)
+
+
+def dbias(grad: _nvte.Tensor, out_dtype: _nvte.DType):
+    if is_fp8(grad):
+        raise NotImplementedError()  # TODO
+    else:
+        output = torch.sum(grad.data, dtype=te_to_torch_dtype(out_dtype), dim=0)
+        return make_nvte_tensor(output)
diff --git a/transformer_engine/pytorch/sequential/nvte/attention.py b/transformer_engine/pytorch/sequential/nvte/attention.py
new file mode 100644
index 0000000000..faef9305b8
--- /dev/null
+++ b/transformer_engine/pytorch/sequential/nvte/attention.py
@@ -0,0 +1,18 @@
+from __future__ import annotations
+from . import cpp_extensions as _nvte
+from .empty import empty
+
+
+def dot_product_attention(
+    QKV: _nvte.Tensor, cu_seqlens: _nvte.Tensor, attn_scale: float, dropout: float
+):
+    S = empty((), _nvte.DType.Float8E4M3)
+    token_count = QKV.shape[0]
+    assert QKV.shape[1] % 3 == 0
+    token_dim = QKV.shape[1] // 3
+
+    _nvte.fused_attn_fwd_qkvpacked(
+        QKV,
+        empty(),
+        S,
+    )
diff --git a/transformer_engine/pytorch/sequential/nvte/cast_transpose.py b/transformer_engine/pytorch/sequential/nvte/cast_transpose.py
new file mode 100644
index 0000000000..0d5ef504e6
--- /dev/null
+++ b/transformer_engine/pytorch/sequential/nvte/cast_transpose.py
@@ -0,0 +1,111 @@
+from __future__ import annotations
+
+from . import cpp_extensions as _nvte
+from ._common import torch_op
+
+from .dtype import is_fp8
+from .empty import empty, multi_empty_share_metadata
+
+
+@torch_op
+def _fp8_quantize(t: _nvte.Tensor, out_dtype: _nvte.DType) -> _nvte.Tensor:
+    output = empty(t.shape, out_dtype)
+    _nvte.fp8_quantize(t, output)
+    return output
+
+
+@torch_op
+def _fp8_dequantize(t: _nvte.Tensor, out_dtype: _nvte.DType) -> _nvte.Tensor:
+    output = empty(t.shape, out_dtype)
+    _nvte.fp8_dequantize(t, output)
+    return output
+
+
+def cast(t: _nvte.Tensor, out_dtype: _nvte.DType):
+    assert t.dtype != out_dtype
+    if is_fp8(t):
+        assert not is_fp8(out_dtype)
+
+    if is_fp8(out_dtype):
+        return _fp8_quantize(t, out_dtype)
+    elif is_fp8(t):
+        return _fp8_dequantize(t, out_dtype)
+    else:
+        output = empty(t.shape, out_dtype)
+        output.data.copy_(t.data)
+        return output
+
+
+def cast_checked(t: _nvte.Tensor, out_dtype: _nvte.DType | None):
+    if out_dtype is None or t.dtype == out_dtype:
+        return t
+    else:
+        return cast(t, out_dtype)
+
+
+@torch_op
+def transpose(t: _nvte.Tensor) -> _nvte.Tensor:
+    output = empty(t.shape[::-1], t.dtype)
+    _nvte.transpose(t, output)
+    return output
+
+
+@torch_op
+def cast_transpose(
+    t: _nvte.Tensor, out_dtype: _nvte.DType
+) -> tuple[_nvte.Tensor, _nvte.Tensor]:
+    assert t.dtype != out_dtype
+    if is_fp8(t):
+        assert not is_fp8(out_dtype)
+
+    out_cast, out_transpose = multi_empty_share_metadata(
+        (t.shape, out_dtype), (t.shape[::-1], out_dtype)
+    )
+
+    _nvte.cast_transpose(t, out_cast, out_transpose)
+    return out_cast, out_transpose
+
+
+def cast_transpose_checked(t: _nvte.Tensor, out_dtype: _nvte.DType | None):
+    if out_dtype is None or t.dtype == out_dtype:
+        return t, transpose(t)
+    else:
+        return cast_transpose(t, out_dtype)
+
+
+def multi_cast_transpose(
+    *desc: tuple[_nvte.Tensor, _nvte.DType]
+) -> list[tuple[_nvte.Tensor, _nvte.Tensor]]:
+    outs = [
+        multi_empty_share_metadata((t.shape, dtype), (t.shape[::-1], dtype))
+        for t, dtype in desc
+    ]
+    out_cast_list, out_transpose_list = zip(*outs)
+    input_list, _ = zip(*desc)
+    _nvte.multi_cast_transpose(
+        input_list, out_cast_list, out_transpose_list  # type: ignore
+    )
+    return outs
+
+
+def multi_cast_transpose_checked(*desc: tuple[_nvte.Tensor, _nvte.DType | None]):
+    transpose_results: list[tuple[_nvte.Tensor, _nvte.Tensor] | None] = []
+    to_cast_transpose: list[tuple[_nvte.Tensor, _nvte.DType]] = []
+    for t, dtype in desc:
+        if dtype is None or t.dtype == dtype:
+            transpose_results.append((t, transpose(t)))
+        else:
+            to_cast_transpose.append((t, dtype))
+            transpose_results.append(None)
+    cast_transpose_results = (
+        multi_cast_transpose(*to_cast_transpose) if to_cast_transpose else []
+    )
+    results: list[tuple[_nvte.Tensor, _nvte.Tensor]] = []
+    i = 0
+    for result in transpose_results:
+        if result is None:
+            results.append(cast_transpose_results[i])
+            i += 1
+        else:
+            results.append(result)
+    return results
diff --git a/transformer_engine/pytorch/sequential/nvte/cpp_extensions/__init__.py b/transformer_engine/pytorch/sequential/nvte/cpp_extensions/__init__.py
new file mode 100644
index 0000000000..36f213a655
--- /dev/null
+++ b/transformer_engine/pytorch/sequential/nvte/cpp_extensions/__init__.py
@@ -0,0 +1,163 @@
+from __future__ import annotations
+from typing import TYPE_CHECKING
+import torch
+from .dynamic_load import inject_real
+
+inject_real(globals())
+
+from .all_fp8_values import ALL_FP8E4M3_VALUES, ALL_FP8E5M2_VALUES
+
+if TYPE_CHECKING:
+    from . import *  # type: ignore
+
+
+class Tensor:
+    __raw: RawTensor
+    dtype: DType
+    shape: list[int]
+    data: torch.Tensor
+    amax: torch.Tensor
+    scale: torch.Tensor
+    scale_inv: torch.Tensor
+
+    def __init__(
+        self,
+        data: torch.Tensor,
+        amax: torch.Tensor,
+        scale: torch.Tensor,
+        scale_inv: torch.Tensor,
+        /,
+        *,
+        dtype_override: DType | None = None,
+    ) -> None:
+        if dtype_override is not None:
+            self.dtype = dtype_override
+        else:
+            self.dtype = torch_to_te_dtype(data.dtype)
+        self.shape = list(data.shape)
+        self.data = data
+        self.amax = amax
+        self.scale = scale
+        self.scale_inv = scale_inv
+        self._raw = RawTensor(
+            self.data.data_ptr(),
+            self.shape,
+            getattr(DType, "__orig_type__")(self.dtype.value),
+            self.amax.data_ptr(),
+            self.scale.data_ptr(),
+            self.scale_inv.data_ptr(),
+        )
+
+    def query_shape_dtype(self):
+        self.dtype = DType(self._raw.dtype.value)
+        self.shape = list(self._raw.shape)
+        return self
+
+    def data_ptr(self):
+        return self.data.data_ptr()
+
+    def __repr__(self):
+        if self.dtype == DType.Float8E4M3 or DType.Float8E5M2:
+            conv_table = (
+                torch.tensor(ALL_FP8E4M3_VALUES, device="cpu")
+                if self.dtype == DType.Float8E4M3
+                else torch.tensor(ALL_FP8E5M2_VALUES, device="cpu")
+            )
+            fp32_values = conv_table[self.data.cpu().int()]
+            data_repr = repr(fp32_values)
+        else:
+            data_repr = repr(self.data)
+        data_repr = data_repr[::-1][data_repr[::-1].find("]") :][::-1]
+        data_repr = "T" + data_repr[1:]
+        return f"""\
+{data_repr},
+    dtype={dtype_name(self.dtype)},\
+amax={self.amax[0].item() if self.amax.numel() else None},\
+scale={self.scale.item() if self.scale.numel() else None},\
+scale_inv={self.scale_inv.item() if self.scale_inv.numel() else None}\
+)"""
+
+
+def te_to_torch_dtype(dtype: DType):
+    match dtype:
+        case DType.Byte:
+            return torch.int8
+        case DType.Int32:
+            return torch.int32
+        case DType.Int64:
+            return torch.int64
+        case DType.Float32:
+            return torch.float32
+        case DType.Float16:
+            return torch.float16
+        case DType.BFloat16:
+            return torch.bfloat16
+        # Using different types for fp8e4m3 and fp8e5m2
+        # allows for a type conversion in the other way
+        case DType.Float8E4M3:
+            return torch.int8
+        case DType.Float8E5M2:
+            return torch.uint8
+
+
+def torch_to_te_dtype(dtype: torch.dtype):
+    match dtype:
+        case torch.int32:
+            return DType.Int32
+        case torch.int64:
+            return DType.Int64
+        case torch.float32:
+            return DType.Float32
+        case torch.float16:
+            return DType.Float16
+        case torch.bfloat16:
+            return DType.BFloat16
+        case torch.int8:
+            # We assume that this is not a workspace (Byte)
+            # tensor, as these shouldn't be exposed outside
+            # of basic operations.
+            return DType.Float8E4M3
+        case torch.uint8:
+            return DType.Float8E5M2
+        case _:
+            raise ValueError(f"Unsupported dtype: {dtype}")
+
+
+def bit_width(dtype: DType):
+    match dtype:
+        case DType.Byte:
+            return 8
+        case DType.Int32:
+            return 32
+        case DType.Int64:
+            return 64
+        case DType.Float32:
+            return 32
+        case DType.Float16:
+            return 16
+        case DType.BFloat16:
+            return 16
+        case DType.Float8E4M3:
+            return 8
+        case DType.Float8E5M2:
+            return 8
+
+
+def dtype_name(dtype: DType):
+    match dtype:
+        case DType.Byte:
+            return "byte"
+        case DType.Int32:
+            return "int32"
+        case DType.Int64:
+            return "int64"
+        case DType.Float32:
+            return "fp32"
+        case DType.Float16:
+            return "fp16"
+        case DType.BFloat16:
+            return "bf16"
+        case DType.Float8E4M3:
+            return "fp8e4m3"
+        case DType.Float8E5M2:
+            return "fp8e5m2"
diff --git a/transformer_engine/pytorch/sequential/nvte/cpp_extensions/__init__.pyi b/transformer_engine/pytorch/sequential/nvte/cpp_extensions/__init__.pyi
new file mode 100644
index 0000000000..9bc1a7a1db
--- /dev/null
+++ b/transformer_engine/pytorch/sequential/nvte/cpp_extensions/__init__.pyi
@@ -0,0 +1,106 @@
+from __future__ import annotations
+import torch
+from enum import Enum
+from typing import Sequence, TYPE_CHECKING
+from typing_extensions import Self
+
+class QKVLayout(Enum):
+    NOT_INTERLEAVED = 0
+    QKV_INTERLEAVED = 1
+    KV_INTERLEAVED = 2
+
+class BiasType(Enum):
+    NO_BIAS = 0
+    PRE_SCALE_BIAS = 1
+    POST_SCALE_BIAS = 2
+
+class MaskType(Enum):
+    NO_MASK = 0
+    PADDING_MASK = 1
+    CAUSAL_MASK = 2
+
+class FusedAttnBackend(Enum):
+    No_Backend = -1
+    F16_max512_seqlen = 0
+    F16_arbitrary_seqlen = 1
+    FP8 = 2
+
+class DType(Enum):
+    Byte = 0
+    Int32 = 1
+    Int64 = 2
+    Float32 = 3
+    Float16 = 4
+    BFloat16 = 5
+    Float8E4M3 = 6
+    Float8E5M2 = 7
+
+class RawTensor:
+    dtype: DType
+    shape: Sequence[int]
+    def data_ptr(self) -> int: ...
+    def amax_ptr(self) -> int: ...
+    def scale_ptr(self) -> int: ...
+    def scale_inv_ptr(self) -> int: ...
+    def __init__(self, data_ptr: int, shape: Sequence[int], dtype: DType, amax_ptr: int, scale_ptr: int, scale_inv_ptr: int) -> None: ...
+
+# Expose names defined in real __init__.py
+# Which are not to be imported from transformer_engine_cuda
+if TYPE_CHECKING:
+    class Tensor:
+        dtype: DType
+        shape: Sequence[int]
+        data: torch.Tensor
+        amax: torch.Tensor
+        scale: torch.Tensor
+        scale_inv: torch.Tensor
+        def __init__(self, data: torch.Tensor, amax: torch.Tensor, scale: torch.Tensor, scale_inv: torch.Tensor, *, dtype_override: DType | None = None,) -> None: ...
+        def data_ptr(self) -> int: ...
+        def query_shape_dtype(self) -> Self: ...
+
+
+    def te_to_torch_dtype(dtype: DType) -> torch.dtype: ...
+    def torch_to_te_dtype(dtype: torch.dtype) -> DType: ...
+    def bit_width(dtype: DType) -> int: ...
+    def dtype_name(dtype: DType) -> str: ...
+
+def gelu(input: Tensor, output: Tensor) -> None: ...
+def dgelu(grad: Tensor, input: Tensor, output: Tensor) -> None: ...
+def geglu(input: Tensor, output: Tensor) -> None: ...
+def dgeglu(grad: Tensor, input: Tensor, output: Tensor) -> None: ...
+def relu(input: Tensor, output: Tensor) -> None: ...
+def drelu(grad: Tensor, input: Tensor, output: Tensor) -> None: ...
+def swiglu(input: Tensor, output: Tensor) -> None: ...
+def dswiglu(grad: Tensor, input: Tensor, output: Tensor) -> None: ...
+def reglu(input: Tensor, output: Tensor) -> None: ...
+def dreglu(grad: Tensor, input: Tensor, output: Tensor) -> None: ...
+def fp8_quantize(input: Tensor, output: Tensor) -> None: ...
+def fp8_dequantize(input: Tensor, output: Tensor) -> None: ...
+def get_fused_attn_backend(q_dtype: DType, kv_dtype: DType, qkv_layout: QKVLayout, bias_type: BiasType, attn_mask_type: MaskType, dropout: float, max_seqlen_q: int, max_seqlen_kv: int, head_dim: int) -> FusedAttnBackend: ...
+def fused_attn_fwd_qkvpacked(QKV: Tensor, Bias: Tensor, S: Tensor, O: Tensor, Aux_CTX_Tensors: Sequence[Tensor], cu_seqlens: Tensor, rng_state: Tensor, max_seqlen: int, is_training: bool, attn_scale: float, dropout: float, qkv_layout: QKVLayout, bias_type: BiasType, attn_mask_type: MaskType, workspace: Tensor) -> None: ...
+def fused_attn_bwd_qkvpacked(QKV: Tensor, O: Tensor, dO: Tensor, S: Tensor, dP: Tensor, Aux_CTX_Tensors: Sequence[Tensor], dQKV: Tensor, dBias: Tensor, cu_seqlens: Tensor, max_seqlen: int, attn_scale: float, dropout: float, qkv_layout: QKVLayout, bias_type: BiasType, attn_mask_type: MaskType, workspace: Tensor) -> None: ...
+def fused_attn_fwd_kvpacked(Q: Tensor, KV: Tensor, Bias: Tensor, S: Tensor, O: Tensor, Aux_CTX_Tensors: Sequence[Tensor], cu_seqlens_q: Tensor, cu_seqlens_kv: Tensor, rng_state: Tensor, max_seqlen_q: int, max_seqlen_kv: int, is_training: bool, attn_scale: float, dropout: float, qkv_layout: QKVLayout, bias_type: BiasType, attn_mask_type: MaskType, workspace: Tensor) -> None: ...
+def fused_attn_bwd_kvpacked(Q: Tensor, KV: Tensor, O: Tensor, dO: Tensor, S: Tensor, dP: Tensor, Aux_CTX_Tensors: Sequence[Tensor], dQ: Tensor, dKV: Tensor, dBias: Tensor, cu_seqlens_q: Tensor, cu_seqlens_kv: Tensor, max_seqlen_q: int, max_seqlen_kv: int, attn_scale: float, dropout: float, qkv_layout: QKVLayout, bias_type: BiasType, attn_mask_type: MaskType, workspace: Tensor) -> None: ...
+def cublas_gemm(A: Tensor, B: Tensor, D: Tensor, bias: Tensor, pre_gelu_out: Tensor, transa: bool, transb: bool, grad: bool, workspace: Tensor, accumulate: bool, use_split_accumulator: bool, math_sm_count: int) -> None: ...
+def layernorm_fwd(x: Tensor, gamma: Tensor, beta: Tensor, epsilon: float, z: Tensor, mu: Tensor, rsigma: Tensor, multiprocessorCount: int, workspace: Tensor, barrier: Tensor) -> None: ...
+def layernorm1p_fwd(x: Tensor, gamma: Tensor, beta: Tensor, epsilon: float, z: Tensor, mu: Tensor, rsigma: Tensor, multiprocessorCount: int, workspace: Tensor, barrier: Tensor) -> None: ...
+def layernorm_bwd(dz: Tensor, x: Tensor, mu: Tensor, rsigma: Tensor, gamma: Tensor, dx: Tensor, dgamma: Tensor, dbeta: Tensor, dgamma_part: Tensor, dbeta_part: Tensor, multiprocessorCount: int, workspace: Tensor, barrier: Tensor) -> None: ...
+def layernorm1p_bwd(dz: Tensor, x: Tensor, mu: Tensor, rsigma: Tensor, gamma: Tensor, dx: Tensor, dgamma: Tensor, dbeta: Tensor, dgamma_part: Tensor, dbeta_part: Tensor, multiprocessorCount: int, workspace: Tensor, barrier: Tensor) -> None: ...
+def rmsnorm_fwd(x: Tensor, gamma: Tensor, epsilon: float, z: Tensor, rsigma: Tensor, multiprocessorCount: int, workspace: Tensor, barrier: Tensor) -> None: ...
+def rmsnorm_bwd(dz: Tensor, x: Tensor, rsigma: Tensor, gamma: Tensor, dx: Tensor, dgamma: Tensor, dgamma_part: Tensor, multiprocessorCount: int, workspace: Tensor, barrier: Tensor) -> None: ...
+def scaled_softmax_forward(input: Tensor, softmax_results: Tensor, scale_factor: float) -> None: ...
+def scaled_softmax_backward(incoming_grads: Tensor, softmax_results: Tensor, output_grads: Tensor, scale_factor: float) -> None: ...
+def scaled_masked_softmax_forward(input: Tensor, mask: Tensor, softmax_results: Tensor, scale_factor: float) -> None: ...
+def scaled_masked_softmax_backward(incoming_grads: Tensor, softmax_results: Tensor, output_grads: Tensor, scale_factor: float) -> None: ...
+def scaled_upper_triang_masked_softmax_forward(input: Tensor, softmax_results: Tensor, scale_factor: float) -> None: ...
+def scaled_upper_triang_masked_softmax_backward(incoming_grads: Tensor, softmax_results: Tensor, output_grads: Tensor, scale_factor: float) -> None: ...
+def cast_transpose(input: Tensor, cast_output: Tensor, transposed_output: Tensor) -> None: ...
+def transpose(input: Tensor, transposed_output: Tensor) -> None: ...
+def cast_transpose_dbias(input: Tensor, cast_output: Tensor, transposed_output: Tensor, dbias: Tensor, workspace: Tensor) -> None: ...
+def fp8_transpose_dbias(input: Tensor, transposed_output: Tensor, dbias: Tensor, workspace: Tensor) -> None: ...
+def cast_transpose_dbias_dgelu(input: Tensor, gelu_input: Tensor, cast_output: Tensor, transposed_output: Tensor, dbias: Tensor, workspace: Tensor) -> None: ...
+def dgeglu_cast_transpose(input: Tensor, geglu_input: Tensor, cast_output: Tensor, transposed_output: Tensor) -> None: ...
+def multi_cast_transpose(input_list: Sequence[Tensor], cast_output_list: Sequence[Tensor], transposed_output_list: Sequence[Tensor]) -> None: ...
+
+# Don't export these names (this stub file gets loaded as a real python module)
+del annotations, torch, Enum, Sequence, TYPE_CHECKING, Self # type: ignore
\ No newline at end of file
diff --git a/transformer_engine/pytorch/sequential/nvte/cpp_extensions/all_fp8_values.py b/transformer_engine/pytorch/sequential/nvte/cpp_extensions/all_fp8_values.py
new file mode 100644
index 0000000000..777b731960
--- /dev/null
+++ b/transformer_engine/pytorch/sequential/nvte/cpp_extensions/all_fp8_values.py
@@ -0,0 +1,72 @@
+# fmt: off
+nan = float("nan")
+inf = float("inf")
+ALL_FP8E4M3_VALUES = [
+   0.         ,    0.001953125,    0.00390625 ,    0.005859375,    0.0078125  ,    0.009765625,    0.01171875 ,    0.013671875,
+   0.015625   ,    0.017578125,    0.01953125 ,    0.021484375,    0.0234375  ,    0.025390625,    0.02734375 ,    0.029296875,
+   0.03125    ,    0.03515625 ,    0.0390625  ,    0.04296875 ,    0.046875   ,    0.05078125 ,    0.0546875  ,    0.05859375 ,
+   0.0625     ,    0.0703125  ,    0.078125   ,    0.0859375  ,    0.09375    ,    0.1015625  ,    0.109375   ,    0.1171875  ,
+   0.125      ,    0.140625   ,    0.15625    ,    0.171875   ,    0.1875     ,    0.203125   ,    0.21875    ,    0.234375   ,
+   0.25       ,    0.28125    ,    0.3125     ,    0.34375    ,    0.375      ,    0.40625    ,    0.4375     ,    0.46875    ,
+   0.5        ,    0.5625     ,    0.625      ,    0.6875     ,    0.75       ,    0.8125     ,    0.875      ,    0.9375     ,
+   1.         ,    1.125      ,    1.25       ,    1.375      ,    1.5        ,    1.625      ,    1.75       ,    1.875      ,
+   2.         ,    2.25       ,    2.5        ,    2.75       ,    3.         ,    3.25       ,    3.5        ,    3.75       ,
+   4.         ,    4.5        ,    5.         ,    5.5        ,    6.         ,    6.5        ,    7.         ,    7.5        ,
+   8.         ,    9.         ,   10.         ,   11.         ,   12.         ,   13.         ,   14.         ,   15.         ,
+  16.         ,   18.         ,   20.         ,   22.         ,   24.         ,   26.         ,   28.         ,   30.         ,
+  32.         ,   36.         ,   40.         ,   44.         ,   48.         ,   52.         ,   56.         ,   60.         ,
+  64.         ,   72.         ,   80.         ,   88.         ,   96.         ,  104.         ,  112.         ,  120.         ,
+ 128.         ,  144.         ,  160.         ,  176.         ,  192.         ,  208.         ,  224.         ,  240.         ,
+ 256.         ,  288.         ,  320.         ,  352.         ,  384.         ,  416.         ,  448.         ,  nan          ,
+  -0.         ,   -0.001953125,   -0.00390625 ,   -0.005859375,   -0.0078125  ,   -0.009765625,   -0.01171875 ,   -0.013671875,
+  -0.015625   ,   -0.017578125,   -0.01953125 ,   -0.021484375,   -0.0234375  ,   -0.025390625,   -0.02734375 ,   -0.029296875,
+  -0.03125    ,   -0.03515625 ,   -0.0390625  ,   -0.04296875 ,   -0.046875   ,   -0.05078125 ,   -0.0546875  ,   -0.05859375 ,
+  -0.0625     ,   -0.0703125  ,   -0.078125   ,   -0.0859375  ,   -0.09375    ,   -0.1015625  ,   -0.109375   ,   -0.1171875  ,
+  -0.125      ,   -0.140625   ,   -0.15625    ,   -0.171875   ,   -0.1875     ,   -0.203125   ,   -0.21875    ,   -0.234375   ,
+  -0.25       ,   -0.28125    ,   -0.3125     ,   -0.34375    ,   -0.375      ,   -0.40625    ,   -0.4375     ,   -0.46875    ,
+  -0.5        ,   -0.5625     ,   -0.625      ,   -0.6875     ,   -0.75       ,   -0.8125     ,   -0.875      ,   -0.9375     ,
+  -1.         ,   -1.125      ,   -1.25       ,   -1.375      ,   -1.5        ,   -1.625      ,   -1.75       ,   -1.875      ,
+  -2.         ,   -2.25       ,   -2.5        ,   -2.75       ,   -3.         ,   -3.25       ,   -3.5        ,   -3.75       ,
+  -4.         ,   -4.5        ,   -5.         ,   -5.5        ,   -6.         ,   -6.5        ,   -7.         ,   -7.5        ,
+  -8.         ,   -9.         ,  -10.         ,  -11.         ,  -12.         ,  -13.         ,  -14.         ,  -15.         ,
+ -16.         ,  -18.         ,  -20.         ,  -22.         ,  -24.         ,  -26.         ,  -28.         ,  -30.         ,
+ -32.         ,  -36.         ,  -40.         ,  -44.         ,  -48.         ,  -52.         ,  -56.         ,  -60.         ,
+ -64.         ,  -72.         ,  -80.         ,  -88.         ,  -96.         , -104.         , -112.         , -120.         ,
+-128.         , -144.         , -160.         , -176.         , -192.         , -208.         , -224.         , -240.         ,
+-256.         , -288.         , -320.         , -352.         , -384.         , -416.         , -448.         ,  nan          ,
+]
+
+ALL_FP8E5M2_VALUES = [
+      0.                ,      0.0000152587890625,      0.000030517578125 ,      0.0000457763671875,      0.00006103515625  ,     0.0000762939453125,      0.000091552734375 ,      0.0001068115234375,
+      0.0001220703125   ,      0.000152587890625 ,      0.00018310546875  ,      0.000213623046875 ,      0.000244140625    ,     0.00030517578125  ,      0.0003662109375   ,      0.00042724609375  ,
+      0.00048828125     ,      0.0006103515625   ,      0.000732421875    ,      0.0008544921875   ,      0.0009765625      ,     0.001220703125    ,      0.00146484375     ,      0.001708984375    ,
+      0.001953125       ,      0.00244140625     ,      0.0029296875      ,      0.00341796875     ,      0.00390625        ,     0.0048828125      ,      0.005859375       ,      0.0068359375      ,
+      0.0078125         ,      0.009765625       ,      0.01171875        ,      0.013671875       ,      0.015625          ,     0.01953125        ,      0.0234375         ,      0.02734375        ,
+      0.03125           ,      0.0390625         ,      0.046875          ,      0.0546875         ,      0.0625            ,     0.078125          ,      0.09375           ,      0.109375          ,
+      0.125             ,      0.15625           ,      0.1875            ,      0.21875           ,      0.25              ,     0.3125            ,      0.375             ,      0.4375            ,
+      0.5               ,      0.625             ,      0.75              ,      0.875             ,      1.                ,     1.25              ,      1.5               ,      1.75              ,
+      2.                ,      2.5               ,      3.                ,      3.5               ,      4.                ,     5.                ,      6.                ,      7.                ,
+      8.                ,     10.                ,     12.                ,     14.                ,     16.                ,    20.                ,     24.                ,     28.                ,
+     32.                ,     40.                ,     48.                ,     56.                ,     64.                ,    80.                ,     96.                ,    112.                ,
+    128.                ,    160.                ,    192.                ,    224.                ,    256.                ,   320.                ,    384.                ,    448.                ,
+    512.                ,    640.                ,    768.                ,    896.                ,   1024.                ,  1280.                ,   1536.                ,   1792.                ,
+   2048.                ,   2560.                ,   3072.                ,   3584.                ,   4096.                ,  5120.                ,   6144.                ,   7168.                ,
+   8192.                ,  10240.                ,  12288.                ,  14336.                ,  16384.                , 20480.                ,  24576.                ,  28672.                ,
+  32768.                ,  40960.                ,  49152.                ,  57344.                ,    inf                 ,   nan                 ,    nan                 ,    nan                 ,
+     -0.                ,     -0.0000152587890625,     -0.000030517578125 ,     -0.0000457763671875,     -0.00006103515625  ,    -0.0000762939453125,     -0.000091552734375 ,     -0.0001068115234375,
+     -0.0001220703125   ,     -0.000152587890625 ,     -0.00018310546875  ,     -0.000213623046875 ,     -0.000244140625    ,    -0.00030517578125  ,     -0.0003662109375   ,     -0.00042724609375  ,
+     -0.00048828125     ,     -0.0006103515625   ,     -0.000732421875    ,     -0.0008544921875   ,     -0.0009765625      ,    -0.001220703125    ,     -0.00146484375     ,     -0.001708984375    ,
+     -0.001953125       ,     -0.00244140625     ,     -0.0029296875      ,     -0.00341796875     ,     -0.00390625        ,    -0.0048828125      ,     -0.005859375       ,     -0.0068359375      ,
+     -0.0078125         ,     -0.009765625       ,     -0.01171875        ,     -0.013671875       ,     -0.015625          ,    -0.01953125        ,     -0.0234375         ,     -0.02734375        ,
+     -0.03125           ,     -0.0390625         ,     -0.046875          ,     -0.0546875         ,     -0.0625            ,    -0.078125          ,     -0.09375           ,     -0.109375          ,
+     -0.125             ,     -0.15625           ,     -0.1875            ,     -0.21875           ,     -0.25              ,    -0.3125            ,     -0.375             ,     -0.4375            ,
+     -0.5               ,     -0.625             ,     -0.75              ,     -0.875             ,     -1.                ,    -1.25              ,     -1.5               ,     -1.75              ,
+     -2.                ,     -2.5               ,     -3.                ,     -3.5               ,     -4.                ,    -5.                ,     -6.                ,     -7.                ,
+     -8.                ,    -10.                ,    -12.                ,    -14.                ,    -16.                ,   -20.                ,    -24.                ,    -28.                ,
+    -32.                ,    -40.                ,    -48.                ,    -56.                ,    -64.                ,   -80.                ,    -96.                ,   -112.                ,
+   -128.                ,   -160.                ,   -192.                ,   -224.                ,   -256.                ,  -320.                ,   -384.                ,   -448.                ,
+   -512.                ,   -640.                ,   -768.                ,   -896.                ,  -1024.                , -1280.                ,  -1536.                ,  -1792.                ,
+  -2048.                ,  -2560.                ,  -3072.                ,  -3584.                ,  -4096.                , -5120.                ,  -6144.                ,  -7168.                ,
+  -8192.                , -10240.                , -12288.                , -14336.                , -16384.                , 20480.                , -24576.                , -28672.                ,
+ -32768.                , -40960.                , -49152.                , -57344.                ,   -inf                 ,   nan                 ,    nan                 ,    nan                 ,
+]
diff --git a/transformer_engine/pytorch/sequential/nvte/cpp_extensions/dynamic_load.py b/transformer_engine/pytorch/sequential/nvte/cpp_extensions/dynamic_load.py
new file mode 100644
index 0000000000..b468e78972
--- /dev/null
+++ b/transformer_engine/pytorch/sequential/nvte/cpp_extensions/dynamic_load.py
@@ -0,0 +1,59 @@
+from enum import Enum
+import functools
+import inspect
+from typing import Any, Callable, TypeVar
+from ...utils import import_file_as_module
+import torch
+import transformer_engine_cuda  # type: ignore
+
+_T1 = TypeVar("_T1")
+_T2 = TypeVar("_T2")
+
+
+def _to_dict(l: list[tuple[_T1, _T2]], /) -> dict[_T1, _T2]:
+    return {t[0]: t[1] for t in l}
+
+
+def _wrap_function(real_func: Callable[..., Any]):
+    @functools.wraps(real_func)
+    def wrapper(*args: Any):
+        real_args: list[Any] = []
+        for arg in args:
+            if arg.__class__.__name__ == "Tensor":
+                real_args.append(arg._raw)
+            elif isinstance(arg, Enum):
+                real_args.append(getattr(type(arg), "__orig_type__")(arg.value))
+            else:
+                real_args.append(arg)
+        return real_func(*real_args, torch.cuda.current_stream().cuda_stream)
+
+    return wrapper
+
+
+def inject_real(namespace: dict[str, Any]):
+    stub = import_file_as_module("__init__.pyi")
+    real = transformer_engine_cuda
+
+    stub_functions = _to_dict(inspect.getmembers(stub, inspect.isfunction))
+    real_functions = _to_dict(inspect.getmembers(real, inspect.isroutine))
+
+    for func_name, _ in stub_functions.items():
+        if func_name not in real_functions:
+            raise RuntimeError(
+                f"Function {func_name} declared in {stub} not found in {real}"
+            )
+        namespace[func_name] = _wrap_function(real_functions[func_name])
+
+    stub_types = _to_dict(inspect.getmembers(stub, inspect.isclass))
+    real_types = _to_dict(inspect.getmembers(real, inspect.isclass))
+
+    for type_name, type_obj in stub_types.items():
+        if type_name not in real_types:
+            raise RuntimeError(
+                f"Type {type_name} declared in {stub} not found in {real}"
+            )
+        if issubclass(type_obj, Enum):
+            setattr(type_obj, "__orig_type__", real_types[type_name])
+            namespace[type_name] = type_obj
+        else:
+            namespace[type_name] = real_types[type_name]
diff --git a/transformer_engine/pytorch/sequential/nvte/cpp_extensions/py.typed b/transformer_engine/pytorch/sequential/nvte/cpp_extensions/py.typed
new file mode 100644
index 0000000000..e69de29bb2
diff --git a/transformer_engine/pytorch/sequential/nvte/cppsrc/pybind.cpp b/transformer_engine/pytorch/sequential/nvte/cppsrc/pybind.cpp
new file mode 100644
index 0000000000..65a4a5b5f5
--- /dev/null
+++ b/transformer_engine/pytorch/sequential/nvte/cppsrc/pybind.cpp
@@ -0,0 +1,312 @@
+/*************************************************************************
+ * Copyright (c) 2022-2023, NVIDIA CORPORATION & AFFILIATES. All rights
+ *reserved.
+ *
+ * See LICENSE for license information.
+ ************************************************************************/
+
+#include <cstdlib>
+#include <cublasLt.h>
+#include <cuda.h>
+#include <cuda_bf16.h>
+#include <cuda_runtime.h>
+#include <exception>
+#include <memory>
+#include <pybind11/pybind11.h>
+#include <pybind11/stl.h>
+#include <stdexcept>
+#include <transformer_engine/activation.h>
+#include <transformer_engine/cast.h>
+#include <transformer_engine/fused_attn.h>
+#include <transformer_engine/gemm.h>
+#include <transformer_engine/layer_norm.h>
+#include <transformer_engine/rmsnorm.h>
+#include <transformer_engine/softmax.h>
+#include <transformer_engine/transformer_engine.h>
+#include <transformer_engine/transpose.h>
+#include <type_traits>
+#include <utility>
+
+#include "type_list.h"
+
+void cuda_check() {
+  static const bool perform_check = []() {
+    const char *var = std::getenv("CUDA_LAUNCH_BLOCKING");
+    if (var && var[0] == '1') {
+      return true;
+    }
+    return false;
+  }();
+
+  if (perform_check) {
+    cudaDeviceSynchronize();
+    auto err = cudaGetLastError();
+    if (err != cudaSuccess) {
+      throw std::runtime_error(
+          "TE kernel error: " + std::string(cudaGetErrorName(err)) + ": " +
+          cudaGetErrorString(err));
+    }
+  }
+}
+
+// ----------- Wrapper for NVTETensor -----------
+class Tensor {
+  static_assert(std::is_same_v<NVTETensor, void *>);
+  std::shared_ptr<void> tensor;
+
+  static void destroy(void *tensor) {
+    if (tensor)
+      nvte_destroy_tensor(tensor);
+  }
+
+public:
+  Tensor() : tensor{nullptr, destroy} {}
+  Tensor(size_t data, const std::vector<size_t> &shape, NVTEDType dtype,
+         size_t amax, size_t scale, size_t scale_inv)
+      : tensor{nvte_create_tensor(reinterpret_cast<void *>(data),
+                                  NVTEShape{shape.data(), shape.size()}, dtype,
+                                  reinterpret_cast<float *>(amax),
+                                  reinterpret_cast<float *>(scale),
+                                  reinterpret_cast<float *>(scale_inv)),
+               destroy} {}
+  Tensor(const Tensor &other) = default;
+  Tensor(Tensor &&other) = default;
+  Tensor &operator=(const Tensor &other) = default;
+  Tensor &operator=(Tensor &&other) = default;
+  operator NVTETensor() const { return tensor.get(); }
+  NVTEDType dtype() const { return nvte_tensor_type(tensor.get()); }
+  auto shape() const {
+    const auto shape_ = nvte_tensor_shape(tensor.get());
+    return std::vector<size_t>(shape_.data, shape_.data + shape_.ndim);
+  }
+  size_t data_ptr() const {
+    return reinterpret_cast<size_t>(nvte_tensor_data(tensor.get()));
+  }
+  size_t amax_ptr() const {
+    return reinterpret_cast<size_t>(nvte_tensor_amax(tensor.get()));
+  }
+  size_t scale_ptr() const {
+    return reinterpret_cast<size_t>(nvte_tensor_scale(tensor.get()));
+  }
+  size_t scale_inv_ptr() const {
+    return reinterpret_cast<size_t>(nvte_tensor_scale_inv(tensor.get()));
+  }
+};
+
+// ----------- Wrapper for NVTETensorPack -----------
+struct TensorPack : NVTETensorPack {
+  TensorPack(const std::vector<Tensor> &tensors_) : NVTETensorPack{} {
+    size = tensors_.size();
+    if (size > MAX_SIZE) {
+      throw std::runtime_error("TensorPack size exceeds MAX_SIZE");
+    }
+    for (size_t i = 0; i < size; ++i) {
+      tensors[i] = static_cast<NVTETensor>(tensors_[i]);
+    }
+    nvte_tensor_pack_create(this);
+  }
+  operator NVTETensorPack *() { return this; }
+  operator const NVTETensorPack *() const { return this; }
+  ~TensorPack() { nvte_tensor_pack_destroy(this); }
+};
+
+// ----------- Function substitution template machinery -----------
+template <typename T> struct exposed_type {
+  using type = T;
+};
+
+template <typename T> struct wrapped;
+template <typename T> struct wrapped : exposed_type<T> {
+  static T wrap(T arg) { return arg; }
+  static T unwrap(T arg) { return arg; }
+};
+template <> struct wrapped<void> : exposed_type<void> {
+  // Intentionally left blank
+  // ie. this should never be used
+  // because an argument cannot have
+  // void type, while conversion
+  // should be skipped for void return type.
+};
+template <> struct wrapped<NVTETensor> : exposed_type<Tensor> {
+  static NVTETensor unwrap(Tensor arg) { return static_cast<NVTETensor>(arg); }
+};
+template <>
+struct wrapped<NVTETensorPack *> : exposed_type<std::vector<Tensor>> {
+  static TensorPack unwrap(const std::vector<Tensor> &arg) {
+    return TensorPack(arg);
+  }
+};
+template <>
+struct wrapped<const NVTETensorPack *> : exposed_type<std::vector<Tensor>> {
+  static TensorPack unwrap(const std::vector<Tensor> &arg) {
+    return TensorPack(arg);
+  }
+};
+template <> struct wrapped<NVTEShape> : exposed_type<std::vector<size_t>> {
+  static std::vector<size_t> wrap(NVTEShape arg) {
+    return std::vector<size_t>(arg.data, arg.data + arg.ndim);
+  }
+  static NVTEShape unwrap(const std::vector<size_t> &arg) {
+    NVTEShape shape{};
+    shape.ndim = arg.size();
+    shape.data = arg.data();
+    return shape;
+  }
+};
+
+template <typename T> using wrapped_t = typename wrapped<T>::type;
+struct at_scope_exit {
+  void (*ptr)();
+  ~at_scope_exit() { ptr(); }
+};
+
+// Makes the cuda stream argument always be the last argument
+template <typename Ret, typename... PrefixArgs, typename... SuffixArgs,
+          typename... Args>
+constexpr auto cuda_stream_arg_helper(Ret(func)(Args...),
+                                      type_list<PrefixArgs...>,
+                                      type_list<SuffixArgs...>) noexcept {
+  return [func](wrapped_t<PrefixArgs>... prefixArgs,
+                wrapped_t<SuffixArgs>... suffixArgs,
+                size_t stream) -> wrapped_t<Ret> {
+    at_scope_exit _{cuda_check};
+    if constexpr (!std::is_same_v<Ret, void>) {
+      return wrapped<Ret>::wrap(
+          func(wrapped<PrefixArgs>::unwrap(prefixArgs)...,
+               reinterpret_cast<cudaStream_t>(stream),
+               wrapped<SuffixArgs>::unwrap(suffixArgs)...));
+    } else {
+      return func(wrapped<PrefixArgs>::unwrap(prefixArgs)...,
+                  reinterpret_cast<cudaStream_t>(stream),
+                  wrapped<SuffixArgs>::unwrap(suffixArgs)...);
+    }
+  };
+}
+
+template <typename Ret, typename... Args>
+constexpr auto wrap(Ret(func)(Args...)) noexcept {
+  using tl = type_list<Args...>;
+  if constexpr (tl::template contains<cudaStream_t>) {
+    constexpr size_t stream_arg_idx = tl::template find<cudaStream_t>;
+    using prefix = typename tl::template pop_back<tl::size - stream_arg_idx>;
+    using suffix = typename tl::template pop_front<stream_arg_idx + 1>;
+    return cuda_stream_arg_helper(func, prefix(), suffix());
+  } else {
+    return [func](wrapped_t<Args>... args) -> wrapped_t<Ret> {
+      at_scope_exit _{cuda_check};
+      if constexpr (!std::is_same_v<Ret, void>) {
+        return wrapped<Ret>::wrap(func(wrapped<Args>::unwrap(args)...));
+      } else {
+        return func(wrapped<Args>::unwrap(args)...);
+      }
+    };
+  }
+}
+
+// Manual wrapper around nvte_multi_cast_transpose
+void multi_cast_transpose(const std::vector<Tensor> &inputs,
+                          const std::vector<Tensor> &cast_outs,
+                          const std::vector<Tensor> &transposed_outs,
+                          size_t stream) {
+  auto count = inputs.size();
+  std::vector<NVTETensor> inputs_(count);
+  std::vector<NVTETensor> cast_outs_(count);
+  std::vector<NVTETensor> transposed_outs_(count);
+  for (int i = 0; i < inputs.size(); ++i) {
+    inputs_[i] = static_cast<NVTETensor>(inputs[i]);
+    cast_outs_[i] = static_cast<NVTETensor>(cast_outs[i]);
+    transposed_outs_[i] = static_cast<NVTETensor>(transposed_outs[i]);
+  }
+  nvte_multi_cast_transpose(count, inputs_.data(), cast_outs_.data(),
+                            transposed_outs_.data(),
+                            reinterpret_cast<cudaStream_t>(stream));
+
+  cuda_check();
+}
+
+// ----------- Registration of module -----------
+namespace py = pybind11;
+PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
+  py::enum_<NVTEDType>(m, "DType", py::module_local())
+      .value("Byte", kNVTEByte)
+      .value("Int32", kNVTEInt32)
+      .value("Int64", kNVTEInt64)
+      .value("Float32", kNVTEFloat32)
+      .value("Float16", kNVTEFloat16)
+      .value("BFloat16", kNVTEBFloat16)
+      .value("Float8E4M3", kNVTEFloat8E4M3)
+      .value("Float8E5M2", kNVTEFloat8E5M2);
+
+  py::enum_<NVTE_Fused_Attn_Backend>(m, "FusedAttnBackend", py::module_local())
+      .value("No_Backend", NVTE_No_Backend)
+      .value("F16_max512_seqlen", NVTE_F16_max512_seqlen)
+      .value("F16_arbitrary_seqlen", NVTE_F16_arbitrary_seqlen)
+      .value("FP8", NVTE_FP8);
+
+  py::enum_<NVTE_QKV_Layout>(m, "QKVLayout", py::module_local())
+      .value("NOT_INTERLEAVED", NVTE_NOT_INTERLEAVED)
+      .value("QKV_INTERLEAVED", NVTE_QKV_INTERLEAVED)
+      .value("KV_INTERLEAVED", NVTE_KV_INTERLEAVED);
+
+  py::enum_<NVTE_Bias_Type>(m, "BiasType", py::module_local())
+      .value("NO_BIAS", NVTE_NO_BIAS)
+      .value("PRE_SCALE_BIAS", NVTE_PRE_SCALE_BIAS)
+      .value("POST_SCALE_BIAS", NVTE_POST_SCALE_BIAS);
+
+  py::enum_<NVTE_Mask_Type>(m, "MaskType", py::module_local())
+      .value("NO_MASK", NVTE_NO_MASK)
+      .value("PADDING_MASK", NVTE_PADDING_MASK)
+      .value("CAUSAL_MASK", NVTE_CAUSAL_MASK);
+
+  py::class_<Tensor>(m, "RawTensor", py::module_local())
+      .def(py::init<size_t, const std::vector<size_t> &, NVTEDType, size_t,
+                    size_t, size_t>())
+      .def_property_readonly("dtype", &Tensor::dtype)
+      .def_property_readonly("shape", &Tensor::shape)
+      .def("data_ptr", &Tensor::data_ptr)
+      .def("amax_ptr", &Tensor::amax_ptr)
+      .def("scale_ptr", &Tensor::scale_ptr)
+      .def("scale_inv_ptr", &Tensor::scale_inv_ptr);
+
+  m.def("gelu", wrap(nvte_gelu));
+  m.def("dgelu", wrap(nvte_dgelu));
+  m.def("geglu", wrap(nvte_geglu));
+  m.def("dgeglu", wrap(nvte_dgeglu));
+  m.def("relu", wrap(nvte_relu));
+  m.def("drelu", wrap(nvte_drelu));
+  m.def("swiglu", wrap(nvte_swiglu));
+  m.def("dswiglu", wrap(nvte_dswiglu));
+  m.def("reglu", wrap(nvte_reglu));
+  m.def("dreglu", wrap(nvte_dreglu));
+  m.def("fp8_quantize", wrap(nvte_fp8_quantize));
+  m.def("fp8_dequantize", wrap(nvte_fp8_dequantize));
+  m.def("get_fused_attn_backend", wrap(nvte_get_fused_attn_backend));
+  m.def("fused_attn_fwd_qkvpacked", wrap(nvte_fused_attn_fwd_qkvpacked));
+  m.def("fused_attn_bwd_qkvpacked", wrap(nvte_fused_attn_bwd_qkvpacked));
+  m.def("fused_attn_fwd_kvpacked", wrap(nvte_fused_attn_fwd_kvpacked));
+  m.def("fused_attn_bwd_kvpacked", wrap(nvte_fused_attn_bwd_kvpacked));
+  m.def("cublas_gemm", wrap(nvte_cublas_gemm));
+  m.def("layernorm_fwd", wrap(nvte_layernorm_fwd));
+  m.def("layernorm1p_fwd", wrap(nvte_layernorm1p_fwd));
+  m.def("layernorm_bwd", wrap(nvte_layernorm_bwd));
+  m.def("layernorm1p_bwd", wrap(nvte_layernorm1p_bwd));
+  m.def("rmsnorm_fwd", wrap(nvte_rmsnorm_fwd));
+  m.def("rmsnorm_bwd", wrap(nvte_rmsnorm_bwd));
+  m.def("scaled_softmax_forward", wrap(nvte_scaled_softmax_forward));
+  m.def("scaled_softmax_backward", wrap(nvte_scaled_softmax_backward));
+  m.def("scaled_masked_softmax_forward",
+        wrap(nvte_scaled_masked_softmax_forward));
+  m.def("scaled_masked_softmax_backward",
+        wrap(nvte_scaled_masked_softmax_backward));
+  m.def("scaled_upper_triang_masked_softmax_forward",
+        wrap(nvte_scaled_upper_triang_masked_softmax_forward));
+  m.def("scaled_upper_triang_masked_softmax_backward",
+        wrap(nvte_scaled_upper_triang_masked_softmax_backward));
+  m.def("cast_transpose", wrap(nvte_cast_transpose));
+  m.def("transpose", wrap(nvte_transpose));
+  m.def("cast_transpose_dbias", wrap(nvte_cast_transpose_dbias));
+  m.def("fp8_transpose_dbias", wrap(nvte_fp8_transpose_dbias));
+  m.def("cast_transpose_dbias_dgelu", wrap(nvte_cast_transpose_dbias_dgelu));
+  m.def("dgeglu_cast_transpose", wrap(nvte_dgeglu_cast_transpose));
+  m.def("multi_cast_transpose", &multi_cast_transpose);
+}
diff --git a/transformer_engine/pytorch/sequential/nvte/cppsrc/type_list.h b/transformer_engine/pytorch/sequential/nvte/cppsrc/type_list.h
new file mode 100644
index 0000000000..7b5459761d
--- /dev/null
+++ b/transformer_engine/pytorch/sequential/nvte/cppsrc/type_list.h
@@ -0,0 +1,180 @@
+#include <cstddef>
+#include <initializer_list>
+#include <type_traits>
+
+template <typename... Ts> struct type_list;
+
+template <typename TL> struct type_list_front;
+template <typename TL> struct type_list_back;
+template <typename TL> struct type_list_reverse_list;
+template <typename TL, size_t I> struct type_list_index;
+template <typename TL1, typename TL2> struct type_list_cat_list;
+template <typename TL, size_t N = 1> struct type_list_pop_front_list;
+template <typename TL, size_t N = 1> struct type_list_pop_back_list;
+template <typename TL, typename T> struct type_list_contains;
+template <typename TL, template <typename> typename Pred> struct type_list_any;
+template <typename TL, typename T> struct type_list_find;
+template <typename TL, template <typename> typename Pred>
+struct type_list_first;
+
+template <typename First, typename... Ts>
+struct type_list_front<type_list<First, Ts...>> {
+  using type = First;
+};
+
+template <typename First, typename... Ts>
+struct type_list_pop_front_list<type_list<First, Ts...>, 0> {
+  using type = type_list<First, Ts...>;
+};
+template <> struct type_list_pop_front_list<type_list<>, 0> {
+  using type = type_list<>;
+};
+template <typename First, typename... Ts, size_t N>
+struct type_list_pop_front_list<type_list<First, Ts...>, N> {
+  using type = typename type_list_pop_front_list<type_list<Ts...>, N - 1>::type;
+};
+
+template <typename... Ts, size_t I>
+struct type_list_index<type_list<Ts...>, I> {
+private:
+  using stripped = typename type_list_pop_front_list<type_list<Ts...>, I>::type;
+
+public:
+  using type = typename type_list_front<stripped>::type;
+};
+
+template <typename... Ts1, typename... Ts2>
+struct type_list_cat_list<type_list<Ts1...>, type_list<Ts2...>> {
+  using type = type_list<Ts1..., Ts2...>;
+};
+
+template <typename First, typename... Ts>
+struct type_list_reverse_list<type_list<First, Ts...>> {
+private:
+  using ts_reversed = typename type_list_reverse_list<type_list<Ts...>>::type;
+  using back_list = type_list<First>;
+
+public:
+  using type = typename type_list_cat_list<ts_reversed, back_list>::type;
+};
+template <> struct type_list_reverse_list<type_list<>> {
+  using type = type_list<>;
+};
+
+template <typename... Ts> struct type_list_back<type_list<Ts...>> {
+private:
+  using reversed = typename type_list_reverse_list<type_list<Ts...>>::type;
+
+public:
+  using type = typename type_list_front<reversed>::type;
+};
+
+template <typename... Ts, size_t N>
+struct type_list_pop_back_list<type_list<Ts...>, N> {
+private:
+  using reversed = typename type_list_reverse_list<type_list<Ts...>>::type;
+  using stripped = typename type_list_pop_front_list<reversed, N>::type;
+
+public:
+  using type = typename type_list_reverse_list<stripped>::type;
+};
+
+template <typename... Ts, template <typename> typename Pred>
+struct type_list_any<type_list<Ts...>, Pred> {
+  static constexpr bool value = (Pred<Ts>::value || ...);
+};
+
+template <typename... Ts, template <typename> typename Pred>
+struct type_list_first<type_list<Ts...>, Pred> {
+private:
+  static constexpr bool values[] = {Pred<Ts>::value...};
+
+public:
+  static constexpr size_t value = []() {
+    for (size_t i = 0; i < sizeof(values) / sizeof(bool); ++i) {
+      if (values[i]) {
+        return i;
+      }
+    }
+    return sizeof(values) / sizeof(bool);
+  }();
+};
+
+template <typename... Ts, typename T>
+struct type_list_contains<type_list<Ts...>, T> {
+private:
+  template <typename U> struct pred {
+    static constexpr bool value = std::is_same_v<T, U>;
+  };
+
+public:
+  static constexpr bool value = type_list_any<type_list<Ts...>, pred>::value;
+};
+
+template <typename... Ts, typename T>
+struct type_list_find<type_list<Ts...>, T> {
+  template <typename U> struct pred {
+    static constexpr bool value = std::is_same_v<T, U>;
+  };
+
+public:
+  static constexpr size_t value =
+      type_list_first<type_list<Ts...>, pred>::value;
+};
+
+template <typename TL>
+using type_list_front_t = typename type_list_front<TL>::type;
+template <typename TL>
+using type_list_back_t = typename type_list_back<TL>::type;
+template <typename TL>
+using type_list_reverse_list_t = typename type_list_reverse_list<TL>::type;
+template <typename TL, size_t I>
+using type_list_index_t = typename type_list_index<TL, I>::type;
+template <typename TL1, typename TL2>
+using type_list_cat_list_t = typename type_list_cat_list<TL1, TL2>::type;
+template <typename TL, size_t N = 1>
+using type_list_pop_front_list_t =
+    typename type_list_pop_front_list<TL, N>::type;
+template <typename TL, size_t N = 1>
+using type_list_pop_back_list_t = typename type_list_pop_back_list<TL, N>::type;
+template <typename TL, typename T>
+constexpr bool type_list_contains_v = type_list_contains<TL, T>::value;
+template <typename TL, template <typename> typename Pred>
+constexpr bool type_list_any_v = type_list_any<TL, Pred>::value;
+template <typename TL, typename T>
+constexpr size_t type_list_find_v = type_list_find<TL, T>::value;
+template <typename TL, template <typename> typename Pred>
+constexpr size_t type_list_first_v = type_list_first<TL, Pred>::value;
+
+template <typename... Ts> struct type_list {
+  using front = type_list<type_list_front_t<type_list>>;
+  using front_t = type_list_index_t<front, 0>;
+
+  using back = type_list<type_list_back_t<type_list>>;
+  using back_t = type_list_index_t<back, 0>;
+
+  using reverse = type_list_reverse_list_t<type_list>;
+
+  template <size_t I> using get = type_list_index_t<type_list, I>;
+
+  template <size_t N = 1>
+  using pop_front = type_list_pop_front_list_t<type_list, N>;
+
+  template <size_t N = 1>
+  using pop_back = type_list_pop_back_list_t<type_list, N>;
+
+  template <typename T>
+  static constexpr bool contains = type_list_contains_v<type_list, T>;
+
+  template <template <typename> typename Pred>
+  static constexpr bool any = type_list_any_v<type_list, Pred>;
+
+  template <typename T>
+  static constexpr size_t find = type_list_find_v<type_list, T>;
+
+  template <typename T, template <typename> typename Pred>
+  static constexpr size_t first = type_list_first_v<type_list, Pred>;
+
+  static constexpr size_t size = sizeof...(Ts);
+};
+template <> struct type_list<> {};
diff --git a/transformer_engine/pytorch/sequential/nvte/dtype.py b/transformer_engine/pytorch/sequential/nvte/dtype.py
new file mode 100644
index 0000000000..060f57c25f
--- /dev/null
+++ b/transformer_engine/pytorch/sequential/nvte/dtype.py
@@ -0,0 +1,20 @@
+from __future__ import annotations
+from . import cpp_extensions as _nvte
+from .cpp_extensions import te_to_torch_dtype, torch_to_te_dtype, dtype_name, bit_width
+
+
+def is_fp8(t: _nvte.Tensor | _nvte.DType):
+    if isinstance(t, _nvte.DType):
+        dtype = t
+    else:
+        dtype = t.dtype
+    return dtype is _nvte.DType.Float8E4M3 or dtype is _nvte.DType.Float8E5M2
+
+
+__all__ = [
+    "is_fp8",
+    "te_to_torch_dtype",
+    "torch_to_te_dtype",
+    "dtype_name",
+    "bit_width",
+]
diff --git a/transformer_engine/pytorch/sequential/nvte/empty.py b/transformer_engine/pytorch/sequential/nvte/empty.py
new file mode 100644
index 0000000000..fc8b660a02
--- /dev/null
+++ b/transformer_engine/pytorch/sequential/nvte/empty.py
@@ -0,0 +1,35 @@
+from __future__ import annotations
+from typing import Sequence
+import torch
+from . import cpp_extensions as _nvte
+from .dtype import te_to_torch_dtype, is_fp8
+from . import execution_state
+
+
+def empty(shape: Sequence[int] = (), out_dtype: _nvte.DType = _nvte.DType.Float32):
+    return multi_empty_share_metadata((shape, out_dtype))[0]
+
+
+def empty_like(t: _nvte.Tensor):
+    return empty(t.shape, t.dtype)
+
+
+def multi_empty_share_metadata(*shapes_dtypes: tuple[Sequence[int], _nvte.DType]):
+    if any(is_fp8(dtype) for _, dtype in shapes_dtypes):
+        if len({dtype for _, dtype in shapes_dtypes if is_fp8(dtype)}) != 1:
+            raise ValueError(
+                "All FP8 tensors that share the same metatensors must have the same dtype."
+            )
+        fp8_dtype = next(dtype for _, dtype in shapes_dtypes if is_fp8(dtype))
+        amax, scale, scale_inv = execution_state.meta_tensor_provider(fp8_dtype)
+    return tuple(
+        _nvte.Tensor(
+            torch.empty(shape, dtype=te_to_torch_dtype(dtype), device="cuda")
+            if shape != ()
+            else torch.Tensor().cuda(),
+            amax if is_fp8(dtype) else torch.Tensor().cuda(),  # type: ignore[possibly-unbound]
+            scale if is_fp8(dtype) else torch.Tensor().cuda(),  # type: ignore[possibly-unbound]
+            scale_inv if is_fp8(dtype) else torch.Tensor().cuda(),  # type: ignore[possibly-unbound]
+        )
+        for shape, dtype in shapes_dtypes
+    )
diff --git a/transformer_engine/pytorch/sequential/nvte/execution_state.py b/transformer_engine/pytorch/sequential/nvte/execution_state.py
new file mode 100644
index 0000000000..e58d4d2ff2
--- /dev/null
+++ b/transformer_engine/pytorch/sequential/nvte/execution_state.py
@@ -0,0 +1,36 @@
+from __future__ import annotations
+from typing import Literal
+import torch
+from ..utils import contextmanager
+from ..persistent import Persistent
+from ..metatensors import PersistentFP8Meta
+from .cpp_extensions import DType
+
+FP8Meta = tuple[torch.Tensor, torch.Tensor, torch.Tensor]
+
+
+def _default_meta_tensor_provider():
+    meta_tensor_provider = PersistentFP8Meta()
+    meta_tensor_provider.next_iteration()
+    return meta_tensor_provider
+
+
+pass_: Literal["forward", "backward", "inference"] = "inference"
+meta_tensor_provider: Persistent[DType, FP8Meta] = _default_meta_tensor_provider()
+
+
+@contextmanager
+def set_execution_state(
+    pass__: Literal["forward", "backward", "inference"],
+    meta_tensor_provider_: Persistent[DType, FP8Meta],
+):
+    global meta_tensor_provider
+    global pass_
+
+    meta_tensor_provider = meta_tensor_provider_
+    pass_ = pass__
+    try:
+        yield
+    finally:
+        meta_tensor_provider = _default_meta_tensor_provider()
+        pass_ = "inference"
diff --git a/transformer_engine/pytorch/sequential/nvte/misc_fusions.py b/transformer_engine/pytorch/sequential/nvte/misc_fusions.py
new file mode 100644
index 0000000000..6226c96330
--- /dev/null
+++ b/transformer_engine/pytorch/sequential/nvte/misc_fusions.py
@@ -0,0 +1,118 @@
+from __future__ import annotations
+from . import cpp_extensions as _nvte
+from ._common import torch_op
+from .dtype import is_fp8
+from .cast_transpose import cast_transpose_checked
+from .empty import multi_empty_share_metadata, empty, empty_like
+from .add import dbias
+from .activation import dgeglu, dgelu
+
+
+@torch_op
+def _cast_transpose_dbias(
+    grad: _nvte.Tensor, cast_dtype: _nvte.DType, dbias_dtype: _nvte.DType
+) -> tuple[_nvte.Tensor, _nvte.Tensor, _nvte.Tensor]:
+    grad_cast, grad_transpose = multi_empty_share_metadata(
+        (grad.shape, cast_dtype), (grad.shape[::-1], cast_dtype)
+    )
+    out_dbias = empty((grad.shape[1],), dbias_dtype)
+    workspace = empty()
+    for _ in range(2):
+        _nvte.cast_transpose_dbias(
+            grad, grad_cast, grad_transpose, out_dbias, workspace
+        )
+        workspace = empty_like(workspace.query_shape_dtype())
+    return grad_cast, grad_transpose, out_dbias
+
+
+@torch_op
+def _fp8_transpose_dbias(
+    grad: _nvte.Tensor, dbias_dtype: _nvte.DType
+) -> tuple[_nvte.Tensor, _nvte.Tensor, _nvte.Tensor]:
+    grad_transpose = empty(grad.shape[::-1], grad.dtype)
+    out_dbias = empty((grad.shape[1],), dbias_dtype)
+    workspace = empty()
+    for _ in range(2):
+        _nvte.fp8_transpose_dbias(grad, grad_transpose, out_dbias, workspace)
+        workspace = empty_like(workspace.query_shape_dtype())
+    return grad, grad_transpose, out_dbias
+
+
+def cast_transpose_dbias_checked(
+    grad: _nvte.Tensor, cast_dtype: _nvte.DType | None, dbias_dtype: _nvte.DType
+):
+    if (
+        dbias_dtype == grad.dtype
+        and cast_dtype is not None
+        and cast_dtype != grad.dtype
+    ):
+        return _cast_transpose_dbias(grad, cast_dtype, dbias_dtype)
+    elif is_fp8(grad) and (cast_dtype is None or cast_dtype == grad.dtype):
+        return _fp8_transpose_dbias(grad, dbias_dtype)
+    else:
+        grad_cast, grad_transpose = cast_transpose_checked(grad, cast_dtype)
+        out_dbias = dbias(grad, dbias_dtype)
+        return grad_cast, grad_transpose, out_dbias
+
+
+@torch_op
+def _cast_transpose_dbias_dgelu(
+    grad: _nvte.Tensor,
+    pre_gelu: _nvte.Tensor,
+    cast_dtype: _nvte.DType,
+    dbias_dtype: _nvte.DType,
+) -> tuple[_nvte.Tensor, _nvte.Tensor, _nvte.Tensor]:
+    dgelu_cast, dgelu_transpose = multi_empty_share_metadata(
+        (grad.shape, cast_dtype), (grad.shape[::-1], cast_dtype)
+    )
+    out_dbias = empty((grad.shape[1],), dbias_dtype)
+    workspace = empty()
+    for _ in range(2):
+        _nvte.cast_transpose_dbias_dgelu(
+            grad, pre_gelu, dgelu_cast, dgelu_transpose, out_dbias, workspace
+        )
+        workspace = empty_like(workspace.query_shape_dtype())
+    return dgelu_cast, dgelu_transpose, out_dbias
+
+
+def cast_transpose_dbias_dgelu_checked(
+    grad: _nvte.Tensor,
+    pre_gelu: _nvte.Tensor,
+    cast_dtype: _nvte.DType | None,
+    dbias_dtype: _nvte.DType,
+):
+    if (
+        dbias_dtype == grad.dtype
+        and cast_dtype is not None
+        and cast_dtype != grad.dtype
+        and grad.dtype == pre_gelu.dtype
+    ):
+        return _cast_transpose_dbias_dgelu(grad, pre_gelu, cast_dtype, dbias_dtype)
+    else:
+        dgelu_ = dgelu(grad, pre_gelu, cast_dtype or grad.dtype)
+        return cast_transpose_dbias_checked(dgelu_, cast_dtype, dbias_dtype)
+
+
+@torch_op
+def _cast_transpose_dgeglu(
+    grad: _nvte.Tensor, pre_geglu: _nvte.Tensor, cast_dtype: _nvte.DType
+) -> tuple[_nvte.Tensor, _nvte.Tensor]:
+    dgeglu_cast, dgeglu_transpose = multi_empty_share_metadata(
+        (grad.shape, cast_dtype), (grad.shape[::-1], cast_dtype)
+    )
+    _nvte.dgeglu_cast_transpose(grad, pre_geglu, dgeglu_cast, dgeglu_transpose)
+    return dgeglu_cast, dgeglu_transpose
+
+
+def cast_transpose_dgeglu_checked(
+    grad: _nvte.Tensor, pre_geglu: _nvte.Tensor, cast_dtype: _nvte.DType | None
+):
+    if (
+        grad.dtype == pre_geglu.dtype
+        and cast_dtype is not None
+        and cast_dtype != grad.dtype
+    ):
+        return _cast_transpose_dgeglu(grad, pre_geglu, cast_dtype)
+    else:
+        dgeglu_ = dgeglu(grad, pre_geglu, cast_dtype or grad.dtype)
+        return cast_transpose_checked(dgeglu_, cast_dtype)
diff --git a/transformer_engine/pytorch/sequential/nvte/mmt.py b/transformer_engine/pytorch/sequential/nvte/mmt.py
new file mode 100644
index 0000000000..b871e6e7f4
--- /dev/null
+++ b/transformer_engine/pytorch/sequential/nvte/mmt.py
@@ -0,0 +1,148 @@
+from __future__ import annotations
+import subprocess
+from . import cpp_extensions as _nvte
+from ..utils import cache
+from ._common import torch_op
+import torch
+from .empty import empty
+from . import execution_state
+
+
+@cache
+def _is_hopper():
+    gpu_name = (
+        subprocess.check_output(
+            "nvidia-smi --query-gpu=name --format=csv,noheader", shell=True
+        )
+        .decode("utf-8")
+        .strip()
+    )
+    return "H100" in gpu_name
+
+
+@cache
+def _cublas_workspace():
+    workspace_size = 33_554_432 if _is_hopper() else 4_194_304
+    data = torch.empty(workspace_size, dtype=torch.int8, device="cuda")
+    return _nvte.Tensor(
+        data,
+        torch.Tensor().cuda(),
+        torch.Tensor().cuda(),
+        torch.Tensor().cuda(),
+        dtype_override=_nvte.DType.Byte,
+    )
+
+
+def _to_cublas_args(A: _nvte.Tensor, B: _nvte.Tensor, transA: bool, transB: bool):
+    return B, A, not transA, not transB
+
+
+def matmul_transpose(mat: _nvte.Tensor, mul: _nvte.Tensor, out_dtype: _nvte.DType):
+    "returns mat @ mul^T"
+    return matmul_transpose_add(mat, mul, empty(), out_dtype)
+
+
+def matmul_transpose_gelu(mat: _nvte.Tensor, mul: _nvte.Tensor, out_dtype: _nvte.DType):
+    "returns mat @ mul^T, GELU(mat @ mul^T)"
+    return matmul_transpose_add_gelu(mat, mul, empty(), out_dtype)
+
+
+def matmul_transpose_gelu_add(mat: _nvte.Tensor, mul: _nvte.Tensor, add: _nvte.Tensor):
+    "returns mat @ mul^T, GELU(mat @ mul^T) + add"
+    return matmul_transpose_add_gelu_add(mat, mul, empty(), add)
+
+
+@torch_op
+def matmul_transpose_add(
+    mat: _nvte.Tensor, mul: _nvte.Tensor, add: _nvte.Tensor, out_dtype: _nvte.DType
+) -> _nvte.Tensor:
+    "returns mat @ mul^T + add"
+    a, b, trans_a, trans_b = _to_cublas_args(mat, mul, False, True)
+    out = empty((b.shape[0], a.shape[0]), out_dtype)
+    _nvte.cublas_gemm(
+        a,
+        b,
+        out,
+        add,
+        empty(),
+        trans_a,
+        trans_b,
+        execution_state.pass_ == "backward",
+        _cublas_workspace(),
+        False,
+        execution_state.pass_ == "backward",
+        0,
+    )
+    return out
+
+
+@torch_op
+def matmul_transpose_add_gelu(
+    mat: _nvte.Tensor, mul: _nvte.Tensor, add: _nvte.Tensor, out_dtype: _nvte.DType
+) -> tuple[_nvte.Tensor, _nvte.Tensor]:
+    "returns mat @ mul^T + add, GELU(mat @ mul^T + add)"
+    a, b, trans_a, trans_b = _to_cublas_args(mat, mul, False, True)
+    out = empty((b.shape[0], a.shape[0]), out_dtype)
+    pre_gelu = empty(out.shape, add.dtype)
+    _nvte.cublas_gemm(
+        a,
+        b,
+        out,
+        add,
+        pre_gelu,
+        trans_a,
+        trans_b,
+        execution_state.pass_ == "backward",
+        _cublas_workspace(),
+        False,
+        execution_state.pass_ == "backward",
+        0,
+    )
+    return pre_gelu, out
+
+
+@torch_op
+def matmul_transpose_add_add(
+    mat: _nvte.Tensor, mul: _nvte.Tensor, add1: _nvte.Tensor, add2: _nvte.Tensor
+) -> _nvte.Tensor:
+    "returns mat @ mul^T + add1 + add2"
+    a, b, trans_a, trans_b = _to_cublas_args(mat, mul, False, True)
+    _nvte.cublas_gemm(
+        a,
+        b,
+        add2,
+        add1,
+        empty(),
+        trans_a,
+        trans_b,
+        execution_state.pass_ == "backward",
+        _cublas_workspace(),
+        True,
+        execution_state.pass_ == "backward",
+        0,
+    )
+    return add2
+
+
+@torch_op
+def matmul_transpose_add_gelu_add(
+    mat: _nvte.Tensor, mul: _nvte.Tensor, add1: _nvte.Tensor, add2: _nvte.Tensor
+) -> tuple[_nvte.Tensor, _nvte.Tensor]:
+    "returns mat @ mul^T + add1, GELU(mat @ mul^T + add1) + add2"
+    a, b, trans_a, trans_b = _to_cublas_args(mat, mul, False, True)
+    pre_gelu = empty(add2.shape, add1.dtype)
+    _nvte.cublas_gemm(
+        a,
+        b,
+        add2,
+        add1,
+        pre_gelu,
+        trans_a,
+        trans_b,
+        execution_state.pass_ == "backward",
+        _cublas_workspace(),
+        True,
+        execution_state.pass_ == "backward",
+        0,
+    )
+    return pre_gelu, add2
diff --git a/transformer_engine/pytorch/sequential/nvte/normalization.py b/transformer_engine/pytorch/sequential/nvte/normalization.py
new file mode 100644
index 0000000000..2d7ff0b497
--- /dev/null
+++ b/transformer_engine/pytorch/sequential/nvte/normalization.py
@@ -0,0 +1,258 @@
+from __future__ import annotations
+import os
+import torch
+from . import cpp_extensions as _nvte
+from ..utils import contextmanager, cache
+from ._common import torch_op
+from . import execution_state
+from .dtype import dtype_name
+from .empty import empty, empty_like
+
+
+@cache
+def _fwd_sm_margin():
+    return int(os.getenv("NVTE_FWD_LAYERNORM_SM_MARGIN", "0"))
+
+
+@cache
+def _bwd_sm_margin():
+    return int(os.getenv("NVTE_BWD_LAYERNORM_SM_MARGIN", "0"))
+
+
+@cache
+def _sm_total_count() -> int:
+    return torch.cuda.get_device_properties(  # type: ignore
+        torch.cuda.current_device()
+    ).multi_processor_count
+
+
+def _sm_margin():
+    if execution_state.pass_ == "backward":
+        return _bwd_sm_margin()
+    elif execution_state.pass_ == "forward":
+        return _fwd_sm_margin()
+    else:
+        return 0
+
+
+class _NormConfig:
+    def __init__(
+        self, hidden_size: int, gamma: _nvte.Tensor, x: _nvte.Tensor, out: _nvte.Tensor
+    ):
+        self.hidden_size = hidden_size
+        self.gamma_dtype_name = dtype_name(gamma.dtype)
+        self.x_dtype_name = dtype_name(x.dtype)
+        self.out_dtype_name = dtype_name(out.dtype)
+
+    def __str__(self):
+        return str(
+            (
+                self.hidden_size,
+                self.gamma_dtype_name,
+                self.x_dtype_name,
+                self.out_dtype_name,
+            )
+        )
+
+
+@contextmanager
+def _handle_unsupported_config(
+    func_name: str,
+    hidden_size: int,
+    gamma: _nvte.Tensor,
+    x: _nvte.Tensor,
+    out: _nvte.Tensor,
+):
+    try:
+        yield
+    except RuntimeError as error:
+        config = _NormConfig(hidden_size, gamma, x, out)
+        if "Unsupported types." in str(error):
+            raise ValueError(
+                f"This configuration for {func_name} is not supported. "
+                "(Regex) Search for REGISTER_FWD_(TUNED|GENERAL)_LAUNCHER to see possible options. "
+                f"Used configuration: {config}"
+            ) from error
+        else:
+            raise
+
+
+@torch_op
+def layernorm(
+    x: _nvte.Tensor,
+    eps: float,
+    zero_centered_gamma: bool,
+    gamma: _nvte.Tensor,
+    beta: _nvte.Tensor,
+    out_dtype: _nvte.DType,
+) -> tuple[_nvte.Tensor, _nvte.Tensor, _nvte.Tensor]:
+    "returns (x - mean(x)) / sqrt(var(x) + eps) * gamma + beta, mu (for bwd), rsigma (for bwd)"
+
+    assert len(x.shape) == 2
+    n, hidden_size = x.shape
+    mu = empty((n,), _nvte.DType.Float32)
+    rsigma = empty((n,), _nvte.DType.Float32)
+    out = empty(x.shape, out_dtype)
+
+    if zero_centered_gamma:
+        func = _nvte.layernorm1p_fwd
+    else:
+        func = _nvte.layernorm_fwd
+
+    with _handle_unsupported_config("layernorm", hidden_size, gamma, x, out):
+        workspace = empty()
+        barrier = empty()
+        for _ in range(2):
+            func(
+                x,
+                gamma,
+                beta,
+                eps,
+                out,
+                mu,
+                rsigma,
+                _sm_total_count() - _sm_margin(),
+                workspace,
+                barrier,
+            )
+            workspace = empty_like(workspace.query_shape_dtype())
+            barrier = empty_like(barrier.query_shape_dtype())
+
+    return out, mu, rsigma
+
+
+@torch_op
+def dlayernorm(
+    grad: _nvte.Tensor,
+    zero_centered_gamma: bool,
+    x: _nvte.Tensor,
+    gamma: _nvte.Tensor,
+    mu: _nvte.Tensor,
+    rsigma: _nvte.Tensor,
+    dx_dtype: _nvte.DType,
+    dgamma_dtype: _nvte.DType,
+    dbeta_dtype: _nvte.DType,
+) -> tuple[_nvte.Tensor, _nvte.Tensor, _nvte.Tensor]:
+    "returns dx, dgamma, dbeta"
+
+    dx = empty(x.shape, dx_dtype)
+    dgamma = empty(gamma.shape, dgamma_dtype)
+    dbeta = empty(gamma.shape, dbeta_dtype)
+
+    if zero_centered_gamma:
+        func = _nvte.layernorm1p_bwd
+    else:
+        func = _nvte.layernorm_bwd
+
+    with _handle_unsupported_config("dlayernorm", x.shape[1], gamma, x, dx):
+        workspace = empty()
+        barrier = empty()
+        dgamma_part = empty()
+        dbeta_part = empty()
+        for _ in range(2):
+            func(
+                grad,
+                x,
+                mu,
+                rsigma,
+                gamma,
+                dx,
+                dgamma,
+                dbeta,
+                dgamma_part,
+                dbeta_part,
+                _sm_total_count() - _sm_margin(),
+                workspace,
+                barrier,
+            )
+            workspace = empty_like(workspace.query_shape_dtype())
+            barrier = empty_like(barrier.query_shape_dtype())
+            dgamma_part = empty_like(dgamma_part.query_shape_dtype())
+            dbeta_part = empty_like(dbeta_part.query_shape_dtype())
+
+    return dx, dgamma, dbeta
+
+
+@torch_op
+def rmsnorm(
+    x: _nvte.Tensor,
+    eps: float,
+    zero_centered_gamma: bool,
+    gamma: _nvte.Tensor,
+    out_dtype: _nvte.DType,
+) -> tuple[_nvte.Tensor, _nvte.Tensor]:
+    "returns x / sqrt(var(x) + eps) * gamma, rsigma (for bwd)"
+
+    assert len(x.shape) == 2
+
+    n, hidden_size = x.shape
+    rsigma = empty((n,), _nvte.DType.Float32)
+    out = empty(x.shape, out_dtype)
+
+    if zero_centered_gamma:
+        raise NotImplementedError()  # TODO
+    else:
+        func = _nvte.rmsnorm_fwd
+
+    with _handle_unsupported_config("rmsnorm", hidden_size, gamma, x, out):
+        workspace = empty()
+        barrier = empty()
+        for _ in range(2):
+            func(
+                x,
+                gamma,
+                eps,
+                out,
+                rsigma,
+                _sm_total_count() - _sm_margin(),
+                workspace,
+                barrier,
+            )
+            workspace = empty_like(workspace.query_shape_dtype())
+            barrier = empty_like(barrier.query_shape_dtype())
+
+    return out, rsigma
+
+
+@torch_op
+def drmsnorm(
+    grad: _nvte.Tensor,
+    zero_centered_gamma: bool,
+    x: _nvte.Tensor,
+    gamma: _nvte.Tensor,
+    rsigma: _nvte.Tensor,
+    dx_dtype: _nvte.DType,
+    dgamma_dtype: _nvte.DType,
+) -> tuple[_nvte.Tensor, _nvte.Tensor]:
+    "returns dx, dgamma"
+
+    dx = empty(x.shape, dx_dtype)
+    dgamma = empty(gamma.shape, dgamma_dtype)
+
+    if zero_centered_gamma:
+        raise NotImplementedError()  # TODO
+    else:
+        func = _nvte.rmsnorm_bwd
+
+    with _handle_unsupported_config("drmsnorm", x.shape[1], gamma, x, dx):
+        workspace = empty()
+        barrier = empty()
+        dgamma_part = empty()
+        for _ in range(2):
+            func(
+                grad,
+                x,
+                rsigma,
+                gamma,
+                dx,
+                dgamma,
+                dgamma_part,
+                _sm_total_count() - _sm_margin(),
+                workspace,
+                barrier,
+            )
+            workspace = empty_like(workspace.query_shape_dtype())
+            barrier = empty_like(barrier.query_shape_dtype())
+            dgamma_part = empty_like(dgamma_part.query_shape_dtype())
+
+    return dx, dgamma
diff --git a/transformer_engine/pytorch/sequential/persistent.py b/transformer_engine/pytorch/sequential/persistent.py
new file mode 100644
index 0000000000..33c548be28
--- /dev/null
+++ b/transformer_engine/pytorch/sequential/persistent.py
@@ -0,0 +1,89 @@
+from __future__ import annotations
+from abc import ABC, abstractmethod
+from typing import Generic, TypeVar
+from typing_extensions import TypeVarTuple, Unpack
+
+
+T = TypeVar("T")
+Ts = TypeVarTuple("Ts")
+
+
+class Persistent(Generic[Unpack[Ts], T], ABC):
+    """
+    Storage for data that is to be persisted between iterations.
+    Examples include fp8 metatensors (during training)
+    and KV cache (during inference).
+    """
+
+    # abstract
+    @abstractmethod
+    def _generate(self, *args: Unpack[Ts]) -> T:
+        ...
+
+    # public
+    def __call__(self, *args: Unpack[Ts]) -> T:
+        if __debug__:
+            if self._iteration() == 1:
+                self.__arguments.append(args)
+            else:
+                assert self.__arguments[self.__index_within_iteration(False)] == args
+        result = self._generate(*args)
+        if __debug__:
+            if self._iteration() == 1:
+                self.__values.append(result)
+            else:
+                assert self.__values[self.__index_within_iteration(False)] is result
+        return result
+
+    def next_iteration(self):
+        self.__user_set_iteration += 1
+
+    # protected
+    def _iteration(self):
+        assert self.__user_set_iteration > 0
+        return self.__user_set_iteration
+
+    def _is_new_iteration(self):
+        return self.__is_new_iteration(True)
+
+    def _index_within_iteration(self):
+        return self.__index_within_iteration(True)
+
+    def _max_index(self):
+        assert self._iteration() != 1
+        return self.__max_index
+
+    # private
+    __index: int = 0
+    __max_index: int = 0
+    __user_set_iteration: int = 0
+    __derived_seen_iteration: int = 0
+    if __debug__:
+        __values: list[T] = []
+        __arguments: list[tuple[Unpack[Ts]]] = []
+
+    def __is_new_iteration(self, update: bool):
+        if self.__derived_seen_iteration == self._iteration() - 1:
+            if update:
+                self.__derived_seen_iteration = self._iteration()
+            return True
+        elif self.__derived_seen_iteration == self._iteration():
+            return False
+        elif self.__derived_seen_iteration > self._iteration():
+            raise AssertionError("Iteration cannot decrease.")
+        else:  # self.__cur_iter == self._iteration() - k, k > 1
+            raise AssertionError("Cannot skip iterations.")
+
+    def __index_within_iteration(self, update: bool):
+        if update:
+            if self.__is_new_iteration(False):
+                self.__index = 1
+            else:
+                self.__index += 1
+                if self._iteration() == 1:
+                    self.__max_index = self.__index
+
+        assert self.__index > 0
+        assert self.__index <= self.__max_index
+
+        return self.__index - 1
diff --git a/transformer_engine/pytorch/sequential/recipe.py b/transformer_engine/pytorch/sequential/recipe.py
new file mode 100644
index 0000000000..6b46aea2c7
--- /dev/null
+++ b/transformer_engine/pytorch/sequential/recipe.py
@@ -0,0 +1,56 @@
+from __future__ import annotations
+from typing import Callable, TypeVar, NamedTuple
+from types import TracebackType
+from .nvte import DType
+import torch
+
+T = TypeVar("T")
+
+
+def _default_amax_reduction_method(
+    per_tensor_amax_histories: torch.Tensor,
+) -> torch.Tensor:
+    return per_tensor_amax_histories.max(dim=1).values  # type: ignore
+
+
+def _default_scaling_factor_compute_method(
+    amax: torch.Tensor,
+    fp8_max: torch.Tensor,
+    margin: torch.Tensor,
+    out_scale: torch.Tensor,
+):
+    exp = torch.floor(torch.log2(fp8_max / amax)) - margin
+    t = torch.round(torch.pow(2, torch.abs(exp)))
+    t = torch.where(amax > 0.0, t, out_scale)
+    t = torch.where(torch.isfinite(amax), t, out_scale)
+    torch.where(exp < 0, 1 / t, t, out=out_scale)
+
+
+_recipe_stack: list[Recipe] = []
+
+
+class Recipe(NamedTuple):
+    amax_history_len: int = 1024
+    amax_reduction_period: int = 10
+    amax_reduction_method: Callable[
+        [torch.Tensor], torch.Tensor
+    ] = _default_amax_reduction_method
+    scaling_factor_compute_method: Callable[
+        [torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor], None
+    ] = _default_scaling_factor_compute_method
+    lowp: DType = DType.Float32
+    world_size: int = 1
+
+    def __enter__(self):
+        _recipe_stack.append(self)
+
+    def __exit__(self, exc_type: type[T], exc_value: T, exc_traceback: TracebackType):
+        assert _recipe_stack[-1] is self
+        _recipe_stack.pop()
+
+    @staticmethod
+    def current() -> Recipe:
+        return _recipe_stack[-1]
+
+
+_recipe_stack.append(Recipe())
diff --git a/transformer_engine/pytorch/sequential/utils.py b/transformer_engine/pytorch/sequential/utils.py
new file mode 100644
index 0000000000..d3104329a2
--- /dev/null
+++ b/transformer_engine/pytorch/sequential/utils.py
@@ -0,0 +1,442 @@
+from __future__ import annotations
+import ast
+from typing import (
+    Any,
+    Callable,
+    Generic,
+    Generator,
+    Literal,
+    Mapping,
+    Protocol,
+    Sequence,
+    Sized,
+    TypeVar,
+    overload,
+    Iterable,
+)
+from types import NoneType, TracebackType, ModuleType, GenericAlias
+from typing_extensions import ParamSpec, TypeVarTuple, Unpack
+from .exec_saving_source import exec_saving_source
+
+PS = ParamSpec("PS")
+T = TypeVar("T")
+Ts = TypeVarTuple("Ts")
+Ts2 = TypeVarTuple("Ts2")
+CT = TypeVar("CT", covariant=True)
+ExcT = TypeVar("ExcT")
+SomeDict = TypeVar("SomeDict", bound=Mapping[Any, Any], covariant=True)
+
+
+class _Context(Generic[PS, T]):
+    def __init__(
+        self,
+        func: Callable[PS, Generator[T, None, None]],
+        *args: PS.args,
+        **kwargs: PS.kwargs,
+    ):
+        self.func = func
+        self.args = args
+        self.kwargs = kwargs
+
+    def __enter__(self):
+        gen = self.func(*self.args, **self.kwargs)
+        self.gen = gen
+        return next(gen)
+
+    def __exit__(
+        self,
+        exc_type: type[ExcT],
+        exc_value: ExcT,
+        exc_traceback: TracebackType,
+    ):
+        try:
+            next(self.gen)
+        except StopIteration:
+            # Discard exception, it is expected
+            pass
+
+
+class contextmanager(Generic[PS, T]):
+    "TorchDynamo-compatible replacement for `contextlib.contextmanager`"
+
+    def __init__(self, func: Callable[PS, Generator[T, None, None]]):
+        self.func = func
+
+    def __call__(self, *args: PS.args, **kwargs: PS.kwargs):
+        return _Context(self.func, *args, **kwargs)
+
+
+def cache(func: Callable[[], T]) -> Callable[[], T]:
+    "TorchDynamo-compatible replacement for `functools.cache`"
+    result = func()
+
+    def wrapper():
+        return result
+
+    return wrapper
+
+
+@overload
+def import_file_as_module(
+    file_path: str,
+    run_module: bool = True,
+    *,
+    only_for_side_effects: Literal[False] = False,
+) -> ModuleType:
+    ...
+
+
+@overload
+def import_file_as_module(
+    file_path: str,
+    run_module: bool = True,
+    *,
+    only_for_side_effects: Literal[True] = True,
+) -> None:
+    ...
+
+
+def import_file_as_module(
+    file_path: str, run_module: bool = True, *, only_for_side_effects: bool = False
+):
+    if only_for_side_effects and not run_module:
+        raise ValueError("Cannot import file for side effects only without running it!")
+
+    from importlib.util import spec_from_loader, module_from_spec
+    from importlib.machinery import SourceFileLoader
+    from pathlib import Path
+    import inspect
+    import sys
+    import os
+
+    try:
+        caller_path = Path(inspect.getframeinfo(sys._getframe(1))[0]).resolve(
+            strict=True
+        )
+        old_cwd = os.getcwd()
+        os.chdir(caller_path.parent)
+    except:
+        old_cwd = None
+
+    try:
+        path = Path(file_path)
+        if not path.suffix:
+            path = path.with_suffix(".py")
+        path = path.resolve(strict=True)
+
+        spec = spec_from_loader(path.name, SourceFileLoader(path.name, str(path)))
+        if spec is None:
+            raise ImportError(
+                f'Failed to load file "{path}" as module: spec_from_loader returned None'
+            )
+        mod = module_from_spec(spec)
+        if run_module:
+            if spec.loader is None:
+                raise ImportError(
+                    f'Failed to run file "{path}" as module: spec_from_loader returned spec with a None loader'
+                )
+            spec.loader.exec_module(mod)
+        if only_for_side_effects:
+            return None
+        else:
+            return mod
+    finally:
+        if old_cwd is not None:
+            os.chdir(old_cwd)
+
+
+def get_arg_types(f: Callable[..., Any]) -> list[type]:
+    import typing
+    import ast
+
+    annotations = typing.get_type_hints(f)
+    annotations.pop("return", None)
+    arg_type_annotations = tuple(annotations.values())
+
+    arg_types = [
+        ast.literal_eval(val) if isinstance(val, str) else val
+        for val in arg_type_annotations
+    ]
+
+    return arg_types
+
+
+def get_arg_names(f: Callable[..., Any]) -> list[str]:
+    import typing
+
+    annotations = typing.get_type_hints(f)
+    annotations.pop("return", None)
+    return list(annotations.keys())
+
+
+def get_return_type(f: Callable[..., T]) -> type[T]:
+    import typing
+    import ast
+
+    try:
+        return_annotation = typing.get_type_hints(f)["return"]
+    except KeyError as e:
+        raise ValueError(f"{f} must have an annotated return type") from e
+
+    return_type = (
+        ast.literal_eval(return_annotation)
+        if isinstance(return_annotation, str)
+        else return_annotation
+    )
+
+    return return_type  # type: ignore
+
+
+class SizedIterable(Sized, Iterable[CT], Protocol):
+    pass
+
+
+class enumerate(enumerate[T]):
+    def __init__(self, iterable: Iterable[T], start: int = 0) -> None:
+        if isinstance(iterable, Sized):
+            self.__len__ = lambda: len(iterable)
+        super().__init__(iterable, start)
+
+    def __len__(self) -> int:
+        ...
+
+
+def unrolled_for(
+    iterations: int,
+) -> Callable[
+    [Callable[[Unpack[Ts], SomeDict], SomeDict]],
+    Callable[[Iterable[tuple[Unpack[Ts]]], SomeDict], None],
+]:
+    if not hasattr(unrolled_for, "memo"):
+        setattr(unrolled_for, "memo", {})
+    memo: dict[tuple[int, bool, bool], Callable[..., Any]] = getattr(
+        unrolled_for, "memo"
+    )
+
+    def decorator(
+        f: Callable[[Unpack[Ts], SomeDict], SomeDict]
+    ) -> Callable[[Iterable[tuple[Unpack[Ts]]], SomeDict], None]:
+        import inspect
+
+        unpack = len(inspect.getfullargspec(f).args) > 1
+        INDENT = " " * 4
+        pref_code = f"\ndef unrolled_{iterations}(f, iterable, loop_state):\n"
+        pref_code += INDENT + "iterator = iter(iterable)\n"
+        iter_code = INDENT + "item = next(iterator)\n"
+        return_type = get_return_type(f)
+        if unpack:
+            if return_type is NoneType:
+                iter_code += INDENT + "f(*item)\n"
+            else:
+                iter_code += INDENT + "loop_state = f(*item, loop_state)\n"
+        else:
+            if return_type is NoneType:
+                iter_code += INDENT + "f(item)\n"
+            else:
+                iter_code += INDENT + "loop_state = f(item, loop_state)\n"
+        sufx_code = ""
+        namespace: dict[str, Any] = {}
+        full_code = pref_code + iter_code * iterations + sufx_code
+        exec_saving_source(full_code, namespace)
+        unrolled_loop = namespace[f"unrolled_{iterations}"]
+        memo[(iterations, unpack, return_type is not NoneType)] = unrolled_loop
+        return lambda iterable, loop_state: unrolled_loop(f, iterable, loop_state)
+
+    return decorator
+
+
+def get_globals(o: object) -> dict[str, Any]:
+    "Returns the same object that `globals()` would return inside the provided object."
+    try:
+        return o.__globals__  # type: ignore
+    except:
+        pass
+    try:
+        import sys
+
+        return sys.modules[o.__module__].__dict__
+    except:
+        pass
+    raise ValueError(f"Cannot get globals for {o}")
+
+
+class MacroVar(Generic[T]):
+    "A `TypeVar`-like object representing a `macro`'s parameter."
+
+    def __new__(cls, name: str, type_: type[T] = object) -> T:
+        return (name, type_)  # type: ignore
+
+
+class _MacroTransformer(ast.NodeTransformer):
+    def __init__(self, names: Sequence[str], values: Sequence[Any]) -> None:
+        if not len(names) == len(values):
+            raise ValueError(f"Length mismatch: {len(names)} != {len(values)}")
+        for name in names:
+            if not name.isidentifier():
+                raise ValueError(f"Invalid identifier: {name}")
+            if not names.count(name) == 1:
+                raise ValueError(f"Duplicate identifier: {name}")
+        for value in values:
+            try:
+                constant = ast.Constant(value=value)
+                source = ast.unparse(constant)
+                reconstructed = ast.literal_eval(source)
+                if not reconstructed == value:
+                    raise ValueError(
+                        f"Cannot reconstruct value after serialization: {value}"
+                    )
+            except Exception as e:
+                raise ValueError(f"Cannot serialize value: {value}") from e
+        self.names = names
+        self.values = values
+
+    def visit_Name(self, node: ast.Name):
+        if node.id in self.names:
+            idx = self.names.index(node.id)
+            value = self.values[idx]
+            return ast.Constant(value=value)
+        else:
+            return node
+
+
+def macro(
+    *substitutions: Unpack[Ts], textual: bool = True
+) -> Callable[[T], Callable[[Unpack[Ts]], T]]:
+    """
+    This decorator functions like a C-like macro definition.
+    It can be applied to a function or class definition.
+    It is to be used together with `MacroVar`s - `TypeVar`-like
+    objects representing the macro's parameters.
+
+    Example declaration:
+    ```
+    X = MacroVar("X", int)
+    @macro(X)
+    def f():
+        return X
+    ```
+    The above macro can then be used like this:
+    ```
+    f1 = f(1)
+    assert f1() == 1
+    ```
+
+    The `textual` (default `True`) argument controls if the
+    instantiations of the macro should have their ASTs modified
+    in place (textual) or if they should have the same source,
+    and instead have the provided values injected as constant
+    globals into their namespace.
+
+    For example, with `textual=True`, `inspect.getsource(f1)` outputs:
+    ```
+    def f():
+        return 1
+    ```
+    With `textual=False`, `inspect.getsource(f1)` outputs:
+    ```
+    def f():
+        return X
+    ```
+    The `textual=False` mode is needed when `eval(repr(X))` fails.
+    In this mode, the substituted values `are` (Python `is`) the
+    original provided values.
+
+    It can be used, for example, to sidestep some limitations of TorchDynamo.
+    """
+    names: list[str] = [name for name, _ in substitutions]  # type: ignore
+    for name in names:
+        assert name.isidentifier()
+        assert names.count(name) == 1
+
+    if textual:
+
+        def textual_decorator(definition: T) -> Callable[[Unpack[Ts]], T]:
+            import inspect
+            import ast
+
+            try:
+                source = inspect.getsource(definition)  # type: ignore
+                # Source includes the decorator, remove it
+                source = source[source.find("\n") + 1 :].strip()
+                ast_tree = ast.parse(source)
+            except OSError:
+                raise ValueError("Cannot get source code of definition")
+
+            def macro_impl(*values: Unpack[Ts]) -> T:
+                _MacroTransformer(names, values).visit(ast_tree)
+                ast.fix_missing_locations(ast_tree)
+                source = ast.unparse(ast_tree)
+                try:
+                    globals_ = get_globals(definition).copy()
+                    del globals_[definition.__name__]  # type: ignore
+                except:
+                    globals_ = {}
+                exec_saving_source(source, globals_)
+                return globals_[definition.__name__]  # type: ignore
+
+            return macro_impl
+
+        return textual_decorator
+    else:
+
+        def injection_decorator(definition: T) -> Callable[[Unpack[Ts]], T]:
+            import inspect
+
+            try:
+                source = inspect.getsource(definition)  # type: ignore
+                # Source includes the decorator, remove it
+                source = source[source.find("\n") + 1 :].strip()
+            except OSError:
+                raise ValueError("Cannot get source code of definition")
+
+            def macro_impl(*values: Unpack[Ts]) -> T:
+                try:
+                    globals_ = get_globals(definition).copy()
+                    del globals_[definition.__name__]  # type: ignore
+                except:
+                    globals_ = {}
+                for name, value in zip(names, values):
+                    globals_[name] = value
+                exec_saving_source(source, globals_)
+                return globals_[definition.__name__]  # type: ignore
+
+            return macro_impl
+
+        return injection_decorator
+
+
+@overload
+def is_generic(t: type) -> Literal[False]:
+    ...
+
+
+@overload
+def is_generic(t: GenericAlias) -> Literal[True]:
+    ...
+
+
+def is_generic(t: type | GenericAlias):
+    """
+    Returns True if the type is a generic type, False otherwise.
+    This is useful for checking if `isinstance` would fail with
+    a `TypeError` when called with a generic type.
+    """
+    from types import GenericAlias
+    from typing import _SpecialGenericAlias, _GenericAlias  # type: ignore
+
+    return isinstance(t, GenericAlias | _SpecialGenericAlias | _GenericAlias)
+
+
+__all__ = [
+    "contextmanager",
+    "cache",
+    "import_file_as_module",
+    "exec_saving_source",
+    "unrolled_for",
+    "is_generic",
+    "get_arg_names",
+    "get_arg_types",
+    "get_return_type",
+    "macro",
+    "MacroVar",
+]