NVIDIA · janekb04 · Aug 27, 2023 · Aug 27, 2023 · Aug 27, 2023 · Aug 27, 2023
diff --git a/setup.py b/setup.py
@@ -484,7 +484,7 @@ def setup_pytorch_extension() -> setuptools.Extension:
     ]
 
     # Compiler flags
-    cxx_flags = ["-O3"]
+    cxx_flags = ["-O3", "-fvisibility=hidden"]
     nvcc_flags = [
         "-O3",
         "-gencode",
@@ -536,6 +536,73 @@ def setup_pytorch_extension() -> setuptools.Extension:
         },
     )
 
+def setup_sequential_extension() -> setuptools.Extension:
+    # Source files
+    src_dir = root_path / "transformer_engine" / "pytorch" / "sequential" / "nvte" / "cppsrc"
+    sources = [
+        src_dir / "pybind.cpp"
+    ]
+
+    # Header files
+    include_dirs = [
+        root_path / "transformer_engine" / "common" / "include",
+        root_path / "transformer_engine",
+        root_path / "3rdparty" / "cudnn-frontend" / "include",
+    ]
+
+    # Compiler flags
+    cxx_flags = ["-O3", "-fvisibility=hidden"]
+    nvcc_flags = [
+        "-O3",
+        "-gencode",
+        "arch=compute_70,code=sm_70",
+        "-U__CUDA_NO_HALF_OPERATORS__",
+        "-U__CUDA_NO_HALF_CONVERSIONS__",
+        "-U__CUDA_NO_BFLOAT16_OPERATORS__",
+        "-U__CUDA_NO_BFLOAT16_CONVERSIONS__",
+        "-U__CUDA_NO_BFLOAT162_OPERATORS__",
+        "-U__CUDA_NO_BFLOAT162_CONVERSIONS__",
+        "--expt-relaxed-constexpr",
+        "--expt-extended-lambda",
+        "--use_fast_math",
+    ]
+
+    # Version-dependent CUDA options
+    try:
+        version = cuda_version()
+    except FileNotFoundError:
+        print("Could not determine CUDA Toolkit version")
+    else:
+        if version >= (11, 2):
+            nvcc_flags.extend(["--threads", "4"])
+        if version >= (11, 0):
+            nvcc_flags.extend(["-gencode", "arch=compute_80,code=sm_80"])
+        if version >= (11, 8):
+            nvcc_flags.extend(["-gencode", "arch=compute_90,code=sm_90"])
+
+    # userbuffers support
+    if with_userbuffers():
+        if os.getenv("MPI_HOME"):
+            mpi_home = Path(os.getenv("MPI_HOME"))
+            include_dirs.append(mpi_home / "include")
+        cxx_flags.append("-DNVTE_WITH_USERBUFFERS")
+        nvcc_flags.append("-DNVTE_WITH_USERBUFFERS")
+
+    # Construct PyTorch CUDA extension
+    sources = [str(path) for path in sources]
+    include_dirs = [str(path) for path in include_dirs]
+    from torch.utils.cpp_extension import CUDAExtension
+    return CUDAExtension(
+        name="transformer_engine_cuda",
+        sources=sources,
+        include_dirs=include_dirs,
+        extra_compile_args={
+            "cxx": cxx_flags,
+            "nvcc": nvcc_flags,
+        },
+        package_data={"transformer_engine_cuda": ["py.typed", "*.pyi"]}
+    )
+
 
 def setup_paddle_extension() -> setuptools.Extension:
     """Setup CUDA extension for Paddle support"""
@@ -555,7 +622,7 @@ def setup_paddle_extension() -> setuptools.Extension:
     ]
 
     # Compiler flags
-    cxx_flags = ["-O3"]
+    cxx_flags = ["-O3", "-fvisibility=hidden"]
     nvcc_flags = [
         "-O3",
         "-gencode",
@@ -614,6 +681,7 @@ def main():
     ext_modules = [setup_common_extension()]
     if "pytorch" in frameworks():
         ext_modules.append(setup_pytorch_extension())
+        ext_modules.append(setup_sequential_extension())
 
     if "paddle" in frameworks():
         ext_modules.append(setup_paddle_extension())

diff --git a/tests/sequential/compare_pt_te_seq.py b/tests/sequential/compare_pt_te_seq.py
@@ -0,0 +1,162 @@
+from __future__ import annotations
+import torch
+import transformer_engine.pytorch.sequential as seq
+from torch import nn
+import transformer_engine.pytorch as te
+from math import sqrt
+
+import torch
+import torch.nn as nn
+
+
+class RMSNorm(nn.Module):
+    def __init__(self, hidden_dim: int, eps: float = 1e-5):
+        super().__init__()
+        self.hidden_dim = hidden_dim
+        self.eps = eps
+        self.weight = nn.Parameter(torch.ones(hidden_dim))
+
+    def forward(self, x: torch.Tensor):
+        x_norm = x.norm(2, dim=-1, keepdim=True)
+        rms_x = x_norm / sqrt(self.hidden_dim)
+        y = x / (rms_x + self.eps)
+        return y * self.weight
+
+
+torch.set_default_device("cuda")
+
+SEQ_LEN = 128
+HIDDEN_DIM = 768
+
+
+def max_abs_diff(a: torch.Tensor, b: torch.Tensor):
+    v = (a - b).abs().max().item()
+    if v >= 0.001:
+        return f"\033[31m{v:12.10f}\033[0m"
+    else:
+        return f"\033[32m{v:12.10f}\033[0m"
+
+
+def cpy(dst: torch.Tensor, src: torch.Tensor):
+    dst.data = torch.as_tensor(src.data.clone().detach(), dtype=dst.dtype).detach()
+
+
+def cmp_modules(te: nn.Module, seq: nn.Module, pt: nn.Module):
+    x_te = x_src.detach().clone().requires_grad_()
+    x_seq = x_src.detach().clone().requires_grad_()
+    x_pt = x_src.detach().clone().requires_grad_()
+
+    y_te = te(x_te)
+    y_seq = seq(x_seq)
+    y_pt = pt(x_pt)
+
+    y_te.sum().backward()
+    y_seq.sum().backward()
+    y_pt.sum().backward()
+
+    print(f"mad(dx_te, dx_seq): {max_abs_diff(x_te.grad, x_seq.grad)}")
+    print(f"mad(dx_te,  dx_pt): {max_abs_diff(x_te.grad, x_pt.grad)}")
+    print(f"mad(dx_seq, dx_pt): {max_abs_diff(x_seq.grad,x_pt.grad)}")
+
+    print(f"mad( y_te,  y_seq): {max_abs_diff(y_te, y_seq)}")
+    print(f"mad( y_te,   y_pt): {max_abs_diff(y_te, y_pt)}")
+    print(f"mad( y_seq,  y_pt): {max_abs_diff(y_seq,y_pt)}")
+
+
+def cmp_layernorm_mlp(norm: str, act: str):
+    m_seq = seq.Sequential(
+        seq.LayerNorm(HIDDEN_DIM) if norm == "LayerNorm" else seq.RMSNorm(HIDDEN_DIM),
+        seq.Linear(HIDDEN_DIM, 3 * HIDDEN_DIM),
+        seq.GELU() if act == "gelu" else seq.ReLU(),
+        seq.Linear(3 * HIDDEN_DIM, HIDDEN_DIM),
+    )
+    m_te = te.LayerNormMLP(
+        HIDDEN_DIM, 3 * HIDDEN_DIM, activation=act, normalization=norm
+    )
+    m_pt = nn.Sequential(
+        nn.LayerNorm(HIDDEN_DIM) if norm == "LayerNorm" else RMSNorm(HIDDEN_DIM),
+        nn.Linear(HIDDEN_DIM, 3 * HIDDEN_DIM),
+        nn.GELU() if act == "gelu" else nn.ReLU(),
+        nn.Linear(3 * HIDDEN_DIM, HIDDEN_DIM),
+    )
+
+    cpy(m_te.layer_norm_weight, m_seq._modules["0"].weight)
+    if norm == "LayerNorm":
+        cpy(m_te.layer_norm_bias, m_seq._modules["0"].bias)
+    cpy(m_te.fc1_weight, m_seq._modules["1"].weight)
+    cpy(m_te.fc1_bias, m_seq._modules["1"].bias)
+    cpy(m_te.fc2_weight, m_seq._modules["3"].weight)
+    cpy(m_te.fc2_bias, m_seq._modules["3"].bias)
+
+    cpy(m_pt[0].weight, m_seq._modules["0"].weight)
+    if norm == "LayerNorm":
+        cpy(m_pt[0].bias, m_seq._modules["0"].bias)
+    cpy(m_pt[1].weight, m_seq._modules["1"].weight)
+    cpy(m_pt[1].bias, m_seq._modules["1"].bias)
+    cpy(m_pt[3].weight, m_seq._modules["3"].weight)
+    cpy(m_pt[3].bias, m_seq._modules["3"].bias)
+
+    cmp_modules(m_te, m_seq, m_pt)
+
+
+def cmp_layernorm():
+    m_seq = seq.LayerNorm(HIDDEN_DIM)
+    m_te = te.LayerNorm(HIDDEN_DIM)
+    m_pt = nn.LayerNorm(HIDDEN_DIM)
+
+    cpy(m_te.weight, m_seq.weight)
+    cpy(m_te.bias, m_seq.bias)
+    cpy(m_pt.weight, m_seq.weight)
+    cpy(m_pt.bias, m_seq.bias)
+
+    cmp_modules(m_te, m_seq, m_pt)
+
+
+def cmp_linear():
+    m_seq = seq.Linear(HIDDEN_DIM, HIDDEN_DIM)
+    m_te = te.Linear(HIDDEN_DIM, HIDDEN_DIM)
+    m_pt = nn.Linear(HIDDEN_DIM, HIDDEN_DIM)
+
+    cpy(m_te.weight, m_seq.weight)
+    cpy(m_te.bias, m_seq.bias)
+    cpy(m_pt.weight, m_seq.weight)
+    cpy(m_pt.bias, m_seq.bias)
+
+    cmp_modules(m_te, m_seq, m_pt)
+
+
+def cmp_linear_no_bias():
+    m_seq = seq.Linear(HIDDEN_DIM, HIDDEN_DIM, bias=False)
+    m_te = te.Linear(HIDDEN_DIM, HIDDEN_DIM, bias=False)
+    m_pt = nn.Linear(HIDDEN_DIM, HIDDEN_DIM, bias=False)
+
+    cpy(m_te.weight, m_seq.weight)
+    cpy(m_pt.weight, m_seq.weight)
+
+    cmp_modules(m_te, m_seq, m_pt)
+
+
+print("\n ----- FP32 INPUT & WEIGHTS ------")
+x_src = torch.rand(SEQ_LEN, HIDDEN_DIM, device="cuda")
+
+for _ in range(10):
+    print("\n### Comparing LayerNormMPL (gelu) ###")
+    cmp_layernorm_mlp("LayerNorm", "gelu")
+
+    print("\n### Comparing LayerNormMPL (relu) ###")
+    cmp_layernorm_mlp("LayerNorm", "relu")
+
+    print("\n### Comparing RMSNormMPL (gelu) ###")
+    cmp_layernorm_mlp("RMSNorm", "gelu")
+
+    print("\n### Comparing RMSNormMPL (relu) ###")
+    cmp_layernorm_mlp("RMSNorm", "relu")
+
+    print("\n### Comparing LayerNorm ###")
+    cmp_layernorm()
+
+    print("\n### Comparing Linear ###")
+    cmp_linear()
+
+    print("\n### Comparing Linear (no bias) ###")
+    cmp_linear_no_bias()
diff --git a/tests/sequential/perf_test.py b/tests/sequential/perf_test.py
@@ -0,0 +1,62 @@
+import torch
+import transformer_engine.pytorch.sequential as seq
+from torch import nn
+import transformer_engine.pytorch as te
+from math import sqrt
+
+SEQ_LEN = 4096
+HIDDEN_DIM = 1024
+
+seq.Sequential(
+    seq.RMSNorm(HIDDEN_DIM),
+)
+
+
+vasavani_dec = te.Sequential(
+    te.Residual(
+        te.Linear(HIDDEN_DIM, 3 * HIDDEN_DIM),
+        te.DotProductAttention(24),
+        te.Linear(HIDDEN_DIM, HIDDEN_DIM),
+        te.LayerNorm(HIDDEN_DIM),
+    ),
+    te.Residual(
+        te.Linear(HIDDEN_DIM, 4 * HIDDEN_DIM),
+        te.ReLU(),
+        te.Linear(4 * HIDDEN_DIM, HIDDEN_DIM),
+        te.LayerNorm(HIDDEN_DIM),
+    ),
+)
+
+gpt = te.Sequential(
+    te.Residual(
+        te.LayerNorm(HIDDEN_DIM),
+        te.Linear(HIDDEN_DIM, 3 * HIDDEN_DIM),
+        te.DotProductAttention(24),
+        te.Linear(HIDDEN_DIM, HIDDEN_DIM),
+        te.Dropout(0.1),
+    ),
+    te.Residual(
+        te.LayerNorm(HIDDEN_DIM),
+        te.Linear(HIDDEN_DIM, 4 * HIDDEN_DIM),
+        te.GELU(),
+        te.Linear(4 * HIDDEN_DIM, HIDDEN_DIM),
+        te.Dropout(0.1),
+    ),
+)
+
+llama = te.Sequential(
+    te.Residual(
+        te.RMSNorm(HIDDEN_DIM),
+        te.Linear(HIDDEN_DIM, 3 * HIDDEN_DIM),
+        te.DotProductAttention(24),
+        te.Linear(HIDDEN_DIM, HIDDEN_DIM),
+        te.Dropout(0.1),
+    ),
+    te.Residual(
+        te.RMSNorm(HIDDEN_DIM),
+        te.Linear(HIDDEN_DIM, 4 * HIDDEN_DIM),
+        te.SwiGLU(),
+        te.Linear(4 * HIDDEN_DIM, HIDDEN_DIM),
+        te.Dropout(0.1),
+    ),
+)
diff --git a/tests/sequential/simple_prec_compare.py b/tests/sequential/simple_prec_compare.py
@@ -0,0 +1,37 @@
+import torch
+from torch import nn
+import transformer_engine.pytorch.sequential as seq
+
+N = 2048
+HIDDEN_DIM = 1024
+x = torch.rand(N, HIDDEN_DIM, device="cuda", requires_grad=True)
+
+m = seq.Sequential(
+    seq.RMSNorm(HIDDEN_DIM),
+    seq.Linear(HIDDEN_DIM, 4 * HIDDEN_DIM),
+    seq.SwiGLU(),
+    seq.Linear(2 * HIDDEN_DIM, HIDDEN_DIM),
+)
+torch.set_printoptions(precision=4, sci_mode=False)
+
+m(x)
+
+with seq.Recipe(lowp=seq.nvte.DType.Float8E4M3):
+    opt: nn.Module = torch.compile(m, fullgraph=True, dynamic=True)
+    for _ in range(100):
+        y: torch.Tensor = opt(x)
+        y.sum().backward()
+        print(x.grad)
+        x.grad = None
+
+with seq.Recipe(lowp=seq.nvte.DType.BFloat16):
+    y = m(x)
+    y.sum().backward()
+    print(x.grad)
+    x.grad = None
+
+with seq.Recipe(lowp=seq.nvte.DType.Float32):
+    y = m(x)
+    y.sum().backward()
+    print(x.grad)
+    x.grad = None