From 718996c16c78e643ba9a47fc790909f3b9314def Mon Sep 17 00:00:00 2001
From: Tim Moon <tmoon@nvidia.com>
Date: Fri, 24 May 2024 22:30:55 +0000
Subject: [PATCH 1/4] Modify CUDA graph tests to use grad accumulation steps

Signed-off-by: Tim Moon <tmoon@nvidia.com>
---
 tests/pytorch/test_cuda_graphs.py | 123 +++++++++++++++++++++---------
 1 file changed, 89 insertions(+), 34 deletions(-)

diff --git a/tests/pytorch/test_cuda_graphs.py b/tests/pytorch/test_cuda_graphs.py
index 2b1dcb3aa3..6bcb2d3b41 100644
--- a/tests/pytorch/test_cuda_graphs.py
+++ b/tests/pytorch/test_cuda_graphs.py
@@ -66,9 +66,6 @@ def assert_all_equal(l1: List[torch.Tensor], l2: List[torch.Tensor], names=None)
     failed = False
     failed_tensors = ""
     for i, (t1, t2) in enumerate(zip(l1, l2)):
-        with torch.no_grad():
-            t1.masked_fill_(t1.isnan(), 1.0)
-            t2.masked_fill_(t2.isnan(), 1.0)
         if not torch.equal(t1, t2):
             failed = True
             failed_tensors += f"    {names[i]}\n" if names is not None else f"    tensor at idx={i}\n"
@@ -77,7 +74,7 @@ def assert_all_equal(l1: List[torch.Tensor], l2: List[torch.Tensor], names=None)
 
 def generate_data(
     s: int, b: int, h: int, nheads: int, kv: int, dtype: torch.dtype,
-    dpa: bool = False, warmup: bool = False, gen_labels: bool = False,
+    dpa: bool = False, warmup: bool = False, gen_grad_output: bool = False,
 ) -> Tuple[torch.Tensor, torch.Tensor]:
     """Generate synthetic data."""
     gen_func = torch.ones if warmup else torch.randn
@@ -86,11 +83,11 @@ def generate_data(
     else:
         inputs = [gen_func(s, b, h, device="cuda", requires_grad=True, dtype=dtype)]
 
-    if not gen_labels:
+    if not gen_grad_output:
         return inputs
 
-    target = torch.randn(s, b, h, device="cuda", dtype=dtype)
-    return inputs, target
+    grad_output = torch.randn(s, b, h, device="cuda", dtype=dtype)
+    return inputs, grad_output
 
 
 def get_outputs(model, output):
@@ -104,7 +101,31 @@ def get_outputs(model, output):
     return values
 
 
-def _test_cuda_graphs(config, bs, num_layers, dtype, fp8, fp8_params, graph, module, optimizer, graph_mode=""):
+class _Sequential(torch.nn.Sequential):
+
+    def forward(self, input_: torch.Tensor, is_first_microbatch=None) -> torch.Tensor:
+        kwargs = {}
+        if is_first_microbatch is not None:
+            kwargs["is_first_microbatch"] = is_first_microbatch
+        x = input_
+        for module in self:
+            x = module(x, **kwargs)
+        return x
+
+
+def _test_cuda_graphs(
+    *,
+    config: ModelConfig,
+    bs: int,
+    num_layers: int,
+    dtype: torch.dtype,
+    fp8: bool,
+    fp8_params: bool,
+    fp8_weight_caching: bool,
+    module: str,
+    optimizer: torch.optim.Optimizer,
+    graph_mode: str,
+) -> List[torch.Tensor]:
     """Helper function for test."""
     reset_rng_states()
     FP8GlobalStateManager.reset()
@@ -150,40 +171,50 @@ def _test_cuda_graphs(config, bs, num_layers, dtype, fp8, fp8_params, graph, mod
             ) for _ in range(num_layers)]
 
         # Generate model and wrap API to return graphed version.
-        if graph:
-            # Graph entire module at once.
-            if graph_mode == "full":
-                model = modules[0] if dpa else torch.nn.Sequential(*modules)
-                model = make_graphed_callables(
-                        model,
-                        generate_data(config.s, bs, config.h, config.nheads, config.kv, dtype, dpa=dpa, warmup=True),
-                        num_warmup_iters=10,
-                        fp8_enabled=fp8)
-            else:
-                modules = [make_graphed_callables(
+        if graph_mode == "full":
+            # Graph entire model at once.
+            model = modules[0] if dpa else torch.nn.Sequential(*modules)
+            model = make_graphed_callables(
+                model,
+                generate_data(config.s, bs, config.h, config.nheads, config.kv, dtype, dpa=dpa, warmup=True),
+                num_warmup_iters=10,
+                fp8_enabled=fp8,
+                fp8_weight_caching=fp8_weight_caching,
+            )
+        elif graph_mode == "individual":
+            # Graph individual modules
+            modules = [
+                make_graphed_callables(
                     module,
                     generate_data(config.s, bs, config.h, config.nheads, config.kv, dtype, dpa=dpa, warmup=True),
                     num_warmup_iters=10,
-                    fp8_enabled=fp8) for module in modules]
-                model = modules[0] if dpa else torch.nn.Sequential(*modules)
+                    fp8_enabled=fp8,
+                    fp8_weight_caching=fp8_weight_caching,
+                )
+                for module in modules
+            ]
+            model = modules[0] if dpa else _Sequential(*modules)
         else:
-            model = modules[0] if dpa else torch.nn.Sequential(*modules)
+            model = modules[0] if dpa else _Sequential(*modules)
 
     # Loss function and optimizer.
-    loss_fn = torch.nn.MSELoss()
     if not dpa:
         optimizer = optimizer(model.parameters(), lr=0.001)
 
     # Launch.
-    for _ in range(10):
-        inputs, target = generate_data(config.s, bs, config.h, config.nheads, config.kv, dtype, dpa=dpa, gen_labels=True)
-        with fp8_autocast(enabled=fp8):
-            output = model(*inputs)
-        loss = loss_fn(output, target)
-        loss.backward()
+    for train_step in range(3):
+        if not dpa:
+            optimizer.zero_grad(set_to_none=False)
+        for grad_accumulation_step in range(2):
+            inputs, grad_output = generate_data(config.s, bs, config.h, config.nheads, config.kv, dtype, dpa=dpa, gen_grad_output=True)
+            with fp8_autocast(enabled=fp8):
+                kwargs = {}
+                if fp8_weight_caching:
+                    kwargs["is_first_microbatch"] = (grad_accumulation_step == 0)
+                output = model(*inputs, **kwargs)
+            (output * grad_output).sum().backward()
         if not dpa:
             optimizer.step()
-            optimizer.zero_grad()
 
     return get_outputs(model, output)
 
@@ -194,21 +225,45 @@ def _test_cuda_graphs(config, bs, num_layers, dtype, fp8, fp8_params, graph, mod
 @pytest.mark.parametrize("num_layers", [1, 10])
 @pytest.mark.parametrize("fp8", all_boolean)
 @pytest.mark.parametrize("fp8_params", all_boolean)
+@pytest.mark.parametrize("fp8_weight_caching", all_boolean)
 @pytest.mark.parametrize("module", modules)
 @pytest.mark.parametrize("optimizer", optimizers)
-def test_gpt_make_graphed_callables(dtype, bs, model, num_layers, fp8, fp8_params, module, optimizer):
+def test_gpt_make_graphed_callables(
+    dtype: torch.dtype,
+    bs: int,
+    model: str,
+    num_layers: int,
+    fp8: bool,
+    fp8_params: bool,
+    fp8_weight_caching: bool,
+    module: str,
+    optimizer: torch.optim.Optimizer,
+) -> None:
     if fp8 and not fp8_available:
         pytest.skip(reason_for_no_fp8)
     if fp8_params and not fp8:
         pytest.skip("FP8 needed for FP8 parameters.")
+    if fp8_weight_caching and not fp8:
+        pytest.skip("FP8 needed for FP8 parameters.")
     if module == "dpa" and num_layers > 1:
         pytest.skip("Max 1 layer for DPA.")
 
     config = model_configs[model]
 
-    outputs = _test_cuda_graphs(config, bs, num_layers, dtype, fp8, fp8_params, False, module, optimizer)
-    graph_outputs_mode1 = _test_cuda_graphs(config, bs, num_layers, dtype, fp8, fp8_params, True, module, optimizer, graph_mode="full")
-    graph_outputs_mode2 = _test_cuda_graphs(config, bs, num_layers, dtype, fp8, fp8_params, True, module, optimizer, graph_mode="individual")
+    kwargs = dict(
+        config=config,
+        bs=bs,
+        num_layers=num_layers,
+        dtype=dtype,
+        fp8=fp8,
+        fp8_params=fp8_params,
+        fp8_weight_caching=fp8_weight_caching,
+        module=module,
+        optimizer=optimizer,
+    )
+    outputs = _test_cuda_graphs(graph_mode="none", **kwargs)
+    graph_outputs_mode1 = _test_cuda_graphs(graph_mode="full", **kwargs)
+    graph_outputs_mode2 = _test_cuda_graphs(graph_mode="individual", **kwargs)
 
     # Check that results match
     assert_all_equal(outputs, graph_outputs_mode1)

From ab5eb387de5750cface50a64712ec75eb24bfea5 Mon Sep 17 00:00:00 2001
From: Tim Moon <tmoon@nvidia.com>
Date: Sat, 25 May 2024 00:43:35 +0000
Subject: [PATCH 2/4] Initialize grad buffers before capturing CUDA graph in
 CUDA graph tests

Signed-off-by: Tim Moon <tmoon@nvidia.com>
---
 tests/pytorch/test_cuda_graphs.py   | 17 ++++++++---------
 transformer_engine/pytorch/graph.py |  5 -----
 2 files changed, 8 insertions(+), 14 deletions(-)

diff --git a/tests/pytorch/test_cuda_graphs.py b/tests/pytorch/test_cuda_graphs.py
index 6bcb2d3b41..98175a73d4 100644
--- a/tests/pytorch/test_cuda_graphs.py
+++ b/tests/pytorch/test_cuda_graphs.py
@@ -39,8 +39,6 @@ def __init__(self, hidden_size, nheads, kv, seq_len):
 
 modules = ["transformer", "layernorm_mlp", "layernorm_linear", "linear", "mha", "dpa"]
 
-optimizers = [torch.optim.SGD, torch.optim.Adam]
-
 all_boolean = [True, False]
 
 dtypes = [torch.float32, torch.float16]
@@ -123,7 +121,6 @@ def _test_cuda_graphs(
     fp8_params: bool,
     fp8_weight_caching: bool,
     module: str,
-    optimizer: torch.optim.Optimizer,
     graph_mode: str,
 ) -> List[torch.Tensor]:
     """Helper function for test."""
@@ -170,6 +167,11 @@ def _test_cuda_graphs(
                 config.h, config.h, device="cuda", params_dtype=dtype
             ) for _ in range(num_layers)]
 
+        # Initialize gradient buffers.
+        for module in modules:
+            for param in module.parameters():
+                param.grad = torch.empty_like(param)
+
         # Generate model and wrap API to return graphed version.
         if graph_mode == "full":
             # Graph entire model at once.
@@ -199,7 +201,7 @@ def _test_cuda_graphs(
 
     # Loss function and optimizer.
     if not dpa:
-        optimizer = optimizer(model.parameters(), lr=0.001)
+        optimizer = torch.optim.SGD(model.parameters(), lr=0.001)
 
     # Launch.
     for train_step in range(3):
@@ -212,7 +214,7 @@ def _test_cuda_graphs(
                 if fp8_weight_caching:
                     kwargs["is_first_microbatch"] = (grad_accumulation_step == 0)
                 output = model(*inputs, **kwargs)
-            (output * grad_output).sum().backward()
+            output.backward(grad_output)
         if not dpa:
             optimizer.step()
 
@@ -222,12 +224,11 @@ def _test_cuda_graphs(
 @pytest.mark.parametrize("dtype", dtypes)
 @pytest.mark.parametrize("bs", [1, 2])
 @pytest.mark.parametrize("model", model_configs.keys())
-@pytest.mark.parametrize("num_layers", [1, 10])
+@pytest.mark.parametrize("num_layers", [1, 3])
 @pytest.mark.parametrize("fp8", all_boolean)
 @pytest.mark.parametrize("fp8_params", all_boolean)
 @pytest.mark.parametrize("fp8_weight_caching", all_boolean)
 @pytest.mark.parametrize("module", modules)
-@pytest.mark.parametrize("optimizer", optimizers)
 def test_gpt_make_graphed_callables(
     dtype: torch.dtype,
     bs: int,
@@ -237,7 +238,6 @@ def test_gpt_make_graphed_callables(
     fp8_params: bool,
     fp8_weight_caching: bool,
     module: str,
-    optimizer: torch.optim.Optimizer,
 ) -> None:
     if fp8 and not fp8_available:
         pytest.skip(reason_for_no_fp8)
@@ -259,7 +259,6 @@ def test_gpt_make_graphed_callables(
         fp8_params=fp8_params,
         fp8_weight_caching=fp8_weight_caching,
         module=module,
-        optimizer=optimizer,
     )
     outputs = _test_cuda_graphs(graph_mode="none", **kwargs)
     graph_outputs_mode1 = _test_cuda_graphs(graph_mode="full", **kwargs)
diff --git a/transformer_engine/pytorch/graph.py b/transformer_engine/pytorch/graph.py
index 5de3b7a342..3f73b5306d 100644
--- a/transformer_engine/pytorch/graph.py
+++ b/transformer_engine/pytorch/graph.py
@@ -536,11 +536,6 @@ def forward_func(*args, **kwargs):
     else:
         torch.cuda.set_rng_state(original_rng_states)
 
-    # Reset FP8 gradients.
-    for module in modules:
-        for p in module.parameters():
-            p.grad = None
-
     # Restore FP8 state.
     restore_fp8_tensors(modules, saved_fp8_tensors)
 

From 4107434833501b017f6e53754740a2575377e6a3 Mon Sep 17 00:00:00 2001
From: Tim Moon <tmoon@nvidia.com>
Date: Sat, 25 May 2024 01:45:02 +0000
Subject: [PATCH 3/4] Only use BS=2 in CUDA graph tests

Signed-off-by: Tim Moon <tmoon@nvidia.com>
---
 tests/pytorch/test_cuda_graphs.py | 95 +++++++++++++++++++------------
 1 file changed, 59 insertions(+), 36 deletions(-)

diff --git a/tests/pytorch/test_cuda_graphs.py b/tests/pytorch/test_cuda_graphs.py
index 98175a73d4..1b296f85d2 100644
--- a/tests/pytorch/test_cuda_graphs.py
+++ b/tests/pytorch/test_cuda_graphs.py
@@ -2,6 +2,7 @@
 #
 # See LICENSE for license information.
 
+from dataclasses import dataclass
 from typing import List, Tuple
 import pytest
 
@@ -25,17 +26,16 @@
 _cpu_rng_state = torch.get_rng_state()
 _cuda_rng_state = torch.cuda.get_rng_state()
 
-
+@dataclass
 class ModelConfig:
-    def __init__(self, hidden_size, nheads, kv, seq_len):
-        self.h = hidden_size
-        self.nheads = nheads
-        self.kv = kv
-        self.s = seq_len
+    """Data tensor dimensions within Transformer model"""
+    sequence_length: int
+    batch_size: int
+    hidden_size: int
+    num_heads: int
+    kv_channels: int
 
-model_configs = {
-    "small": ModelConfig(64, 2, 32, 32),
-}
+model_configs = {"small": ModelConfig(2, 32, 64, 2, 32)}
 
 modules = ["transformer", "layernorm_mlp", "layernorm_linear", "linear", "mha", "dpa"]
 
@@ -71,20 +71,49 @@ def assert_all_equal(l1: List[torch.Tensor], l2: List[torch.Tensor], names=None)
 
 
 def generate_data(
-    s: int, b: int, h: int, nheads: int, kv: int, dtype: torch.dtype,
-    dpa: bool = False, warmup: bool = False, gen_grad_output: bool = False,
+    config: ModelConfig,
+    dtype: torch.dtype,
+    dpa: bool = False,
+    warmup: bool = False,
+    return_grad_output: bool = False,
 ) -> Tuple[torch.Tensor, torch.Tensor]:
     """Generate synthetic data."""
     gen_func = torch.ones if warmup else torch.randn
     if dpa:
-        inputs = [gen_func(s, b, nheads, kv, device="cuda", requires_grad=True, dtype=dtype) for _ in range(3)]
+        inputs = [
+            gen_func(
+                config.sequence_length,
+                config.batch_size,
+                config.num_heads,
+                config.kv_channels,
+                device="cuda",
+                requires_grad=True,
+                dtype=dtype,
+            )
+            for _ in range(3)
+        ]
     else:
-        inputs = [gen_func(s, b, h, device="cuda", requires_grad=True, dtype=dtype)]
+        inputs = [
+            gen_func(
+                config.sequence_length,
+                config.batch_size,
+                config.hidden_size,
+                device="cuda",
+                requires_grad=True,
+                dtype=dtype,
+            )
+        ]
 
-    if not gen_grad_output:
+    if not return_grad_output:
         return inputs
 
-    grad_output = torch.randn(s, b, h, device="cuda", dtype=dtype)
+    grad_output = torch.randn(
+        config.sequence_length,
+        config.batch_size,
+        config.hidden_size,
+        device="cuda",
+        dtype=dtype,
+    )
     return inputs, grad_output
 
 
@@ -100,11 +129,9 @@ def get_outputs(model, output):
 
 
 class _Sequential(torch.nn.Sequential):
+    """Sequential model that forwards keyword arguments to modules"""
 
-    def forward(self, input_: torch.Tensor, is_first_microbatch=None) -> torch.Tensor:
-        kwargs = {}
-        if is_first_microbatch is not None:
-            kwargs["is_first_microbatch"] = is_first_microbatch
+    def forward(self, input_: torch.Tensor, **kwargs) -> torch.Tensor:
         x = input_
         for module in self:
             x = module(x, **kwargs)
@@ -114,7 +141,6 @@ def forward(self, input_: torch.Tensor, is_first_microbatch=None) -> torch.Tenso
 def _test_cuda_graphs(
     *,
     config: ModelConfig,
-    bs: int,
     num_layers: int,
     dtype: torch.dtype,
     fp8: bool,
@@ -132,9 +158,9 @@ def _test_cuda_graphs(
         # Create modules.
         if module == "transformer":
             modules = [TransformerLayer(
-                            config.h,
-                            config.h,
-                            config.nheads,
+                            config.hidden_size,
+                            config.hidden_size,
+                            config.num_heads,
                             hidden_dropout=0.0,
                             attention_dropout=0.0,
                             fuse_qkv_params=True,
@@ -142,29 +168,29 @@ def _test_cuda_graphs(
                        ) for _ in range(num_layers)]
         elif module == "layernorm_mlp":
             modules = [LayerNormMLP(
-                config.h, config.h, params_dtype=dtype
+                config.hidden_size, config.hidden_size, params_dtype=dtype
             ) for _ in range(num_layers)]
         elif module == "layernorm_linear":
             modules = [LayerNormLinear(
-                config.h, config.h, params_dtype=dtype
+                config.hidden_size, config.hidden_size, params_dtype=dtype
             ) for _ in range(num_layers)]
         elif module == "mha":
             modules = [MultiheadAttention(
-                            config.h,
-                            config.nheads,
+                            config.hidden_size,
+                            config.num_heads,
                             attention_dropout=0.0,
                             params_dtype=dtype,
                             fuse_qkv_params=True,
                        ) for _ in range(num_layers)]
         elif dpa:
-            assert config.h % config.nheads == 0, "Err."
+            assert config.hidden_size % config.num_heads == 0, "Err."
             assert num_layers == 1, "Err."
             modules = [DotProductAttention(
-                        config.nheads, config.kv, attention_dropout=0.0
+                        config.num_heads, config.kv_channels, attention_dropout=0.0
                         ) for _ in range(num_layers)]
         else:
             modules = [Linear(
-                config.h, config.h, device="cuda", params_dtype=dtype
+                config.hidden_size, config.hidden_size, device="cuda", params_dtype=dtype
             ) for _ in range(num_layers)]
 
         # Initialize gradient buffers.
@@ -178,7 +204,7 @@ def _test_cuda_graphs(
             model = modules[0] if dpa else torch.nn.Sequential(*modules)
             model = make_graphed_callables(
                 model,
-                generate_data(config.s, bs, config.h, config.nheads, config.kv, dtype, dpa=dpa, warmup=True),
+                generate_data(config, dtype, dpa=dpa, warmup=True),
                 num_warmup_iters=10,
                 fp8_enabled=fp8,
                 fp8_weight_caching=fp8_weight_caching,
@@ -188,7 +214,7 @@ def _test_cuda_graphs(
             modules = [
                 make_graphed_callables(
                     module,
-                    generate_data(config.s, bs, config.h, config.nheads, config.kv, dtype, dpa=dpa, warmup=True),
+                    generate_data(config, dtype, dpa=dpa, warmup=True),
                     num_warmup_iters=10,
                     fp8_enabled=fp8,
                     fp8_weight_caching=fp8_weight_caching,
@@ -208,7 +234,7 @@ def _test_cuda_graphs(
         if not dpa:
             optimizer.zero_grad(set_to_none=False)
         for grad_accumulation_step in range(2):
-            inputs, grad_output = generate_data(config.s, bs, config.h, config.nheads, config.kv, dtype, dpa=dpa, gen_grad_output=True)
+            inputs, grad_output = generate_data(config, dtype, dpa=dpa, return_grad_output=True)
             with fp8_autocast(enabled=fp8):
                 kwargs = {}
                 if fp8_weight_caching:
@@ -222,7 +248,6 @@ def _test_cuda_graphs(
 
 
 @pytest.mark.parametrize("dtype", dtypes)
-@pytest.mark.parametrize("bs", [1, 2])
 @pytest.mark.parametrize("model", model_configs.keys())
 @pytest.mark.parametrize("num_layers", [1, 3])
 @pytest.mark.parametrize("fp8", all_boolean)
@@ -231,7 +256,6 @@ def _test_cuda_graphs(
 @pytest.mark.parametrize("module", modules)
 def test_gpt_make_graphed_callables(
     dtype: torch.dtype,
-    bs: int,
     model: str,
     num_layers: int,
     fp8: bool,
@@ -252,7 +276,6 @@ def test_gpt_make_graphed_callables(
 
     kwargs = dict(
         config=config,
-        bs=bs,
         num_layers=num_layers,
         dtype=dtype,
         fp8=fp8,

From 204436d21b0d7ac97204a6d326c61550d4bd6896 Mon Sep 17 00:00:00 2001
From: Tim Moon <4406448+timmoon10@users.noreply.github.com>
Date: Mon, 3 Jun 2024 11:27:42 -0700
Subject: [PATCH 4/4] Update tests/pytorch/test_cuda_graphs.py

Signed-off-by: Tim Moon <4406448+timmoon10@users.noreply.github.com>
---
 tests/pytorch/test_cuda_graphs.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/pytorch/test_cuda_graphs.py b/tests/pytorch/test_cuda_graphs.py
index 1b296f85d2..71023c32f9 100644
--- a/tests/pytorch/test_cuda_graphs.py
+++ b/tests/pytorch/test_cuda_graphs.py
@@ -230,7 +230,7 @@ def _test_cuda_graphs(
         optimizer = torch.optim.SGD(model.parameters(), lr=0.001)
 
     # Launch.
-    for train_step in range(3):
+    for _ in range(3):
         if not dpa:
             optimizer.zero_grad(set_to_none=False)
         for grad_accumulation_step in range(2):