From 718996c16c78e643ba9a47fc790909f3b9314def Mon Sep 17 00:00:00 2001 From: Tim Moon Date: Fri, 24 May 2024 22:30:55 +0000 Subject: [PATCH 1/4] Modify CUDA graph tests to use grad accumulation steps Signed-off-by: Tim Moon --- tests/pytorch/test_cuda_graphs.py | 123 +++++++++++++++++++++--------- 1 file changed, 89 insertions(+), 34 deletions(-) diff --git a/tests/pytorch/test_cuda_graphs.py b/tests/pytorch/test_cuda_graphs.py index 2b1dcb3aa3..6bcb2d3b41 100644 --- a/tests/pytorch/test_cuda_graphs.py +++ b/tests/pytorch/test_cuda_graphs.py @@ -66,9 +66,6 @@ def assert_all_equal(l1: List[torch.Tensor], l2: List[torch.Tensor], names=None) failed = False failed_tensors = "" for i, (t1, t2) in enumerate(zip(l1, l2)): - with torch.no_grad(): - t1.masked_fill_(t1.isnan(), 1.0) - t2.masked_fill_(t2.isnan(), 1.0) if not torch.equal(t1, t2): failed = True failed_tensors += f" {names[i]}\n" if names is not None else f" tensor at idx={i}\n" @@ -77,7 +74,7 @@ def assert_all_equal(l1: List[torch.Tensor], l2: List[torch.Tensor], names=None) def generate_data( s: int, b: int, h: int, nheads: int, kv: int, dtype: torch.dtype, - dpa: bool = False, warmup: bool = False, gen_labels: bool = False, + dpa: bool = False, warmup: bool = False, gen_grad_output: bool = False, ) -> Tuple[torch.Tensor, torch.Tensor]: """Generate synthetic data.""" gen_func = torch.ones if warmup else torch.randn @@ -86,11 +83,11 @@ def generate_data( else: inputs = [gen_func(s, b, h, device="cuda", requires_grad=True, dtype=dtype)] - if not gen_labels: + if not gen_grad_output: return inputs - target = torch.randn(s, b, h, device="cuda", dtype=dtype) - return inputs, target + grad_output = torch.randn(s, b, h, device="cuda", dtype=dtype) + return inputs, grad_output def get_outputs(model, output): @@ -104,7 +101,31 @@ def get_outputs(model, output): return values -def _test_cuda_graphs(config, bs, num_layers, dtype, fp8, fp8_params, graph, module, optimizer, graph_mode=""): +class _Sequential(torch.nn.Sequential): + + def forward(self, input_: torch.Tensor, is_first_microbatch=None) -> torch.Tensor: + kwargs = {} + if is_first_microbatch is not None: + kwargs["is_first_microbatch"] = is_first_microbatch + x = input_ + for module in self: + x = module(x, **kwargs) + return x + + +def _test_cuda_graphs( + *, + config: ModelConfig, + bs: int, + num_layers: int, + dtype: torch.dtype, + fp8: bool, + fp8_params: bool, + fp8_weight_caching: bool, + module: str, + optimizer: torch.optim.Optimizer, + graph_mode: str, +) -> List[torch.Tensor]: """Helper function for test.""" reset_rng_states() FP8GlobalStateManager.reset() @@ -150,40 +171,50 @@ def _test_cuda_graphs(config, bs, num_layers, dtype, fp8, fp8_params, graph, mod ) for _ in range(num_layers)] # Generate model and wrap API to return graphed version. - if graph: - # Graph entire module at once. - if graph_mode == "full": - model = modules[0] if dpa else torch.nn.Sequential(*modules) - model = make_graphed_callables( - model, - generate_data(config.s, bs, config.h, config.nheads, config.kv, dtype, dpa=dpa, warmup=True), - num_warmup_iters=10, - fp8_enabled=fp8) - else: - modules = [make_graphed_callables( + if graph_mode == "full": + # Graph entire model at once. + model = modules[0] if dpa else torch.nn.Sequential(*modules) + model = make_graphed_callables( + model, + generate_data(config.s, bs, config.h, config.nheads, config.kv, dtype, dpa=dpa, warmup=True), + num_warmup_iters=10, + fp8_enabled=fp8, + fp8_weight_caching=fp8_weight_caching, + ) + elif graph_mode == "individual": + # Graph individual modules + modules = [ + make_graphed_callables( module, generate_data(config.s, bs, config.h, config.nheads, config.kv, dtype, dpa=dpa, warmup=True), num_warmup_iters=10, - fp8_enabled=fp8) for module in modules] - model = modules[0] if dpa else torch.nn.Sequential(*modules) + fp8_enabled=fp8, + fp8_weight_caching=fp8_weight_caching, + ) + for module in modules + ] + model = modules[0] if dpa else _Sequential(*modules) else: - model = modules[0] if dpa else torch.nn.Sequential(*modules) + model = modules[0] if dpa else _Sequential(*modules) # Loss function and optimizer. - loss_fn = torch.nn.MSELoss() if not dpa: optimizer = optimizer(model.parameters(), lr=0.001) # Launch. - for _ in range(10): - inputs, target = generate_data(config.s, bs, config.h, config.nheads, config.kv, dtype, dpa=dpa, gen_labels=True) - with fp8_autocast(enabled=fp8): - output = model(*inputs) - loss = loss_fn(output, target) - loss.backward() + for train_step in range(3): + if not dpa: + optimizer.zero_grad(set_to_none=False) + for grad_accumulation_step in range(2): + inputs, grad_output = generate_data(config.s, bs, config.h, config.nheads, config.kv, dtype, dpa=dpa, gen_grad_output=True) + with fp8_autocast(enabled=fp8): + kwargs = {} + if fp8_weight_caching: + kwargs["is_first_microbatch"] = (grad_accumulation_step == 0) + output = model(*inputs, **kwargs) + (output * grad_output).sum().backward() if not dpa: optimizer.step() - optimizer.zero_grad() return get_outputs(model, output) @@ -194,21 +225,45 @@ def _test_cuda_graphs(config, bs, num_layers, dtype, fp8, fp8_params, graph, mod @pytest.mark.parametrize("num_layers", [1, 10]) @pytest.mark.parametrize("fp8", all_boolean) @pytest.mark.parametrize("fp8_params", all_boolean) +@pytest.mark.parametrize("fp8_weight_caching", all_boolean) @pytest.mark.parametrize("module", modules) @pytest.mark.parametrize("optimizer", optimizers) -def test_gpt_make_graphed_callables(dtype, bs, model, num_layers, fp8, fp8_params, module, optimizer): +def test_gpt_make_graphed_callables( + dtype: torch.dtype, + bs: int, + model: str, + num_layers: int, + fp8: bool, + fp8_params: bool, + fp8_weight_caching: bool, + module: str, + optimizer: torch.optim.Optimizer, +) -> None: if fp8 and not fp8_available: pytest.skip(reason_for_no_fp8) if fp8_params and not fp8: pytest.skip("FP8 needed for FP8 parameters.") + if fp8_weight_caching and not fp8: + pytest.skip("FP8 needed for FP8 parameters.") if module == "dpa" and num_layers > 1: pytest.skip("Max 1 layer for DPA.") config = model_configs[model] - outputs = _test_cuda_graphs(config, bs, num_layers, dtype, fp8, fp8_params, False, module, optimizer) - graph_outputs_mode1 = _test_cuda_graphs(config, bs, num_layers, dtype, fp8, fp8_params, True, module, optimizer, graph_mode="full") - graph_outputs_mode2 = _test_cuda_graphs(config, bs, num_layers, dtype, fp8, fp8_params, True, module, optimizer, graph_mode="individual") + kwargs = dict( + config=config, + bs=bs, + num_layers=num_layers, + dtype=dtype, + fp8=fp8, + fp8_params=fp8_params, + fp8_weight_caching=fp8_weight_caching, + module=module, + optimizer=optimizer, + ) + outputs = _test_cuda_graphs(graph_mode="none", **kwargs) + graph_outputs_mode1 = _test_cuda_graphs(graph_mode="full", **kwargs) + graph_outputs_mode2 = _test_cuda_graphs(graph_mode="individual", **kwargs) # Check that results match assert_all_equal(outputs, graph_outputs_mode1) From ab5eb387de5750cface50a64712ec75eb24bfea5 Mon Sep 17 00:00:00 2001 From: Tim Moon Date: Sat, 25 May 2024 00:43:35 +0000 Subject: [PATCH 2/4] Initialize grad buffers before capturing CUDA graph in CUDA graph tests Signed-off-by: Tim Moon --- tests/pytorch/test_cuda_graphs.py | 17 ++++++++--------- transformer_engine/pytorch/graph.py | 5 ----- 2 files changed, 8 insertions(+), 14 deletions(-) diff --git a/tests/pytorch/test_cuda_graphs.py b/tests/pytorch/test_cuda_graphs.py index 6bcb2d3b41..98175a73d4 100644 --- a/tests/pytorch/test_cuda_graphs.py +++ b/tests/pytorch/test_cuda_graphs.py @@ -39,8 +39,6 @@ def __init__(self, hidden_size, nheads, kv, seq_len): modules = ["transformer", "layernorm_mlp", "layernorm_linear", "linear", "mha", "dpa"] -optimizers = [torch.optim.SGD, torch.optim.Adam] - all_boolean = [True, False] dtypes = [torch.float32, torch.float16] @@ -123,7 +121,6 @@ def _test_cuda_graphs( fp8_params: bool, fp8_weight_caching: bool, module: str, - optimizer: torch.optim.Optimizer, graph_mode: str, ) -> List[torch.Tensor]: """Helper function for test.""" @@ -170,6 +167,11 @@ def _test_cuda_graphs( config.h, config.h, device="cuda", params_dtype=dtype ) for _ in range(num_layers)] + # Initialize gradient buffers. + for module in modules: + for param in module.parameters(): + param.grad = torch.empty_like(param) + # Generate model and wrap API to return graphed version. if graph_mode == "full": # Graph entire model at once. @@ -199,7 +201,7 @@ def _test_cuda_graphs( # Loss function and optimizer. if not dpa: - optimizer = optimizer(model.parameters(), lr=0.001) + optimizer = torch.optim.SGD(model.parameters(), lr=0.001) # Launch. for train_step in range(3): @@ -212,7 +214,7 @@ def _test_cuda_graphs( if fp8_weight_caching: kwargs["is_first_microbatch"] = (grad_accumulation_step == 0) output = model(*inputs, **kwargs) - (output * grad_output).sum().backward() + output.backward(grad_output) if not dpa: optimizer.step() @@ -222,12 +224,11 @@ def _test_cuda_graphs( @pytest.mark.parametrize("dtype", dtypes) @pytest.mark.parametrize("bs", [1, 2]) @pytest.mark.parametrize("model", model_configs.keys()) -@pytest.mark.parametrize("num_layers", [1, 10]) +@pytest.mark.parametrize("num_layers", [1, 3]) @pytest.mark.parametrize("fp8", all_boolean) @pytest.mark.parametrize("fp8_params", all_boolean) @pytest.mark.parametrize("fp8_weight_caching", all_boolean) @pytest.mark.parametrize("module", modules) -@pytest.mark.parametrize("optimizer", optimizers) def test_gpt_make_graphed_callables( dtype: torch.dtype, bs: int, @@ -237,7 +238,6 @@ def test_gpt_make_graphed_callables( fp8_params: bool, fp8_weight_caching: bool, module: str, - optimizer: torch.optim.Optimizer, ) -> None: if fp8 and not fp8_available: pytest.skip(reason_for_no_fp8) @@ -259,7 +259,6 @@ def test_gpt_make_graphed_callables( fp8_params=fp8_params, fp8_weight_caching=fp8_weight_caching, module=module, - optimizer=optimizer, ) outputs = _test_cuda_graphs(graph_mode="none", **kwargs) graph_outputs_mode1 = _test_cuda_graphs(graph_mode="full", **kwargs) diff --git a/transformer_engine/pytorch/graph.py b/transformer_engine/pytorch/graph.py index 5de3b7a342..3f73b5306d 100644 --- a/transformer_engine/pytorch/graph.py +++ b/transformer_engine/pytorch/graph.py @@ -536,11 +536,6 @@ def forward_func(*args, **kwargs): else: torch.cuda.set_rng_state(original_rng_states) - # Reset FP8 gradients. - for module in modules: - for p in module.parameters(): - p.grad = None - # Restore FP8 state. restore_fp8_tensors(modules, saved_fp8_tensors) From 4107434833501b017f6e53754740a2575377e6a3 Mon Sep 17 00:00:00 2001 From: Tim Moon Date: Sat, 25 May 2024 01:45:02 +0000 Subject: [PATCH 3/4] Only use BS=2 in CUDA graph tests Signed-off-by: Tim Moon --- tests/pytorch/test_cuda_graphs.py | 95 +++++++++++++++++++------------ 1 file changed, 59 insertions(+), 36 deletions(-) diff --git a/tests/pytorch/test_cuda_graphs.py b/tests/pytorch/test_cuda_graphs.py index 98175a73d4..1b296f85d2 100644 --- a/tests/pytorch/test_cuda_graphs.py +++ b/tests/pytorch/test_cuda_graphs.py @@ -2,6 +2,7 @@ # # See LICENSE for license information. +from dataclasses import dataclass from typing import List, Tuple import pytest @@ -25,17 +26,16 @@ _cpu_rng_state = torch.get_rng_state() _cuda_rng_state = torch.cuda.get_rng_state() - +@dataclass class ModelConfig: - def __init__(self, hidden_size, nheads, kv, seq_len): - self.h = hidden_size - self.nheads = nheads - self.kv = kv - self.s = seq_len + """Data tensor dimensions within Transformer model""" + sequence_length: int + batch_size: int + hidden_size: int + num_heads: int + kv_channels: int -model_configs = { - "small": ModelConfig(64, 2, 32, 32), -} +model_configs = {"small": ModelConfig(2, 32, 64, 2, 32)} modules = ["transformer", "layernorm_mlp", "layernorm_linear", "linear", "mha", "dpa"] @@ -71,20 +71,49 @@ def assert_all_equal(l1: List[torch.Tensor], l2: List[torch.Tensor], names=None) def generate_data( - s: int, b: int, h: int, nheads: int, kv: int, dtype: torch.dtype, - dpa: bool = False, warmup: bool = False, gen_grad_output: bool = False, + config: ModelConfig, + dtype: torch.dtype, + dpa: bool = False, + warmup: bool = False, + return_grad_output: bool = False, ) -> Tuple[torch.Tensor, torch.Tensor]: """Generate synthetic data.""" gen_func = torch.ones if warmup else torch.randn if dpa: - inputs = [gen_func(s, b, nheads, kv, device="cuda", requires_grad=True, dtype=dtype) for _ in range(3)] + inputs = [ + gen_func( + config.sequence_length, + config.batch_size, + config.num_heads, + config.kv_channels, + device="cuda", + requires_grad=True, + dtype=dtype, + ) + for _ in range(3) + ] else: - inputs = [gen_func(s, b, h, device="cuda", requires_grad=True, dtype=dtype)] + inputs = [ + gen_func( + config.sequence_length, + config.batch_size, + config.hidden_size, + device="cuda", + requires_grad=True, + dtype=dtype, + ) + ] - if not gen_grad_output: + if not return_grad_output: return inputs - grad_output = torch.randn(s, b, h, device="cuda", dtype=dtype) + grad_output = torch.randn( + config.sequence_length, + config.batch_size, + config.hidden_size, + device="cuda", + dtype=dtype, + ) return inputs, grad_output @@ -100,11 +129,9 @@ def get_outputs(model, output): class _Sequential(torch.nn.Sequential): + """Sequential model that forwards keyword arguments to modules""" - def forward(self, input_: torch.Tensor, is_first_microbatch=None) -> torch.Tensor: - kwargs = {} - if is_first_microbatch is not None: - kwargs["is_first_microbatch"] = is_first_microbatch + def forward(self, input_: torch.Tensor, **kwargs) -> torch.Tensor: x = input_ for module in self: x = module(x, **kwargs) @@ -114,7 +141,6 @@ def forward(self, input_: torch.Tensor, is_first_microbatch=None) -> torch.Tenso def _test_cuda_graphs( *, config: ModelConfig, - bs: int, num_layers: int, dtype: torch.dtype, fp8: bool, @@ -132,9 +158,9 @@ def _test_cuda_graphs( # Create modules. if module == "transformer": modules = [TransformerLayer( - config.h, - config.h, - config.nheads, + config.hidden_size, + config.hidden_size, + config.num_heads, hidden_dropout=0.0, attention_dropout=0.0, fuse_qkv_params=True, @@ -142,29 +168,29 @@ def _test_cuda_graphs( ) for _ in range(num_layers)] elif module == "layernorm_mlp": modules = [LayerNormMLP( - config.h, config.h, params_dtype=dtype + config.hidden_size, config.hidden_size, params_dtype=dtype ) for _ in range(num_layers)] elif module == "layernorm_linear": modules = [LayerNormLinear( - config.h, config.h, params_dtype=dtype + config.hidden_size, config.hidden_size, params_dtype=dtype ) for _ in range(num_layers)] elif module == "mha": modules = [MultiheadAttention( - config.h, - config.nheads, + config.hidden_size, + config.num_heads, attention_dropout=0.0, params_dtype=dtype, fuse_qkv_params=True, ) for _ in range(num_layers)] elif dpa: - assert config.h % config.nheads == 0, "Err." + assert config.hidden_size % config.num_heads == 0, "Err." assert num_layers == 1, "Err." modules = [DotProductAttention( - config.nheads, config.kv, attention_dropout=0.0 + config.num_heads, config.kv_channels, attention_dropout=0.0 ) for _ in range(num_layers)] else: modules = [Linear( - config.h, config.h, device="cuda", params_dtype=dtype + config.hidden_size, config.hidden_size, device="cuda", params_dtype=dtype ) for _ in range(num_layers)] # Initialize gradient buffers. @@ -178,7 +204,7 @@ def _test_cuda_graphs( model = modules[0] if dpa else torch.nn.Sequential(*modules) model = make_graphed_callables( model, - generate_data(config.s, bs, config.h, config.nheads, config.kv, dtype, dpa=dpa, warmup=True), + generate_data(config, dtype, dpa=dpa, warmup=True), num_warmup_iters=10, fp8_enabled=fp8, fp8_weight_caching=fp8_weight_caching, @@ -188,7 +214,7 @@ def _test_cuda_graphs( modules = [ make_graphed_callables( module, - generate_data(config.s, bs, config.h, config.nheads, config.kv, dtype, dpa=dpa, warmup=True), + generate_data(config, dtype, dpa=dpa, warmup=True), num_warmup_iters=10, fp8_enabled=fp8, fp8_weight_caching=fp8_weight_caching, @@ -208,7 +234,7 @@ def _test_cuda_graphs( if not dpa: optimizer.zero_grad(set_to_none=False) for grad_accumulation_step in range(2): - inputs, grad_output = generate_data(config.s, bs, config.h, config.nheads, config.kv, dtype, dpa=dpa, gen_grad_output=True) + inputs, grad_output = generate_data(config, dtype, dpa=dpa, return_grad_output=True) with fp8_autocast(enabled=fp8): kwargs = {} if fp8_weight_caching: @@ -222,7 +248,6 @@ def _test_cuda_graphs( @pytest.mark.parametrize("dtype", dtypes) -@pytest.mark.parametrize("bs", [1, 2]) @pytest.mark.parametrize("model", model_configs.keys()) @pytest.mark.parametrize("num_layers", [1, 3]) @pytest.mark.parametrize("fp8", all_boolean) @@ -231,7 +256,6 @@ def _test_cuda_graphs( @pytest.mark.parametrize("module", modules) def test_gpt_make_graphed_callables( dtype: torch.dtype, - bs: int, model: str, num_layers: int, fp8: bool, @@ -252,7 +276,6 @@ def test_gpt_make_graphed_callables( kwargs = dict( config=config, - bs=bs, num_layers=num_layers, dtype=dtype, fp8=fp8, From 204436d21b0d7ac97204a6d326c61550d4bd6896 Mon Sep 17 00:00:00 2001 From: Tim Moon <4406448+timmoon10@users.noreply.github.com> Date: Mon, 3 Jun 2024 11:27:42 -0700 Subject: [PATCH 4/4] Update tests/pytorch/test_cuda_graphs.py Signed-off-by: Tim Moon <4406448+timmoon10@users.noreply.github.com> --- tests/pytorch/test_cuda_graphs.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/pytorch/test_cuda_graphs.py b/tests/pytorch/test_cuda_graphs.py index 1b296f85d2..71023c32f9 100644 --- a/tests/pytorch/test_cuda_graphs.py +++ b/tests/pytorch/test_cuda_graphs.py @@ -230,7 +230,7 @@ def _test_cuda_graphs( optimizer = torch.optim.SGD(model.parameters(), lr=0.001) # Launch. - for train_step in range(3): + for _ in range(3): if not dpa: optimizer.zero_grad(set_to_none=False) for grad_accumulation_step in range(2):