NVIDIA-NeMo · timmoon10 · Jul 25, 2023 · Sep 7, 2023 · Sep 7, 2023 · Sep 9, 2023
diff --git a/Dockerfile b/Dockerfile
@@ -45,12 +45,14 @@ RUN apt-get update && \
 WORKDIR /workspace/
 
 WORKDIR /tmp/
-# TODO: Remove once this Apex commit (5/12/23) is included in PyTorch
-# container
+
+# Distributed Adam support for multiple dtypes
 RUN git clone https://github.com/NVIDIA/apex.git && \
   cd apex && \
-  git checkout 8b7a1ff183741dd8f9b87e7bafd04cfde99cea28 && \
-  pip3 install -v --disable-pip-version-check --no-cache-dir --global-option="--cpp_ext" --global-option="--cuda_ext" --global-option="--fast_layer_norm" --global-option="--distributed_adam" --global-option="--deprecated_fused_adam" ./
+  git checkout 2386a912164b0c5cfcd8be7a2b890fbac5607c82 && \
+  pip3 install -v --no-build-isolation --config-settings --build-option="--cpp_ext --cuda_ext --fast_layer_norm --distributed_adam --deprecated_fused_adam" .
+
+RUN pip3 install git+https://github.com/timmoon10/TransformerEngine.git@float8tensor_experiments
 
 # uninstall stuff from base container
 RUN pip3 uninstall -y sacrebleu torchtext

diff --git a/README.rst b/README.rst
@@ -41,14 +41,14 @@
 Introduction
 ------------
 
-NVIDIA NeMo is a conversational AI toolkit built for researchers working on automatic speech recognition (ASR), 
-text-to-speech synthesis (TTS), large language models (LLMs), and 
+NVIDIA NeMo is a conversational AI toolkit built for researchers working on automatic speech recognition (ASR),
+text-to-speech synthesis (TTS), large language models (LLMs), and
 natural language processing (NLP).
-The primary objective of NeMo is to help researchers from industry and academia to reuse prior work (code and pretrained models) 
+The primary objective of NeMo is to help researchers from industry and academia to reuse prior work (code and pretrained models)
 and make it easier to create new `conversational AI models <https://developer.nvidia.com/conversational-ai#started>`_.
 
-All NeMo models are trained with `Lightning <https://github.com/Lightning-AI/lightning>`_ and 
-training is automatically scalable to 1000s of GPUs. 
+All NeMo models are trained with `Lightning <https://github.com/Lightning-AI/lightning>`_ and
+training is automatically scalable to 1000s of GPUs.
 Additionally, NeMo Megatron LLM models can be trained up to 1 trillion parameters using tensor and pipeline model parallelism.
 NeMo models can be optimized for inference and deployed for production use-cases with `NVIDIA Riva <https://developer.nvidia.com/riva>`_.
 
@@ -57,14 +57,14 @@ State of the Art pretrained NeMo models are freely available on `HuggingFace Hub
 `NVIDIA NGC <https://catalog.ngc.nvidia.com/models?query=nemo&orderBy=weightPopularDESC>`_.
 These models can be used to transcribe audio, synthesize speech, or translate text in just a few lines of code.
 
-We have extensive `tutorials <https://docs.nvidia.com/deeplearning/nemo/user-guide/docs/en/stable/starthere/tutorials.html>`_ that 
+We have extensive `tutorials <https://docs.nvidia.com/deeplearning/nemo/user-guide/docs/en/stable/starthere/tutorials.html>`_ that
 can all be run on `Google Colab <https://colab.research.google.com>`_.
 
-For advanced users that want to train NeMo models from scratch or finetune existing NeMo models 
+For advanced users that want to train NeMo models from scratch or finetune existing NeMo models
 we have a full suite of `example scripts <https://github.com/NVIDIA/NeMo/tree/main/examples>`_ that support multi-GPU/multi-node training.
 
 For scaling NeMo LLM training on Slurm clusters or public clouds, please see the `NVIDIA NeMo Megatron Launcher <https://github.com/NVIDIA/NeMo-Megatron-Launcher>`_.
-The NM launcher has extensive recipes, scripts, utilities, and documentation for training NeMo LLMs and also has an `Autoconfigurator <https://github.com/NVIDIA/NeMo-Megatron-Launcher#53-using-autoconfigurator-to-find-the-optimal-configuration>`_ 
+The NM launcher has extensive recipes, scripts, utilities, and documentation for training NeMo LLMs and also has an `Autoconfigurator <https://github.com/NVIDIA/NeMo-Megatron-Launcher#53-using-autoconfigurator-to-find-the-optimal-configuration>`_
 which can be used to find the optimal model parallel configuration for training on a specific cluster.
 
 Also see our `introductory video <https://www.youtube.com/embed/wBgpMf_KQVw>`_ for a high level overview of NeMo.
@@ -245,8 +245,8 @@ To install Apex, run
 
     git clone https://github.com/NVIDIA/apex.git
     cd apex
-    git checkout 57057e2fcf1c084c0fcc818f55c0ff6ea1b24ae2
-    pip install -v --disable-pip-version-check --no-cache-dir --global-option="--cpp_ext" --global-option="--cuda_ext" --global-option="--fast_layer_norm" --global-option="--distributed_adam" --global-option="--deprecated_fused_adam" ./
+    git checkout 2386a912164b0c5cfcd8be7a2b890fbac5607c82
+    pip3 install -v --no-build-isolation --config-settings --build-option="--cpp_ext --cuda_ext --fast_layer_norm --distributed_adam --deprecated_fused_adam" .
 
 It is highly recommended to use the NVIDIA PyTorch or NeMo container if having issues installing Apex or any other dependencies.
 
@@ -283,7 +283,7 @@ Transformer Engine requires PyTorch to be built with CUDA 11.8.
 
 Flash Attention
 ~~~~~~~~~~~~~~~~~~~~
-Transformer Engine already supports Flash Attention for GPT models. If you want to use Flash Attention for non-causal models or use with attention bias (introduced from position encoding, e.g. Alibi), please install `flash-attn <https://github.com/HazyResearch/flash-attention>`_. 
+Transformer Engine already supports Flash Attention for GPT models. If you want to use Flash Attention for non-causal models or use with attention bias (introduced from position encoding, e.g. Alibi), please install `flash-attn <https://github.com/HazyResearch/flash-attention>`_.
 
 .. code-block:: bash
 
@@ -292,7 +292,7 @@ Transformer Engine already supports Flash Attention for GPT models. If you want
 
 NLP inference UI
 ~~~~~~~~~~~~~~~~~~~~
-To launch the inference web UI server, please install the gradio `gradio <https://gradio.app/>`_. 
+To launch the inference web UI server, please install the gradio `gradio <https://gradio.app/>`_.
 
 .. code-block:: bash
 

diff --git a/nemo/collections/nlp/models/language_modeling/megatron_base_model.py b/nemo/collections/nlp/models/language_modeling/megatron_base_model.py
@@ -66,7 +66,7 @@ class MegatronBaseModel(NLPModel):
 
     - Initialize the model parallel world for nemo.
     - Turn on all of the nvidia optimizations.
-    - If `cfg.tokenizer` is available, it loads the tokenizer and pad the vocab to the 
+    - If `cfg.tokenizer` is available, it loads the tokenizer and pad the vocab to the
       correct size for tensor model parallelism.
     - If using distributed optimizer, configure to be compatible
       with O2 level optimizations and/or model parallelism.
@@ -407,9 +407,8 @@ def setup_optimization(
         optim_kwargs = {} if optim_kwargs is None else optim_kwargs.copy()
         if self.with_distributed_adam:
 
-            # Allocate contiguous buffers to avoid extra copies
+            # Allocate contiguous buffer to avoid extra copies
             optim_kwargs['contiguous_grad_buffer'] = True
-            optim_kwargs['contiguous_param_buffer'] = True
 
             # Make sure optimizer state is in FP32
             optim_dtype = torch.float32
@@ -490,9 +489,11 @@ def configure_optimizers(self):
         if self.with_distributed_adam:
 
             # Initialize param buckets if explicitly provided
-            if hasattr(self, 'distributed_adam_buckets'):
+            if getattr(self, 'distributed_adam_buckets', None):
                 for bucket in self.distributed_adam_buckets:
                     self._optimizer.init_params_bucket(bucket)
+                self._optimizer.init_params_bucket(self.parameters())
+            if hasattr(self, 'distributed_adam_buckets'):
                 del self.distributed_adam_buckets
 
             # Make sure all params are initialized so main grads are
@@ -509,7 +510,8 @@ def configure_optimizers(self):
             self._optimizer.init_params(reversed(no_overlap_params))
 
             # Initialize contiguous parameter buffer
-            self._optimizer.init_param_buffer()
+            if self._optimizer.contiguous_param_buffer:
+                self._optimizer.init_param_buffer()
 
         if self._scheduler is None:
             return self._optimizer

diff --git a/nemo/collections/nlp/models/language_modeling/megatron_gpt_model.py b/nemo/collections/nlp/models/language_modeling/megatron_gpt_model.py
@@ -16,6 +16,7 @@
 import os
 import queue
 import warnings
+from contextlib import nullcontext
 from functools import partial
 from typing import Any, Dict, Iterator, List, Optional, Union
 
@@ -226,11 +227,19 @@ def __init__(self, cfg: DictConfig, trainer: Trainer):
                 virtual_pipeline_model_parallel_size=self.cfg.get('virtual_pipeline_model_parallel_size', None),
             )
         else:
-            self.model = build_model(
-                model_provider_func=self.model_provider_func,
-                wrap_with_ddp=False,
-                virtual_pipeline_model_parallel_size=self.cfg.get('virtual_pipeline_model_parallel_size', None),
-            )
+            fp8_enabled = cfg.get('fp8', False) and int(os.getenv("NEMO_WITH_FP8_PARAMS", "1"))
+            make_model_context = nullcontext
+            if fp8_enabled and HAVE_TE:
+                fp8_recipe = transformer_engine.common.recipe.DelayedScaling(
+                    margin=0, interval=1, fp8_format=transformer_engine.common.recipe.Format.E4M3,
+                )
+                make_model_context = partial(transformer_engine.pytorch.fp8_model_init, enabled=True)
+            with make_model_context():
+                self.model = build_model(
+                    model_provider_func=self.model_provider_func,
+                    wrap_with_ddp=False,
+                    virtual_pipeline_model_parallel_size=self.cfg.get('virtual_pipeline_model_parallel_size', None),
+                )
 
         # if we're not using interleaved, then self.model is a module.
         if self.cfg.get('virtual_pipeline_model_parallel_size', None) is None:
@@ -437,10 +446,6 @@ def configure_optimizers(self):
                             [p for p in layer.parameters() if not getattr(p, '_disable_overlap_grad_sync', False)]
                         )
             buckets.reverse()
-            used_params = set()
-            for bucket in buckets:
-                used_params.update(bucket)
-            buckets[-1].extend(p for p in self.parameters() if p not in used_params)
             self.distributed_adam_buckets = buckets
 
         return super().configure_optimizers()

diff --git a/nemo/collections/nlp/modules/common/megatron/clip_grads.py b/nemo/collections/nlp/modules/common/megatron/clip_grads.py
@@ -200,7 +200,7 @@ def clip_grad_norm_distributed_optimizer(optimizer, max_norm, norm_type=2):
     #   - parameter should not be shared
     #   - should not be a replica due to tensor model parallelism
     params_for_norm = []
-    for param in optimizer.parameters(with_fp32_optim_params=True):
+    for param in optimizer.parameters():
         is_not_shared = param_is_not_shared(param)
         is_not_tp_duplicate = param_is_not_tensor_parallel_duplicate(param)
         if is_not_shared and is_not_tp_duplicate:

diff --git a/nemo/collections/nlp/modules/common/megatron/transformer.py b/nemo/collections/nlp/modules/common/megatron/transformer.py
@@ -835,8 +835,6 @@ def __init__(
             params_dtype=params_dtype,
             get_rng_state_tracker=get_rng_state_tracker,
             fuse_wgrad_accumulation=fuse_wgrad_accumulation,
-            apply_query_key_layer_scaling=apply_query_key_layer_scaling,
-            attention_softmax_in_fp32=attention_softmax_in_fp32,
             seq_length=seq_length,
             micro_batch_size=micro_batch_size,
             sequence_parallel=sequence_parallel,