From cdd88590b10f8c2922dde06c2b204c2c73a741f4 Mon Sep 17 00:00:00 2001
From: Tim Moon <tmoon@nvidia.com>
Date: Wed, 5 Feb 2025 22:58:08 +0000
Subject: [PATCH] Configure FSDP to use original params

Needed to avoid bug with Transformer Engine LayerNorm, which needs to access module parameters.

Signed-off-by: Tim Moon <tmoon@nvidia.com>
---
 .../nlp/models/language_modeling/megatron_base_model.py       | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/nemo/collections/nlp/models/language_modeling/megatron_base_model.py b/nemo/collections/nlp/models/language_modeling/megatron_base_model.py
index 519835f74dae..41ba0b2693c7 100644
--- a/nemo/collections/nlp/models/language_modeling/megatron_base_model.py
+++ b/nemo/collections/nlp/models/language_modeling/megatron_base_model.py
@@ -1324,6 +1324,10 @@ def find_frozen_submodules(model):
             for submodule in frozen_submodule_names:
                 logging.debug(f"Ignoring state {submodule} in FSDP.")
             self.trainer.strategy.kwargs['ignored_states'] = frozen_submodules
+
+            # Transformer Engine expects that module parameters are available, so FSDP should not replace them
+            self.trainer.strategy.kwargs['use_orig_params'] = True
+
             # FSDP requires uniform status of require_grads
             # Diffusion models like SD has frozen parts and needs to be added to 'ignored_states'
             # from sharding for FSDP to work