From cdd88590b10f8c2922dde06c2b204c2c73a741f4 Mon Sep 17 00:00:00 2001 From: Tim Moon Date: Wed, 5 Feb 2025 22:58:08 +0000 Subject: [PATCH] Configure FSDP to use original params Needed to avoid bug with Transformer Engine LayerNorm, which needs to access module parameters. Signed-off-by: Tim Moon --- .../nlp/models/language_modeling/megatron_base_model.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/nemo/collections/nlp/models/language_modeling/megatron_base_model.py b/nemo/collections/nlp/models/language_modeling/megatron_base_model.py index 519835f74dae..41ba0b2693c7 100644 --- a/nemo/collections/nlp/models/language_modeling/megatron_base_model.py +++ b/nemo/collections/nlp/models/language_modeling/megatron_base_model.py @@ -1324,6 +1324,10 @@ def find_frozen_submodules(model): for submodule in frozen_submodule_names: logging.debug(f"Ignoring state {submodule} in FSDP.") self.trainer.strategy.kwargs['ignored_states'] = frozen_submodules + + # Transformer Engine expects that module parameters are available, so FSDP should not replace them + self.trainer.strategy.kwargs['use_orig_params'] = True + # FSDP requires uniform status of require_grads # Diffusion models like SD has frozen parts and needs to be added to 'ignored_states' # from sharding for FSDP to work