diff --git a/nemo/collections/nlp/models/language_modeling/megatron_gpt_model.py b/nemo/collections/nlp/models/language_modeling/megatron_gpt_model.py index d791505f3fe4..0624e373bca3 100644 --- a/nemo/collections/nlp/models/language_modeling/megatron_gpt_model.py +++ b/nemo/collections/nlp/models/language_modeling/megatron_gpt_model.py @@ -245,7 +245,7 @@ def __init__(self, cfg: DictConfig, trainer: Trainer): if self.megatron_amp_O2: - if not self.with_distributed_adam: + if not self.with_distributed_adam and not self.cfg.get("use_cpu_initialization", False): # Pre-allocate the model on GPU to have master parameters allocated on the same device with matching data type if isinstance(self.model, list): for module in self.model: