From dcf6c7d95d021e923b545af83636c5d6bba20386 Mon Sep 17 00:00:00 2001
From: MaximumEntropy <sandeep.subramanian.1@umontreal.ca>
Date: Thu, 3 Nov 2022 21:13:07 -0700
Subject: [PATCH 01/17] Initial refactor

Signed-off-by: MaximumEntropy <sandeep.subramanian.1@umontreal.ca>
---
 .../conf/megatron_t5_config.yaml              |   4 +-
 ...megatron_t5_config_finetune_glue_mnli.yaml |   8 +-
 .../megatron_t5_seq2seq_finetune.py           | 110 +++++++++++++++---
 .../megatron_finetune_model.py                |   1 +
 4 files changed, 100 insertions(+), 23 deletions(-)

diff --git a/examples/nlp/language_modeling/conf/megatron_t5_config.yaml b/examples/nlp/language_modeling/conf/megatron_t5_config.yaml
index 2a8c2912e395..ad6024f199e2 100644
--- a/examples/nlp/language_modeling/conf/megatron_t5_config.yaml
+++ b/examples/nlp/language_modeling/conf/megatron_t5_config.yaml
@@ -6,7 +6,7 @@ name: megatron_t5
 restore_from_path: null # used when starting from a .nemo file
 
 trainer:
-  devices: 1
+  devices: 2
   num_nodes: 1
   accelerator: gpu
   precision: 16
@@ -47,7 +47,7 @@ model:
   micro_batch_size: 4
   global_batch_size: 8 # will use more micro batches to reach global batch size
   tensor_model_parallel_size: 1
-  pipeline_model_parallel_size: 1
+  pipeline_model_parallel_size: 2
   resume_from_checkpoint: null # manually set the checkpoint file to load from
   pipeline_model_parallel_split_rank: 0 # rank at which decoder starts.
 
diff --git a/examples/nlp/language_modeling/conf/megatron_t5_config_finetune_glue_mnli.yaml b/examples/nlp/language_modeling/conf/megatron_t5_config_finetune_glue_mnli.yaml
index ac68b57e0216..58793630a38d 100644
--- a/examples/nlp/language_modeling/conf/megatron_t5_config_finetune_glue_mnli.yaml
+++ b/examples/nlp/language_modeling/conf/megatron_t5_config_finetune_glue_mnli.yaml
@@ -28,7 +28,7 @@ exp_manager:
   resume_ignore_no_checkpoint: True
   create_checkpoint_callback: True
   checkpoint_callback_params:
-    monitor: validation_${model.data.validation_ds.metric.name}
+    monitor: validation_exact_string_match
     save_top_k: 10
     mode: max
     always_save_nemo: False # TODO: add support
@@ -37,7 +37,11 @@ exp_manager:
     save_best_model: True
 
 model:
-  restore_from_path: ??? # Path to a trained T5 .nemo file
+  restore_from_path: null # Path to a trained T5 .nemo file
+  pretrained_checkpoint:
+    checkpoint_dir: null # Path to a folder that contains a .ckpt file
+    checkpoint_name: null # Name of the .ckpt file within the checkpoint_dir.
+    hparams_file: null # Path to a .yaml file that contains the hyperparameters of the checkpoint.
   tensor_model_parallel_size: 1
   pipeline_model_parallel_size: 1
   pipeline_model_parallel_split_rank: 0
diff --git a/examples/nlp/language_modeling/megatron_t5_seq2seq_finetune.py b/examples/nlp/language_modeling/megatron_t5_seq2seq_finetune.py
index 22883657736f..5c1a7e67b872 100644
--- a/examples/nlp/language_modeling/megatron_t5_seq2seq_finetune.py
+++ b/examples/nlp/language_modeling/megatron_t5_seq2seq_finetune.py
@@ -12,6 +12,8 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+import os
+import tempfile
 from omegaconf.omegaconf import OmegaConf, open_dict
 from pytorch_lightning import Trainer
 from pytorch_lightning.callbacks.timer import Timer
@@ -29,9 +31,83 @@
     PipelineMixedPrecisionPlugin,
 )
 from nemo.core.config import hydra_runner
-from nemo.utils import logging
+from nemo.utils import logging, AppState
 from nemo.utils.exp_manager import StatelessTimer, exp_manager
+from nemo.utils.model_utils import inject_model_parallel_rank
 
+def _modify_config(t5_cfg, cfg, add_cfg_to_tree=False):
+    OmegaConf.set_struct(t5_cfg, True)
+    with open_dict(t5_cfg):
+        t5_cfg.megatron_amp_O2 = cfg.model.get('megatron_amp_O2', False)
+        if hasattr(t5_cfg, 'encoder') and hasattr(t5_cfg, 'decoder'):
+            t5_cfg.encoder.masked_softmax_fusion = False
+            t5_cfg.decoder.masked_softmax_fusion = False
+            t5_cfg.encoder.hidden_dropout = cfg.model.get('hidden_dropout', 0.1)
+            t5_cfg.decoder.hidden_dropout = cfg.model.get('hidden_dropout', 0.1)
+            if hasattr(t5_cfg.encoder, 'ffn_dropout'):
+                t5_cfg.encoder.ffn_dropout = cfg.model.get('ffn_dropout', 0.1)
+            if hasattr(t5_cfg.decoder, 'ffn_dropout'):
+                t5_cfg.decoder.ffn_dropout = cfg.model.get('ffn_dropout', 0.1)
+        else:
+            t5_cfg.hidden_dropout = cfg.model.get('hidden_dropout', 0.1)
+            t5_cfg.attention_dropout = cfg.model.get('attention_dropout', 0.1)
+            t5_cfg.masked_softmax_fusion = False
+        t5_cfg.data = cfg.model.data
+        t5_cfg.precision = cfg.trainer.precision
+        t5_cfg.optim = cfg.model.optim
+        t5_cfg.micro_batch_size = cfg.model.data.train_ds.micro_batch_size
+        t5_cfg.global_batch_size = cfg.model.data.train_ds.global_batch_size
+        # XNLI has eval languages in the yaml config.
+        if hasattr(cfg.model, 'eval_languages'):
+            t5_cfg.eval_languages = cfg.model.eval_languages
+        
+        # This is needed when modifying a hparam file directly to load `.ckpt` files.
+        # This is not needed to modify the cfg in `.nemo` files.
+        if add_cfg_to_tree:
+            t5_cfg.cfg = cfg
+
+    return t5_cfg
+
+def load_from_nemo(cls, restore_from_path, trainer, t5_cfg):
+    model = cls.restore_from(
+        restore_path=cfg.model.restore_from_path,
+        trainer=trainer,
+        override_config_path=t5_cfg,
+        save_restore_connector=NLPSaveRestoreConnector(),
+    )
+    return model
+
+def load_from_checkpoint_dir(cls, cfg, trainer):
+    app_state = AppState()
+    if cfg.model.tensor_model_parallel_size > 1 or cfg.model.pipeline_model_parallel_size > 1:
+        app_state.model_parallel_size = cfg.model.tensor_model_parallel_size * cfg.pipeline_model_parallel_size
+        app_state.tensor_model_parallel_size = cfg.model.tensor_model_parallel_size
+        app_state.pipeline_model_parallel_size = cfg.model.pipeline_model_parallel_size
+        (
+            app_state.tensor_model_parallel_rank,
+            app_state.pipeline_model_parallel_rank,
+            app_state.model_parallel_size,
+            app_state.data_parallel_size,
+            app_state.pipeline_model_parallel_split_rank,
+            app_state.virtual_pipeline_model_parallel_rank,
+        ) = fake_initialize_model_parallel(
+            world_size=app_state.model_parallel_size,
+            rank=trainer.global_rank,
+            tensor_model_parallel_size_=cfg.model.tensor_model_parallel_size,
+            pipeline_model_parallel_size_=cfg.model.pipeline_model_parallel_size,
+            pipeline_model_parallel_split_rank_=cfg.model.pipeline_model_parallel_split_rank,
+        )
+    checkpoint_path = inject_model_parallel_rank(os.path.join(cfg.model.pretrained_checkpoint.checkpoint_dir, cfg.model.pretrained_checkpoint.checkpoint_name))
+    hparams_file = OmegaConf.load(cfg.model.pretrained_checkpoint.hparams_file)
+    t5_cfg = _modify_config(hparams_file.cfg, cfg, add_cfg_to_tree=True)
+    with tempfile.NamedTemporaryFile(suffix='.yaml') as f:
+        OmegaConf.save(config=t5_cfg, f=f.name)
+        model = cls.load_from_checkpoint(
+            checkpoint_path=checkpoint_path,
+            trainer=trainer,
+            hparams_file=f.name,
+        )
+        return model
 
 @hydra_runner(config_path="conf", config_name="megatron_t5_config_finetune_glue_mnli")
 def main(cfg) -> None:
@@ -78,6 +154,7 @@ def main(cfg) -> None:
         if isinstance(callback, Timer):
             trainer.callbacks[idx] = StatelessTimer(cfg.trainer.max_time,)
 
+    '''
     # Get the T5 Base configuration.
     t5_cfg = MegatronT5FinetuneModel.restore_from(
         restore_path=cfg.model.restore_from_path, trainer=trainer, return_config=True
@@ -108,28 +185,23 @@ def main(cfg) -> None:
         # XNLI has eval languages in the yaml config.
         if hasattr(cfg.model, 'eval_languages'):
             t5_cfg.eval_languages = cfg.model.eval_languages
+    '''
 
     if hasattr(cfg.model.data.train_ds, 'task_name'):
-        model = MegatronT5GLUEModel.restore_from(
-            restore_path=cfg.model.restore_from_path,
-            trainer=trainer,
-            override_config_path=t5_cfg,
-            save_restore_connector=NLPSaveRestoreConnector(),
-        )
+        if cfg.model.restore_from_path:
+            model = load_from_nemo(MegatronT5GLUEModel, cfg.model.restore_from_path, trainer, t5_cfg)
+        else:
+            model = load_from_checkpoint_dir(MegatronT5GLUEModel, cfg, trainer)
     elif hasattr(cfg.model.data.train_ds, 'file_names'):
-        model = MegatronT0Model.restore_from(
-            restore_path=cfg.model.restore_from_path,
-            trainer=trainer,
-            override_config_path=t5_cfg,
-            save_restore_connector=NLPSaveRestoreConnector(),
-        )
+        if cfg.model.restore_from_path:
+            model = load_from_nemo(MegatronT0Model, cfg.model.restore_from_path, trainer, t5_cfg)
+        else:
+            model = load_from_checkpoint_dir(MegatronT0Model, cfg, trainer, t5_cfg)
     else:
-        model = MegatronT5FinetuneModel.restore_from(
-            restore_path=cfg.model.restore_from_path,
-            trainer=trainer,
-            override_config_path=t5_cfg,
-            save_restore_connector=NLPSaveRestoreConnector(),
-        )
+        if cfg.model.restore_from_path:
+            model = load_from_nemo(MegatronT5FinetuneModel, cfg.model.restore_from_path, trainer, t5_cfg)
+        else:
+            model = load_from_checkpoint_dir(MegatronT5FinetuneModel, cfg, trainer, t5_cfg)
 
     trainer.fit(model)
     trainer.validate(model)
diff --git a/nemo/collections/nlp/models/language_modeling/megatron_finetune_model.py b/nemo/collections/nlp/models/language_modeling/megatron_finetune_model.py
index e29cea2264b6..dd4419a7c087 100644
--- a/nemo/collections/nlp/models/language_modeling/megatron_finetune_model.py
+++ b/nemo/collections/nlp/models/language_modeling/megatron_finetune_model.py
@@ -42,6 +42,7 @@ class MegatronT5FinetuneModel(MegatronT5Model):
 
     def __init__(self, cfg: DictConfig, trainer: Trainer):
         super().__init__(cfg, trainer=trainer)
+        import ipdb; ipdb.set_trace()
         self.val_metric, self.val_metric_name = self.setup_metric(self.cfg.data.validation_ds)
         self.val_metric = torch.nn.ModuleList(self.val_metric)
         if hasattr(self.cfg.data, "test_ds"):

From f2a4fa4171f20ce0a2abe982b75443825da4681b Mon Sep 17 00:00:00 2001
From: MaximumEntropy <sandeep.subramanian.1@umontreal.ca>
Date: Fri, 4 Nov 2022 13:41:19 -0700
Subject: [PATCH 02/17] Resolve config before passing to load_from_checkpoint

Signed-off-by: MaximumEntropy <sandeep.subramanian.1@umontreal.ca>
---
 examples/nlp/language_modeling/megatron_t5_seq2seq_finetune.py | 3 ++-
 .../nlp/models/language_modeling/megatron_finetune_model.py    | 1 -
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/examples/nlp/language_modeling/megatron_t5_seq2seq_finetune.py b/examples/nlp/language_modeling/megatron_t5_seq2seq_finetune.py
index 5c1a7e67b872..887a30871164 100644
--- a/examples/nlp/language_modeling/megatron_t5_seq2seq_finetune.py
+++ b/examples/nlp/language_modeling/megatron_t5_seq2seq_finetune.py
@@ -64,7 +64,8 @@ def _modify_config(t5_cfg, cfg, add_cfg_to_tree=False):
         # This is needed when modifying a hparam file directly to load `.ckpt` files.
         # This is not needed to modify the cfg in `.nemo` files.
         if add_cfg_to_tree:
-            t5_cfg.cfg = cfg
+            OmegaConf.resolve(t5_cfg)
+            t5_cfg.cfg = t5_cfg
 
     return t5_cfg
 
diff --git a/nemo/collections/nlp/models/language_modeling/megatron_finetune_model.py b/nemo/collections/nlp/models/language_modeling/megatron_finetune_model.py
index dd4419a7c087..e29cea2264b6 100644
--- a/nemo/collections/nlp/models/language_modeling/megatron_finetune_model.py
+++ b/nemo/collections/nlp/models/language_modeling/megatron_finetune_model.py
@@ -42,7 +42,6 @@ class MegatronT5FinetuneModel(MegatronT5Model):
 
     def __init__(self, cfg: DictConfig, trainer: Trainer):
         super().__init__(cfg, trainer=trainer)
-        import ipdb; ipdb.set_trace()
         self.val_metric, self.val_metric_name = self.setup_metric(self.cfg.data.validation_ds)
         self.val_metric = torch.nn.ModuleList(self.val_metric)
         if hasattr(self.cfg.data, "test_ds"):

From 09230c996691ef8d889b9293c909206f8039bbff Mon Sep 17 00:00:00 2001
From: MaximumEntropy <sandeep.subramanian.1@umontreal.ca>
Date: Fri, 4 Nov 2022 14:15:12 -0700
Subject: [PATCH 03/17] Fixes for model parallel and nemo restore

Signed-off-by: MaximumEntropy <sandeep.subramanian.1@umontreal.ca>
---
 .../megatron_t5_seq2seq_finetune.py           | 58 +++++++------------
 1 file changed, 20 insertions(+), 38 deletions(-)

diff --git a/examples/nlp/language_modeling/megatron_t5_seq2seq_finetune.py b/examples/nlp/language_modeling/megatron_t5_seq2seq_finetune.py
index 887a30871164..b7549cddace5 100644
--- a/examples/nlp/language_modeling/megatron_t5_seq2seq_finetune.py
+++ b/examples/nlp/language_modeling/megatron_t5_seq2seq_finetune.py
@@ -20,6 +20,7 @@
 from pytorch_lightning.plugins.environments.torchelastic_environment import TorchElasticEnvironment
 from pytorch_lightning.trainer.connectors.checkpoint_connector import CheckpointConnector
 
+from nemo.collections.nlp.modules.common.megatron.megatron_init import fake_initialize_model_parallel
 from nemo.collections.nlp.models.language_modeling.megatron_finetune_model import MegatronT5FinetuneModel
 from nemo.collections.nlp.models.language_modeling.megatron_glue_model import MegatronT5GLUEModel
 from nemo.collections.nlp.models.language_modeling.megatron_t0_model import MegatronT0Model
@@ -36,6 +37,10 @@
 from nemo.utils.model_utils import inject_model_parallel_rank
 
 def _modify_config(t5_cfg, cfg, add_cfg_to_tree=False):
+    """
+    This function modifies the original t5 pre-training config (t5_cfg) with attributes from the finetuning config (cfg).
+    The `add_cfg_to_tree` arg adds `cfg` to the top of the yaml tree which is needed for all `hparams.yaml` files when passed as an arg to `load_from_checkpoint()`.
+    """
     OmegaConf.set_struct(t5_cfg, True)
     with open_dict(t5_cfg):
         t5_cfg.megatron_amp_O2 = cfg.model.get('megatron_amp_O2', False)
@@ -69,7 +74,8 @@ def _modify_config(t5_cfg, cfg, add_cfg_to_tree=False):
 
     return t5_cfg
 
-def load_from_nemo(cls, restore_from_path, trainer, t5_cfg):
+def load_from_nemo(cls, cfg, trainer, t5_cfg):
+    t5_cfg = _modify_config(t5_cfg, cfg, add_cfg_to_tree=False)
     model = cls.restore_from(
         restore_path=cfg.model.restore_from_path,
         trainer=trainer,
@@ -81,7 +87,7 @@ def load_from_nemo(cls, restore_from_path, trainer, t5_cfg):
 def load_from_checkpoint_dir(cls, cfg, trainer):
     app_state = AppState()
     if cfg.model.tensor_model_parallel_size > 1 or cfg.model.pipeline_model_parallel_size > 1:
-        app_state.model_parallel_size = cfg.model.tensor_model_parallel_size * cfg.pipeline_model_parallel_size
+        app_state.model_parallel_size = cfg.model.tensor_model_parallel_size * cfg.model.pipeline_model_parallel_size
         app_state.tensor_model_parallel_size = cfg.model.tensor_model_parallel_size
         app_state.pipeline_model_parallel_size = cfg.model.pipeline_model_parallel_size
         (
@@ -155,52 +161,28 @@ def main(cfg) -> None:
         if isinstance(callback, Timer):
             trainer.callbacks[idx] = StatelessTimer(cfg.trainer.max_time,)
 
-    '''
-    # Get the T5 Base configuration.
-    t5_cfg = MegatronT5FinetuneModel.restore_from(
-        restore_path=cfg.model.restore_from_path, trainer=trainer, return_config=True
-    )
-
-    # Override the T5 configuration with the one from the config file.
-    OmegaConf.set_struct(t5_cfg, True)
-    with open_dict(t5_cfg):
-        t5_cfg.megatron_amp_O2 = cfg.model.get('megatron_amp_O2', False)
-        if hasattr(t5_cfg, 'encoder') and hasattr(t5_cfg, 'decoder'):
-            t5_cfg.encoder.masked_softmax_fusion = False
-            t5_cfg.decoder.masked_softmax_fusion = False
-            t5_cfg.encoder.hidden_dropout = cfg.model.get('hidden_dropout', 0.1)
-            t5_cfg.decoder.hidden_dropout = cfg.model.get('hidden_dropout', 0.1)
-            if hasattr(t5_cfg.encoder, 'ffn_dropout'):
-                t5_cfg.encoder.ffn_dropout = cfg.model.get('ffn_dropout', 0.1)
-            if hasattr(t5_cfg.decoder, 'ffn_dropout'):
-                t5_cfg.decoder.ffn_dropout = cfg.model.get('ffn_dropout', 0.1)
-        else:
-            t5_cfg.hidden_dropout = cfg.model.get('hidden_dropout', 0.1)
-            t5_cfg.attention_dropout = cfg.model.get('attention_dropout', 0.1)
-            t5_cfg.masked_softmax_fusion = False
-        t5_cfg.data = cfg.model.data
-        t5_cfg.precision = cfg.trainer.precision
-        t5_cfg.optim = cfg.model.optim
-        t5_cfg.micro_batch_size = cfg.model.data.train_ds.micro_batch_size
-        t5_cfg.global_batch_size = cfg.model.data.train_ds.global_batch_size
-        # XNLI has eval languages in the yaml config.
-        if hasattr(cfg.model, 'eval_languages'):
-            t5_cfg.eval_languages = cfg.model.eval_languages
-    '''
-
     if hasattr(cfg.model.data.train_ds, 'task_name'):
         if cfg.model.restore_from_path:
-            model = load_from_nemo(MegatronT5GLUEModel, cfg.model.restore_from_path, trainer, t5_cfg)
+            t5_cfg = MegatronT5GLUEModel.restore_from(
+               restore_path=cfg.model.restore_from_path, trainer=trainer, return_config=True
+            )
+            model = load_from_nemo(MegatronT5GLUEModel, cfg, trainer, t5_cfg)
         else:
             model = load_from_checkpoint_dir(MegatronT5GLUEModel, cfg, trainer)
     elif hasattr(cfg.model.data.train_ds, 'file_names'):
         if cfg.model.restore_from_path:
-            model = load_from_nemo(MegatronT0Model, cfg.model.restore_from_path, trainer, t5_cfg)
+            t5_cfg = MegatronT0Model.restore_from(
+               restore_path=cfg.model.restore_from_path, trainer=trainer, return_config=True
+            )
+            model = load_from_nemo(MegatronT0Model, cfg, trainer, t5_cfg)
         else:
             model = load_from_checkpoint_dir(MegatronT0Model, cfg, trainer, t5_cfg)
     else:
         if cfg.model.restore_from_path:
-            model = load_from_nemo(MegatronT5FinetuneModel, cfg.model.restore_from_path, trainer, t5_cfg)
+            t5_cfg = MegatronT5FinetuneModel.restore_from(
+               restore_path=cfg.model.restore_from_path, trainer=trainer, return_config=True
+            )
+            model = load_from_nemo(MegatronT5FinetuneModel, cfg, trainer, t5_cfg)
         else:
             model = load_from_checkpoint_dir(MegatronT5FinetuneModel, cfg, trainer, t5_cfg)
 

From 6ba13758770e01ae317e63d4d8200f30005264ce Mon Sep 17 00:00:00 2001
From: MaximumEntropy <sandeep.subramanian.1@umontreal.ca>
Date: Mon, 7 Nov 2022 09:39:39 -0800
Subject: [PATCH 04/17] Fixes for eval

Signed-off-by: MaximumEntropy <sandeep.subramanian.1@umontreal.ca>
---
 .../conf/megatron_t0_config.yaml              |   8 +-
 .../megatron_t5_config_finetune_eval.yaml     |   6 +-
 ...megatron_t5_config_finetune_glue_eval.yaml |   9 +-
 ...megatron_t5_config_finetune_glue_xnli.yaml |   8 +-
 .../conf/megatron_t5_finetune.yaml            |   6 +-
 .../megatron_t5_seq2seq_eval.py               | 111 ++++++++++--------
 .../megatron_t5_seq2seq_finetune.py           |  31 +++--
 7 files changed, 113 insertions(+), 66 deletions(-)

diff --git a/examples/nlp/language_modeling/conf/megatron_t0_config.yaml b/examples/nlp/language_modeling/conf/megatron_t0_config.yaml
index 04503cac769f..503dc17d2acc 100644
--- a/examples/nlp/language_modeling/conf/megatron_t0_config.yaml
+++ b/examples/nlp/language_modeling/conf/megatron_t0_config.yaml
@@ -36,7 +36,11 @@ exp_manager:
     save_best_model: True
 
 model:
-  restore_from_path: ??? # Path to a trained T5 or LM-adapted T5 .nemo file
+  restore_from_path: null # Path to a trained T5 .nemo file
+  pretrained_checkpoint:
+    checkpoint_dir: null # Path to a folder that contains a .ckpt file
+    checkpoint_name: null # Name of the .ckpt file within the checkpoint_dir.
+    hparams_file: null # Path to a .yaml file that contains the hyperparameters of the checkpoint.
   tensor_model_parallel_size: 1
   pipeline_model_parallel_size: 1
   pipeline_model_parallel_split_rank: 0
@@ -82,7 +86,7 @@ model:
         num_classes: null
       replace_bos_with_pad: ${data.train_ds.replace_bos_with_pad}
       add_bos_to_input: ${data.train_ds.add_bos_to_input}
-      add_eos_to_input: ${data.train_ds.replace_bos_with_pad}
+      add_eos_to_input: ${data.train_ds.add_eos_to_input}
       seed: 1234
 
   optim:
diff --git a/examples/nlp/language_modeling/conf/megatron_t5_config_finetune_eval.yaml b/examples/nlp/language_modeling/conf/megatron_t5_config_finetune_eval.yaml
index 8be471a78dde..bc1a7420df48 100644
--- a/examples/nlp/language_modeling/conf/megatron_t5_config_finetune_eval.yaml
+++ b/examples/nlp/language_modeling/conf/megatron_t5_config_finetune_eval.yaml
@@ -17,7 +17,11 @@ exp_manager:
   create_checkpoint_callback: False
 
 model:
-  restore_from_path: ??? # Path to a finetuned T5 .nemo file
+  restore_from_path: null # Path to a trained T5 .nemo file
+  pretrained_checkpoint:
+    checkpoint_dir: null # Path to a folder that contains a .ckpt file
+    checkpoint_name: null # Name of the .ckpt file within the checkpoint_dir.
+    hparams_file: null # Path to a .yaml file that contains the hyperparameters of the checkpoint.
   gradient_as_bucket_view: True # Allocate gradients in a contiguous bucket to save memory (less fragmentation and buffer memory)
   megatron_amp_O2: False # Enable O2 optimization for megatron amp
 
diff --git a/examples/nlp/language_modeling/conf/megatron_t5_config_finetune_glue_eval.yaml b/examples/nlp/language_modeling/conf/megatron_t5_config_finetune_glue_eval.yaml
index 87ce5ac03eb5..024ad5f66ae9 100644
--- a/examples/nlp/language_modeling/conf/megatron_t5_config_finetune_glue_eval.yaml
+++ b/examples/nlp/language_modeling/conf/megatron_t5_config_finetune_glue_eval.yaml
@@ -17,9 +17,16 @@ exp_manager:
   create_checkpoint_callback: False
 
 model:
-  restore_from_path: ??? # Path to a finetuned T5 .nemo file
+  restore_from_path: null # Path to a trained T5 .nemo file
+  pretrained_checkpoint:
+    checkpoint_dir: null # Path to a folder that contains a .ckpt file
+    checkpoint_name: null # Name of the .ckpt file within the checkpoint_dir.
+    hparams_file: null # Path to a .yaml file that contains the hyperparameters of the checkpoint.
   gradient_as_bucket_view: True # Allocate gradients in a contiguous bucket to save memory (less fragmentation and buffer memory)
   megatron_amp_O2: False # Enable O2 optimization for megatron amp
+  tensor_model_parallel_size: 1
+  pipeline_model_parallel_size: 1
+  pipeline_model_parallel_split_rank: 0
 
   data:
     validation_ds:
diff --git a/examples/nlp/language_modeling/conf/megatron_t5_config_finetune_glue_xnli.yaml b/examples/nlp/language_modeling/conf/megatron_t5_config_finetune_glue_xnli.yaml
index 1b08bc37246e..486a6da14135 100644
--- a/examples/nlp/language_modeling/conf/megatron_t5_config_finetune_glue_xnli.yaml
+++ b/examples/nlp/language_modeling/conf/megatron_t5_config_finetune_glue_xnli.yaml
@@ -37,9 +37,13 @@ exp_manager:
     save_best_model: True
 
 model:
-  restore_from_path: ???
+  restore_from_path: null # Path to a trained T5 .nemo file
+  pretrained_checkpoint:
+    checkpoint_dir: null # Path to a folder that contains a .ckpt file
+    checkpoint_name: null # Name of the .ckpt file within the checkpoint_dir.
+    hparams_file: null # Path to a .yaml file that contains the hyperparameters of the checkpoint.
   tensor_model_parallel_size: 1
-  pipeline_model_parallel_size: 2
+  pipeline_model_parallel_size: 1
   pipeline_model_parallel_split_rank: 1
   gradient_as_bucket_view: True # Allocate gradients in a contiguous bucket to save memory (less fragmentation and buffer memory)
   resume_from_checkpoint: null
diff --git a/examples/nlp/language_modeling/conf/megatron_t5_finetune.yaml b/examples/nlp/language_modeling/conf/megatron_t5_finetune.yaml
index 9a5cf15cfe74..8c383aad9c78 100644
--- a/examples/nlp/language_modeling/conf/megatron_t5_finetune.yaml
+++ b/examples/nlp/language_modeling/conf/megatron_t5_finetune.yaml
@@ -36,7 +36,11 @@ exp_manager:
     save_best_model: True
 
 model:
-  restore_from_path: ??? # Path to a trained T5 .nemo file
+  restore_from_path: null # Path to a trained T5 .nemo file
+  pretrained_checkpoint:
+    checkpoint_dir: null # Path to a folder that contains a .ckpt file
+    checkpoint_name: null # Name of the .ckpt file within the checkpoint_dir.
+    hparams_file: null # Path to a .yaml file that contains the hyperparameters of the checkpoint.
   tensor_model_parallel_size: 1
   pipeline_model_parallel_size: 1
   pipeline_model_parallel_split_rank: 0
diff --git a/examples/nlp/language_modeling/megatron_t5_seq2seq_eval.py b/examples/nlp/language_modeling/megatron_t5_seq2seq_eval.py
index 25fd84d800d4..c4512fd157d6 100644
--- a/examples/nlp/language_modeling/megatron_t5_seq2seq_eval.py
+++ b/examples/nlp/language_modeling/megatron_t5_seq2seq_eval.py
@@ -30,6 +30,45 @@
 from nemo.core.config import hydra_runner
 from nemo.utils import logging
 from nemo.utils.exp_manager import StatelessTimer, exp_manager
+from megatron_t5_seq2seq_finetune import load_from_nemo, load_from_checkpoint_dir, validate_checkpoint_loading_args
+
+def _modify_config(t5_cfg, cfg, add_cfg_to_tree=False):
+    """
+    This function modifies the original t5 pre-training config (t5_cfg) with attributes from the finetuning config (cfg).
+    The `add_cfg_to_tree` arg adds `cfg` to the top of the yaml tree which is needed for all `hparams.yaml` files when passed as an arg to `load_from_checkpoint()`.
+    """
+    OmegaConf.set_struct(t5_cfg, True)
+    with open_dict(t5_cfg):
+        t5_cfg.precision = cfg.trainer.precision
+        # Overwrite data configs
+        if cfg.model.data.validation_ds.get('src_file_name', None) is not None:
+            logging.info(
+                'Found validation_ds.src_file_name in the config file. Overriding the finetuned model config file with the values from the new config file.'
+            )
+            t5_cfg.data.validation_ds.src_file_name = cfg.model.data.validation_ds.src_file_name
+        if cfg.model.data.validation_ds.get('tgt_file_name', None) is not None:
+            logging.info(
+                'Found validation_ds.tgt_file_name in the config file. Overriding the finetuned model config file with the values from the new config file.'
+            )
+            t5_cfg.data.validation_ds.tgt_file_name = cfg.model.data.validation_ds.tgt_file_name
+
+        if "write_predictions_to_file" in cfg.model.data.validation_ds:
+            t5_cfg.data.validation_ds.write_predictions_to_file = (
+                cfg.model.data.validation_ds.write_predictions_to_file
+            )
+        if "output_file_path_prefix" in cfg.model.data.validation_ds:
+            t5_cfg.data.validation_ds.output_file_path_prefix = cfg.model.data.validation_ds.output_file_path_prefix
+
+        t5_cfg.data.validation_ds.micro_batch_size = cfg.model.data.validation_ds.micro_batch_size
+        t5_cfg.data.validation_ds.global_batch_size = cfg.model.data.validation_ds.global_batch_size
+
+        # This is needed when modifying a hparam file directly to load `.ckpt` files.
+        # This is not needed to modify the cfg in `.nemo` files.
+        if add_cfg_to_tree:
+            OmegaConf.resolve(t5_cfg)
+            t5_cfg.cfg = t5_cfg
+
+    return t5_cfg
 
 
 @hydra_runner(config_path="conf", config_name="megatron_t5_config_finetune_glue_eval")
@@ -69,59 +108,33 @@ def main(cfg) -> None:
         if isinstance(callback, Timer):
             trainer.callbacks[idx] = StatelessTimer(cfg.trainer.max_time,)
 
-    t5_cfg = MegatronT5GLUEModel.restore_from(
-        restore_path=cfg.model.restore_from_path, trainer=trainer, return_config=True
-    )
-
-    # Override the T5 configuration with the one from the config file.
-    # NOTE: Only data can be overriden here since this the file being restored here should already correspond to a GLUE/XNLI finetuned model.
-    OmegaConf.set_struct(t5_cfg, True)
-    with open_dict(t5_cfg):
-        t5_cfg.precision = cfg.trainer.precision
-        # Overwrite data configs
-        if cfg.model.data.validation_ds.get('src_file_name', None) is not None:
-            logging.info(
-                'Found validation_ds.src_file_name in the config file. Overriding the finetuned model config file with the values from the new config file.'
-            )
-            t5_cfg.data.validation_ds.src_file_name = cfg.model.data.validation_ds.src_file_name
-        if cfg.model.data.validation_ds.get('tgt_file_name', None) is not None:
-            logging.info(
-                'Found validation_ds.tgt_file_name in the config file. Overriding the finetuned model config file with the values from the new config file.'
-            )
-            t5_cfg.data.validation_ds.tgt_file_name = cfg.model.data.validation_ds.tgt_file_name
-
-        if "write_predictions_to_file" in cfg.model.data.validation_ds:
-            t5_cfg.data.validation_ds.write_predictions_to_file = (
-                cfg.model.data.validation_ds.write_predictions_to_file
-            )
-        if "output_file_path_prefix" in cfg.model.data.validation_ds:
-            t5_cfg.data.validation_ds.output_file_path_prefix = cfg.model.data.validation_ds.output_file_path_prefix
-            t5_cfg.data.validation_ds.src_file_name = cfg.model.data.validation_ds.src_file_name
-
-        t5_cfg.data.validation_ds.micro_batch_size = cfg.model.data.validation_ds.micro_batch_size
-        t5_cfg.data.validation_ds.global_batch_size = cfg.model.data.validation_ds.global_batch_size
-
-        if hasattr(cfg.model.data.validation_ds, 'task_name'):
-            model = MegatronT5GLUEModel.restore_from(
-                restore_path=cfg.model.restore_from_path,
-                trainer=trainer,
-                override_config_path=t5_cfg,
-                save_restore_connector=NLPSaveRestoreConnector(),
+    if hasattr(cfg.model.data.validation_ds, 'task_name'):
+        if cfg.model.restore_from_path:
+            t5_cfg = MegatronT5GLUEModel.restore_from(
+               restore_path=cfg.model.restore_from_path, trainer=trainer, return_config=True
             )
-        elif hasattr(cfg.model.data.validation_ds, 'file_names'):
-            model = MegatronT0Model.restore_from(
-                restore_path=cfg.model.restore_from_path,
-                trainer=trainer,
-                override_config_path=t5_cfg,
-                save_restore_connector=NLPSaveRestoreConnector(),
+            model = load_from_nemo(MegatronT5GLUEModel, cfg, trainer, t5_cfg, modify_confg_fn=_modify_config)
+        else:
+            validate_checkpoint_loading_args(cfg.model.pretrained_checkpoint)
+            model = load_from_checkpoint_dir(MegatronT5GLUEModel, cfg, trainer, modify_confg_fn=_modify_config)
+    elif hasattr(cfg.model.data.validation_ds, 'file_names'):
+        if cfg.model.restore_from_path:
+            t5_cfg = MegatronT0Model.restore_from(
+               restore_path=cfg.model.restore_from_path, trainer=trainer, return_config=True
             )
+            model = load_from_nemo(MegatronT0Model, cfg, trainer, t5_cfg, modify_confg_fn=_modify_config)
         else:
-            model = MegatronT5FinetuneModel.restore_from(
-                restore_path=cfg.model.restore_from_path,
-                trainer=trainer,
-                override_config_path=t5_cfg,
-                save_restore_connector=NLPSaveRestoreConnector(),
+            validate_checkpoint_loading_args(cfg.model.pretrained_checkpoint)
+            model = load_from_checkpoint_dir(MegatronT0Model, cfg, trainer, t5_cfg, modify_confg_fn=_modify_config)
+    else:
+        if cfg.model.restore_from_path:
+            t5_cfg = MegatronT5FinetuneModel.restore_from(
+               restore_path=cfg.model.restore_from_path, trainer=trainer, return_config=True
             )
+            model = load_from_nemo(MegatronT5FinetuneModel, cfg, trainer, t5_cfg, modify_confg_fn=_modify_config)
+        else:
+            validate_checkpoint_loading_args(cfg.model.pretrained_checkpoint)
+            model = load_from_checkpoint_dir(MegatronT5FinetuneModel, cfg, trainer, t5_cfg, modify_confg_fn=_modify_config)
 
     model.freeze()
     trainer.validate(model)
diff --git a/examples/nlp/language_modeling/megatron_t5_seq2seq_finetune.py b/examples/nlp/language_modeling/megatron_t5_seq2seq_finetune.py
index b7549cddace5..d89b1fa28473 100644
--- a/examples/nlp/language_modeling/megatron_t5_seq2seq_finetune.py
+++ b/examples/nlp/language_modeling/megatron_t5_seq2seq_finetune.py
@@ -74,8 +74,8 @@ def _modify_config(t5_cfg, cfg, add_cfg_to_tree=False):
 
     return t5_cfg
 
-def load_from_nemo(cls, cfg, trainer, t5_cfg):
-    t5_cfg = _modify_config(t5_cfg, cfg, add_cfg_to_tree=False)
+def load_from_nemo(cls, cfg, trainer, t5_cfg, modify_confg_fn):
+    t5_cfg = modify_confg_fn(t5_cfg, cfg, add_cfg_to_tree=False)
     model = cls.restore_from(
         restore_path=cfg.model.restore_from_path,
         trainer=trainer,
@@ -84,7 +84,7 @@ def load_from_nemo(cls, cfg, trainer, t5_cfg):
     )
     return model
 
-def load_from_checkpoint_dir(cls, cfg, trainer):
+def load_from_checkpoint_dir(cls, cfg, trainer, modify_confg_fn):
     app_state = AppState()
     if cfg.model.tensor_model_parallel_size > 1 or cfg.model.pipeline_model_parallel_size > 1:
         app_state.model_parallel_size = cfg.model.tensor_model_parallel_size * cfg.model.pipeline_model_parallel_size
@@ -106,7 +106,7 @@ def load_from_checkpoint_dir(cls, cfg, trainer):
         )
     checkpoint_path = inject_model_parallel_rank(os.path.join(cfg.model.pretrained_checkpoint.checkpoint_dir, cfg.model.pretrained_checkpoint.checkpoint_name))
     hparams_file = OmegaConf.load(cfg.model.pretrained_checkpoint.hparams_file)
-    t5_cfg = _modify_config(hparams_file.cfg, cfg, add_cfg_to_tree=True)
+    t5_cfg = modify_confg_fn(hparams_file.cfg, cfg, add_cfg_to_tree=True)
     with tempfile.NamedTemporaryFile(suffix='.yaml') as f:
         OmegaConf.save(config=t5_cfg, f=f.name)
         model = cls.load_from_checkpoint(
@@ -116,6 +116,14 @@ def load_from_checkpoint_dir(cls, cfg, trainer):
         )
         return model
 
+def validate_checkpoint_loading_args(cfg):
+    if cfg.checkpoint_dir is None or not os.path.isdir(cfg.checkpoint_dir):
+        raise ValueError(f'Checkpoint directory {cfg.checkpoint_dir} does not exist or is not a directory.')
+    if cfg.checkpoint_name is None:
+        raise ValueError(f'Checkpoint name {cfg.checkpoint_name} is not valid.')
+    if cfg.hparams_file is None or not os.path.isfile(cfg.hparams_file):
+        raise ValueError(f'Hparams file {cfg.hparams_file} does not exist or is not a file.')
+
 @hydra_runner(config_path="conf", config_name="megatron_t5_config_finetune_glue_mnli")
 def main(cfg) -> None:
     logging.info("\n\n************** Experiment configuration ***********")
@@ -166,25 +174,28 @@ def main(cfg) -> None:
             t5_cfg = MegatronT5GLUEModel.restore_from(
                restore_path=cfg.model.restore_from_path, trainer=trainer, return_config=True
             )
-            model = load_from_nemo(MegatronT5GLUEModel, cfg, trainer, t5_cfg)
+            model = load_from_nemo(MegatronT5GLUEModel, cfg, trainer, t5_cfg, modify_confg_fn=_modify_config)
         else:
-            model = load_from_checkpoint_dir(MegatronT5GLUEModel, cfg, trainer)
+            validate_checkpoint_loading_args(cfg.model.pretrained_checkpoint)
+            model = load_from_checkpoint_dir(MegatronT5GLUEModel, cfg, trainer, modify_confg_fn=_modify_config)
     elif hasattr(cfg.model.data.train_ds, 'file_names'):
         if cfg.model.restore_from_path:
             t5_cfg = MegatronT0Model.restore_from(
                restore_path=cfg.model.restore_from_path, trainer=trainer, return_config=True
             )
-            model = load_from_nemo(MegatronT0Model, cfg, trainer, t5_cfg)
+            model = load_from_nemo(MegatronT0Model, cfg, trainer, t5_cfg, modify_confg_fn=_modify_config)
         else:
-            model = load_from_checkpoint_dir(MegatronT0Model, cfg, trainer, t5_cfg)
+            validate_checkpoint_loading_args(cfg.model.pretrained_checkpoint)
+            model = load_from_checkpoint_dir(MegatronT0Model, cfg, trainer, t5_cfg, modify_confg_fn=_modify_config)
     else:
         if cfg.model.restore_from_path:
             t5_cfg = MegatronT5FinetuneModel.restore_from(
                restore_path=cfg.model.restore_from_path, trainer=trainer, return_config=True
             )
-            model = load_from_nemo(MegatronT5FinetuneModel, cfg, trainer, t5_cfg)
+            model = load_from_nemo(MegatronT5FinetuneModel, cfg, trainer, t5_cfg, modify_confg_fn=_modify_config)
         else:
-            model = load_from_checkpoint_dir(MegatronT5FinetuneModel, cfg, trainer, t5_cfg)
+            validate_checkpoint_loading_args(cfg.model.pretrained_checkpoint)
+            model = load_from_checkpoint_dir(MegatronT5FinetuneModel, cfg, trainer, t5_cfg, modify_confg_fn=_modify_config)
 
     trainer.fit(model)
     trainer.validate(model)

From e5204c6b8106b417dca36821d31ee645474e2d51 Mon Sep 17 00:00:00 2001
From: "pre-commit-ci[bot]"
 <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Date: Mon, 7 Nov 2022 17:45:21 +0000
Subject: [PATCH 05/17] [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci
---
 .../megatron_t5_seq2seq_eval.py               | 13 +++++---
 .../megatron_t5_seq2seq_finetune.py           | 32 +++++++++++--------
 2 files changed, 27 insertions(+), 18 deletions(-)

diff --git a/examples/nlp/language_modeling/megatron_t5_seq2seq_eval.py b/examples/nlp/language_modeling/megatron_t5_seq2seq_eval.py
index c4512fd157d6..ef2ec8a65cbd 100644
--- a/examples/nlp/language_modeling/megatron_t5_seq2seq_eval.py
+++ b/examples/nlp/language_modeling/megatron_t5_seq2seq_eval.py
@@ -12,6 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+from megatron_t5_seq2seq_finetune import load_from_checkpoint_dir, load_from_nemo, validate_checkpoint_loading_args
 from omegaconf.omegaconf import OmegaConf, open_dict
 from pytorch_lightning import Trainer
 from pytorch_lightning.callbacks.timer import Timer
@@ -30,7 +31,7 @@
 from nemo.core.config import hydra_runner
 from nemo.utils import logging
 from nemo.utils.exp_manager import StatelessTimer, exp_manager
-from megatron_t5_seq2seq_finetune import load_from_nemo, load_from_checkpoint_dir, validate_checkpoint_loading_args
+
 
 def _modify_config(t5_cfg, cfg, add_cfg_to_tree=False):
     """
@@ -111,7 +112,7 @@ def main(cfg) -> None:
     if hasattr(cfg.model.data.validation_ds, 'task_name'):
         if cfg.model.restore_from_path:
             t5_cfg = MegatronT5GLUEModel.restore_from(
-               restore_path=cfg.model.restore_from_path, trainer=trainer, return_config=True
+                restore_path=cfg.model.restore_from_path, trainer=trainer, return_config=True
             )
             model = load_from_nemo(MegatronT5GLUEModel, cfg, trainer, t5_cfg, modify_confg_fn=_modify_config)
         else:
@@ -120,7 +121,7 @@ def main(cfg) -> None:
     elif hasattr(cfg.model.data.validation_ds, 'file_names'):
         if cfg.model.restore_from_path:
             t5_cfg = MegatronT0Model.restore_from(
-               restore_path=cfg.model.restore_from_path, trainer=trainer, return_config=True
+                restore_path=cfg.model.restore_from_path, trainer=trainer, return_config=True
             )
             model = load_from_nemo(MegatronT0Model, cfg, trainer, t5_cfg, modify_confg_fn=_modify_config)
         else:
@@ -129,12 +130,14 @@ def main(cfg) -> None:
     else:
         if cfg.model.restore_from_path:
             t5_cfg = MegatronT5FinetuneModel.restore_from(
-               restore_path=cfg.model.restore_from_path, trainer=trainer, return_config=True
+                restore_path=cfg.model.restore_from_path, trainer=trainer, return_config=True
             )
             model = load_from_nemo(MegatronT5FinetuneModel, cfg, trainer, t5_cfg, modify_confg_fn=_modify_config)
         else:
             validate_checkpoint_loading_args(cfg.model.pretrained_checkpoint)
-            model = load_from_checkpoint_dir(MegatronT5FinetuneModel, cfg, trainer, t5_cfg, modify_confg_fn=_modify_config)
+            model = load_from_checkpoint_dir(
+                MegatronT5FinetuneModel, cfg, trainer, t5_cfg, modify_confg_fn=_modify_config
+            )
 
     model.freeze()
     trainer.validate(model)
diff --git a/examples/nlp/language_modeling/megatron_t5_seq2seq_finetune.py b/examples/nlp/language_modeling/megatron_t5_seq2seq_finetune.py
index d89b1fa28473..84b78739f673 100644
--- a/examples/nlp/language_modeling/megatron_t5_seq2seq_finetune.py
+++ b/examples/nlp/language_modeling/megatron_t5_seq2seq_finetune.py
@@ -14,16 +14,17 @@
 
 import os
 import tempfile
+
 from omegaconf.omegaconf import OmegaConf, open_dict
 from pytorch_lightning import Trainer
 from pytorch_lightning.callbacks.timer import Timer
 from pytorch_lightning.plugins.environments.torchelastic_environment import TorchElasticEnvironment
 from pytorch_lightning.trainer.connectors.checkpoint_connector import CheckpointConnector
 
-from nemo.collections.nlp.modules.common.megatron.megatron_init import fake_initialize_model_parallel
 from nemo.collections.nlp.models.language_modeling.megatron_finetune_model import MegatronT5FinetuneModel
 from nemo.collections.nlp.models.language_modeling.megatron_glue_model import MegatronT5GLUEModel
 from nemo.collections.nlp.models.language_modeling.megatron_t0_model import MegatronT0Model
+from nemo.collections.nlp.modules.common.megatron.megatron_init import fake_initialize_model_parallel
 from nemo.collections.nlp.parts.nlp_overrides import (
     GradScaler,
     MegatronHalfPrecisionPlugin,
@@ -32,10 +33,11 @@
     PipelineMixedPrecisionPlugin,
 )
 from nemo.core.config import hydra_runner
-from nemo.utils import logging, AppState
+from nemo.utils import AppState, logging
 from nemo.utils.exp_manager import StatelessTimer, exp_manager
 from nemo.utils.model_utils import inject_model_parallel_rank
 
+
 def _modify_config(t5_cfg, cfg, add_cfg_to_tree=False):
     """
     This function modifies the original t5 pre-training config (t5_cfg) with attributes from the finetuning config (cfg).
@@ -65,7 +67,7 @@ def _modify_config(t5_cfg, cfg, add_cfg_to_tree=False):
         # XNLI has eval languages in the yaml config.
         if hasattr(cfg.model, 'eval_languages'):
             t5_cfg.eval_languages = cfg.model.eval_languages
-        
+
         # This is needed when modifying a hparam file directly to load `.ckpt` files.
         # This is not needed to modify the cfg in `.nemo` files.
         if add_cfg_to_tree:
@@ -74,6 +76,7 @@ def _modify_config(t5_cfg, cfg, add_cfg_to_tree=False):
 
     return t5_cfg
 
+
 def load_from_nemo(cls, cfg, trainer, t5_cfg, modify_confg_fn):
     t5_cfg = modify_confg_fn(t5_cfg, cfg, add_cfg_to_tree=False)
     model = cls.restore_from(
@@ -84,6 +87,7 @@ def load_from_nemo(cls, cfg, trainer, t5_cfg, modify_confg_fn):
     )
     return model
 
+
 def load_from_checkpoint_dir(cls, cfg, trainer, modify_confg_fn):
     app_state = AppState()
     if cfg.model.tensor_model_parallel_size > 1 or cfg.model.pipeline_model_parallel_size > 1:
@@ -104,18 +108,17 @@ def load_from_checkpoint_dir(cls, cfg, trainer, modify_confg_fn):
             pipeline_model_parallel_size_=cfg.model.pipeline_model_parallel_size,
             pipeline_model_parallel_split_rank_=cfg.model.pipeline_model_parallel_split_rank,
         )
-    checkpoint_path = inject_model_parallel_rank(os.path.join(cfg.model.pretrained_checkpoint.checkpoint_dir, cfg.model.pretrained_checkpoint.checkpoint_name))
+    checkpoint_path = inject_model_parallel_rank(
+        os.path.join(cfg.model.pretrained_checkpoint.checkpoint_dir, cfg.model.pretrained_checkpoint.checkpoint_name)
+    )
     hparams_file = OmegaConf.load(cfg.model.pretrained_checkpoint.hparams_file)
     t5_cfg = modify_confg_fn(hparams_file.cfg, cfg, add_cfg_to_tree=True)
     with tempfile.NamedTemporaryFile(suffix='.yaml') as f:
         OmegaConf.save(config=t5_cfg, f=f.name)
-        model = cls.load_from_checkpoint(
-            checkpoint_path=checkpoint_path,
-            trainer=trainer,
-            hparams_file=f.name,
-        )
+        model = cls.load_from_checkpoint(checkpoint_path=checkpoint_path, trainer=trainer, hparams_file=f.name,)
         return model
 
+
 def validate_checkpoint_loading_args(cfg):
     if cfg.checkpoint_dir is None or not os.path.isdir(cfg.checkpoint_dir):
         raise ValueError(f'Checkpoint directory {cfg.checkpoint_dir} does not exist or is not a directory.')
@@ -124,6 +127,7 @@ def validate_checkpoint_loading_args(cfg):
     if cfg.hparams_file is None or not os.path.isfile(cfg.hparams_file):
         raise ValueError(f'Hparams file {cfg.hparams_file} does not exist or is not a file.')
 
+
 @hydra_runner(config_path="conf", config_name="megatron_t5_config_finetune_glue_mnli")
 def main(cfg) -> None:
     logging.info("\n\n************** Experiment configuration ***********")
@@ -172,7 +176,7 @@ def main(cfg) -> None:
     if hasattr(cfg.model.data.train_ds, 'task_name'):
         if cfg.model.restore_from_path:
             t5_cfg = MegatronT5GLUEModel.restore_from(
-               restore_path=cfg.model.restore_from_path, trainer=trainer, return_config=True
+                restore_path=cfg.model.restore_from_path, trainer=trainer, return_config=True
             )
             model = load_from_nemo(MegatronT5GLUEModel, cfg, trainer, t5_cfg, modify_confg_fn=_modify_config)
         else:
@@ -181,7 +185,7 @@ def main(cfg) -> None:
     elif hasattr(cfg.model.data.train_ds, 'file_names'):
         if cfg.model.restore_from_path:
             t5_cfg = MegatronT0Model.restore_from(
-               restore_path=cfg.model.restore_from_path, trainer=trainer, return_config=True
+                restore_path=cfg.model.restore_from_path, trainer=trainer, return_config=True
             )
             model = load_from_nemo(MegatronT0Model, cfg, trainer, t5_cfg, modify_confg_fn=_modify_config)
         else:
@@ -190,12 +194,14 @@ def main(cfg) -> None:
     else:
         if cfg.model.restore_from_path:
             t5_cfg = MegatronT5FinetuneModel.restore_from(
-               restore_path=cfg.model.restore_from_path, trainer=trainer, return_config=True
+                restore_path=cfg.model.restore_from_path, trainer=trainer, return_config=True
             )
             model = load_from_nemo(MegatronT5FinetuneModel, cfg, trainer, t5_cfg, modify_confg_fn=_modify_config)
         else:
             validate_checkpoint_loading_args(cfg.model.pretrained_checkpoint)
-            model = load_from_checkpoint_dir(MegatronT5FinetuneModel, cfg, trainer, t5_cfg, modify_confg_fn=_modify_config)
+            model = load_from_checkpoint_dir(
+                MegatronT5FinetuneModel, cfg, trainer, t5_cfg, modify_confg_fn=_modify_config
+            )
 
     trainer.fit(model)
     trainer.validate(model)

From 711d59da4e82473f07a8ed03566cb03629cfb569 Mon Sep 17 00:00:00 2001
From: MaximumEntropy <sandeep.subramanian.1@umontreal.ca>
Date: Mon, 7 Nov 2022 09:46:46 -0800
Subject: [PATCH 06/17] Revert config changes

Signed-off-by: MaximumEntropy <sandeep.subramanian.1@umontreal.ca>
---
 examples/nlp/language_modeling/conf/megatron_t5_config.yaml | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/examples/nlp/language_modeling/conf/megatron_t5_config.yaml b/examples/nlp/language_modeling/conf/megatron_t5_config.yaml
index ad6024f199e2..2a8c2912e395 100644
--- a/examples/nlp/language_modeling/conf/megatron_t5_config.yaml
+++ b/examples/nlp/language_modeling/conf/megatron_t5_config.yaml
@@ -6,7 +6,7 @@ name: megatron_t5
 restore_from_path: null # used when starting from a .nemo file
 
 trainer:
-  devices: 2
+  devices: 1
   num_nodes: 1
   accelerator: gpu
   precision: 16
@@ -47,7 +47,7 @@ model:
   micro_batch_size: 4
   global_batch_size: 8 # will use more micro batches to reach global batch size
   tensor_model_parallel_size: 1
-  pipeline_model_parallel_size: 2
+  pipeline_model_parallel_size: 1
   resume_from_checkpoint: null # manually set the checkpoint file to load from
   pipeline_model_parallel_split_rank: 0 # rank at which decoder starts.
 

From d5468e9d5badb8bbc25563dac0385d0a5222c66b Mon Sep 17 00:00:00 2001
From: MaximumEntropy <sandeep.subramanian.1@umontreal.ca>
Date: Mon, 7 Nov 2022 12:44:34 -0800
Subject: [PATCH 07/17] Refactor

Signed-off-by: MaximumEntropy <sandeep.subramanian.1@umontreal.ca>
---
 ...megatron_t5_config_finetune_glue_mnli.yaml |  2 +-
 .../megatron_finetune_model.py                | 68 ++++---------------
 2 files changed, 13 insertions(+), 57 deletions(-)

diff --git a/examples/nlp/language_modeling/conf/megatron_t5_config_finetune_glue_mnli.yaml b/examples/nlp/language_modeling/conf/megatron_t5_config_finetune_glue_mnli.yaml
index 58793630a38d..ff61c5fde20c 100644
--- a/examples/nlp/language_modeling/conf/megatron_t5_config_finetune_glue_mnli.yaml
+++ b/examples/nlp/language_modeling/conf/megatron_t5_config_finetune_glue_mnli.yaml
@@ -28,7 +28,7 @@ exp_manager:
   resume_ignore_no_checkpoint: True
   create_checkpoint_callback: True
   checkpoint_callback_params:
-    monitor: validation_exact_string_match
+    monitor: validation_${model.data.validation_ds.metric.name}
     save_top_k: 10
     mode: max
     always_save_nemo: False # TODO: add support
diff --git a/nemo/collections/nlp/models/language_modeling/megatron_finetune_model.py b/nemo/collections/nlp/models/language_modeling/megatron_finetune_model.py
index 4da8ab57f367..91455f21e477 100644
--- a/nemo/collections/nlp/models/language_modeling/megatron_finetune_model.py
+++ b/nemo/collections/nlp/models/language_modeling/megatron_finetune_model.py
@@ -132,22 +132,9 @@ def setup(self, stage=None):
             self.setup_training_data()
 
     def _process_global_batch(self, global_batch):
-        """Process a list of microbatches into a global batch."""
-        # If there is no language information in the global batch (ex: English MNLI), we can use the parent global batch processor as is.
-        if 'lang' not in global_batch[0]:
-            return self._process_global_batch_without_megatron_batch_sampler(global_batch)
-
-        # For validation data (XNLI), we need to process the global batch and and then deal with language info separately.
-        else:
-            assert all(['lang' in micro_batch for micro_batch in global_batch])
-            langs_list = []
-            processed_global_batch = self._process_global_batch_without_megatron_batch_sampler(
-                [{k: v for k, v in micro_batch.items() if k != 'lang'} for micro_batch in global_batch]
-            )
-            for micro_batch in global_batch:
-                langs_list.extend(micro_batch['lang'])
-            processed_global_batch['lang'] = langs_list
-            return processed_global_batch
+        """Optionally processes a global batch."""
+        # TODO: maybe remove this now that we've refactored data batch sizes.
+        return global_batch
 
     def on_validation_epoch_start(self):
         app_state = AppState()
@@ -189,17 +176,15 @@ def on_train_epoch_start(self) -> None:
         return super().on_train_epoch_start()
 
     def training_step(self, batch, batch_idx):
-        micro_batch_size = batch[0]['text_enc'].size(0)
+        global_batch_size_per_gpu = batc['text_enc'].size(0)
         # This should happen only on the last batch of the dataset.
-        if micro_batch_size != self.cfg.data.train_ds.micro_batch_size:
+        if global_batch_size_per_gpu != self.cfg.data.train_ds.global_batch_size // parallel_state.get_data_parallel_world_size():
             app_state = AppState()
             _reconfigure_microbatch_calculator(
                 rank=app_state.global_rank,
                 rampup_batch_size=None,
-                global_batch_size=micro_batch_size
-                * parallel_state.get_data_parallel_world_size()
-                * get_num_microbatches(),
-                micro_batch_size=micro_batch_size,
+                global_batch_size=global_batch_size_per_gpu * parallel_state.get_data_parallel_world_size(),
+                micro_batch_size=global_batch_size_per_gpu // get_num_microbatches_per_batch(),
                 data_parallel_size=parallel_state.get_data_parallel_world_size(),
             )
         # At this point batch is a list of dictionaries where eatch dict is a microbatch.
@@ -264,17 +249,15 @@ def cast_for_metric(self, pred, label, metric_name, class_labels=None, labels_ar
         return pred, label
 
     def _reconfigure_and_process_inference_batch(self, batch):
-        micro_batch_size = batch[0]['text_enc'].size(0)
+        global_batch_size_per_gpu = batch['text_enc'].size(0)
         # This should happen only on the last batch of the dataset.
-        if micro_batch_size != self.cfg.data.validation_ds.micro_batch_size:
+        if global_batch_size_per_gpu != self.cfg.data.validation_ds.global_batch_size // parallel_state.get_data_parallel_world_size():
             app_state = AppState()
             _reconfigure_microbatch_calculator(
                 rank=app_state.global_rank,
                 rampup_batch_size=None,
-                global_batch_size=micro_batch_size
-                * parallel_state.get_data_parallel_world_size()
-                * get_num_microbatches(),
-                micro_batch_size=micro_batch_size,
+                global_batch_size=global_batch_size_per_gpu * parallel_state.get_data_parallel_world_size(),
+                micro_batch_size=global_batch_size_per_gpu // get_num_microbatches(),
                 data_parallel_size=parallel_state.get_data_parallel_world_size(),
             )
 
@@ -282,7 +265,6 @@ def _reconfigure_and_process_inference_batch(self, batch):
         # After the process_global_batch call, processed_batch will be a single dictionary containing the global batch.
         # This is required since the parent class expects a single global batch dictioanry.
         processed_batch = self._process_global_batch(batch)
-
         return processed_batch
 
     def inference_step(self, batch, batch_idx, mode, dataloader_idx=0):
@@ -525,20 +507,6 @@ def build_data_loader(
         sampler = torch.utils.data.distributed.DistributedSampler(
             dataset, num_replicas=world_size, rank=rank, shuffle=shuffle
         )
-        # This check makes sure the val_check_interval is less than the number of global batches.
-        # Normally, PTL would do this check and properly account for gradient accumulation.
-        # But now, it is implicit in the apex fwd/bwd functions and so we need to check for this somewhere.
-        # The consequence of not doing this is that training loop will never run validation.
-        # NOTE: Prog bar is also broken as a result of this.
-        global_batch_size_per_gpu = micro_batch_size * get_num_microbatches()
-        if (
-            self.trainer.val_check_interval > (sampler.num_samples // global_batch_size_per_gpu)
-            and check_validation_interval
-        ):
-            raise ValueError(
-                f"trainer.val_check_interval {self.trainer.val_check_interval} is > number of global batches {sampler.num_samples // global_batch_size}"
-            )
-
         if isinstance(dataset, ConcatMapDataset):
             collate_fn = dataset.datasets[0].collate_fn
         else:
@@ -548,7 +516,7 @@ def build_data_loader(
             dataset,
             collate_fn=collate_fn,
             sampler=sampler,
-            batch_size=micro_batch_size,
+            batch_size=global_batch_size // parallel_state.get_data_parallel_world_size(),
             num_workers=num_workers,
             pin_memory=pin_memory,
             drop_last=drop_last,
@@ -689,15 +657,3 @@ def build_train_valid_test_datasets(self, stage):
             return
         self._train_ds = self._build_train_dataset(self.cfg.data.train_ds)
         logging.info(f'Finished building datasets ...')
-
-    def on_train_start(self) -> None:
-        """PTL hook used to override DataFetcher with GlobalBatchDataFetcher """
-        self.trainer.fit_loop._data_fetcher = GlobalBatchDataFetcher()
-
-    def on_validation_start(self) -> None:
-        """PTL hook used to override DataFetcher with GlobalBatchDataFetcher """
-        self.trainer.fit_loop.epoch_loop.val_loop._data_fetcher = GlobalBatchDataFetcher()
-        self.trainer.validate_loop._data_fetcher = GlobalBatchDataFetcher()
-
-    def on_test_start(self) -> None:
-        self.trainer.test_loop._data_fetcher = GlobalBatchDataFetcher()

From 9e86322e21dc107c7e495bb51a3dbe847f9b2ea5 Mon Sep 17 00:00:00 2001
From: "pre-commit-ci[bot]"
 <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Date: Mon, 7 Nov 2022 20:46:31 +0000
Subject: [PATCH 08/17] [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci
---
 .../language_modeling/megatron_finetune_model.py       | 10 ++++++++--
 1 file changed, 8 insertions(+), 2 deletions(-)

diff --git a/nemo/collections/nlp/models/language_modeling/megatron_finetune_model.py b/nemo/collections/nlp/models/language_modeling/megatron_finetune_model.py
index 91455f21e477..a6bf63a8b741 100644
--- a/nemo/collections/nlp/models/language_modeling/megatron_finetune_model.py
+++ b/nemo/collections/nlp/models/language_modeling/megatron_finetune_model.py
@@ -178,7 +178,10 @@ def on_train_epoch_start(self) -> None:
     def training_step(self, batch, batch_idx):
         global_batch_size_per_gpu = batc['text_enc'].size(0)
         # This should happen only on the last batch of the dataset.
-        if global_batch_size_per_gpu != self.cfg.data.train_ds.global_batch_size // parallel_state.get_data_parallel_world_size():
+        if (
+            global_batch_size_per_gpu
+            != self.cfg.data.train_ds.global_batch_size // parallel_state.get_data_parallel_world_size()
+        ):
             app_state = AppState()
             _reconfigure_microbatch_calculator(
                 rank=app_state.global_rank,
@@ -251,7 +254,10 @@ def cast_for_metric(self, pred, label, metric_name, class_labels=None, labels_ar
     def _reconfigure_and_process_inference_batch(self, batch):
         global_batch_size_per_gpu = batch['text_enc'].size(0)
         # This should happen only on the last batch of the dataset.
-        if global_batch_size_per_gpu != self.cfg.data.validation_ds.global_batch_size // parallel_state.get_data_parallel_world_size():
+        if (
+            global_batch_size_per_gpu
+            != self.cfg.data.validation_ds.global_batch_size // parallel_state.get_data_parallel_world_size()
+        ):
             app_state = AppState()
             _reconfigure_microbatch_calculator(
                 rank=app_state.global_rank,

From 723d125071d2a8a6051226cd8df4007b3c1c8641 Mon Sep 17 00:00:00 2001
From: MaximumEntropy <sandeep.subramanian.1@umontreal.ca>
Date: Mon, 7 Nov 2022 12:48:08 -0800
Subject: [PATCH 09/17] Fix typo

Signed-off-by: MaximumEntropy <sandeep.subramanian.1@umontreal.ca>
---
 .../nlp/models/language_modeling/megatron_finetune_model.py     | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/nemo/collections/nlp/models/language_modeling/megatron_finetune_model.py b/nemo/collections/nlp/models/language_modeling/megatron_finetune_model.py
index 91455f21e477..e50e3de73535 100644
--- a/nemo/collections/nlp/models/language_modeling/megatron_finetune_model.py
+++ b/nemo/collections/nlp/models/language_modeling/megatron_finetune_model.py
@@ -176,7 +176,7 @@ def on_train_epoch_start(self) -> None:
         return super().on_train_epoch_start()
 
     def training_step(self, batch, batch_idx):
-        global_batch_size_per_gpu = batc['text_enc'].size(0)
+        global_batch_size_per_gpu = batch['text_enc'].size(0)
         # This should happen only on the last batch of the dataset.
         if global_batch_size_per_gpu != self.cfg.data.train_ds.global_batch_size // parallel_state.get_data_parallel_world_size():
             app_state = AppState()

From 939efdb68fccbe3334f10e587c46526b5eb6ae12 Mon Sep 17 00:00:00 2001
From: MaximumEntropy <sandeep.subramanian.1@umontreal.ca>
Date: Mon, 7 Nov 2022 12:52:47 -0800
Subject: [PATCH 10/17] Remove comments

Signed-off-by: MaximumEntropy <sandeep.subramanian.1@umontreal.ca>
---
 .../nlp/models/language_modeling/megatron_finetune_model.py    | 3 ---
 1 file changed, 3 deletions(-)

diff --git a/nemo/collections/nlp/models/language_modeling/megatron_finetune_model.py b/nemo/collections/nlp/models/language_modeling/megatron_finetune_model.py
index 7cc6903b06bd..3b644716cdf3 100644
--- a/nemo/collections/nlp/models/language_modeling/megatron_finetune_model.py
+++ b/nemo/collections/nlp/models/language_modeling/megatron_finetune_model.py
@@ -267,9 +267,6 @@ def _reconfigure_and_process_inference_batch(self, batch):
                 data_parallel_size=parallel_state.get_data_parallel_world_size(),
             )
 
-        # At this point processed_batch is a list of dictionaries where eatch dict is a microbatch.
-        # After the process_global_batch call, processed_batch will be a single dictionary containing the global batch.
-        # This is required since the parent class expects a single global batch dictioanry.
         processed_batch = self._process_global_batch(batch)
         return processed_batch
 

From 61c6fdaf5978721c5168ed248386630fadb48525 Mon Sep 17 00:00:00 2001
From: MaximumEntropy <sandeep.subramanian.1@umontreal.ca>
Date: Mon, 7 Nov 2022 13:09:36 -0800
Subject: [PATCH 11/17] Minor

Signed-off-by: MaximumEntropy <sandeep.subramanian.1@umontreal.ca>
---
 .../nlp/language_modeling/megatron_t5_seq2seq_eval.py | 11 ++++-------
 1 file changed, 4 insertions(+), 7 deletions(-)

diff --git a/examples/nlp/language_modeling/megatron_t5_seq2seq_eval.py b/examples/nlp/language_modeling/megatron_t5_seq2seq_eval.py
index ef2ec8a65cbd..5e37492b4f06 100644
--- a/examples/nlp/language_modeling/megatron_t5_seq2seq_eval.py
+++ b/examples/nlp/language_modeling/megatron_t5_seq2seq_eval.py
@@ -25,8 +25,7 @@
 from nemo.collections.nlp.parts.nlp_overrides import (
     GradScaler,
     MegatronHalfPrecisionPlugin,
-    NLPDDPStrategy,
-    NLPSaveRestoreConnector,
+    NLPDDPStrategy
 )
 from nemo.core.config import hydra_runner
 from nemo.utils import logging
@@ -126,18 +125,16 @@ def main(cfg) -> None:
             model = load_from_nemo(MegatronT0Model, cfg, trainer, t5_cfg, modify_confg_fn=_modify_config)
         else:
             validate_checkpoint_loading_args(cfg.model.pretrained_checkpoint)
-            model = load_from_checkpoint_dir(MegatronT0Model, cfg, trainer, t5_cfg, modify_confg_fn=_modify_config)
+            model = load_from_checkpoint_dir(MegatronT0Model, cfg, trainer, modify_confg_fn=_modify_config)
     else:
         if cfg.model.restore_from_path:
             t5_cfg = MegatronT5FinetuneModel.restore_from(
                 restore_path=cfg.model.restore_from_path, trainer=trainer, return_config=True
             )
-            model = load_from_nemo(MegatronT5FinetuneModel, cfg, trainer, t5_cfg, modify_confg_fn=_modify_config)
+            model = load_from_nemo(MegatronT5FinetuneModel, cfg, trainer, modify_confg_fn=_modify_config)
         else:
             validate_checkpoint_loading_args(cfg.model.pretrained_checkpoint)
-            model = load_from_checkpoint_dir(
-                MegatronT5FinetuneModel, cfg, trainer, t5_cfg, modify_confg_fn=_modify_config
-            )
+            model = load_from_checkpoint_dir(MegatronT5FinetuneModel, cfg, trainer, modify_confg_fn=_modify_config)
 
     model.freeze()
     trainer.validate(model)

From 424beda2c6cf45ed9768445e6b7e2e5f441b76cc Mon Sep 17 00:00:00 2001
From: "pre-commit-ci[bot]"
 <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Date: Mon, 7 Nov 2022 21:12:31 +0000
Subject: [PATCH 12/17] [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci
---
 examples/nlp/language_modeling/megatron_t5_seq2seq_eval.py | 6 +-----
 1 file changed, 1 insertion(+), 5 deletions(-)

diff --git a/examples/nlp/language_modeling/megatron_t5_seq2seq_eval.py b/examples/nlp/language_modeling/megatron_t5_seq2seq_eval.py
index 5e37492b4f06..e78d34adee65 100644
--- a/examples/nlp/language_modeling/megatron_t5_seq2seq_eval.py
+++ b/examples/nlp/language_modeling/megatron_t5_seq2seq_eval.py
@@ -22,11 +22,7 @@
 from nemo.collections.nlp.models.language_modeling.megatron_finetune_model import MegatronT5FinetuneModel
 from nemo.collections.nlp.models.language_modeling.megatron_glue_model import MegatronT5GLUEModel
 from nemo.collections.nlp.models.language_modeling.megatron_t0_model import MegatronT0Model
-from nemo.collections.nlp.parts.nlp_overrides import (
-    GradScaler,
-    MegatronHalfPrecisionPlugin,
-    NLPDDPStrategy
-)
+from nemo.collections.nlp.parts.nlp_overrides import GradScaler, MegatronHalfPrecisionPlugin, NLPDDPStrategy
 from nemo.core.config import hydra_runner
 from nemo.utils import logging
 from nemo.utils.exp_manager import StatelessTimer, exp_manager

From e81bc638000f276969cc9fa7a57f708c530dbadc Mon Sep 17 00:00:00 2001
From: MaximumEntropy <sandeep.subramanian.1@umontreal.ca>
Date: Wed, 9 Nov 2022 14:08:06 -0800
Subject: [PATCH 13/17] Fix validation reconfiguration

Signed-off-by: MaximumEntropy <sandeep.subramanian.1@umontreal.ca>
---
 .../language_modeling/megatron_finetune_model.py     | 12 ++++--------
 1 file changed, 4 insertions(+), 8 deletions(-)

diff --git a/nemo/collections/nlp/models/language_modeling/megatron_finetune_model.py b/nemo/collections/nlp/models/language_modeling/megatron_finetune_model.py
index 3b644716cdf3..66d1f7a005b1 100644
--- a/nemo/collections/nlp/models/language_modeling/megatron_finetune_model.py
+++ b/nemo/collections/nlp/models/language_modeling/megatron_finetune_model.py
@@ -182,12 +182,13 @@ def training_step(self, batch, batch_idx):
             global_batch_size_per_gpu
             != self.cfg.data.train_ds.global_batch_size // parallel_state.get_data_parallel_world_size()
         ):
+            # NOTE: This should never really be called since `drop_last=True` is required for training datasets.
             app_state = AppState()
             _reconfigure_microbatch_calculator(
                 rank=app_state.global_rank,
                 rampup_batch_size=None,
                 global_batch_size=global_batch_size_per_gpu * parallel_state.get_data_parallel_world_size(),
-                micro_batch_size=global_batch_size_per_gpu // get_num_microbatches_per_batch(),
+                micro_batch_size=global_batch_size_per_gpu // get_num_microbatches(),
                 data_parallel_size=parallel_state.get_data_parallel_world_size(),
             )
         # At this point batch is a list of dictionaries where eatch dict is a microbatch.
@@ -258,12 +259,13 @@ def _reconfigure_and_process_inference_batch(self, batch):
             global_batch_size_per_gpu
             != self.cfg.data.validation_ds.global_batch_size // parallel_state.get_data_parallel_world_size()
         ):
+            # NOTE: This is reconfiguring to make sure there is no grad-acc for validation batches.
             app_state = AppState()
             _reconfigure_microbatch_calculator(
                 rank=app_state.global_rank,
                 rampup_batch_size=None,
                 global_batch_size=global_batch_size_per_gpu * parallel_state.get_data_parallel_world_size(),
-                micro_batch_size=global_batch_size_per_gpu // get_num_microbatches(),
+                micro_batch_size=global_batch_size_per_gpu,
                 data_parallel_size=parallel_state.get_data_parallel_world_size(),
             )
 
@@ -492,13 +494,11 @@ def test_epoch_end(self, outputs):
     def build_data_loader(
         self,
         dataset,
-        micro_batch_size,
         global_batch_size,
         shuffle,
         num_workers,
         pin_memory,
         drop_last,
-        check_validation_interval,
     ):
         """Buld dataloader given an input dataset."""
 
@@ -528,13 +528,11 @@ def build_data_loader(
     def setup_training_data(self):
         self._train_dl = self.build_data_loader(
             self._train_ds,
-            micro_batch_size=self.cfg.data.train_ds.micro_batch_size,
             global_batch_size=self.cfg.data.train_ds.global_batch_size,
             shuffle=self.cfg.data.train_ds.shuffle,
             num_workers=self.cfg.data.train_ds.num_workers,
             pin_memory=self.cfg.data.train_ds.pin_memory,
             drop_last=self.cfg.data.train_ds.drop_last,
-            check_validation_interval=True,
         )
 
     def setup_eval_data(self, datasets, data_cfg):
@@ -542,13 +540,11 @@ def setup_eval_data(self, datasets, data_cfg):
         for dataset in datasets:
             eval_dl = self.build_data_loader(
                 dataset,
-                micro_batch_size=data_cfg.micro_batch_size,
                 global_batch_size=data_cfg.global_batch_size,
                 shuffle=data_cfg.shuffle,
                 num_workers=data_cfg.num_workers,
                 pin_memory=data_cfg.pin_memory,
                 drop_last=data_cfg.drop_last,
-                check_validation_interval=False,
             )
             dataloaders.append(eval_dl)
         return dataloaders

From 616f7342b87955101d3eaab3cfdf3c634de1709a Mon Sep 17 00:00:00 2001
From: MaximumEntropy <sandeep.subramanian.1@umontreal.ca>
Date: Wed, 9 Nov 2022 14:08:40 -0800
Subject: [PATCH 14/17] Remove old comment

Signed-off-by: MaximumEntropy <sandeep.subramanian.1@umontreal.ca>
---
 .../nlp/models/language_modeling/megatron_finetune_model.py    | 3 ---
 1 file changed, 3 deletions(-)

diff --git a/nemo/collections/nlp/models/language_modeling/megatron_finetune_model.py b/nemo/collections/nlp/models/language_modeling/megatron_finetune_model.py
index 66d1f7a005b1..7743f93f8227 100644
--- a/nemo/collections/nlp/models/language_modeling/megatron_finetune_model.py
+++ b/nemo/collections/nlp/models/language_modeling/megatron_finetune_model.py
@@ -191,9 +191,6 @@ def training_step(self, batch, batch_idx):
                 micro_batch_size=global_batch_size_per_gpu // get_num_microbatches(),
                 data_parallel_size=parallel_state.get_data_parallel_world_size(),
             )
-        # At this point batch is a list of dictionaries where eatch dict is a microbatch.
-        # After the process_global_batch call, batch will be a single dictionary containing the global batch.
-        # This is required since the parent class expects a single global batch dictioanry.
         batch = self._process_global_batch(batch)
         return super().training_step(batch, batch_idx)
 

From 1dab65cadb63b0d04b3958aa2a9a437572be4bb7 Mon Sep 17 00:00:00 2001
From: "pre-commit-ci[bot]"
 <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Date: Wed, 9 Nov 2022 22:09:46 +0000
Subject: [PATCH 15/17] [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci
---
 .../models/language_modeling/megatron_finetune_model.py   | 8 +-------
 1 file changed, 1 insertion(+), 7 deletions(-)

diff --git a/nemo/collections/nlp/models/language_modeling/megatron_finetune_model.py b/nemo/collections/nlp/models/language_modeling/megatron_finetune_model.py
index 7743f93f8227..db872543d49d 100644
--- a/nemo/collections/nlp/models/language_modeling/megatron_finetune_model.py
+++ b/nemo/collections/nlp/models/language_modeling/megatron_finetune_model.py
@@ -489,13 +489,7 @@ def test_epoch_end(self, outputs):
         _ = self.inference_epoch_end(outputs, 'test', self.cfg.data.test_ds)
 
     def build_data_loader(
-        self,
-        dataset,
-        global_batch_size,
-        shuffle,
-        num_workers,
-        pin_memory,
-        drop_last,
+        self, dataset, global_batch_size, shuffle, num_workers, pin_memory, drop_last,
     ):
         """Buld dataloader given an input dataset."""
 

From 3b148eabdf1a9a74adfb720a84c807a7e4b93a08 Mon Sep 17 00:00:00 2001
From: MaximumEntropy <sandeep.subramanian.1@umontreal.ca>
Date: Thu, 10 Nov 2022 16:40:30 -0800
Subject: [PATCH 16/17] Fixes for test_ds

Signed-off-by: MaximumEntropy <sandeep.subramanian.1@umontreal.ca>
---
 .../megatron_finetune_model.py                | 35 ++++++++++++++-----
 1 file changed, 26 insertions(+), 9 deletions(-)

diff --git a/nemo/collections/nlp/models/language_modeling/megatron_finetune_model.py b/nemo/collections/nlp/models/language_modeling/megatron_finetune_model.py
index db872543d49d..c24fc9ad17c2 100644
--- a/nemo/collections/nlp/models/language_modeling/megatron_finetune_model.py
+++ b/nemo/collections/nlp/models/language_modeling/megatron_finetune_model.py
@@ -147,7 +147,26 @@ def on_validation_epoch_start(self):
         )
         return super().on_validation_epoch_start()
 
+    def on_test_epoch_start(self):
+        app_state = AppState()
+        _reconfigure_microbatch_calculator(
+            rank=app_state.global_rank,
+            rampup_batch_size=None,
+            global_batch_size=self.cfg.data.test_ds.global_batch_size,
+            micro_batch_size=self.cfg.data.test_ds.micro_batch_size,
+            data_parallel_size=parallel_state.get_data_parallel_world_size(),
+        )
+        return super().on_test_epoch_start()
+
+    def on_test_epoch_end(self):
+        self.on_inference_epoch_end(self.cfg.data.test_ds)
+        return super().on_test_epoch_end()
+
     def on_validation_epoch_end(self):
+        self.on_inference_epoch_end(self.cfg.data.validation_ds)
+        return super().on_validation_epoch_end()
+
+    def on_inference_epoch_end(self, ds):
         app_state = AppState()
         if hasattr(self, "_train_ds"):
             _reconfigure_microbatch_calculator(
@@ -163,13 +182,11 @@ def on_validation_epoch_end(self):
             _reconfigure_microbatch_calculator(
                 rank=app_state.global_rank,
                 rampup_batch_size=None,
-                global_batch_size=self.cfg.data.validation_ds.global_batch_size,
-                micro_batch_size=self.cfg.data.validation_ds.micro_batch_size,
+                global_batch_size=ds.global_batch_size,
+                micro_batch_size=ds.micro_batch_size,
                 data_parallel_size=parallel_state.get_data_parallel_world_size(),
             )
 
-        return super().on_validation_epoch_end()
-
     def on_train_epoch_start(self) -> None:
         # Same logic as validation epoch end, but this may be need if there is no validation sanity check to trigger validation_epoch_end()
         self.on_validation_epoch_end()
@@ -249,12 +266,12 @@ def cast_for_metric(self, pred, label, metric_name, class_labels=None, labels_ar
 
         return pred, label
 
-    def _reconfigure_and_process_inference_batch(self, batch):
+    def _reconfigure_and_process_inference_batch(self, batch, ds_config):
         global_batch_size_per_gpu = batch['text_enc'].size(0)
         # This should happen only on the last batch of the dataset.
         if (
             global_batch_size_per_gpu
-            != self.cfg.data.validation_ds.global_batch_size // parallel_state.get_data_parallel_world_size()
+            != ds_config.global_batch_size // parallel_state.get_data_parallel_world_size()
         ):
             # NOTE: This is reconfiguring to make sure there is no grad-acc for validation batches.
             app_state = AppState()
@@ -273,7 +290,7 @@ def inference_step(self, batch, batch_idx, mode, dataloader_idx=0):
         # Regular finetuning datasets will return a list of dicts for each microbatch. But T0 datasets will return a single dict for the global batch.
         batch_has_lang_information = isinstance(batch, list) and len(batch[0]) == 7
 
-        processed_batch = self._reconfigure_and_process_inference_batch(batch)
+        processed_batch = self._reconfigure_and_process_inference_batch(batch, self.cfg.data.validation_ds if mode == 'validation' else self.cfg.data.test_ds)
 
         # Call parent validation step to get the loss.
         # NOTE: There could be extra keys in the processed_batch dictionary such as "langs" for XNLI, this will be ignored in the parent class.
@@ -301,8 +318,8 @@ def inference_step(self, batch, batch_idx, mode, dataloader_idx=0):
                 pred=pred,
                 label=label,
                 metric_name=self.val_metric_name if mode == 'validation' else self.test_metric_name,
-                class_labels=self.cfg.data.validation_ds.metric.get('class_labels', None),
-                labels_are_strings=self.cfg.data.validation_ds.metric.get('labels_are_strings', False),
+                class_labels=self.cfg.data.validation_ds.metric.get('class_labels', None) if mode == 'validation' else self.cfg.data.test_ds.metric.get('class_labels', None),
+                labels_are_strings=self.cfg.data.validation_ds.metric.get('labels_are_strings', False) if mode == 'validation' else self.cfg.data.test_ds.metric.get('labels_are_strings', False),
             )
             if batch_has_lang_information:
                 _ = metric(pred, label, category)

From 4816e2d84f557d3d9611f3f7e2f2037b887ef702 Mon Sep 17 00:00:00 2001
From: "pre-commit-ci[bot]"
 <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Date: Fri, 11 Nov 2022 00:42:59 +0000
Subject: [PATCH 17/17] [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci
---
 .../megatron_finetune_model.py                  | 17 ++++++++++-------
 1 file changed, 10 insertions(+), 7 deletions(-)

diff --git a/nemo/collections/nlp/models/language_modeling/megatron_finetune_model.py b/nemo/collections/nlp/models/language_modeling/megatron_finetune_model.py
index c24fc9ad17c2..c49d7b50580a 100644
--- a/nemo/collections/nlp/models/language_modeling/megatron_finetune_model.py
+++ b/nemo/collections/nlp/models/language_modeling/megatron_finetune_model.py
@@ -269,10 +269,7 @@ def cast_for_metric(self, pred, label, metric_name, class_labels=None, labels_ar
     def _reconfigure_and_process_inference_batch(self, batch, ds_config):
         global_batch_size_per_gpu = batch['text_enc'].size(0)
         # This should happen only on the last batch of the dataset.
-        if (
-            global_batch_size_per_gpu
-            != ds_config.global_batch_size // parallel_state.get_data_parallel_world_size()
-        ):
+        if global_batch_size_per_gpu != ds_config.global_batch_size // parallel_state.get_data_parallel_world_size():
             # NOTE: This is reconfiguring to make sure there is no grad-acc for validation batches.
             app_state = AppState()
             _reconfigure_microbatch_calculator(
@@ -290,7 +287,9 @@ def inference_step(self, batch, batch_idx, mode, dataloader_idx=0):
         # Regular finetuning datasets will return a list of dicts for each microbatch. But T0 datasets will return a single dict for the global batch.
         batch_has_lang_information = isinstance(batch, list) and len(batch[0]) == 7
 
-        processed_batch = self._reconfigure_and_process_inference_batch(batch, self.cfg.data.validation_ds if mode == 'validation' else self.cfg.data.test_ds)
+        processed_batch = self._reconfigure_and_process_inference_batch(
+            batch, self.cfg.data.validation_ds if mode == 'validation' else self.cfg.data.test_ds
+        )
 
         # Call parent validation step to get the loss.
         # NOTE: There could be extra keys in the processed_batch dictionary such as "langs" for XNLI, this will be ignored in the parent class.
@@ -318,8 +317,12 @@ def inference_step(self, batch, batch_idx, mode, dataloader_idx=0):
                 pred=pred,
                 label=label,
                 metric_name=self.val_metric_name if mode == 'validation' else self.test_metric_name,
-                class_labels=self.cfg.data.validation_ds.metric.get('class_labels', None) if mode == 'validation' else self.cfg.data.test_ds.metric.get('class_labels', None),
-                labels_are_strings=self.cfg.data.validation_ds.metric.get('labels_are_strings', False) if mode == 'validation' else self.cfg.data.test_ds.metric.get('labels_are_strings', False),
+                class_labels=self.cfg.data.validation_ds.metric.get('class_labels', None)
+                if mode == 'validation'
+                else self.cfg.data.test_ds.metric.get('class_labels', None),
+                labels_are_strings=self.cfg.data.validation_ds.metric.get('labels_are_strings', False)
+                if mode == 'validation'
+                else self.cfg.data.test_ds.metric.get('labels_are_strings', False),
             )
             if batch_has_lang_information:
                 _ = metric(pred, label, category)