From 56fd8a053542512958b9fc0f0f12829ed8863658 Mon Sep 17 00:00:00 2001
From: MaximumEntropy <sandeep.subramanian.1@umontreal.ca>
Date: Thu, 5 May 2022 18:16:32 -0700
Subject: [PATCH 1/2] Check implicit grad acc in GLUE dataset building

Signed-off-by: MaximumEntropy <sandeep.subramanian.1@umontreal.ca>
---
 .../language_modeling/megatron_glue_model.py  | 22 +++++++++++++++----
 1 file changed, 18 insertions(+), 4 deletions(-)

diff --git a/nemo/collections/nlp/models/language_modeling/megatron_glue_model.py b/nemo/collections/nlp/models/language_modeling/megatron_glue_model.py
index f089fef32514..cae1ad17afb8 100644
--- a/nemo/collections/nlp/models/language_modeling/megatron_glue_model.py
+++ b/nemo/collections/nlp/models/language_modeling/megatron_glue_model.py
@@ -21,6 +21,13 @@
 from nemo.collections.nlp.models.language_modeling.megatron_finetune_model import MegatronT5FinetuneModel
 from nemo.utils import logging
 
+try:
+    from apex.transformer import parallel_state
+
+    HAVE_APEX = True
+except (ImportError, ModuleNotFoundError):
+    HAVE_APEX = False
+
 __all__ = ['MegatronT5GLUEModel']
 
 
@@ -30,7 +37,14 @@ class MegatronT5GLUEModel(MegatronT5FinetuneModel):
     def __init__(self, cfg: DictConfig, trainer: Trainer):
         super().__init__(cfg, trainer=trainer)
 
-    def _build_dataset(self, data_cfg):
+    def _build_dataset(self, data_cfg, check_implict_grad_acc=False):
+        if (
+            check_implict_grad_acc
+            and data_cfg.global_batch_size > data_cfg.micro_batch_size * parallel_state.get_data_parallel_world_size()
+        ):
+            raise ValueError(
+                f'You are trying to use "implicit gradient accumulation" of {data_cfg.global_batch_size // (data_cfg.micro_batch_size * parallel_state.get_data_parallel_world_size())} in your validation/test datasets. This is not supported. Please set global_batch_size equal to micro_batch_size * data_parallel_world_size.'
+            )
         if data_cfg.task_name == 'xnli':
             dataset = TextToTextXNLIDataset(
                 data_cfg.file_path,
@@ -52,17 +66,17 @@ def build_train_valid_test_datasets(self, stage):
         logging.info('Building GLUE/XNLI datasets.')
         if stage != 'test':
             # Wrap this in a list since the general finetuning parent class supports multi-validation.
-            self._validation_ds = [self._build_dataset(self.cfg.data.validation_ds)]
+            self._validation_ds = [self._build_dataset(self.cfg.data.validation_ds, check_implict_grad_acc=True)]
             logging.info(f'Length of val dataset: {len(self._validation_ds)}')
 
         if stage != 'validate':
             if hasattr(self.cfg.data, 'test_ds'):
                 # Wrap this in a list since the general finetuning parent class supports multi-validation.
-                self._test_ds = [self._build_dataset(self.cfg.data.test_ds)]
+                self._test_ds = [self._build_dataset(self.cfg.data.test_ds, check_implict_grad_acc=True)]
                 logging.info(f'Length of test dataset: {len(self._test_ds)}')
 
         if stage == 'validate' or stage == 'test':
             return
-        self._train_ds = self._build_dataset(self.cfg.data.train_ds)
+        self._train_ds = self._build_dataset(self.cfg.data.train_ds, check_implict_grad_acc=False)
         logging.info(f'Length of train dataset: {len(self._train_ds)}')
         logging.info(f'Finished building GLUE/XNLI datasets.')

From 0706ced37b0e4dd24e8f9d47536437c27d08e85a Mon Sep 17 00:00:00 2001
From: MaximumEntropy <sandeep.subramanian.1@umontreal.ca>
Date: Thu, 5 May 2022 21:10:53 -0700
Subject: [PATCH 2/2] Fix jenkins test for GLUE/XNLI

Signed-off-by: MaximumEntropy <sandeep.subramanian.1@umontreal.ca>
---
 Jenkinsfile | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/Jenkinsfile b/Jenkinsfile
index ea84290171e8..381dbed2d475 100644
--- a/Jenkinsfile
+++ b/Jenkinsfile
@@ -2832,7 +2832,7 @@ pipeline {
             model.data.train_ds.task_name=rte \
             model.data.train_ds.global_batch_size=4 \
             model.data.train_ds.micro_batch_size=2 \
-            model.data.validation_ds.global_batch_size=4 \
+            model.data.validation_ds.global_batch_size=2 \
             model.data.validation_ds.micro_batch_size=2 \
             model.data.train_ds.file_path=/home/TestData/nlp/megatron_t5/data/train_ci.tsv \
             model.data.validation_ds.task_name=rte \
@@ -2860,9 +2860,9 @@ pipeline {
             model.pipeline_model_parallel_split_rank=0 \
             model.data.train_ds.global_batch_size=4 \
             model.data.train_ds.micro_batch_size=2 \
-            model.data.validation_ds.global_batch_size=4 \
+            model.data.validation_ds.global_batch_size=2 \
             model.data.validation_ds.micro_batch_size=2 \
-            model.data.test_ds.global_batch_size=4 \
+            model.data.test_ds.global_batch_size=2 \
             model.data.test_ds.micro_batch_size=2 \
             model.data.train_ds.task_name=rte \
             model.data.train_ds.file_path=/home/TestData/nlp/megatron_t5/data/train_ci.tsv \