From bfa1df6ff4983c8a1ff7e4134a0ea30a46a63f17 Mon Sep 17 00:00:00 2001
From: Abhishree <abhishreetm@gmail.com>
Date: Fri, 8 Dec 2023 08:29:02 -0800
Subject: [PATCH 01/14] Handle float limit_val_batches

Signed-off-by: Abhishree <abhishreetm@gmail.com>
---
 .../language_modeling/megatron_base_model.py      | 15 +++++++++++++--
 .../language_modeling/megatron_gpt_model.py       |  6 +++---
 2 files changed, 16 insertions(+), 5 deletions(-)

diff --git a/nemo/collections/nlp/models/language_modeling/megatron_base_model.py b/nemo/collections/nlp/models/language_modeling/megatron_base_model.py
index 5321a307b2c4..e0a4b7281013 100644
--- a/nemo/collections/nlp/models/language_modeling/megatron_base_model.py
+++ b/nemo/collections/nlp/models/language_modeling/megatron_base_model.py
@@ -14,6 +14,7 @@
 
 import gc
 import itertools
+import math
 import os
 import re
 from dataclasses import fields
@@ -46,7 +47,7 @@
 from nemo.utils.get_rank import is_global_rank_zero
 
 try:
-    from apex.transformer.pipeline_parallel.utils import get_num_microbatches
+    from apex.transformer.pipeline_parallel.utils import get_micro_batch_size, get_num_microbatches
 
     HAVE_APEX = True
 
@@ -322,9 +323,19 @@ def _reconfigure_val_batches(self):
         """
         Reconfigure trainer.limit_val_batches for pretraining
         """
+        # Override limit_val_batches to be a multiple of num microbatches and so there are limit_val_batches//num_micro_batches num of global batches
         if isinstance(self.trainer.limit_val_batches, int):
-            # Override limit_val_batches to be a multiple of num microbatches and so there are limit_val_batches//num_micro_batches num of global batches
             self.trainer.limit_val_batches *= get_num_microbatches()
+        else:
+            assert isinstance(self.trainer.limit_val_batches, float)
+            if self._validation_ds is not None:
+                val_length = len(self._validation_ds)
+                if not math.isinf(val_length):
+                    mb_times_dp = get_micro_batch_size() * parallel_state.get_data_parallel_world_size()
+                    total_val_microbatches = val_length // mb_times_dp
+                    limit_val_batches = int(self.trainer.limit_val_batches * total_val_microbatches)
+                    self.trainer.limit_val_batches = limit_val_batches - limit_val_batches % get_num_microbatches()
+
         # Override num sanity steps to be a multiple of num of microbatches
         self.trainer.num_sanity_val_steps *= get_num_microbatches()
 
diff --git a/nemo/collections/nlp/models/language_modeling/megatron_gpt_model.py b/nemo/collections/nlp/models/language_modeling/megatron_gpt_model.py
index 752696ac8faa..3337e14e6578 100644
--- a/nemo/collections/nlp/models/language_modeling/megatron_gpt_model.py
+++ b/nemo/collections/nlp/models/language_modeling/megatron_gpt_model.py
@@ -1172,11 +1172,9 @@ def loss_func(self, loss_mask, num_valid_tokens_in_ub, output_tensor):
         return loss
 
     def build_train_valid_test_datasets(self):
-        # Override limit_val_batches to be a multiple of num microbatches to prevent val_step from exiting in between a step
-        self._reconfigure_val_batches()
-        logging.info('Building GPT datasets.')
         if self.trainer.limit_val_batches > 1.0 and isinstance(self.trainer.limit_val_batches, float):
             raise ValueError("limit_val_batches must be an integer or float less than or equal to 1.0.")
+        logging.info('Building GPT datasets.')
         global_batch_size = self.cfg.global_batch_size
         max_train_steps = self.trainer.max_steps
         eval_iters = (max_train_steps // self.trainer.val_check_interval + 1) * self.trainer.limit_val_batches
@@ -1240,6 +1238,8 @@ def build_train_valid_test_datasets(self):
         if self._test_ds is not None:
             logging.info(f'Length of test dataset: {len(self._test_ds)}')
         logging.info(f'Finished building GPT datasets.')
+        # Override limit_val_batches to be a multiple of num microbatches to prevent val_step from exiting in between a step
+        self._reconfigure_val_batches()
 
         return self._train_ds, self._validation_ds, self._test_ds
 

From d859e677c548a721ebf2e2d787616dcaec0c0fa5 Mon Sep 17 00:00:00 2001
From: Abhishree <abhishreetm@gmail.com>
Date: Wed, 27 Dec 2023 10:37:33 -0800
Subject: [PATCH 02/14] Rectify reconfiguration of float limit_val_batches

Signed-off-by: Abhishree <abhishreetm@gmail.com>
---
 .../models/language_modeling/megatron_base_model.py   | 11 +++++------
 .../models/language_modeling/megatron_gpt_model.py    | 10 ++--------
 2 files changed, 7 insertions(+), 14 deletions(-)

diff --git a/nemo/collections/nlp/models/language_modeling/megatron_base_model.py b/nemo/collections/nlp/models/language_modeling/megatron_base_model.py
index e0a4b7281013..7d899e108506 100644
--- a/nemo/collections/nlp/models/language_modeling/megatron_base_model.py
+++ b/nemo/collections/nlp/models/language_modeling/megatron_base_model.py
@@ -28,6 +28,7 @@
 from pytorch_lightning.plugins.precision import MixedPrecisionPlugin
 from pytorch_lightning.trainer.connectors.logger_connector.fx_validator import _FxValidator
 from pytorch_lightning.trainer.trainer import Trainer
+from pytorch_lightning.utilities.exceptions import MisconfigurationException
 
 from nemo.collections.nlp.models.nlp_model import NLPModel
 from nemo.collections.nlp.modules.common.megatron.attention import HAVE_FLASH_ATTENTION
@@ -329,12 +330,10 @@ def _reconfigure_val_batches(self):
         else:
             assert isinstance(self.trainer.limit_val_batches, float)
             if self._validation_ds is not None:
-                val_length = len(self._validation_ds)
-                if not math.isinf(val_length):
-                    mb_times_dp = get_micro_batch_size() * parallel_state.get_data_parallel_world_size()
-                    total_val_microbatches = val_length // mb_times_dp
-                    limit_val_batches = int(self.trainer.limit_val_batches * total_val_microbatches)
-                    self.trainer.limit_val_batches = limit_val_batches - limit_val_batches % get_num_microbatches()
+                # limit_val_batches is already incorporated in the calculation of eval_iters which is used to compute the samples in dataloader.
+                # Hence not necessary to scale num_val_batches with the float value of limit_val_batches
+                num_val_batches = len(self._validation_dl)
+                self.trainer.limit_val_batches = num_val_batches * get_num_microbatches()
 
         # Override num sanity steps to be a multiple of num of microbatches
         self.trainer.num_sanity_val_steps *= get_num_microbatches()
diff --git a/nemo/collections/nlp/models/language_modeling/megatron_gpt_model.py b/nemo/collections/nlp/models/language_modeling/megatron_gpt_model.py
index 3337e14e6578..c03413810749 100644
--- a/nemo/collections/nlp/models/language_modeling/megatron_gpt_model.py
+++ b/nemo/collections/nlp/models/language_modeling/megatron_gpt_model.py
@@ -1192,12 +1192,6 @@ def build_train_valid_test_datasets(self):
             test_iters * global_batch_size,
         ]
 
-        if self.trainer.limit_val_batches <= 1.0 and isinstance(self.trainer.limit_val_batches, float):
-            train_valid_test_num_samples[
-                1
-            ] = 1  # This is to make sure we only have one epoch on every validation iteration
-
-        mock_dataset = self.cfg.data.get("mock_dataset", False)
         kwargs = {
             "is_built_on_rank": is_dataset_built_on_rank,
             "random_seed": self.cfg.seed,
@@ -1238,8 +1232,6 @@ def build_train_valid_test_datasets(self):
         if self._test_ds is not None:
             logging.info(f'Length of test dataset: {len(self._test_ds)}')
         logging.info(f'Finished building GPT datasets.')
-        # Override limit_val_batches to be a multiple of num microbatches to prevent val_step from exiting in between a step
-        self._reconfigure_val_batches()
 
         return self._train_ds, self._validation_ds, self._test_ds
 
@@ -1325,6 +1317,8 @@ def setup(self, stage=None):
             self.setup_training_data(self.cfg.data)
             self.setup_validation_data(self.cfg.data)
             self.setup_test_data(self.cfg.data)
+            # Override limit_val_batches to be a multiple of num microbatches to prevent val_step from exiting in between a step
+            self._reconfigure_val_batches()
 
         if stage == 'fit':
             self.initialize_last_rank_embeddings()

From e39a211300ad7a110d6b4da89de2f4cf3f69f949 Mon Sep 17 00:00:00 2001
From: Abhishree <abhishreetm@gmail.com>
Date: Wed, 27 Dec 2023 11:37:27 -0800
Subject: [PATCH 03/14] Remove unused imports

Signed-off-by: Abhishree <abhishreetm@gmail.com>
---
 .../nlp/models/language_modeling/megatron_base_model.py       | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/nemo/collections/nlp/models/language_modeling/megatron_base_model.py b/nemo/collections/nlp/models/language_modeling/megatron_base_model.py
index 7d899e108506..b5a2b35672e8 100644
--- a/nemo/collections/nlp/models/language_modeling/megatron_base_model.py
+++ b/nemo/collections/nlp/models/language_modeling/megatron_base_model.py
@@ -14,7 +14,6 @@
 
 import gc
 import itertools
-import math
 import os
 import re
 from dataclasses import fields
@@ -28,7 +27,6 @@
 from pytorch_lightning.plugins.precision import MixedPrecisionPlugin
 from pytorch_lightning.trainer.connectors.logger_connector.fx_validator import _FxValidator
 from pytorch_lightning.trainer.trainer import Trainer
-from pytorch_lightning.utilities.exceptions import MisconfigurationException
 
 from nemo.collections.nlp.models.nlp_model import NLPModel
 from nemo.collections.nlp.modules.common.megatron.attention import HAVE_FLASH_ATTENTION
@@ -48,7 +46,7 @@
 from nemo.utils.get_rank import is_global_rank_zero
 
 try:
-    from apex.transformer.pipeline_parallel.utils import get_micro_batch_size, get_num_microbatches
+    from apex.transformer.pipeline_parallel.utils import get_num_microbatches
 
     HAVE_APEX = True
 

From a1be0651bb7de0e7af3be18c665a82b21af7eca4 Mon Sep 17 00:00:00 2001
From: Abhishree <abhishreetm@gmail.com>
Date: Wed, 27 Dec 2023 18:20:22 -0800
Subject: [PATCH 04/14] Scale len(val_dataloader) with float limit_val_batches

Signed-off-by: Abhishree <abhishreetm@gmail.com>
---
 .../language_modeling/megatron_base_model.py     | 16 ++++++++++++----
 .../language_modeling/megatron_gpt_model.py      |  3 ++-
 2 files changed, 14 insertions(+), 5 deletions(-)

diff --git a/nemo/collections/nlp/models/language_modeling/megatron_base_model.py b/nemo/collections/nlp/models/language_modeling/megatron_base_model.py
index b5a2b35672e8..9772599fbbdd 100644
--- a/nemo/collections/nlp/models/language_modeling/megatron_base_model.py
+++ b/nemo/collections/nlp/models/language_modeling/megatron_base_model.py
@@ -27,6 +27,7 @@
 from pytorch_lightning.plugins.precision import MixedPrecisionPlugin
 from pytorch_lightning.trainer.connectors.logger_connector.fx_validator import _FxValidator
 from pytorch_lightning.trainer.trainer import Trainer
+from pytorch_lightning.utilities.exceptions import MisconfigurationException
 
 from nemo.collections.nlp.models.nlp_model import NLPModel
 from nemo.collections.nlp.modules.common.megatron.attention import HAVE_FLASH_ATTENTION
@@ -327,10 +328,17 @@ def _reconfigure_val_batches(self):
             self.trainer.limit_val_batches *= get_num_microbatches()
         else:
             assert isinstance(self.trainer.limit_val_batches, float)
-            if self._validation_ds is not None:
-                # limit_val_batches is already incorporated in the calculation of eval_iters which is used to compute the samples in dataloader.
-                # Hence not necessary to scale num_val_batches with the float value of limit_val_batches
-                num_val_batches = len(self._validation_dl)
+            if self._validation_ds is not None and len(self._validation_dl) != float("inf"):
+                num_val_batches = int(len(self._validation_dl) * self.trainer.limit_val_batches)
+                if num_val_batches == 0 and self.trainer.limit_val_batches > 0.0:
+                    min_percentage = 1.0 / len(self._validation_dl)
+                    raise MisconfigurationException(
+                        f"You requested to check {self.trainer.limit_val_batches} of the val_dataloader but"
+                        f" {self.trainer.limit_val_batches} * {len(self._validation_dl)} < 1. Please increase the"
+                        f" `limit_val_batches` argument. Try at least"
+                        f" `limit_val_batches={min_percentage}`"
+        )
+
                 self.trainer.limit_val_batches = num_val_batches * get_num_microbatches()
 
         # Override num sanity steps to be a multiple of num of microbatches
diff --git a/nemo/collections/nlp/models/language_modeling/megatron_gpt_model.py b/nemo/collections/nlp/models/language_modeling/megatron_gpt_model.py
index c03413810749..30d5f3d69603 100644
--- a/nemo/collections/nlp/models/language_modeling/megatron_gpt_model.py
+++ b/nemo/collections/nlp/models/language_modeling/megatron_gpt_model.py
@@ -1177,7 +1177,8 @@ def build_train_valid_test_datasets(self):
         logging.info('Building GPT datasets.')
         global_batch_size = self.cfg.global_batch_size
         max_train_steps = self.trainer.max_steps
-        eval_iters = (max_train_steps // self.trainer.val_check_interval + 1) * self.trainer.limit_val_batches
+        eval_iters = (max_train_steps // self.trainer.val_check_interval + 1) * self.trainer.limit_val_batches if isinstance(self.trainer.limit_val_batches, int) \
+                        else (max_train_steps // self.trainer.val_check_interval + 1)
         test_iters = self.trainer.limit_test_batches
 
         # Add extra FIM tokens to tokenizer

From 0c7fd6e351b256d1f1ba2fbd2ce520c31b19ffa2 Mon Sep 17 00:00:00 2001
From: Abhishree <abhishreetm@gmail.com>
Date: Wed, 14 Feb 2024 15:05:59 -0800
Subject: [PATCH 05/14] Return len(dataloader) in microbatches

Signed-off-by: Abhishree <abhishreetm@gmail.com>
---
 .../language_modeling/megatron/data_samplers.py    | 14 ++++++++++----
 1 file changed, 10 insertions(+), 4 deletions(-)

diff --git a/nemo/collections/nlp/data/language_modeling/megatron/data_samplers.py b/nemo/collections/nlp/data/language_modeling/megatron/data_samplers.py
index f977846477b0..6818f99d0e4f 100644
--- a/nemo/collections/nlp/data/language_modeling/megatron/data_samplers.py
+++ b/nemo/collections/nlp/data/language_modeling/megatron/data_samplers.py
@@ -81,9 +81,12 @@ def __len__(self):
         num_available_samples: int = self.total_samples - self.consumed_samples
         if self.global_batch_size is not None:
             if self.drop_last:
-                return num_available_samples // self.global_batch_size
+                num_global_batches = num_available_samples // self.global_batch_size
             else:
-                return (num_available_samples + self.global_batch_size - 1) // self.global_batch_size
+                num_global_batches = (num_available_samples + self.global_batch_size - 1) // self.global_batch_size
+            # return len of dataloader in terms of micro batches to avoid discrepancy between len of dataloader and
+            # num of batches fetched (as training step fetches in terms of micro batches)
+            return num_global_batches * (self.global_batch_size // self.micro_batch_times_data_parallel_size)
         else:
             return (num_available_samples - 1) // self.micro_batch_times_data_parallel_size + 1
 
@@ -162,9 +165,12 @@ def __len__(self):
         num_available_samples = active_total_samples - self.consumed_samples % active_total_samples
         if self.global_batch_size is not None:
             if self.drop_last:
-                return num_available_samples // self.global_batch_size
+                num_global_batches = num_available_samples // self.global_batch_size
             else:
-                return (num_available_samples + self.global_batch_size - 1) // self.global_batch_size
+                num_global_batches = (num_available_samples + self.global_batch_size - 1) // self.global_batch_size
+            # return len of dataloader in terms of micro batches to avoid discrepancy between len of dataloader and
+            # num of batches fetched (as training step fetches in terms of micro batches)
+            return num_global_batches * (self.global_batch_size // self.micro_batch_times_data_parallel_size)
         else:
             if self.drop_last:
                 return num_available_samples // self.micro_batch_times_data_parallel_size

From 85e22e7508b4ffc9ea7ce7302dec1f5284f7557b Mon Sep 17 00:00:00 2001
From: Abhishree <abhishreetm@gmail.com>
Date: Wed, 14 Feb 2024 16:51:09 -0800
Subject: [PATCH 06/14] Add back resetting of num val samples

Signed-off-by: Abhishree <abhishreetm@gmail.com>
---
 .../nlp/models/language_modeling/megatron_gpt_model.py       | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/nemo/collections/nlp/models/language_modeling/megatron_gpt_model.py b/nemo/collections/nlp/models/language_modeling/megatron_gpt_model.py
index 30d5f3d69603..4404baa4cf55 100644
--- a/nemo/collections/nlp/models/language_modeling/megatron_gpt_model.py
+++ b/nemo/collections/nlp/models/language_modeling/megatron_gpt_model.py
@@ -1193,6 +1193,11 @@ def build_train_valid_test_datasets(self):
             test_iters * global_batch_size,
         ]
 
+        if self.trainer.limit_val_batches <= 1.0 and isinstance(self.trainer.limit_val_batches, float):
+            train_valid_test_num_samples[
+                1
+            ] = 1  # This is to make sure we only have one epoch on every validation iteration
+
         kwargs = {
             "is_built_on_rank": is_dataset_built_on_rank,
             "random_seed": self.cfg.seed,

From 5ad42ece2f09e55b5bbbe53d8c07a513d0b52e99 Mon Sep 17 00:00:00 2001
From: Abhishree <abhishreetm@gmail.com>
Date: Wed, 14 Feb 2024 18:36:34 -0800
Subject: [PATCH 07/14] Fix to ensure float limit_val_batches is multiple of
 num_micro_batches

Signed-off-by: Abhishree <abhishreetm@gmail.com>
---
 .../models/language_modeling/megatron_base_model.py  | 12 ++++++++----
 1 file changed, 8 insertions(+), 4 deletions(-)

diff --git a/nemo/collections/nlp/models/language_modeling/megatron_base_model.py b/nemo/collections/nlp/models/language_modeling/megatron_base_model.py
index 9772599fbbdd..5004b94a190c 100644
--- a/nemo/collections/nlp/models/language_modeling/megatron_base_model.py
+++ b/nemo/collections/nlp/models/language_modeling/megatron_base_model.py
@@ -329,8 +329,9 @@ def _reconfigure_val_batches(self):
         else:
             assert isinstance(self.trainer.limit_val_batches, float)
             if self._validation_ds is not None and len(self._validation_dl) != float("inf"):
-                num_val_batches = int(len(self._validation_dl) * self.trainer.limit_val_batches)
-                if num_val_batches == 0 and self.trainer.limit_val_batches > 0.0:
+                # len(self._validation_dl) returns len as num of microbatches
+                limit_val_micro_batches = int(len(self._validation_dl) * self.trainer.limit_val_batches)
+                if limit_val_micro_batches == 0 and self.trainer.limit_val_batches > 0.0:
                     min_percentage = 1.0 / len(self._validation_dl)
                     raise MisconfigurationException(
                         f"You requested to check {self.trainer.limit_val_batches} of the val_dataloader but"
@@ -338,8 +339,11 @@ def _reconfigure_val_batches(self):
                         f" `limit_val_batches` argument. Try at least"
                         f" `limit_val_batches={min_percentage}`"
         )
-
-                self.trainer.limit_val_batches = num_val_batches * get_num_microbatches()
+                # Make sure trainer.limit_val_batches is a multiple of num of microbatches
+                if limit_val_micro_batches < get_num_microbatches():
+                    self.trainer.limit_val_batches = get_num_microbatches()
+                else:
+                    self.trainer.limit_val_batches = limit_val_micro_batches - limit_val_micro_batches % get_num_microbatches()
 
         # Override num sanity steps to be a multiple of num of microbatches
         self.trainer.num_sanity_val_steps *= get_num_microbatches()

From c76cf6dfb235eb1b089a7b3257711a3699317151 Mon Sep 17 00:00:00 2001
From: Abhishree <abhishreetm@gmail.com>
Date: Thu, 15 Feb 2024 17:41:30 -0800
Subject: [PATCH 08/14] Remove forcing eval samples to 1 for float
 limit_val_batches

Signed-off-by: Abhishree <abhishreetm@gmail.com>
---
 .../nlp/models/language_modeling/megatron_gpt_model.py       | 5 -----
 1 file changed, 5 deletions(-)

diff --git a/nemo/collections/nlp/models/language_modeling/megatron_gpt_model.py b/nemo/collections/nlp/models/language_modeling/megatron_gpt_model.py
index 4404baa4cf55..30d5f3d69603 100644
--- a/nemo/collections/nlp/models/language_modeling/megatron_gpt_model.py
+++ b/nemo/collections/nlp/models/language_modeling/megatron_gpt_model.py
@@ -1193,11 +1193,6 @@ def build_train_valid_test_datasets(self):
             test_iters * global_batch_size,
         ]
 
-        if self.trainer.limit_val_batches <= 1.0 and isinstance(self.trainer.limit_val_batches, float):
-            train_valid_test_num_samples[
-                1
-            ] = 1  # This is to make sure we only have one epoch on every validation iteration
-
         kwargs = {
             "is_built_on_rank": is_dataset_built_on_rank,
             "random_seed": self.cfg.seed,

From b34e70899c8b15088dbb53b8445ebb6d98d6002b Mon Sep 17 00:00:00 2001
From: Abhishree <abhishreetm@gmail.com>
Date: Fri, 16 Feb 2024 15:13:10 -0800
Subject: [PATCH 09/14] Fix bug wrt 0 limiot_val_batches

Signed-off-by: Abhishree <abhishreetm@gmail.com>
---
 .../nlp/models/language_modeling/megatron_base_model.py        | 2 ++
 .../nlp/models/language_modeling/megatron_gpt_model.py         | 3 ++-
 2 files changed, 4 insertions(+), 1 deletion(-)

diff --git a/nemo/collections/nlp/models/language_modeling/megatron_base_model.py b/nemo/collections/nlp/models/language_modeling/megatron_base_model.py
index 5004b94a190c..545aa60e1b9b 100644
--- a/nemo/collections/nlp/models/language_modeling/megatron_base_model.py
+++ b/nemo/collections/nlp/models/language_modeling/megatron_base_model.py
@@ -328,6 +328,8 @@ def _reconfigure_val_batches(self):
             self.trainer.limit_val_batches *= get_num_microbatches()
         else:
             assert isinstance(self.trainer.limit_val_batches, float)
+            # Don't reconfigure if limit_val_batches is 0.0
+            if self.trainer.limit_val_batches == 0.0: return
             if self._validation_ds is not None and len(self._validation_dl) != float("inf"):
                 # len(self._validation_dl) returns len as num of microbatches
                 limit_val_micro_batches = int(len(self._validation_dl) * self.trainer.limit_val_batches)
diff --git a/nemo/collections/nlp/models/language_modeling/megatron_gpt_model.py b/nemo/collections/nlp/models/language_modeling/megatron_gpt_model.py
index 30d5f3d69603..e0e1882118d1 100644
--- a/nemo/collections/nlp/models/language_modeling/megatron_gpt_model.py
+++ b/nemo/collections/nlp/models/language_modeling/megatron_gpt_model.py
@@ -1177,7 +1177,8 @@ def build_train_valid_test_datasets(self):
         logging.info('Building GPT datasets.')
         global_batch_size = self.cfg.global_batch_size
         max_train_steps = self.trainer.max_steps
-        eval_iters = (max_train_steps // self.trainer.val_check_interval + 1) * self.trainer.limit_val_batches if isinstance(self.trainer.limit_val_batches, int) \
+        # if limit_val_batches is 0, don't use it for computing eval samples, as it can cause error in building the dataset with 0 samples
+        eval_iters = (max_train_steps // self.trainer.val_check_interval + 1) * self.trainer.limit_val_batches if isinstance(self.trainer.limit_val_batches, int) and self.trainer.limit_val_batches > 0 \
                         else (max_train_steps // self.trainer.val_check_interval + 1)
         test_iters = self.trainer.limit_test_batches
 

From 61e23fa158d0eee432f63456253bdac8594b4f13 Mon Sep 17 00:00:00 2001
From: Abhishree <abhishreetm@gmail.com>
Date: Mon, 19 Feb 2024 19:57:57 -0800
Subject: [PATCH 10/14] Add missing mock_dataset line

Signed-off-by: Abhishree <abhishreetm@gmail.com>
---
 .../nlp/models/language_modeling/megatron_gpt_model.py           | 1 +
 1 file changed, 1 insertion(+)

diff --git a/nemo/collections/nlp/models/language_modeling/megatron_gpt_model.py b/nemo/collections/nlp/models/language_modeling/megatron_gpt_model.py
index e0e1882118d1..955bd94b93d5 100644
--- a/nemo/collections/nlp/models/language_modeling/megatron_gpt_model.py
+++ b/nemo/collections/nlp/models/language_modeling/megatron_gpt_model.py
@@ -1194,6 +1194,7 @@ def build_train_valid_test_datasets(self):
             test_iters * global_batch_size,
         ]
 
+        mock_dataset = self.cfg.data.get("mock_dataset", False)
         kwargs = {
             "is_built_on_rank": is_dataset_built_on_rank,
             "random_seed": self.cfg.seed,

From 35c4c7d77d2ce1f05e7ac25218b53d265b15670b Mon Sep 17 00:00:00 2001
From: Abhishree <abhishreetm@gmail.com>
Date: Tue, 20 Feb 2024 19:45:34 -0800
Subject: [PATCH 11/14] Avoid ensuring limit_val_batches is a mutliple of
 microbatches for 1.0

Signed-off-by: Abhishree <abhishreetm@gmail.com>
---
 .../language_modeling/megatron_base_model.py  | 32 +++++++++++--------
 1 file changed, 18 insertions(+), 14 deletions(-)

diff --git a/nemo/collections/nlp/models/language_modeling/megatron_base_model.py b/nemo/collections/nlp/models/language_modeling/megatron_base_model.py
index 545aa60e1b9b..ed37ae32fbdb 100644
--- a/nemo/collections/nlp/models/language_modeling/megatron_base_model.py
+++ b/nemo/collections/nlp/models/language_modeling/megatron_base_model.py
@@ -330,22 +330,26 @@ def _reconfigure_val_batches(self):
             assert isinstance(self.trainer.limit_val_batches, float)
             # Don't reconfigure if limit_val_batches is 0.0
             if self.trainer.limit_val_batches == 0.0: return
+            # len(self._validation_dl) returns len as num of microbatches
+            val_len_in_micro_batches = len(self._validation_dl)
             if self._validation_ds is not None and len(self._validation_dl) != float("inf"):
-                # len(self._validation_dl) returns len as num of microbatches
-                limit_val_micro_batches = int(len(self._validation_dl) * self.trainer.limit_val_batches)
-                if limit_val_micro_batches == 0 and self.trainer.limit_val_batches > 0.0:
-                    min_percentage = 1.0 / len(self._validation_dl)
-                    raise MisconfigurationException(
-                        f"You requested to check {self.trainer.limit_val_batches} of the val_dataloader but"
-                        f" {self.trainer.limit_val_batches} * {len(self._validation_dl)} < 1. Please increase the"
-                        f" `limit_val_batches` argument. Try at least"
-                        f" `limit_val_batches={min_percentage}`"
-        )
-                # Make sure trainer.limit_val_batches is a multiple of num of microbatches
-                if limit_val_micro_batches < get_num_microbatches():
-                    self.trainer.limit_val_batches = get_num_microbatches()
+                if self.trainer.limit_val_batches == 1.0:
+                    self.trainer.limit_val_batches = val_len_in_micro_batches
                 else:
-                    self.trainer.limit_val_batches = limit_val_micro_batches - limit_val_micro_batches % get_num_microbatches()
+                    limit_val_micro_batches = int(val_len_in_micro_batches * self.trainer.limit_val_batches)
+                    if limit_val_micro_batches == 0 and self.trainer.limit_val_batches > 0.0:
+                        min_percentage = 1.0 / len(self._validation_dl)
+                        raise MisconfigurationException(
+                            f"You requested to check {self.trainer.limit_val_batches} of the val_dataloader but"
+                            f" {self.trainer.limit_val_batches} * {len(self._validation_dl)} < 1. Please increase the"
+                            f" `limit_val_batches` argument. Try at least"
+                            f" `limit_val_batches={min_percentage}`"
+            )
+                    # Make sure trainer.limit_val_batches is a multiple of num of microbatches
+                    if limit_val_micro_batches < get_num_microbatches():
+                        self.trainer.limit_val_batches = get_num_microbatches()
+                    else:
+                        self.trainer.limit_val_batches = limit_val_micro_batches - limit_val_micro_batches % get_num_microbatches()
 
         # Override num sanity steps to be a multiple of num of microbatches
         self.trainer.num_sanity_val_steps *= get_num_microbatches()

From 70fdf348a2ec3d61de75b28a0897b93cef5c2c26 Mon Sep 17 00:00:00 2001
From: "pre-commit-ci[bot]"
 <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Date: Wed, 21 Feb 2024 03:47:33 +0000
Subject: [PATCH 12/14] [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci
---
 .../nlp/models/language_modeling/megatron_base_model.py  | 9 ++++++---
 .../nlp/models/language_modeling/megatron_gpt_model.py   | 7 +++++--
 2 files changed, 11 insertions(+), 5 deletions(-)

diff --git a/nemo/collections/nlp/models/language_modeling/megatron_base_model.py b/nemo/collections/nlp/models/language_modeling/megatron_base_model.py
index ed37ae32fbdb..6a2ea80ec764 100644
--- a/nemo/collections/nlp/models/language_modeling/megatron_base_model.py
+++ b/nemo/collections/nlp/models/language_modeling/megatron_base_model.py
@@ -329,7 +329,8 @@ def _reconfigure_val_batches(self):
         else:
             assert isinstance(self.trainer.limit_val_batches, float)
             # Don't reconfigure if limit_val_batches is 0.0
-            if self.trainer.limit_val_batches == 0.0: return
+            if self.trainer.limit_val_batches == 0.0:
+                return
             # len(self._validation_dl) returns len as num of microbatches
             val_len_in_micro_batches = len(self._validation_dl)
             if self._validation_ds is not None and len(self._validation_dl) != float("inf"):
@@ -344,12 +345,14 @@ def _reconfigure_val_batches(self):
                             f" {self.trainer.limit_val_batches} * {len(self._validation_dl)} < 1. Please increase the"
                             f" `limit_val_batches` argument. Try at least"
                             f" `limit_val_batches={min_percentage}`"
-            )
+                        )
                     # Make sure trainer.limit_val_batches is a multiple of num of microbatches
                     if limit_val_micro_batches < get_num_microbatches():
                         self.trainer.limit_val_batches = get_num_microbatches()
                     else:
-                        self.trainer.limit_val_batches = limit_val_micro_batches - limit_val_micro_batches % get_num_microbatches()
+                        self.trainer.limit_val_batches = (
+                            limit_val_micro_batches - limit_val_micro_batches % get_num_microbatches()
+                        )
 
         # Override num sanity steps to be a multiple of num of microbatches
         self.trainer.num_sanity_val_steps *= get_num_microbatches()
diff --git a/nemo/collections/nlp/models/language_modeling/megatron_gpt_model.py b/nemo/collections/nlp/models/language_modeling/megatron_gpt_model.py
index 955bd94b93d5..92644e9f865f 100644
--- a/nemo/collections/nlp/models/language_modeling/megatron_gpt_model.py
+++ b/nemo/collections/nlp/models/language_modeling/megatron_gpt_model.py
@@ -1178,8 +1178,11 @@ def build_train_valid_test_datasets(self):
         global_batch_size = self.cfg.global_batch_size
         max_train_steps = self.trainer.max_steps
         # if limit_val_batches is 0, don't use it for computing eval samples, as it can cause error in building the dataset with 0 samples
-        eval_iters = (max_train_steps // self.trainer.val_check_interval + 1) * self.trainer.limit_val_batches if isinstance(self.trainer.limit_val_batches, int) and self.trainer.limit_val_batches > 0 \
-                        else (max_train_steps // self.trainer.val_check_interval + 1)
+        eval_iters = (
+            (max_train_steps // self.trainer.val_check_interval + 1) * self.trainer.limit_val_batches
+            if isinstance(self.trainer.limit_val_batches, int) and self.trainer.limit_val_batches > 0
+            else (max_train_steps // self.trainer.val_check_interval + 1)
+        )
         test_iters = self.trainer.limit_test_batches
 
         # Add extra FIM tokens to tokenizer

From 49f6148fa208bb7471ca7fe5c2827efb25da9c8e Mon Sep 17 00:00:00 2001
From: Jan Baczek <jbaczek@nvidia.com>
Date: Thu, 22 Feb 2024 19:01:05 +0100
Subject: [PATCH 13/14] Restore the hack forcing number of validation and test
 epochs to 1

Signed-off-by: Jan Baczek <jbaczek@nvidia.com>
---
 .../language_modeling/megatron_gpt_model.py    | 18 ++++++------------
 1 file changed, 6 insertions(+), 12 deletions(-)

diff --git a/nemo/collections/nlp/models/language_modeling/megatron_gpt_model.py b/nemo/collections/nlp/models/language_modeling/megatron_gpt_model.py
index 92644e9f865f..950ce534e9bc 100644
--- a/nemo/collections/nlp/models/language_modeling/megatron_gpt_model.py
+++ b/nemo/collections/nlp/models/language_modeling/megatron_gpt_model.py
@@ -1177,13 +1177,6 @@ def build_train_valid_test_datasets(self):
         logging.info('Building GPT datasets.')
         global_batch_size = self.cfg.global_batch_size
         max_train_steps = self.trainer.max_steps
-        # if limit_val_batches is 0, don't use it for computing eval samples, as it can cause error in building the dataset with 0 samples
-        eval_iters = (
-            (max_train_steps // self.trainer.val_check_interval + 1) * self.trainer.limit_val_batches
-            if isinstance(self.trainer.limit_val_batches, int) and self.trainer.limit_val_batches > 0
-            else (max_train_steps // self.trainer.val_check_interval + 1)
-        )
-        test_iters = self.trainer.limit_test_batches
 
         # Add extra FIM tokens to tokenizer
         if self.cfg.data.get('add_fim', False) and self.cfg.tokenizer.library == 'megatron':
@@ -1191,11 +1184,12 @@ def build_train_valid_test_datasets(self):
             fim_tokens = [fim_tokens.prefix, fim_tokens.middle, fim_tokens.suffix, fim_tokens.pad, fim_tokens.eod]
             self.tokenizer.add_special_tokens({'additional_special_tokens': fim_tokens})
 
-        train_valid_test_num_samples = [
-            max_train_steps * global_batch_size,
-            eval_iters * global_batch_size,
-            test_iters * global_batch_size,
-        ]
+        # The line below exploits a quirk in mcore dataset construction, to make number of epochs for validation and test equal to 1
+        # The mcore dataset implementation uses the number N we provide via train_valid_test_num_samples to derive parameter E such that
+        # E = argmin_e e * N_d >= N, or equivalently E = ceildiv(N, N_d)
+        # Where N_d is the total number of samples in a dataset (files), and N is the requested number of samples (provided for every split in the list below).
+        # Setting N = 1 we force E to be 1 as well
+        train_valid_test_num_samples = [max_train_steps * global_batch_size, 1, 1]
 
         mock_dataset = self.cfg.data.get("mock_dataset", False)
         kwargs = {

From 90931f1d022744345f4ebefe3fcc912f16723211 Mon Sep 17 00:00:00 2001
From: Jan Baczek <jbaczek@nvidia.com>
Date: Fri, 23 Feb 2024 15:03:09 +0100
Subject: [PATCH 14/14] Change limit_val_batches to 1.0 for GPT pretraining
 test. The integer value is covered in other tests

Signed-off-by: Jan Baczek <jbaczek@nvidia.com>
---
 Jenkinsfile | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/Jenkinsfile b/Jenkinsfile
index 5d81a57c04c9..4f9220da1fc6 100644
--- a/Jenkinsfile
+++ b/Jenkinsfile
@@ -3485,7 +3485,7 @@ assert_frame_equal(training_curve, gt_curve, rtol=1e-3, atol=1e-3)"'''
         trainer.accelerator=gpu \
         trainer.log_every_n_steps=1 \
         trainer.val_check_interval=2 \
-        trainer.limit_val_batches=2 \
+        trainer.limit_val_batches=1.0 \
         trainer.accumulate_grad_batches=1 \
         trainer.max_steps=3 \
         trainer.precision=16 \
@@ -3520,7 +3520,7 @@ assert_frame_equal(training_curve, gt_curve, rtol=1e-3, atol=1e-3)"'''
         trainer.accelerator=gpu \
         trainer.log_every_n_steps=1 \
         trainer.val_check_interval=2 \
-        trainer.limit_val_batches=2 \
+        trainer.limit_val_batches=1.0 \
         trainer.accumulate_grad_batches=1 \
         trainer.max_steps=6 \
         trainer.precision=16 \