From bfa1df6ff4983c8a1ff7e4134a0ea30a46a63f17 Mon Sep 17 00:00:00 2001 From: Abhishree Date: Fri, 8 Dec 2023 08:29:02 -0800 Subject: [PATCH 01/14] Handle float limit_val_batches Signed-off-by: Abhishree --- .../language_modeling/megatron_base_model.py | 15 +++++++++++++-- .../language_modeling/megatron_gpt_model.py | 6 +++--- 2 files changed, 16 insertions(+), 5 deletions(-) diff --git a/nemo/collections/nlp/models/language_modeling/megatron_base_model.py b/nemo/collections/nlp/models/language_modeling/megatron_base_model.py index 5321a307b2c4..e0a4b7281013 100644 --- a/nemo/collections/nlp/models/language_modeling/megatron_base_model.py +++ b/nemo/collections/nlp/models/language_modeling/megatron_base_model.py @@ -14,6 +14,7 @@ import gc import itertools +import math import os import re from dataclasses import fields @@ -46,7 +47,7 @@ from nemo.utils.get_rank import is_global_rank_zero try: - from apex.transformer.pipeline_parallel.utils import get_num_microbatches + from apex.transformer.pipeline_parallel.utils import get_micro_batch_size, get_num_microbatches HAVE_APEX = True @@ -322,9 +323,19 @@ def _reconfigure_val_batches(self): """ Reconfigure trainer.limit_val_batches for pretraining """ + # Override limit_val_batches to be a multiple of num microbatches and so there are limit_val_batches//num_micro_batches num of global batches if isinstance(self.trainer.limit_val_batches, int): - # Override limit_val_batches to be a multiple of num microbatches and so there are limit_val_batches//num_micro_batches num of global batches self.trainer.limit_val_batches *= get_num_microbatches() + else: + assert isinstance(self.trainer.limit_val_batches, float) + if self._validation_ds is not None: + val_length = len(self._validation_ds) + if not math.isinf(val_length): + mb_times_dp = get_micro_batch_size() * parallel_state.get_data_parallel_world_size() + total_val_microbatches = val_length // mb_times_dp + limit_val_batches = int(self.trainer.limit_val_batches * total_val_microbatches) + self.trainer.limit_val_batches = limit_val_batches - limit_val_batches % get_num_microbatches() + # Override num sanity steps to be a multiple of num of microbatches self.trainer.num_sanity_val_steps *= get_num_microbatches() diff --git a/nemo/collections/nlp/models/language_modeling/megatron_gpt_model.py b/nemo/collections/nlp/models/language_modeling/megatron_gpt_model.py index 752696ac8faa..3337e14e6578 100644 --- a/nemo/collections/nlp/models/language_modeling/megatron_gpt_model.py +++ b/nemo/collections/nlp/models/language_modeling/megatron_gpt_model.py @@ -1172,11 +1172,9 @@ def loss_func(self, loss_mask, num_valid_tokens_in_ub, output_tensor): return loss def build_train_valid_test_datasets(self): - # Override limit_val_batches to be a multiple of num microbatches to prevent val_step from exiting in between a step - self._reconfigure_val_batches() - logging.info('Building GPT datasets.') if self.trainer.limit_val_batches > 1.0 and isinstance(self.trainer.limit_val_batches, float): raise ValueError("limit_val_batches must be an integer or float less than or equal to 1.0.") + logging.info('Building GPT datasets.') global_batch_size = self.cfg.global_batch_size max_train_steps = self.trainer.max_steps eval_iters = (max_train_steps // self.trainer.val_check_interval + 1) * self.trainer.limit_val_batches @@ -1240,6 +1238,8 @@ def build_train_valid_test_datasets(self): if self._test_ds is not None: logging.info(f'Length of test dataset: {len(self._test_ds)}') logging.info(f'Finished building GPT datasets.') + # Override limit_val_batches to be a multiple of num microbatches to prevent val_step from exiting in between a step + self._reconfigure_val_batches() return self._train_ds, self._validation_ds, self._test_ds From d859e677c548a721ebf2e2d787616dcaec0c0fa5 Mon Sep 17 00:00:00 2001 From: Abhishree Date: Wed, 27 Dec 2023 10:37:33 -0800 Subject: [PATCH 02/14] Rectify reconfiguration of float limit_val_batches Signed-off-by: Abhishree --- .../models/language_modeling/megatron_base_model.py | 11 +++++------ .../models/language_modeling/megatron_gpt_model.py | 10 ++-------- 2 files changed, 7 insertions(+), 14 deletions(-) diff --git a/nemo/collections/nlp/models/language_modeling/megatron_base_model.py b/nemo/collections/nlp/models/language_modeling/megatron_base_model.py index e0a4b7281013..7d899e108506 100644 --- a/nemo/collections/nlp/models/language_modeling/megatron_base_model.py +++ b/nemo/collections/nlp/models/language_modeling/megatron_base_model.py @@ -28,6 +28,7 @@ from pytorch_lightning.plugins.precision import MixedPrecisionPlugin from pytorch_lightning.trainer.connectors.logger_connector.fx_validator import _FxValidator from pytorch_lightning.trainer.trainer import Trainer +from pytorch_lightning.utilities.exceptions import MisconfigurationException from nemo.collections.nlp.models.nlp_model import NLPModel from nemo.collections.nlp.modules.common.megatron.attention import HAVE_FLASH_ATTENTION @@ -329,12 +330,10 @@ def _reconfigure_val_batches(self): else: assert isinstance(self.trainer.limit_val_batches, float) if self._validation_ds is not None: - val_length = len(self._validation_ds) - if not math.isinf(val_length): - mb_times_dp = get_micro_batch_size() * parallel_state.get_data_parallel_world_size() - total_val_microbatches = val_length // mb_times_dp - limit_val_batches = int(self.trainer.limit_val_batches * total_val_microbatches) - self.trainer.limit_val_batches = limit_val_batches - limit_val_batches % get_num_microbatches() + # limit_val_batches is already incorporated in the calculation of eval_iters which is used to compute the samples in dataloader. + # Hence not necessary to scale num_val_batches with the float value of limit_val_batches + num_val_batches = len(self._validation_dl) + self.trainer.limit_val_batches = num_val_batches * get_num_microbatches() # Override num sanity steps to be a multiple of num of microbatches self.trainer.num_sanity_val_steps *= get_num_microbatches() diff --git a/nemo/collections/nlp/models/language_modeling/megatron_gpt_model.py b/nemo/collections/nlp/models/language_modeling/megatron_gpt_model.py index 3337e14e6578..c03413810749 100644 --- a/nemo/collections/nlp/models/language_modeling/megatron_gpt_model.py +++ b/nemo/collections/nlp/models/language_modeling/megatron_gpt_model.py @@ -1192,12 +1192,6 @@ def build_train_valid_test_datasets(self): test_iters * global_batch_size, ] - if self.trainer.limit_val_batches <= 1.0 and isinstance(self.trainer.limit_val_batches, float): - train_valid_test_num_samples[ - 1 - ] = 1 # This is to make sure we only have one epoch on every validation iteration - - mock_dataset = self.cfg.data.get("mock_dataset", False) kwargs = { "is_built_on_rank": is_dataset_built_on_rank, "random_seed": self.cfg.seed, @@ -1238,8 +1232,6 @@ def build_train_valid_test_datasets(self): if self._test_ds is not None: logging.info(f'Length of test dataset: {len(self._test_ds)}') logging.info(f'Finished building GPT datasets.') - # Override limit_val_batches to be a multiple of num microbatches to prevent val_step from exiting in between a step - self._reconfigure_val_batches() return self._train_ds, self._validation_ds, self._test_ds @@ -1325,6 +1317,8 @@ def setup(self, stage=None): self.setup_training_data(self.cfg.data) self.setup_validation_data(self.cfg.data) self.setup_test_data(self.cfg.data) + # Override limit_val_batches to be a multiple of num microbatches to prevent val_step from exiting in between a step + self._reconfigure_val_batches() if stage == 'fit': self.initialize_last_rank_embeddings() From e39a211300ad7a110d6b4da89de2f4cf3f69f949 Mon Sep 17 00:00:00 2001 From: Abhishree Date: Wed, 27 Dec 2023 11:37:27 -0800 Subject: [PATCH 03/14] Remove unused imports Signed-off-by: Abhishree --- .../nlp/models/language_modeling/megatron_base_model.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/nemo/collections/nlp/models/language_modeling/megatron_base_model.py b/nemo/collections/nlp/models/language_modeling/megatron_base_model.py index 7d899e108506..b5a2b35672e8 100644 --- a/nemo/collections/nlp/models/language_modeling/megatron_base_model.py +++ b/nemo/collections/nlp/models/language_modeling/megatron_base_model.py @@ -14,7 +14,6 @@ import gc import itertools -import math import os import re from dataclasses import fields @@ -28,7 +27,6 @@ from pytorch_lightning.plugins.precision import MixedPrecisionPlugin from pytorch_lightning.trainer.connectors.logger_connector.fx_validator import _FxValidator from pytorch_lightning.trainer.trainer import Trainer -from pytorch_lightning.utilities.exceptions import MisconfigurationException from nemo.collections.nlp.models.nlp_model import NLPModel from nemo.collections.nlp.modules.common.megatron.attention import HAVE_FLASH_ATTENTION @@ -48,7 +46,7 @@ from nemo.utils.get_rank import is_global_rank_zero try: - from apex.transformer.pipeline_parallel.utils import get_micro_batch_size, get_num_microbatches + from apex.transformer.pipeline_parallel.utils import get_num_microbatches HAVE_APEX = True From a1be0651bb7de0e7af3be18c665a82b21af7eca4 Mon Sep 17 00:00:00 2001 From: Abhishree Date: Wed, 27 Dec 2023 18:20:22 -0800 Subject: [PATCH 04/14] Scale len(val_dataloader) with float limit_val_batches Signed-off-by: Abhishree --- .../language_modeling/megatron_base_model.py | 16 ++++++++++++---- .../language_modeling/megatron_gpt_model.py | 3 ++- 2 files changed, 14 insertions(+), 5 deletions(-) diff --git a/nemo/collections/nlp/models/language_modeling/megatron_base_model.py b/nemo/collections/nlp/models/language_modeling/megatron_base_model.py index b5a2b35672e8..9772599fbbdd 100644 --- a/nemo/collections/nlp/models/language_modeling/megatron_base_model.py +++ b/nemo/collections/nlp/models/language_modeling/megatron_base_model.py @@ -27,6 +27,7 @@ from pytorch_lightning.plugins.precision import MixedPrecisionPlugin from pytorch_lightning.trainer.connectors.logger_connector.fx_validator import _FxValidator from pytorch_lightning.trainer.trainer import Trainer +from pytorch_lightning.utilities.exceptions import MisconfigurationException from nemo.collections.nlp.models.nlp_model import NLPModel from nemo.collections.nlp.modules.common.megatron.attention import HAVE_FLASH_ATTENTION @@ -327,10 +328,17 @@ def _reconfigure_val_batches(self): self.trainer.limit_val_batches *= get_num_microbatches() else: assert isinstance(self.trainer.limit_val_batches, float) - if self._validation_ds is not None: - # limit_val_batches is already incorporated in the calculation of eval_iters which is used to compute the samples in dataloader. - # Hence not necessary to scale num_val_batches with the float value of limit_val_batches - num_val_batches = len(self._validation_dl) + if self._validation_ds is not None and len(self._validation_dl) != float("inf"): + num_val_batches = int(len(self._validation_dl) * self.trainer.limit_val_batches) + if num_val_batches == 0 and self.trainer.limit_val_batches > 0.0: + min_percentage = 1.0 / len(self._validation_dl) + raise MisconfigurationException( + f"You requested to check {self.trainer.limit_val_batches} of the val_dataloader but" + f" {self.trainer.limit_val_batches} * {len(self._validation_dl)} < 1. Please increase the" + f" `limit_val_batches` argument. Try at least" + f" `limit_val_batches={min_percentage}`" + ) + self.trainer.limit_val_batches = num_val_batches * get_num_microbatches() # Override num sanity steps to be a multiple of num of microbatches diff --git a/nemo/collections/nlp/models/language_modeling/megatron_gpt_model.py b/nemo/collections/nlp/models/language_modeling/megatron_gpt_model.py index c03413810749..30d5f3d69603 100644 --- a/nemo/collections/nlp/models/language_modeling/megatron_gpt_model.py +++ b/nemo/collections/nlp/models/language_modeling/megatron_gpt_model.py @@ -1177,7 +1177,8 @@ def build_train_valid_test_datasets(self): logging.info('Building GPT datasets.') global_batch_size = self.cfg.global_batch_size max_train_steps = self.trainer.max_steps - eval_iters = (max_train_steps // self.trainer.val_check_interval + 1) * self.trainer.limit_val_batches + eval_iters = (max_train_steps // self.trainer.val_check_interval + 1) * self.trainer.limit_val_batches if isinstance(self.trainer.limit_val_batches, int) \ + else (max_train_steps // self.trainer.val_check_interval + 1) test_iters = self.trainer.limit_test_batches # Add extra FIM tokens to tokenizer From 0c7fd6e351b256d1f1ba2fbd2ce520c31b19ffa2 Mon Sep 17 00:00:00 2001 From: Abhishree Date: Wed, 14 Feb 2024 15:05:59 -0800 Subject: [PATCH 05/14] Return len(dataloader) in microbatches Signed-off-by: Abhishree --- .../language_modeling/megatron/data_samplers.py | 14 ++++++++++---- 1 file changed, 10 insertions(+), 4 deletions(-) diff --git a/nemo/collections/nlp/data/language_modeling/megatron/data_samplers.py b/nemo/collections/nlp/data/language_modeling/megatron/data_samplers.py index f977846477b0..6818f99d0e4f 100644 --- a/nemo/collections/nlp/data/language_modeling/megatron/data_samplers.py +++ b/nemo/collections/nlp/data/language_modeling/megatron/data_samplers.py @@ -81,9 +81,12 @@ def __len__(self): num_available_samples: int = self.total_samples - self.consumed_samples if self.global_batch_size is not None: if self.drop_last: - return num_available_samples // self.global_batch_size + num_global_batches = num_available_samples // self.global_batch_size else: - return (num_available_samples + self.global_batch_size - 1) // self.global_batch_size + num_global_batches = (num_available_samples + self.global_batch_size - 1) // self.global_batch_size + # return len of dataloader in terms of micro batches to avoid discrepancy between len of dataloader and + # num of batches fetched (as training step fetches in terms of micro batches) + return num_global_batches * (self.global_batch_size // self.micro_batch_times_data_parallel_size) else: return (num_available_samples - 1) // self.micro_batch_times_data_parallel_size + 1 @@ -162,9 +165,12 @@ def __len__(self): num_available_samples = active_total_samples - self.consumed_samples % active_total_samples if self.global_batch_size is not None: if self.drop_last: - return num_available_samples // self.global_batch_size + num_global_batches = num_available_samples // self.global_batch_size else: - return (num_available_samples + self.global_batch_size - 1) // self.global_batch_size + num_global_batches = (num_available_samples + self.global_batch_size - 1) // self.global_batch_size + # return len of dataloader in terms of micro batches to avoid discrepancy between len of dataloader and + # num of batches fetched (as training step fetches in terms of micro batches) + return num_global_batches * (self.global_batch_size // self.micro_batch_times_data_parallel_size) else: if self.drop_last: return num_available_samples // self.micro_batch_times_data_parallel_size From 85e22e7508b4ffc9ea7ce7302dec1f5284f7557b Mon Sep 17 00:00:00 2001 From: Abhishree Date: Wed, 14 Feb 2024 16:51:09 -0800 Subject: [PATCH 06/14] Add back resetting of num val samples Signed-off-by: Abhishree --- .../nlp/models/language_modeling/megatron_gpt_model.py | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/nemo/collections/nlp/models/language_modeling/megatron_gpt_model.py b/nemo/collections/nlp/models/language_modeling/megatron_gpt_model.py index 30d5f3d69603..4404baa4cf55 100644 --- a/nemo/collections/nlp/models/language_modeling/megatron_gpt_model.py +++ b/nemo/collections/nlp/models/language_modeling/megatron_gpt_model.py @@ -1193,6 +1193,11 @@ def build_train_valid_test_datasets(self): test_iters * global_batch_size, ] + if self.trainer.limit_val_batches <= 1.0 and isinstance(self.trainer.limit_val_batches, float): + train_valid_test_num_samples[ + 1 + ] = 1 # This is to make sure we only have one epoch on every validation iteration + kwargs = { "is_built_on_rank": is_dataset_built_on_rank, "random_seed": self.cfg.seed, From 5ad42ece2f09e55b5bbbe53d8c07a513d0b52e99 Mon Sep 17 00:00:00 2001 From: Abhishree Date: Wed, 14 Feb 2024 18:36:34 -0800 Subject: [PATCH 07/14] Fix to ensure float limit_val_batches is multiple of num_micro_batches Signed-off-by: Abhishree --- .../models/language_modeling/megatron_base_model.py | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) diff --git a/nemo/collections/nlp/models/language_modeling/megatron_base_model.py b/nemo/collections/nlp/models/language_modeling/megatron_base_model.py index 9772599fbbdd..5004b94a190c 100644 --- a/nemo/collections/nlp/models/language_modeling/megatron_base_model.py +++ b/nemo/collections/nlp/models/language_modeling/megatron_base_model.py @@ -329,8 +329,9 @@ def _reconfigure_val_batches(self): else: assert isinstance(self.trainer.limit_val_batches, float) if self._validation_ds is not None and len(self._validation_dl) != float("inf"): - num_val_batches = int(len(self._validation_dl) * self.trainer.limit_val_batches) - if num_val_batches == 0 and self.trainer.limit_val_batches > 0.0: + # len(self._validation_dl) returns len as num of microbatches + limit_val_micro_batches = int(len(self._validation_dl) * self.trainer.limit_val_batches) + if limit_val_micro_batches == 0 and self.trainer.limit_val_batches > 0.0: min_percentage = 1.0 / len(self._validation_dl) raise MisconfigurationException( f"You requested to check {self.trainer.limit_val_batches} of the val_dataloader but" @@ -338,8 +339,11 @@ def _reconfigure_val_batches(self): f" `limit_val_batches` argument. Try at least" f" `limit_val_batches={min_percentage}`" ) - - self.trainer.limit_val_batches = num_val_batches * get_num_microbatches() + # Make sure trainer.limit_val_batches is a multiple of num of microbatches + if limit_val_micro_batches < get_num_microbatches(): + self.trainer.limit_val_batches = get_num_microbatches() + else: + self.trainer.limit_val_batches = limit_val_micro_batches - limit_val_micro_batches % get_num_microbatches() # Override num sanity steps to be a multiple of num of microbatches self.trainer.num_sanity_val_steps *= get_num_microbatches() From c76cf6dfb235eb1b089a7b3257711a3699317151 Mon Sep 17 00:00:00 2001 From: Abhishree Date: Thu, 15 Feb 2024 17:41:30 -0800 Subject: [PATCH 08/14] Remove forcing eval samples to 1 for float limit_val_batches Signed-off-by: Abhishree --- .../nlp/models/language_modeling/megatron_gpt_model.py | 5 ----- 1 file changed, 5 deletions(-) diff --git a/nemo/collections/nlp/models/language_modeling/megatron_gpt_model.py b/nemo/collections/nlp/models/language_modeling/megatron_gpt_model.py index 4404baa4cf55..30d5f3d69603 100644 --- a/nemo/collections/nlp/models/language_modeling/megatron_gpt_model.py +++ b/nemo/collections/nlp/models/language_modeling/megatron_gpt_model.py @@ -1193,11 +1193,6 @@ def build_train_valid_test_datasets(self): test_iters * global_batch_size, ] - if self.trainer.limit_val_batches <= 1.0 and isinstance(self.trainer.limit_val_batches, float): - train_valid_test_num_samples[ - 1 - ] = 1 # This is to make sure we only have one epoch on every validation iteration - kwargs = { "is_built_on_rank": is_dataset_built_on_rank, "random_seed": self.cfg.seed, From b34e70899c8b15088dbb53b8445ebb6d98d6002b Mon Sep 17 00:00:00 2001 From: Abhishree Date: Fri, 16 Feb 2024 15:13:10 -0800 Subject: [PATCH 09/14] Fix bug wrt 0 limiot_val_batches Signed-off-by: Abhishree --- .../nlp/models/language_modeling/megatron_base_model.py | 2 ++ .../nlp/models/language_modeling/megatron_gpt_model.py | 3 ++- 2 files changed, 4 insertions(+), 1 deletion(-) diff --git a/nemo/collections/nlp/models/language_modeling/megatron_base_model.py b/nemo/collections/nlp/models/language_modeling/megatron_base_model.py index 5004b94a190c..545aa60e1b9b 100644 --- a/nemo/collections/nlp/models/language_modeling/megatron_base_model.py +++ b/nemo/collections/nlp/models/language_modeling/megatron_base_model.py @@ -328,6 +328,8 @@ def _reconfigure_val_batches(self): self.trainer.limit_val_batches *= get_num_microbatches() else: assert isinstance(self.trainer.limit_val_batches, float) + # Don't reconfigure if limit_val_batches is 0.0 + if self.trainer.limit_val_batches == 0.0: return if self._validation_ds is not None and len(self._validation_dl) != float("inf"): # len(self._validation_dl) returns len as num of microbatches limit_val_micro_batches = int(len(self._validation_dl) * self.trainer.limit_val_batches) diff --git a/nemo/collections/nlp/models/language_modeling/megatron_gpt_model.py b/nemo/collections/nlp/models/language_modeling/megatron_gpt_model.py index 30d5f3d69603..e0e1882118d1 100644 --- a/nemo/collections/nlp/models/language_modeling/megatron_gpt_model.py +++ b/nemo/collections/nlp/models/language_modeling/megatron_gpt_model.py @@ -1177,7 +1177,8 @@ def build_train_valid_test_datasets(self): logging.info('Building GPT datasets.') global_batch_size = self.cfg.global_batch_size max_train_steps = self.trainer.max_steps - eval_iters = (max_train_steps // self.trainer.val_check_interval + 1) * self.trainer.limit_val_batches if isinstance(self.trainer.limit_val_batches, int) \ + # if limit_val_batches is 0, don't use it for computing eval samples, as it can cause error in building the dataset with 0 samples + eval_iters = (max_train_steps // self.trainer.val_check_interval + 1) * self.trainer.limit_val_batches if isinstance(self.trainer.limit_val_batches, int) and self.trainer.limit_val_batches > 0 \ else (max_train_steps // self.trainer.val_check_interval + 1) test_iters = self.trainer.limit_test_batches From 61e23fa158d0eee432f63456253bdac8594b4f13 Mon Sep 17 00:00:00 2001 From: Abhishree Date: Mon, 19 Feb 2024 19:57:57 -0800 Subject: [PATCH 10/14] Add missing mock_dataset line Signed-off-by: Abhishree --- .../nlp/models/language_modeling/megatron_gpt_model.py | 1 + 1 file changed, 1 insertion(+) diff --git a/nemo/collections/nlp/models/language_modeling/megatron_gpt_model.py b/nemo/collections/nlp/models/language_modeling/megatron_gpt_model.py index e0e1882118d1..955bd94b93d5 100644 --- a/nemo/collections/nlp/models/language_modeling/megatron_gpt_model.py +++ b/nemo/collections/nlp/models/language_modeling/megatron_gpt_model.py @@ -1194,6 +1194,7 @@ def build_train_valid_test_datasets(self): test_iters * global_batch_size, ] + mock_dataset = self.cfg.data.get("mock_dataset", False) kwargs = { "is_built_on_rank": is_dataset_built_on_rank, "random_seed": self.cfg.seed, From 35c4c7d77d2ce1f05e7ac25218b53d265b15670b Mon Sep 17 00:00:00 2001 From: Abhishree Date: Tue, 20 Feb 2024 19:45:34 -0800 Subject: [PATCH 11/14] Avoid ensuring limit_val_batches is a mutliple of microbatches for 1.0 Signed-off-by: Abhishree --- .../language_modeling/megatron_base_model.py | 32 +++++++++++-------- 1 file changed, 18 insertions(+), 14 deletions(-) diff --git a/nemo/collections/nlp/models/language_modeling/megatron_base_model.py b/nemo/collections/nlp/models/language_modeling/megatron_base_model.py index 545aa60e1b9b..ed37ae32fbdb 100644 --- a/nemo/collections/nlp/models/language_modeling/megatron_base_model.py +++ b/nemo/collections/nlp/models/language_modeling/megatron_base_model.py @@ -330,22 +330,26 @@ def _reconfigure_val_batches(self): assert isinstance(self.trainer.limit_val_batches, float) # Don't reconfigure if limit_val_batches is 0.0 if self.trainer.limit_val_batches == 0.0: return + # len(self._validation_dl) returns len as num of microbatches + val_len_in_micro_batches = len(self._validation_dl) if self._validation_ds is not None and len(self._validation_dl) != float("inf"): - # len(self._validation_dl) returns len as num of microbatches - limit_val_micro_batches = int(len(self._validation_dl) * self.trainer.limit_val_batches) - if limit_val_micro_batches == 0 and self.trainer.limit_val_batches > 0.0: - min_percentage = 1.0 / len(self._validation_dl) - raise MisconfigurationException( - f"You requested to check {self.trainer.limit_val_batches} of the val_dataloader but" - f" {self.trainer.limit_val_batches} * {len(self._validation_dl)} < 1. Please increase the" - f" `limit_val_batches` argument. Try at least" - f" `limit_val_batches={min_percentage}`" - ) - # Make sure trainer.limit_val_batches is a multiple of num of microbatches - if limit_val_micro_batches < get_num_microbatches(): - self.trainer.limit_val_batches = get_num_microbatches() + if self.trainer.limit_val_batches == 1.0: + self.trainer.limit_val_batches = val_len_in_micro_batches else: - self.trainer.limit_val_batches = limit_val_micro_batches - limit_val_micro_batches % get_num_microbatches() + limit_val_micro_batches = int(val_len_in_micro_batches * self.trainer.limit_val_batches) + if limit_val_micro_batches == 0 and self.trainer.limit_val_batches > 0.0: + min_percentage = 1.0 / len(self._validation_dl) + raise MisconfigurationException( + f"You requested to check {self.trainer.limit_val_batches} of the val_dataloader but" + f" {self.trainer.limit_val_batches} * {len(self._validation_dl)} < 1. Please increase the" + f" `limit_val_batches` argument. Try at least" + f" `limit_val_batches={min_percentage}`" + ) + # Make sure trainer.limit_val_batches is a multiple of num of microbatches + if limit_val_micro_batches < get_num_microbatches(): + self.trainer.limit_val_batches = get_num_microbatches() + else: + self.trainer.limit_val_batches = limit_val_micro_batches - limit_val_micro_batches % get_num_microbatches() # Override num sanity steps to be a multiple of num of microbatches self.trainer.num_sanity_val_steps *= get_num_microbatches() From 70fdf348a2ec3d61de75b28a0897b93cef5c2c26 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Wed, 21 Feb 2024 03:47:33 +0000 Subject: [PATCH 12/14] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- .../nlp/models/language_modeling/megatron_base_model.py | 9 ++++++--- .../nlp/models/language_modeling/megatron_gpt_model.py | 7 +++++-- 2 files changed, 11 insertions(+), 5 deletions(-) diff --git a/nemo/collections/nlp/models/language_modeling/megatron_base_model.py b/nemo/collections/nlp/models/language_modeling/megatron_base_model.py index ed37ae32fbdb..6a2ea80ec764 100644 --- a/nemo/collections/nlp/models/language_modeling/megatron_base_model.py +++ b/nemo/collections/nlp/models/language_modeling/megatron_base_model.py @@ -329,7 +329,8 @@ def _reconfigure_val_batches(self): else: assert isinstance(self.trainer.limit_val_batches, float) # Don't reconfigure if limit_val_batches is 0.0 - if self.trainer.limit_val_batches == 0.0: return + if self.trainer.limit_val_batches == 0.0: + return # len(self._validation_dl) returns len as num of microbatches val_len_in_micro_batches = len(self._validation_dl) if self._validation_ds is not None and len(self._validation_dl) != float("inf"): @@ -344,12 +345,14 @@ def _reconfigure_val_batches(self): f" {self.trainer.limit_val_batches} * {len(self._validation_dl)} < 1. Please increase the" f" `limit_val_batches` argument. Try at least" f" `limit_val_batches={min_percentage}`" - ) + ) # Make sure trainer.limit_val_batches is a multiple of num of microbatches if limit_val_micro_batches < get_num_microbatches(): self.trainer.limit_val_batches = get_num_microbatches() else: - self.trainer.limit_val_batches = limit_val_micro_batches - limit_val_micro_batches % get_num_microbatches() + self.trainer.limit_val_batches = ( + limit_val_micro_batches - limit_val_micro_batches % get_num_microbatches() + ) # Override num sanity steps to be a multiple of num of microbatches self.trainer.num_sanity_val_steps *= get_num_microbatches() diff --git a/nemo/collections/nlp/models/language_modeling/megatron_gpt_model.py b/nemo/collections/nlp/models/language_modeling/megatron_gpt_model.py index 955bd94b93d5..92644e9f865f 100644 --- a/nemo/collections/nlp/models/language_modeling/megatron_gpt_model.py +++ b/nemo/collections/nlp/models/language_modeling/megatron_gpt_model.py @@ -1178,8 +1178,11 @@ def build_train_valid_test_datasets(self): global_batch_size = self.cfg.global_batch_size max_train_steps = self.trainer.max_steps # if limit_val_batches is 0, don't use it for computing eval samples, as it can cause error in building the dataset with 0 samples - eval_iters = (max_train_steps // self.trainer.val_check_interval + 1) * self.trainer.limit_val_batches if isinstance(self.trainer.limit_val_batches, int) and self.trainer.limit_val_batches > 0 \ - else (max_train_steps // self.trainer.val_check_interval + 1) + eval_iters = ( + (max_train_steps // self.trainer.val_check_interval + 1) * self.trainer.limit_val_batches + if isinstance(self.trainer.limit_val_batches, int) and self.trainer.limit_val_batches > 0 + else (max_train_steps // self.trainer.val_check_interval + 1) + ) test_iters = self.trainer.limit_test_batches # Add extra FIM tokens to tokenizer From 49f6148fa208bb7471ca7fe5c2827efb25da9c8e Mon Sep 17 00:00:00 2001 From: Jan Baczek Date: Thu, 22 Feb 2024 19:01:05 +0100 Subject: [PATCH 13/14] Restore the hack forcing number of validation and test epochs to 1 Signed-off-by: Jan Baczek --- .../language_modeling/megatron_gpt_model.py | 18 ++++++------------ 1 file changed, 6 insertions(+), 12 deletions(-) diff --git a/nemo/collections/nlp/models/language_modeling/megatron_gpt_model.py b/nemo/collections/nlp/models/language_modeling/megatron_gpt_model.py index 92644e9f865f..950ce534e9bc 100644 --- a/nemo/collections/nlp/models/language_modeling/megatron_gpt_model.py +++ b/nemo/collections/nlp/models/language_modeling/megatron_gpt_model.py @@ -1177,13 +1177,6 @@ def build_train_valid_test_datasets(self): logging.info('Building GPT datasets.') global_batch_size = self.cfg.global_batch_size max_train_steps = self.trainer.max_steps - # if limit_val_batches is 0, don't use it for computing eval samples, as it can cause error in building the dataset with 0 samples - eval_iters = ( - (max_train_steps // self.trainer.val_check_interval + 1) * self.trainer.limit_val_batches - if isinstance(self.trainer.limit_val_batches, int) and self.trainer.limit_val_batches > 0 - else (max_train_steps // self.trainer.val_check_interval + 1) - ) - test_iters = self.trainer.limit_test_batches # Add extra FIM tokens to tokenizer if self.cfg.data.get('add_fim', False) and self.cfg.tokenizer.library == 'megatron': @@ -1191,11 +1184,12 @@ def build_train_valid_test_datasets(self): fim_tokens = [fim_tokens.prefix, fim_tokens.middle, fim_tokens.suffix, fim_tokens.pad, fim_tokens.eod] self.tokenizer.add_special_tokens({'additional_special_tokens': fim_tokens}) - train_valid_test_num_samples = [ - max_train_steps * global_batch_size, - eval_iters * global_batch_size, - test_iters * global_batch_size, - ] + # The line below exploits a quirk in mcore dataset construction, to make number of epochs for validation and test equal to 1 + # The mcore dataset implementation uses the number N we provide via train_valid_test_num_samples to derive parameter E such that + # E = argmin_e e * N_d >= N, or equivalently E = ceildiv(N, N_d) + # Where N_d is the total number of samples in a dataset (files), and N is the requested number of samples (provided for every split in the list below). + # Setting N = 1 we force E to be 1 as well + train_valid_test_num_samples = [max_train_steps * global_batch_size, 1, 1] mock_dataset = self.cfg.data.get("mock_dataset", False) kwargs = { From 90931f1d022744345f4ebefe3fcc912f16723211 Mon Sep 17 00:00:00 2001 From: Jan Baczek Date: Fri, 23 Feb 2024 15:03:09 +0100 Subject: [PATCH 14/14] Change limit_val_batches to 1.0 for GPT pretraining test. The integer value is covered in other tests Signed-off-by: Jan Baczek --- Jenkinsfile | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/Jenkinsfile b/Jenkinsfile index 5d81a57c04c9..4f9220da1fc6 100644 --- a/Jenkinsfile +++ b/Jenkinsfile @@ -3485,7 +3485,7 @@ assert_frame_equal(training_curve, gt_curve, rtol=1e-3, atol=1e-3)"''' trainer.accelerator=gpu \ trainer.log_every_n_steps=1 \ trainer.val_check_interval=2 \ - trainer.limit_val_batches=2 \ + trainer.limit_val_batches=1.0 \ trainer.accumulate_grad_batches=1 \ trainer.max_steps=3 \ trainer.precision=16 \ @@ -3520,7 +3520,7 @@ assert_frame_equal(training_curve, gt_curve, rtol=1e-3, atol=1e-3)"''' trainer.accelerator=gpu \ trainer.log_every_n_steps=1 \ trainer.val_check_interval=2 \ - trainer.limit_val_batches=2 \ + trainer.limit_val_batches=1.0 \ trainer.accumulate_grad_batches=1 \ trainer.max_steps=6 \ trainer.precision=16 \