From 56fd8a053542512958b9fc0f0f12829ed8863658 Mon Sep 17 00:00:00 2001 From: MaximumEntropy Date: Thu, 5 May 2022 18:16:32 -0700 Subject: [PATCH 1/2] Check implicit grad acc in GLUE dataset building Signed-off-by: MaximumEntropy --- .../language_modeling/megatron_glue_model.py | 22 +++++++++++++++---- 1 file changed, 18 insertions(+), 4 deletions(-) diff --git a/nemo/collections/nlp/models/language_modeling/megatron_glue_model.py b/nemo/collections/nlp/models/language_modeling/megatron_glue_model.py index f089fef32514..cae1ad17afb8 100644 --- a/nemo/collections/nlp/models/language_modeling/megatron_glue_model.py +++ b/nemo/collections/nlp/models/language_modeling/megatron_glue_model.py @@ -21,6 +21,13 @@ from nemo.collections.nlp.models.language_modeling.megatron_finetune_model import MegatronT5FinetuneModel from nemo.utils import logging +try: + from apex.transformer import parallel_state + + HAVE_APEX = True +except (ImportError, ModuleNotFoundError): + HAVE_APEX = False + __all__ = ['MegatronT5GLUEModel'] @@ -30,7 +37,14 @@ class MegatronT5GLUEModel(MegatronT5FinetuneModel): def __init__(self, cfg: DictConfig, trainer: Trainer): super().__init__(cfg, trainer=trainer) - def _build_dataset(self, data_cfg): + def _build_dataset(self, data_cfg, check_implict_grad_acc=False): + if ( + check_implict_grad_acc + and data_cfg.global_batch_size > data_cfg.micro_batch_size * parallel_state.get_data_parallel_world_size() + ): + raise ValueError( + f'You are trying to use "implicit gradient accumulation" of {data_cfg.global_batch_size // (data_cfg.micro_batch_size * parallel_state.get_data_parallel_world_size())} in your validation/test datasets. This is not supported. Please set global_batch_size equal to micro_batch_size * data_parallel_world_size.' + ) if data_cfg.task_name == 'xnli': dataset = TextToTextXNLIDataset( data_cfg.file_path, @@ -52,17 +66,17 @@ def build_train_valid_test_datasets(self, stage): logging.info('Building GLUE/XNLI datasets.') if stage != 'test': # Wrap this in a list since the general finetuning parent class supports multi-validation. - self._validation_ds = [self._build_dataset(self.cfg.data.validation_ds)] + self._validation_ds = [self._build_dataset(self.cfg.data.validation_ds, check_implict_grad_acc=True)] logging.info(f'Length of val dataset: {len(self._validation_ds)}') if stage != 'validate': if hasattr(self.cfg.data, 'test_ds'): # Wrap this in a list since the general finetuning parent class supports multi-validation. - self._test_ds = [self._build_dataset(self.cfg.data.test_ds)] + self._test_ds = [self._build_dataset(self.cfg.data.test_ds, check_implict_grad_acc=True)] logging.info(f'Length of test dataset: {len(self._test_ds)}') if stage == 'validate' or stage == 'test': return - self._train_ds = self._build_dataset(self.cfg.data.train_ds) + self._train_ds = self._build_dataset(self.cfg.data.train_ds, check_implict_grad_acc=False) logging.info(f'Length of train dataset: {len(self._train_ds)}') logging.info(f'Finished building GLUE/XNLI datasets.') From 0706ced37b0e4dd24e8f9d47536437c27d08e85a Mon Sep 17 00:00:00 2001 From: MaximumEntropy Date: Thu, 5 May 2022 21:10:53 -0700 Subject: [PATCH 2/2] Fix jenkins test for GLUE/XNLI Signed-off-by: MaximumEntropy --- Jenkinsfile | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/Jenkinsfile b/Jenkinsfile index ea84290171e8..381dbed2d475 100644 --- a/Jenkinsfile +++ b/Jenkinsfile @@ -2832,7 +2832,7 @@ pipeline { model.data.train_ds.task_name=rte \ model.data.train_ds.global_batch_size=4 \ model.data.train_ds.micro_batch_size=2 \ - model.data.validation_ds.global_batch_size=4 \ + model.data.validation_ds.global_batch_size=2 \ model.data.validation_ds.micro_batch_size=2 \ model.data.train_ds.file_path=/home/TestData/nlp/megatron_t5/data/train_ci.tsv \ model.data.validation_ds.task_name=rte \ @@ -2860,9 +2860,9 @@ pipeline { model.pipeline_model_parallel_split_rank=0 \ model.data.train_ds.global_batch_size=4 \ model.data.train_ds.micro_batch_size=2 \ - model.data.validation_ds.global_batch_size=4 \ + model.data.validation_ds.global_batch_size=2 \ model.data.validation_ds.micro_batch_size=2 \ - model.data.test_ds.global_batch_size=4 \ + model.data.test_ds.global_batch_size=2 \ model.data.test_ds.micro_batch_size=2 \ model.data.train_ds.task_name=rte \ model.data.train_ds.file_path=/home/TestData/nlp/megatron_t5/data/train_ci.tsv \