diff --git a/Jenkinsfile b/Jenkinsfile index e2c73a9f9d04..5554ff68a2eb 100644 --- a/Jenkinsfile +++ b/Jenkinsfile @@ -2832,7 +2832,7 @@ pipeline { model.data.train_ds.task_name=rte \ model.data.train_ds.global_batch_size=4 \ model.data.train_ds.micro_batch_size=2 \ - model.data.validation_ds.global_batch_size=4 \ + model.data.validation_ds.global_batch_size=2 \ model.data.validation_ds.micro_batch_size=2 \ model.data.train_ds.file_path=/home/TestData/nlp/megatron_t5/data/train_ci.tsv \ model.data.validation_ds.task_name=rte \ @@ -2860,9 +2860,9 @@ pipeline { model.pipeline_model_parallel_split_rank=0 \ model.data.train_ds.global_batch_size=4 \ model.data.train_ds.micro_batch_size=2 \ - model.data.validation_ds.global_batch_size=4 \ + model.data.validation_ds.global_batch_size=2 \ model.data.validation_ds.micro_batch_size=2 \ - model.data.test_ds.global_batch_size=4 \ + model.data.test_ds.global_batch_size=2 \ model.data.test_ds.micro_batch_size=2 \ model.data.train_ds.task_name=rte \ model.data.train_ds.file_path=/home/TestData/nlp/megatron_t5/data/train_ci.tsv \ diff --git a/nemo/collections/nlp/models/language_modeling/megatron_glue_model.py b/nemo/collections/nlp/models/language_modeling/megatron_glue_model.py index f089fef32514..cae1ad17afb8 100644 --- a/nemo/collections/nlp/models/language_modeling/megatron_glue_model.py +++ b/nemo/collections/nlp/models/language_modeling/megatron_glue_model.py @@ -21,6 +21,13 @@ from nemo.collections.nlp.models.language_modeling.megatron_finetune_model import MegatronT5FinetuneModel from nemo.utils import logging +try: + from apex.transformer import parallel_state + + HAVE_APEX = True +except (ImportError, ModuleNotFoundError): + HAVE_APEX = False + __all__ = ['MegatronT5GLUEModel'] @@ -30,7 +37,14 @@ class MegatronT5GLUEModel(MegatronT5FinetuneModel): def __init__(self, cfg: DictConfig, trainer: Trainer): super().__init__(cfg, trainer=trainer) - def _build_dataset(self, data_cfg): + def _build_dataset(self, data_cfg, check_implict_grad_acc=False): + if ( + check_implict_grad_acc + and data_cfg.global_batch_size > data_cfg.micro_batch_size * parallel_state.get_data_parallel_world_size() + ): + raise ValueError( + f'You are trying to use "implicit gradient accumulation" of {data_cfg.global_batch_size // (data_cfg.micro_batch_size * parallel_state.get_data_parallel_world_size())} in your validation/test datasets. This is not supported. Please set global_batch_size equal to micro_batch_size * data_parallel_world_size.' + ) if data_cfg.task_name == 'xnli': dataset = TextToTextXNLIDataset( data_cfg.file_path, @@ -52,17 +66,17 @@ def build_train_valid_test_datasets(self, stage): logging.info('Building GLUE/XNLI datasets.') if stage != 'test': # Wrap this in a list since the general finetuning parent class supports multi-validation. - self._validation_ds = [self._build_dataset(self.cfg.data.validation_ds)] + self._validation_ds = [self._build_dataset(self.cfg.data.validation_ds, check_implict_grad_acc=True)] logging.info(f'Length of val dataset: {len(self._validation_ds)}') if stage != 'validate': if hasattr(self.cfg.data, 'test_ds'): # Wrap this in a list since the general finetuning parent class supports multi-validation. - self._test_ds = [self._build_dataset(self.cfg.data.test_ds)] + self._test_ds = [self._build_dataset(self.cfg.data.test_ds, check_implict_grad_acc=True)] logging.info(f'Length of test dataset: {len(self._test_ds)}') if stage == 'validate' or stage == 'test': return - self._train_ds = self._build_dataset(self.cfg.data.train_ds) + self._train_ds = self._build_dataset(self.cfg.data.train_ds, check_implict_grad_acc=False) logging.info(f'Length of train dataset: {len(self._train_ds)}') logging.info(f'Finished building GLUE/XNLI datasets.')