Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 3 additions & 3 deletions Jenkinsfile
Original file line number Diff line number Diff line change
Expand Up @@ -2832,7 +2832,7 @@ pipeline {
model.data.train_ds.task_name=rte \
model.data.train_ds.global_batch_size=4 \
model.data.train_ds.micro_batch_size=2 \
model.data.validation_ds.global_batch_size=4 \
model.data.validation_ds.global_batch_size=2 \
model.data.validation_ds.micro_batch_size=2 \
model.data.train_ds.file_path=/home/TestData/nlp/megatron_t5/data/train_ci.tsv \
model.data.validation_ds.task_name=rte \
Expand Down Expand Up @@ -2860,9 +2860,9 @@ pipeline {
model.pipeline_model_parallel_split_rank=0 \
model.data.train_ds.global_batch_size=4 \
model.data.train_ds.micro_batch_size=2 \
model.data.validation_ds.global_batch_size=4 \
model.data.validation_ds.global_batch_size=2 \
model.data.validation_ds.micro_batch_size=2 \
model.data.test_ds.global_batch_size=4 \
model.data.test_ds.global_batch_size=2 \
model.data.test_ds.micro_batch_size=2 \
model.data.train_ds.task_name=rte \
model.data.train_ds.file_path=/home/TestData/nlp/megatron_t5/data/train_ci.tsv \
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,13 @@
from nemo.collections.nlp.models.language_modeling.megatron_finetune_model import MegatronT5FinetuneModel
from nemo.utils import logging

try:
from apex.transformer import parallel_state

HAVE_APEX = True
except (ImportError, ModuleNotFoundError):
HAVE_APEX = False

__all__ = ['MegatronT5GLUEModel']


Expand All @@ -30,7 +37,14 @@ class MegatronT5GLUEModel(MegatronT5FinetuneModel):
def __init__(self, cfg: DictConfig, trainer: Trainer):
super().__init__(cfg, trainer=trainer)

def _build_dataset(self, data_cfg):
def _build_dataset(self, data_cfg, check_implict_grad_acc=False):
if (
check_implict_grad_acc
and data_cfg.global_batch_size > data_cfg.micro_batch_size * parallel_state.get_data_parallel_world_size()
):
raise ValueError(
f'You are trying to use "implicit gradient accumulation" of {data_cfg.global_batch_size // (data_cfg.micro_batch_size * parallel_state.get_data_parallel_world_size())} in your validation/test datasets. This is not supported. Please set global_batch_size equal to micro_batch_size * data_parallel_world_size.'
)
if data_cfg.task_name == 'xnli':
dataset = TextToTextXNLIDataset(
data_cfg.file_path,
Expand All @@ -52,17 +66,17 @@ def build_train_valid_test_datasets(self, stage):
logging.info('Building GLUE/XNLI datasets.')
if stage != 'test':
# Wrap this in a list since the general finetuning parent class supports multi-validation.
self._validation_ds = [self._build_dataset(self.cfg.data.validation_ds)]
self._validation_ds = [self._build_dataset(self.cfg.data.validation_ds, check_implict_grad_acc=True)]
logging.info(f'Length of val dataset: {len(self._validation_ds)}')

if stage != 'validate':
if hasattr(self.cfg.data, 'test_ds'):
# Wrap this in a list since the general finetuning parent class supports multi-validation.
self._test_ds = [self._build_dataset(self.cfg.data.test_ds)]
self._test_ds = [self._build_dataset(self.cfg.data.test_ds, check_implict_grad_acc=True)]
logging.info(f'Length of test dataset: {len(self._test_ds)}')

if stage == 'validate' or stage == 'test':
return
self._train_ds = self._build_dataset(self.cfg.data.train_ds)
self._train_ds = self._build_dataset(self.cfg.data.train_ds, check_implict_grad_acc=False)
logging.info(f'Length of train dataset: {len(self._train_ds)}')
logging.info(f'Finished building GLUE/XNLI datasets.')