From dcf6c7d95d021e923b545af83636c5d6bba20386 Mon Sep 17 00:00:00 2001 From: MaximumEntropy Date: Thu, 3 Nov 2022 21:13:07 -0700 Subject: [PATCH 01/17] Initial refactor Signed-off-by: MaximumEntropy --- .../conf/megatron_t5_config.yaml | 4 +- ...megatron_t5_config_finetune_glue_mnli.yaml | 8 +- .../megatron_t5_seq2seq_finetune.py | 110 +++++++++++++++--- .../megatron_finetune_model.py | 1 + 4 files changed, 100 insertions(+), 23 deletions(-) diff --git a/examples/nlp/language_modeling/conf/megatron_t5_config.yaml b/examples/nlp/language_modeling/conf/megatron_t5_config.yaml index 2a8c2912e395..ad6024f199e2 100644 --- a/examples/nlp/language_modeling/conf/megatron_t5_config.yaml +++ b/examples/nlp/language_modeling/conf/megatron_t5_config.yaml @@ -6,7 +6,7 @@ name: megatron_t5 restore_from_path: null # used when starting from a .nemo file trainer: - devices: 1 + devices: 2 num_nodes: 1 accelerator: gpu precision: 16 @@ -47,7 +47,7 @@ model: micro_batch_size: 4 global_batch_size: 8 # will use more micro batches to reach global batch size tensor_model_parallel_size: 1 - pipeline_model_parallel_size: 1 + pipeline_model_parallel_size: 2 resume_from_checkpoint: null # manually set the checkpoint file to load from pipeline_model_parallel_split_rank: 0 # rank at which decoder starts. diff --git a/examples/nlp/language_modeling/conf/megatron_t5_config_finetune_glue_mnli.yaml b/examples/nlp/language_modeling/conf/megatron_t5_config_finetune_glue_mnli.yaml index ac68b57e0216..58793630a38d 100644 --- a/examples/nlp/language_modeling/conf/megatron_t5_config_finetune_glue_mnli.yaml +++ b/examples/nlp/language_modeling/conf/megatron_t5_config_finetune_glue_mnli.yaml @@ -28,7 +28,7 @@ exp_manager: resume_ignore_no_checkpoint: True create_checkpoint_callback: True checkpoint_callback_params: - monitor: validation_${model.data.validation_ds.metric.name} + monitor: validation_exact_string_match save_top_k: 10 mode: max always_save_nemo: False # TODO: add support @@ -37,7 +37,11 @@ exp_manager: save_best_model: True model: - restore_from_path: ??? # Path to a trained T5 .nemo file + restore_from_path: null # Path to a trained T5 .nemo file + pretrained_checkpoint: + checkpoint_dir: null # Path to a folder that contains a .ckpt file + checkpoint_name: null # Name of the .ckpt file within the checkpoint_dir. + hparams_file: null # Path to a .yaml file that contains the hyperparameters of the checkpoint. tensor_model_parallel_size: 1 pipeline_model_parallel_size: 1 pipeline_model_parallel_split_rank: 0 diff --git a/examples/nlp/language_modeling/megatron_t5_seq2seq_finetune.py b/examples/nlp/language_modeling/megatron_t5_seq2seq_finetune.py index 22883657736f..5c1a7e67b872 100644 --- a/examples/nlp/language_modeling/megatron_t5_seq2seq_finetune.py +++ b/examples/nlp/language_modeling/megatron_t5_seq2seq_finetune.py @@ -12,6 +12,8 @@ # See the License for the specific language governing permissions and # limitations under the License. +import os +import tempfile from omegaconf.omegaconf import OmegaConf, open_dict from pytorch_lightning import Trainer from pytorch_lightning.callbacks.timer import Timer @@ -29,9 +31,83 @@ PipelineMixedPrecisionPlugin, ) from nemo.core.config import hydra_runner -from nemo.utils import logging +from nemo.utils import logging, AppState from nemo.utils.exp_manager import StatelessTimer, exp_manager +from nemo.utils.model_utils import inject_model_parallel_rank +def _modify_config(t5_cfg, cfg, add_cfg_to_tree=False): + OmegaConf.set_struct(t5_cfg, True) + with open_dict(t5_cfg): + t5_cfg.megatron_amp_O2 = cfg.model.get('megatron_amp_O2', False) + if hasattr(t5_cfg, 'encoder') and hasattr(t5_cfg, 'decoder'): + t5_cfg.encoder.masked_softmax_fusion = False + t5_cfg.decoder.masked_softmax_fusion = False + t5_cfg.encoder.hidden_dropout = cfg.model.get('hidden_dropout', 0.1) + t5_cfg.decoder.hidden_dropout = cfg.model.get('hidden_dropout', 0.1) + if hasattr(t5_cfg.encoder, 'ffn_dropout'): + t5_cfg.encoder.ffn_dropout = cfg.model.get('ffn_dropout', 0.1) + if hasattr(t5_cfg.decoder, 'ffn_dropout'): + t5_cfg.decoder.ffn_dropout = cfg.model.get('ffn_dropout', 0.1) + else: + t5_cfg.hidden_dropout = cfg.model.get('hidden_dropout', 0.1) + t5_cfg.attention_dropout = cfg.model.get('attention_dropout', 0.1) + t5_cfg.masked_softmax_fusion = False + t5_cfg.data = cfg.model.data + t5_cfg.precision = cfg.trainer.precision + t5_cfg.optim = cfg.model.optim + t5_cfg.micro_batch_size = cfg.model.data.train_ds.micro_batch_size + t5_cfg.global_batch_size = cfg.model.data.train_ds.global_batch_size + # XNLI has eval languages in the yaml config. + if hasattr(cfg.model, 'eval_languages'): + t5_cfg.eval_languages = cfg.model.eval_languages + + # This is needed when modifying a hparam file directly to load `.ckpt` files. + # This is not needed to modify the cfg in `.nemo` files. + if add_cfg_to_tree: + t5_cfg.cfg = cfg + + return t5_cfg + +def load_from_nemo(cls, restore_from_path, trainer, t5_cfg): + model = cls.restore_from( + restore_path=cfg.model.restore_from_path, + trainer=trainer, + override_config_path=t5_cfg, + save_restore_connector=NLPSaveRestoreConnector(), + ) + return model + +def load_from_checkpoint_dir(cls, cfg, trainer): + app_state = AppState() + if cfg.model.tensor_model_parallel_size > 1 or cfg.model.pipeline_model_parallel_size > 1: + app_state.model_parallel_size = cfg.model.tensor_model_parallel_size * cfg.pipeline_model_parallel_size + app_state.tensor_model_parallel_size = cfg.model.tensor_model_parallel_size + app_state.pipeline_model_parallel_size = cfg.model.pipeline_model_parallel_size + ( + app_state.tensor_model_parallel_rank, + app_state.pipeline_model_parallel_rank, + app_state.model_parallel_size, + app_state.data_parallel_size, + app_state.pipeline_model_parallel_split_rank, + app_state.virtual_pipeline_model_parallel_rank, + ) = fake_initialize_model_parallel( + world_size=app_state.model_parallel_size, + rank=trainer.global_rank, + tensor_model_parallel_size_=cfg.model.tensor_model_parallel_size, + pipeline_model_parallel_size_=cfg.model.pipeline_model_parallel_size, + pipeline_model_parallel_split_rank_=cfg.model.pipeline_model_parallel_split_rank, + ) + checkpoint_path = inject_model_parallel_rank(os.path.join(cfg.model.pretrained_checkpoint.checkpoint_dir, cfg.model.pretrained_checkpoint.checkpoint_name)) + hparams_file = OmegaConf.load(cfg.model.pretrained_checkpoint.hparams_file) + t5_cfg = _modify_config(hparams_file.cfg, cfg, add_cfg_to_tree=True) + with tempfile.NamedTemporaryFile(suffix='.yaml') as f: + OmegaConf.save(config=t5_cfg, f=f.name) + model = cls.load_from_checkpoint( + checkpoint_path=checkpoint_path, + trainer=trainer, + hparams_file=f.name, + ) + return model @hydra_runner(config_path="conf", config_name="megatron_t5_config_finetune_glue_mnli") def main(cfg) -> None: @@ -78,6 +154,7 @@ def main(cfg) -> None: if isinstance(callback, Timer): trainer.callbacks[idx] = StatelessTimer(cfg.trainer.max_time,) + ''' # Get the T5 Base configuration. t5_cfg = MegatronT5FinetuneModel.restore_from( restore_path=cfg.model.restore_from_path, trainer=trainer, return_config=True @@ -108,28 +185,23 @@ def main(cfg) -> None: # XNLI has eval languages in the yaml config. if hasattr(cfg.model, 'eval_languages'): t5_cfg.eval_languages = cfg.model.eval_languages + ''' if hasattr(cfg.model.data.train_ds, 'task_name'): - model = MegatronT5GLUEModel.restore_from( - restore_path=cfg.model.restore_from_path, - trainer=trainer, - override_config_path=t5_cfg, - save_restore_connector=NLPSaveRestoreConnector(), - ) + if cfg.model.restore_from_path: + model = load_from_nemo(MegatronT5GLUEModel, cfg.model.restore_from_path, trainer, t5_cfg) + else: + model = load_from_checkpoint_dir(MegatronT5GLUEModel, cfg, trainer) elif hasattr(cfg.model.data.train_ds, 'file_names'): - model = MegatronT0Model.restore_from( - restore_path=cfg.model.restore_from_path, - trainer=trainer, - override_config_path=t5_cfg, - save_restore_connector=NLPSaveRestoreConnector(), - ) + if cfg.model.restore_from_path: + model = load_from_nemo(MegatronT0Model, cfg.model.restore_from_path, trainer, t5_cfg) + else: + model = load_from_checkpoint_dir(MegatronT0Model, cfg, trainer, t5_cfg) else: - model = MegatronT5FinetuneModel.restore_from( - restore_path=cfg.model.restore_from_path, - trainer=trainer, - override_config_path=t5_cfg, - save_restore_connector=NLPSaveRestoreConnector(), - ) + if cfg.model.restore_from_path: + model = load_from_nemo(MegatronT5FinetuneModel, cfg.model.restore_from_path, trainer, t5_cfg) + else: + model = load_from_checkpoint_dir(MegatronT5FinetuneModel, cfg, trainer, t5_cfg) trainer.fit(model) trainer.validate(model) diff --git a/nemo/collections/nlp/models/language_modeling/megatron_finetune_model.py b/nemo/collections/nlp/models/language_modeling/megatron_finetune_model.py index e29cea2264b6..dd4419a7c087 100644 --- a/nemo/collections/nlp/models/language_modeling/megatron_finetune_model.py +++ b/nemo/collections/nlp/models/language_modeling/megatron_finetune_model.py @@ -42,6 +42,7 @@ class MegatronT5FinetuneModel(MegatronT5Model): def __init__(self, cfg: DictConfig, trainer: Trainer): super().__init__(cfg, trainer=trainer) + import ipdb; ipdb.set_trace() self.val_metric, self.val_metric_name = self.setup_metric(self.cfg.data.validation_ds) self.val_metric = torch.nn.ModuleList(self.val_metric) if hasattr(self.cfg.data, "test_ds"): From f2a4fa4171f20ce0a2abe982b75443825da4681b Mon Sep 17 00:00:00 2001 From: MaximumEntropy Date: Fri, 4 Nov 2022 13:41:19 -0700 Subject: [PATCH 02/17] Resolve config before passing to load_from_checkpoint Signed-off-by: MaximumEntropy --- examples/nlp/language_modeling/megatron_t5_seq2seq_finetune.py | 3 ++- .../nlp/models/language_modeling/megatron_finetune_model.py | 1 - 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/examples/nlp/language_modeling/megatron_t5_seq2seq_finetune.py b/examples/nlp/language_modeling/megatron_t5_seq2seq_finetune.py index 5c1a7e67b872..887a30871164 100644 --- a/examples/nlp/language_modeling/megatron_t5_seq2seq_finetune.py +++ b/examples/nlp/language_modeling/megatron_t5_seq2seq_finetune.py @@ -64,7 +64,8 @@ def _modify_config(t5_cfg, cfg, add_cfg_to_tree=False): # This is needed when modifying a hparam file directly to load `.ckpt` files. # This is not needed to modify the cfg in `.nemo` files. if add_cfg_to_tree: - t5_cfg.cfg = cfg + OmegaConf.resolve(t5_cfg) + t5_cfg.cfg = t5_cfg return t5_cfg diff --git a/nemo/collections/nlp/models/language_modeling/megatron_finetune_model.py b/nemo/collections/nlp/models/language_modeling/megatron_finetune_model.py index dd4419a7c087..e29cea2264b6 100644 --- a/nemo/collections/nlp/models/language_modeling/megatron_finetune_model.py +++ b/nemo/collections/nlp/models/language_modeling/megatron_finetune_model.py @@ -42,7 +42,6 @@ class MegatronT5FinetuneModel(MegatronT5Model): def __init__(self, cfg: DictConfig, trainer: Trainer): super().__init__(cfg, trainer=trainer) - import ipdb; ipdb.set_trace() self.val_metric, self.val_metric_name = self.setup_metric(self.cfg.data.validation_ds) self.val_metric = torch.nn.ModuleList(self.val_metric) if hasattr(self.cfg.data, "test_ds"): From 09230c996691ef8d889b9293c909206f8039bbff Mon Sep 17 00:00:00 2001 From: MaximumEntropy Date: Fri, 4 Nov 2022 14:15:12 -0700 Subject: [PATCH 03/17] Fixes for model parallel and nemo restore Signed-off-by: MaximumEntropy --- .../megatron_t5_seq2seq_finetune.py | 58 +++++++------------ 1 file changed, 20 insertions(+), 38 deletions(-) diff --git a/examples/nlp/language_modeling/megatron_t5_seq2seq_finetune.py b/examples/nlp/language_modeling/megatron_t5_seq2seq_finetune.py index 887a30871164..b7549cddace5 100644 --- a/examples/nlp/language_modeling/megatron_t5_seq2seq_finetune.py +++ b/examples/nlp/language_modeling/megatron_t5_seq2seq_finetune.py @@ -20,6 +20,7 @@ from pytorch_lightning.plugins.environments.torchelastic_environment import TorchElasticEnvironment from pytorch_lightning.trainer.connectors.checkpoint_connector import CheckpointConnector +from nemo.collections.nlp.modules.common.megatron.megatron_init import fake_initialize_model_parallel from nemo.collections.nlp.models.language_modeling.megatron_finetune_model import MegatronT5FinetuneModel from nemo.collections.nlp.models.language_modeling.megatron_glue_model import MegatronT5GLUEModel from nemo.collections.nlp.models.language_modeling.megatron_t0_model import MegatronT0Model @@ -36,6 +37,10 @@ from nemo.utils.model_utils import inject_model_parallel_rank def _modify_config(t5_cfg, cfg, add_cfg_to_tree=False): + """ + This function modifies the original t5 pre-training config (t5_cfg) with attributes from the finetuning config (cfg). + The `add_cfg_to_tree` arg adds `cfg` to the top of the yaml tree which is needed for all `hparams.yaml` files when passed as an arg to `load_from_checkpoint()`. + """ OmegaConf.set_struct(t5_cfg, True) with open_dict(t5_cfg): t5_cfg.megatron_amp_O2 = cfg.model.get('megatron_amp_O2', False) @@ -69,7 +74,8 @@ def _modify_config(t5_cfg, cfg, add_cfg_to_tree=False): return t5_cfg -def load_from_nemo(cls, restore_from_path, trainer, t5_cfg): +def load_from_nemo(cls, cfg, trainer, t5_cfg): + t5_cfg = _modify_config(t5_cfg, cfg, add_cfg_to_tree=False) model = cls.restore_from( restore_path=cfg.model.restore_from_path, trainer=trainer, @@ -81,7 +87,7 @@ def load_from_nemo(cls, restore_from_path, trainer, t5_cfg): def load_from_checkpoint_dir(cls, cfg, trainer): app_state = AppState() if cfg.model.tensor_model_parallel_size > 1 or cfg.model.pipeline_model_parallel_size > 1: - app_state.model_parallel_size = cfg.model.tensor_model_parallel_size * cfg.pipeline_model_parallel_size + app_state.model_parallel_size = cfg.model.tensor_model_parallel_size * cfg.model.pipeline_model_parallel_size app_state.tensor_model_parallel_size = cfg.model.tensor_model_parallel_size app_state.pipeline_model_parallel_size = cfg.model.pipeline_model_parallel_size ( @@ -155,52 +161,28 @@ def main(cfg) -> None: if isinstance(callback, Timer): trainer.callbacks[idx] = StatelessTimer(cfg.trainer.max_time,) - ''' - # Get the T5 Base configuration. - t5_cfg = MegatronT5FinetuneModel.restore_from( - restore_path=cfg.model.restore_from_path, trainer=trainer, return_config=True - ) - - # Override the T5 configuration with the one from the config file. - OmegaConf.set_struct(t5_cfg, True) - with open_dict(t5_cfg): - t5_cfg.megatron_amp_O2 = cfg.model.get('megatron_amp_O2', False) - if hasattr(t5_cfg, 'encoder') and hasattr(t5_cfg, 'decoder'): - t5_cfg.encoder.masked_softmax_fusion = False - t5_cfg.decoder.masked_softmax_fusion = False - t5_cfg.encoder.hidden_dropout = cfg.model.get('hidden_dropout', 0.1) - t5_cfg.decoder.hidden_dropout = cfg.model.get('hidden_dropout', 0.1) - if hasattr(t5_cfg.encoder, 'ffn_dropout'): - t5_cfg.encoder.ffn_dropout = cfg.model.get('ffn_dropout', 0.1) - if hasattr(t5_cfg.decoder, 'ffn_dropout'): - t5_cfg.decoder.ffn_dropout = cfg.model.get('ffn_dropout', 0.1) - else: - t5_cfg.hidden_dropout = cfg.model.get('hidden_dropout', 0.1) - t5_cfg.attention_dropout = cfg.model.get('attention_dropout', 0.1) - t5_cfg.masked_softmax_fusion = False - t5_cfg.data = cfg.model.data - t5_cfg.precision = cfg.trainer.precision - t5_cfg.optim = cfg.model.optim - t5_cfg.micro_batch_size = cfg.model.data.train_ds.micro_batch_size - t5_cfg.global_batch_size = cfg.model.data.train_ds.global_batch_size - # XNLI has eval languages in the yaml config. - if hasattr(cfg.model, 'eval_languages'): - t5_cfg.eval_languages = cfg.model.eval_languages - ''' - if hasattr(cfg.model.data.train_ds, 'task_name'): if cfg.model.restore_from_path: - model = load_from_nemo(MegatronT5GLUEModel, cfg.model.restore_from_path, trainer, t5_cfg) + t5_cfg = MegatronT5GLUEModel.restore_from( + restore_path=cfg.model.restore_from_path, trainer=trainer, return_config=True + ) + model = load_from_nemo(MegatronT5GLUEModel, cfg, trainer, t5_cfg) else: model = load_from_checkpoint_dir(MegatronT5GLUEModel, cfg, trainer) elif hasattr(cfg.model.data.train_ds, 'file_names'): if cfg.model.restore_from_path: - model = load_from_nemo(MegatronT0Model, cfg.model.restore_from_path, trainer, t5_cfg) + t5_cfg = MegatronT0Model.restore_from( + restore_path=cfg.model.restore_from_path, trainer=trainer, return_config=True + ) + model = load_from_nemo(MegatronT0Model, cfg, trainer, t5_cfg) else: model = load_from_checkpoint_dir(MegatronT0Model, cfg, trainer, t5_cfg) else: if cfg.model.restore_from_path: - model = load_from_nemo(MegatronT5FinetuneModel, cfg.model.restore_from_path, trainer, t5_cfg) + t5_cfg = MegatronT5FinetuneModel.restore_from( + restore_path=cfg.model.restore_from_path, trainer=trainer, return_config=True + ) + model = load_from_nemo(MegatronT5FinetuneModel, cfg, trainer, t5_cfg) else: model = load_from_checkpoint_dir(MegatronT5FinetuneModel, cfg, trainer, t5_cfg) From 6ba13758770e01ae317e63d4d8200f30005264ce Mon Sep 17 00:00:00 2001 From: MaximumEntropy Date: Mon, 7 Nov 2022 09:39:39 -0800 Subject: [PATCH 04/17] Fixes for eval Signed-off-by: MaximumEntropy --- .../conf/megatron_t0_config.yaml | 8 +- .../megatron_t5_config_finetune_eval.yaml | 6 +- ...megatron_t5_config_finetune_glue_eval.yaml | 9 +- ...megatron_t5_config_finetune_glue_xnli.yaml | 8 +- .../conf/megatron_t5_finetune.yaml | 6 +- .../megatron_t5_seq2seq_eval.py | 111 ++++++++++-------- .../megatron_t5_seq2seq_finetune.py | 31 +++-- 7 files changed, 113 insertions(+), 66 deletions(-) diff --git a/examples/nlp/language_modeling/conf/megatron_t0_config.yaml b/examples/nlp/language_modeling/conf/megatron_t0_config.yaml index 04503cac769f..503dc17d2acc 100644 --- a/examples/nlp/language_modeling/conf/megatron_t0_config.yaml +++ b/examples/nlp/language_modeling/conf/megatron_t0_config.yaml @@ -36,7 +36,11 @@ exp_manager: save_best_model: True model: - restore_from_path: ??? # Path to a trained T5 or LM-adapted T5 .nemo file + restore_from_path: null # Path to a trained T5 .nemo file + pretrained_checkpoint: + checkpoint_dir: null # Path to a folder that contains a .ckpt file + checkpoint_name: null # Name of the .ckpt file within the checkpoint_dir. + hparams_file: null # Path to a .yaml file that contains the hyperparameters of the checkpoint. tensor_model_parallel_size: 1 pipeline_model_parallel_size: 1 pipeline_model_parallel_split_rank: 0 @@ -82,7 +86,7 @@ model: num_classes: null replace_bos_with_pad: ${data.train_ds.replace_bos_with_pad} add_bos_to_input: ${data.train_ds.add_bos_to_input} - add_eos_to_input: ${data.train_ds.replace_bos_with_pad} + add_eos_to_input: ${data.train_ds.add_eos_to_input} seed: 1234 optim: diff --git a/examples/nlp/language_modeling/conf/megatron_t5_config_finetune_eval.yaml b/examples/nlp/language_modeling/conf/megatron_t5_config_finetune_eval.yaml index 8be471a78dde..bc1a7420df48 100644 --- a/examples/nlp/language_modeling/conf/megatron_t5_config_finetune_eval.yaml +++ b/examples/nlp/language_modeling/conf/megatron_t5_config_finetune_eval.yaml @@ -17,7 +17,11 @@ exp_manager: create_checkpoint_callback: False model: - restore_from_path: ??? # Path to a finetuned T5 .nemo file + restore_from_path: null # Path to a trained T5 .nemo file + pretrained_checkpoint: + checkpoint_dir: null # Path to a folder that contains a .ckpt file + checkpoint_name: null # Name of the .ckpt file within the checkpoint_dir. + hparams_file: null # Path to a .yaml file that contains the hyperparameters of the checkpoint. gradient_as_bucket_view: True # Allocate gradients in a contiguous bucket to save memory (less fragmentation and buffer memory) megatron_amp_O2: False # Enable O2 optimization for megatron amp diff --git a/examples/nlp/language_modeling/conf/megatron_t5_config_finetune_glue_eval.yaml b/examples/nlp/language_modeling/conf/megatron_t5_config_finetune_glue_eval.yaml index 87ce5ac03eb5..024ad5f66ae9 100644 --- a/examples/nlp/language_modeling/conf/megatron_t5_config_finetune_glue_eval.yaml +++ b/examples/nlp/language_modeling/conf/megatron_t5_config_finetune_glue_eval.yaml @@ -17,9 +17,16 @@ exp_manager: create_checkpoint_callback: False model: - restore_from_path: ??? # Path to a finetuned T5 .nemo file + restore_from_path: null # Path to a trained T5 .nemo file + pretrained_checkpoint: + checkpoint_dir: null # Path to a folder that contains a .ckpt file + checkpoint_name: null # Name of the .ckpt file within the checkpoint_dir. + hparams_file: null # Path to a .yaml file that contains the hyperparameters of the checkpoint. gradient_as_bucket_view: True # Allocate gradients in a contiguous bucket to save memory (less fragmentation and buffer memory) megatron_amp_O2: False # Enable O2 optimization for megatron amp + tensor_model_parallel_size: 1 + pipeline_model_parallel_size: 1 + pipeline_model_parallel_split_rank: 0 data: validation_ds: diff --git a/examples/nlp/language_modeling/conf/megatron_t5_config_finetune_glue_xnli.yaml b/examples/nlp/language_modeling/conf/megatron_t5_config_finetune_glue_xnli.yaml index 1b08bc37246e..486a6da14135 100644 --- a/examples/nlp/language_modeling/conf/megatron_t5_config_finetune_glue_xnli.yaml +++ b/examples/nlp/language_modeling/conf/megatron_t5_config_finetune_glue_xnli.yaml @@ -37,9 +37,13 @@ exp_manager: save_best_model: True model: - restore_from_path: ??? + restore_from_path: null # Path to a trained T5 .nemo file + pretrained_checkpoint: + checkpoint_dir: null # Path to a folder that contains a .ckpt file + checkpoint_name: null # Name of the .ckpt file within the checkpoint_dir. + hparams_file: null # Path to a .yaml file that contains the hyperparameters of the checkpoint. tensor_model_parallel_size: 1 - pipeline_model_parallel_size: 2 + pipeline_model_parallel_size: 1 pipeline_model_parallel_split_rank: 1 gradient_as_bucket_view: True # Allocate gradients in a contiguous bucket to save memory (less fragmentation and buffer memory) resume_from_checkpoint: null diff --git a/examples/nlp/language_modeling/conf/megatron_t5_finetune.yaml b/examples/nlp/language_modeling/conf/megatron_t5_finetune.yaml index 9a5cf15cfe74..8c383aad9c78 100644 --- a/examples/nlp/language_modeling/conf/megatron_t5_finetune.yaml +++ b/examples/nlp/language_modeling/conf/megatron_t5_finetune.yaml @@ -36,7 +36,11 @@ exp_manager: save_best_model: True model: - restore_from_path: ??? # Path to a trained T5 .nemo file + restore_from_path: null # Path to a trained T5 .nemo file + pretrained_checkpoint: + checkpoint_dir: null # Path to a folder that contains a .ckpt file + checkpoint_name: null # Name of the .ckpt file within the checkpoint_dir. + hparams_file: null # Path to a .yaml file that contains the hyperparameters of the checkpoint. tensor_model_parallel_size: 1 pipeline_model_parallel_size: 1 pipeline_model_parallel_split_rank: 0 diff --git a/examples/nlp/language_modeling/megatron_t5_seq2seq_eval.py b/examples/nlp/language_modeling/megatron_t5_seq2seq_eval.py index 25fd84d800d4..c4512fd157d6 100644 --- a/examples/nlp/language_modeling/megatron_t5_seq2seq_eval.py +++ b/examples/nlp/language_modeling/megatron_t5_seq2seq_eval.py @@ -30,6 +30,45 @@ from nemo.core.config import hydra_runner from nemo.utils import logging from nemo.utils.exp_manager import StatelessTimer, exp_manager +from megatron_t5_seq2seq_finetune import load_from_nemo, load_from_checkpoint_dir, validate_checkpoint_loading_args + +def _modify_config(t5_cfg, cfg, add_cfg_to_tree=False): + """ + This function modifies the original t5 pre-training config (t5_cfg) with attributes from the finetuning config (cfg). + The `add_cfg_to_tree` arg adds `cfg` to the top of the yaml tree which is needed for all `hparams.yaml` files when passed as an arg to `load_from_checkpoint()`. + """ + OmegaConf.set_struct(t5_cfg, True) + with open_dict(t5_cfg): + t5_cfg.precision = cfg.trainer.precision + # Overwrite data configs + if cfg.model.data.validation_ds.get('src_file_name', None) is not None: + logging.info( + 'Found validation_ds.src_file_name in the config file. Overriding the finetuned model config file with the values from the new config file.' + ) + t5_cfg.data.validation_ds.src_file_name = cfg.model.data.validation_ds.src_file_name + if cfg.model.data.validation_ds.get('tgt_file_name', None) is not None: + logging.info( + 'Found validation_ds.tgt_file_name in the config file. Overriding the finetuned model config file with the values from the new config file.' + ) + t5_cfg.data.validation_ds.tgt_file_name = cfg.model.data.validation_ds.tgt_file_name + + if "write_predictions_to_file" in cfg.model.data.validation_ds: + t5_cfg.data.validation_ds.write_predictions_to_file = ( + cfg.model.data.validation_ds.write_predictions_to_file + ) + if "output_file_path_prefix" in cfg.model.data.validation_ds: + t5_cfg.data.validation_ds.output_file_path_prefix = cfg.model.data.validation_ds.output_file_path_prefix + + t5_cfg.data.validation_ds.micro_batch_size = cfg.model.data.validation_ds.micro_batch_size + t5_cfg.data.validation_ds.global_batch_size = cfg.model.data.validation_ds.global_batch_size + + # This is needed when modifying a hparam file directly to load `.ckpt` files. + # This is not needed to modify the cfg in `.nemo` files. + if add_cfg_to_tree: + OmegaConf.resolve(t5_cfg) + t5_cfg.cfg = t5_cfg + + return t5_cfg @hydra_runner(config_path="conf", config_name="megatron_t5_config_finetune_glue_eval") @@ -69,59 +108,33 @@ def main(cfg) -> None: if isinstance(callback, Timer): trainer.callbacks[idx] = StatelessTimer(cfg.trainer.max_time,) - t5_cfg = MegatronT5GLUEModel.restore_from( - restore_path=cfg.model.restore_from_path, trainer=trainer, return_config=True - ) - - # Override the T5 configuration with the one from the config file. - # NOTE: Only data can be overriden here since this the file being restored here should already correspond to a GLUE/XNLI finetuned model. - OmegaConf.set_struct(t5_cfg, True) - with open_dict(t5_cfg): - t5_cfg.precision = cfg.trainer.precision - # Overwrite data configs - if cfg.model.data.validation_ds.get('src_file_name', None) is not None: - logging.info( - 'Found validation_ds.src_file_name in the config file. Overriding the finetuned model config file with the values from the new config file.' - ) - t5_cfg.data.validation_ds.src_file_name = cfg.model.data.validation_ds.src_file_name - if cfg.model.data.validation_ds.get('tgt_file_name', None) is not None: - logging.info( - 'Found validation_ds.tgt_file_name in the config file. Overriding the finetuned model config file with the values from the new config file.' - ) - t5_cfg.data.validation_ds.tgt_file_name = cfg.model.data.validation_ds.tgt_file_name - - if "write_predictions_to_file" in cfg.model.data.validation_ds: - t5_cfg.data.validation_ds.write_predictions_to_file = ( - cfg.model.data.validation_ds.write_predictions_to_file - ) - if "output_file_path_prefix" in cfg.model.data.validation_ds: - t5_cfg.data.validation_ds.output_file_path_prefix = cfg.model.data.validation_ds.output_file_path_prefix - t5_cfg.data.validation_ds.src_file_name = cfg.model.data.validation_ds.src_file_name - - t5_cfg.data.validation_ds.micro_batch_size = cfg.model.data.validation_ds.micro_batch_size - t5_cfg.data.validation_ds.global_batch_size = cfg.model.data.validation_ds.global_batch_size - - if hasattr(cfg.model.data.validation_ds, 'task_name'): - model = MegatronT5GLUEModel.restore_from( - restore_path=cfg.model.restore_from_path, - trainer=trainer, - override_config_path=t5_cfg, - save_restore_connector=NLPSaveRestoreConnector(), + if hasattr(cfg.model.data.validation_ds, 'task_name'): + if cfg.model.restore_from_path: + t5_cfg = MegatronT5GLUEModel.restore_from( + restore_path=cfg.model.restore_from_path, trainer=trainer, return_config=True ) - elif hasattr(cfg.model.data.validation_ds, 'file_names'): - model = MegatronT0Model.restore_from( - restore_path=cfg.model.restore_from_path, - trainer=trainer, - override_config_path=t5_cfg, - save_restore_connector=NLPSaveRestoreConnector(), + model = load_from_nemo(MegatronT5GLUEModel, cfg, trainer, t5_cfg, modify_confg_fn=_modify_config) + else: + validate_checkpoint_loading_args(cfg.model.pretrained_checkpoint) + model = load_from_checkpoint_dir(MegatronT5GLUEModel, cfg, trainer, modify_confg_fn=_modify_config) + elif hasattr(cfg.model.data.validation_ds, 'file_names'): + if cfg.model.restore_from_path: + t5_cfg = MegatronT0Model.restore_from( + restore_path=cfg.model.restore_from_path, trainer=trainer, return_config=True ) + model = load_from_nemo(MegatronT0Model, cfg, trainer, t5_cfg, modify_confg_fn=_modify_config) else: - model = MegatronT5FinetuneModel.restore_from( - restore_path=cfg.model.restore_from_path, - trainer=trainer, - override_config_path=t5_cfg, - save_restore_connector=NLPSaveRestoreConnector(), + validate_checkpoint_loading_args(cfg.model.pretrained_checkpoint) + model = load_from_checkpoint_dir(MegatronT0Model, cfg, trainer, t5_cfg, modify_confg_fn=_modify_config) + else: + if cfg.model.restore_from_path: + t5_cfg = MegatronT5FinetuneModel.restore_from( + restore_path=cfg.model.restore_from_path, trainer=trainer, return_config=True ) + model = load_from_nemo(MegatronT5FinetuneModel, cfg, trainer, t5_cfg, modify_confg_fn=_modify_config) + else: + validate_checkpoint_loading_args(cfg.model.pretrained_checkpoint) + model = load_from_checkpoint_dir(MegatronT5FinetuneModel, cfg, trainer, t5_cfg, modify_confg_fn=_modify_config) model.freeze() trainer.validate(model) diff --git a/examples/nlp/language_modeling/megatron_t5_seq2seq_finetune.py b/examples/nlp/language_modeling/megatron_t5_seq2seq_finetune.py index b7549cddace5..d89b1fa28473 100644 --- a/examples/nlp/language_modeling/megatron_t5_seq2seq_finetune.py +++ b/examples/nlp/language_modeling/megatron_t5_seq2seq_finetune.py @@ -74,8 +74,8 @@ def _modify_config(t5_cfg, cfg, add_cfg_to_tree=False): return t5_cfg -def load_from_nemo(cls, cfg, trainer, t5_cfg): - t5_cfg = _modify_config(t5_cfg, cfg, add_cfg_to_tree=False) +def load_from_nemo(cls, cfg, trainer, t5_cfg, modify_confg_fn): + t5_cfg = modify_confg_fn(t5_cfg, cfg, add_cfg_to_tree=False) model = cls.restore_from( restore_path=cfg.model.restore_from_path, trainer=trainer, @@ -84,7 +84,7 @@ def load_from_nemo(cls, cfg, trainer, t5_cfg): ) return model -def load_from_checkpoint_dir(cls, cfg, trainer): +def load_from_checkpoint_dir(cls, cfg, trainer, modify_confg_fn): app_state = AppState() if cfg.model.tensor_model_parallel_size > 1 or cfg.model.pipeline_model_parallel_size > 1: app_state.model_parallel_size = cfg.model.tensor_model_parallel_size * cfg.model.pipeline_model_parallel_size @@ -106,7 +106,7 @@ def load_from_checkpoint_dir(cls, cfg, trainer): ) checkpoint_path = inject_model_parallel_rank(os.path.join(cfg.model.pretrained_checkpoint.checkpoint_dir, cfg.model.pretrained_checkpoint.checkpoint_name)) hparams_file = OmegaConf.load(cfg.model.pretrained_checkpoint.hparams_file) - t5_cfg = _modify_config(hparams_file.cfg, cfg, add_cfg_to_tree=True) + t5_cfg = modify_confg_fn(hparams_file.cfg, cfg, add_cfg_to_tree=True) with tempfile.NamedTemporaryFile(suffix='.yaml') as f: OmegaConf.save(config=t5_cfg, f=f.name) model = cls.load_from_checkpoint( @@ -116,6 +116,14 @@ def load_from_checkpoint_dir(cls, cfg, trainer): ) return model +def validate_checkpoint_loading_args(cfg): + if cfg.checkpoint_dir is None or not os.path.isdir(cfg.checkpoint_dir): + raise ValueError(f'Checkpoint directory {cfg.checkpoint_dir} does not exist or is not a directory.') + if cfg.checkpoint_name is None: + raise ValueError(f'Checkpoint name {cfg.checkpoint_name} is not valid.') + if cfg.hparams_file is None or not os.path.isfile(cfg.hparams_file): + raise ValueError(f'Hparams file {cfg.hparams_file} does not exist or is not a file.') + @hydra_runner(config_path="conf", config_name="megatron_t5_config_finetune_glue_mnli") def main(cfg) -> None: logging.info("\n\n************** Experiment configuration ***********") @@ -166,25 +174,28 @@ def main(cfg) -> None: t5_cfg = MegatronT5GLUEModel.restore_from( restore_path=cfg.model.restore_from_path, trainer=trainer, return_config=True ) - model = load_from_nemo(MegatronT5GLUEModel, cfg, trainer, t5_cfg) + model = load_from_nemo(MegatronT5GLUEModel, cfg, trainer, t5_cfg, modify_confg_fn=_modify_config) else: - model = load_from_checkpoint_dir(MegatronT5GLUEModel, cfg, trainer) + validate_checkpoint_loading_args(cfg.model.pretrained_checkpoint) + model = load_from_checkpoint_dir(MegatronT5GLUEModel, cfg, trainer, modify_confg_fn=_modify_config) elif hasattr(cfg.model.data.train_ds, 'file_names'): if cfg.model.restore_from_path: t5_cfg = MegatronT0Model.restore_from( restore_path=cfg.model.restore_from_path, trainer=trainer, return_config=True ) - model = load_from_nemo(MegatronT0Model, cfg, trainer, t5_cfg) + model = load_from_nemo(MegatronT0Model, cfg, trainer, t5_cfg, modify_confg_fn=_modify_config) else: - model = load_from_checkpoint_dir(MegatronT0Model, cfg, trainer, t5_cfg) + validate_checkpoint_loading_args(cfg.model.pretrained_checkpoint) + model = load_from_checkpoint_dir(MegatronT0Model, cfg, trainer, t5_cfg, modify_confg_fn=_modify_config) else: if cfg.model.restore_from_path: t5_cfg = MegatronT5FinetuneModel.restore_from( restore_path=cfg.model.restore_from_path, trainer=trainer, return_config=True ) - model = load_from_nemo(MegatronT5FinetuneModel, cfg, trainer, t5_cfg) + model = load_from_nemo(MegatronT5FinetuneModel, cfg, trainer, t5_cfg, modify_confg_fn=_modify_config) else: - model = load_from_checkpoint_dir(MegatronT5FinetuneModel, cfg, trainer, t5_cfg) + validate_checkpoint_loading_args(cfg.model.pretrained_checkpoint) + model = load_from_checkpoint_dir(MegatronT5FinetuneModel, cfg, trainer, t5_cfg, modify_confg_fn=_modify_config) trainer.fit(model) trainer.validate(model) From e5204c6b8106b417dca36821d31ee645474e2d51 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Mon, 7 Nov 2022 17:45:21 +0000 Subject: [PATCH 05/17] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- .../megatron_t5_seq2seq_eval.py | 13 +++++--- .../megatron_t5_seq2seq_finetune.py | 32 +++++++++++-------- 2 files changed, 27 insertions(+), 18 deletions(-) diff --git a/examples/nlp/language_modeling/megatron_t5_seq2seq_eval.py b/examples/nlp/language_modeling/megatron_t5_seq2seq_eval.py index c4512fd157d6..ef2ec8a65cbd 100644 --- a/examples/nlp/language_modeling/megatron_t5_seq2seq_eval.py +++ b/examples/nlp/language_modeling/megatron_t5_seq2seq_eval.py @@ -12,6 +12,7 @@ # See the License for the specific language governing permissions and # limitations under the License. +from megatron_t5_seq2seq_finetune import load_from_checkpoint_dir, load_from_nemo, validate_checkpoint_loading_args from omegaconf.omegaconf import OmegaConf, open_dict from pytorch_lightning import Trainer from pytorch_lightning.callbacks.timer import Timer @@ -30,7 +31,7 @@ from nemo.core.config import hydra_runner from nemo.utils import logging from nemo.utils.exp_manager import StatelessTimer, exp_manager -from megatron_t5_seq2seq_finetune import load_from_nemo, load_from_checkpoint_dir, validate_checkpoint_loading_args + def _modify_config(t5_cfg, cfg, add_cfg_to_tree=False): """ @@ -111,7 +112,7 @@ def main(cfg) -> None: if hasattr(cfg.model.data.validation_ds, 'task_name'): if cfg.model.restore_from_path: t5_cfg = MegatronT5GLUEModel.restore_from( - restore_path=cfg.model.restore_from_path, trainer=trainer, return_config=True + restore_path=cfg.model.restore_from_path, trainer=trainer, return_config=True ) model = load_from_nemo(MegatronT5GLUEModel, cfg, trainer, t5_cfg, modify_confg_fn=_modify_config) else: @@ -120,7 +121,7 @@ def main(cfg) -> None: elif hasattr(cfg.model.data.validation_ds, 'file_names'): if cfg.model.restore_from_path: t5_cfg = MegatronT0Model.restore_from( - restore_path=cfg.model.restore_from_path, trainer=trainer, return_config=True + restore_path=cfg.model.restore_from_path, trainer=trainer, return_config=True ) model = load_from_nemo(MegatronT0Model, cfg, trainer, t5_cfg, modify_confg_fn=_modify_config) else: @@ -129,12 +130,14 @@ def main(cfg) -> None: else: if cfg.model.restore_from_path: t5_cfg = MegatronT5FinetuneModel.restore_from( - restore_path=cfg.model.restore_from_path, trainer=trainer, return_config=True + restore_path=cfg.model.restore_from_path, trainer=trainer, return_config=True ) model = load_from_nemo(MegatronT5FinetuneModel, cfg, trainer, t5_cfg, modify_confg_fn=_modify_config) else: validate_checkpoint_loading_args(cfg.model.pretrained_checkpoint) - model = load_from_checkpoint_dir(MegatronT5FinetuneModel, cfg, trainer, t5_cfg, modify_confg_fn=_modify_config) + model = load_from_checkpoint_dir( + MegatronT5FinetuneModel, cfg, trainer, t5_cfg, modify_confg_fn=_modify_config + ) model.freeze() trainer.validate(model) diff --git a/examples/nlp/language_modeling/megatron_t5_seq2seq_finetune.py b/examples/nlp/language_modeling/megatron_t5_seq2seq_finetune.py index d89b1fa28473..84b78739f673 100644 --- a/examples/nlp/language_modeling/megatron_t5_seq2seq_finetune.py +++ b/examples/nlp/language_modeling/megatron_t5_seq2seq_finetune.py @@ -14,16 +14,17 @@ import os import tempfile + from omegaconf.omegaconf import OmegaConf, open_dict from pytorch_lightning import Trainer from pytorch_lightning.callbacks.timer import Timer from pytorch_lightning.plugins.environments.torchelastic_environment import TorchElasticEnvironment from pytorch_lightning.trainer.connectors.checkpoint_connector import CheckpointConnector -from nemo.collections.nlp.modules.common.megatron.megatron_init import fake_initialize_model_parallel from nemo.collections.nlp.models.language_modeling.megatron_finetune_model import MegatronT5FinetuneModel from nemo.collections.nlp.models.language_modeling.megatron_glue_model import MegatronT5GLUEModel from nemo.collections.nlp.models.language_modeling.megatron_t0_model import MegatronT0Model +from nemo.collections.nlp.modules.common.megatron.megatron_init import fake_initialize_model_parallel from nemo.collections.nlp.parts.nlp_overrides import ( GradScaler, MegatronHalfPrecisionPlugin, @@ -32,10 +33,11 @@ PipelineMixedPrecisionPlugin, ) from nemo.core.config import hydra_runner -from nemo.utils import logging, AppState +from nemo.utils import AppState, logging from nemo.utils.exp_manager import StatelessTimer, exp_manager from nemo.utils.model_utils import inject_model_parallel_rank + def _modify_config(t5_cfg, cfg, add_cfg_to_tree=False): """ This function modifies the original t5 pre-training config (t5_cfg) with attributes from the finetuning config (cfg). @@ -65,7 +67,7 @@ def _modify_config(t5_cfg, cfg, add_cfg_to_tree=False): # XNLI has eval languages in the yaml config. if hasattr(cfg.model, 'eval_languages'): t5_cfg.eval_languages = cfg.model.eval_languages - + # This is needed when modifying a hparam file directly to load `.ckpt` files. # This is not needed to modify the cfg in `.nemo` files. if add_cfg_to_tree: @@ -74,6 +76,7 @@ def _modify_config(t5_cfg, cfg, add_cfg_to_tree=False): return t5_cfg + def load_from_nemo(cls, cfg, trainer, t5_cfg, modify_confg_fn): t5_cfg = modify_confg_fn(t5_cfg, cfg, add_cfg_to_tree=False) model = cls.restore_from( @@ -84,6 +87,7 @@ def load_from_nemo(cls, cfg, trainer, t5_cfg, modify_confg_fn): ) return model + def load_from_checkpoint_dir(cls, cfg, trainer, modify_confg_fn): app_state = AppState() if cfg.model.tensor_model_parallel_size > 1 or cfg.model.pipeline_model_parallel_size > 1: @@ -104,18 +108,17 @@ def load_from_checkpoint_dir(cls, cfg, trainer, modify_confg_fn): pipeline_model_parallel_size_=cfg.model.pipeline_model_parallel_size, pipeline_model_parallel_split_rank_=cfg.model.pipeline_model_parallel_split_rank, ) - checkpoint_path = inject_model_parallel_rank(os.path.join(cfg.model.pretrained_checkpoint.checkpoint_dir, cfg.model.pretrained_checkpoint.checkpoint_name)) + checkpoint_path = inject_model_parallel_rank( + os.path.join(cfg.model.pretrained_checkpoint.checkpoint_dir, cfg.model.pretrained_checkpoint.checkpoint_name) + ) hparams_file = OmegaConf.load(cfg.model.pretrained_checkpoint.hparams_file) t5_cfg = modify_confg_fn(hparams_file.cfg, cfg, add_cfg_to_tree=True) with tempfile.NamedTemporaryFile(suffix='.yaml') as f: OmegaConf.save(config=t5_cfg, f=f.name) - model = cls.load_from_checkpoint( - checkpoint_path=checkpoint_path, - trainer=trainer, - hparams_file=f.name, - ) + model = cls.load_from_checkpoint(checkpoint_path=checkpoint_path, trainer=trainer, hparams_file=f.name,) return model + def validate_checkpoint_loading_args(cfg): if cfg.checkpoint_dir is None or not os.path.isdir(cfg.checkpoint_dir): raise ValueError(f'Checkpoint directory {cfg.checkpoint_dir} does not exist or is not a directory.') @@ -124,6 +127,7 @@ def validate_checkpoint_loading_args(cfg): if cfg.hparams_file is None or not os.path.isfile(cfg.hparams_file): raise ValueError(f'Hparams file {cfg.hparams_file} does not exist or is not a file.') + @hydra_runner(config_path="conf", config_name="megatron_t5_config_finetune_glue_mnli") def main(cfg) -> None: logging.info("\n\n************** Experiment configuration ***********") @@ -172,7 +176,7 @@ def main(cfg) -> None: if hasattr(cfg.model.data.train_ds, 'task_name'): if cfg.model.restore_from_path: t5_cfg = MegatronT5GLUEModel.restore_from( - restore_path=cfg.model.restore_from_path, trainer=trainer, return_config=True + restore_path=cfg.model.restore_from_path, trainer=trainer, return_config=True ) model = load_from_nemo(MegatronT5GLUEModel, cfg, trainer, t5_cfg, modify_confg_fn=_modify_config) else: @@ -181,7 +185,7 @@ def main(cfg) -> None: elif hasattr(cfg.model.data.train_ds, 'file_names'): if cfg.model.restore_from_path: t5_cfg = MegatronT0Model.restore_from( - restore_path=cfg.model.restore_from_path, trainer=trainer, return_config=True + restore_path=cfg.model.restore_from_path, trainer=trainer, return_config=True ) model = load_from_nemo(MegatronT0Model, cfg, trainer, t5_cfg, modify_confg_fn=_modify_config) else: @@ -190,12 +194,14 @@ def main(cfg) -> None: else: if cfg.model.restore_from_path: t5_cfg = MegatronT5FinetuneModel.restore_from( - restore_path=cfg.model.restore_from_path, trainer=trainer, return_config=True + restore_path=cfg.model.restore_from_path, trainer=trainer, return_config=True ) model = load_from_nemo(MegatronT5FinetuneModel, cfg, trainer, t5_cfg, modify_confg_fn=_modify_config) else: validate_checkpoint_loading_args(cfg.model.pretrained_checkpoint) - model = load_from_checkpoint_dir(MegatronT5FinetuneModel, cfg, trainer, t5_cfg, modify_confg_fn=_modify_config) + model = load_from_checkpoint_dir( + MegatronT5FinetuneModel, cfg, trainer, t5_cfg, modify_confg_fn=_modify_config + ) trainer.fit(model) trainer.validate(model) From 711d59da4e82473f07a8ed03566cb03629cfb569 Mon Sep 17 00:00:00 2001 From: MaximumEntropy Date: Mon, 7 Nov 2022 09:46:46 -0800 Subject: [PATCH 06/17] Revert config changes Signed-off-by: MaximumEntropy --- examples/nlp/language_modeling/conf/megatron_t5_config.yaml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/examples/nlp/language_modeling/conf/megatron_t5_config.yaml b/examples/nlp/language_modeling/conf/megatron_t5_config.yaml index ad6024f199e2..2a8c2912e395 100644 --- a/examples/nlp/language_modeling/conf/megatron_t5_config.yaml +++ b/examples/nlp/language_modeling/conf/megatron_t5_config.yaml @@ -6,7 +6,7 @@ name: megatron_t5 restore_from_path: null # used when starting from a .nemo file trainer: - devices: 2 + devices: 1 num_nodes: 1 accelerator: gpu precision: 16 @@ -47,7 +47,7 @@ model: micro_batch_size: 4 global_batch_size: 8 # will use more micro batches to reach global batch size tensor_model_parallel_size: 1 - pipeline_model_parallel_size: 2 + pipeline_model_parallel_size: 1 resume_from_checkpoint: null # manually set the checkpoint file to load from pipeline_model_parallel_split_rank: 0 # rank at which decoder starts. From d5468e9d5badb8bbc25563dac0385d0a5222c66b Mon Sep 17 00:00:00 2001 From: MaximumEntropy Date: Mon, 7 Nov 2022 12:44:34 -0800 Subject: [PATCH 07/17] Refactor Signed-off-by: MaximumEntropy --- ...megatron_t5_config_finetune_glue_mnli.yaml | 2 +- .../megatron_finetune_model.py | 68 ++++--------------- 2 files changed, 13 insertions(+), 57 deletions(-) diff --git a/examples/nlp/language_modeling/conf/megatron_t5_config_finetune_glue_mnli.yaml b/examples/nlp/language_modeling/conf/megatron_t5_config_finetune_glue_mnli.yaml index 58793630a38d..ff61c5fde20c 100644 --- a/examples/nlp/language_modeling/conf/megatron_t5_config_finetune_glue_mnli.yaml +++ b/examples/nlp/language_modeling/conf/megatron_t5_config_finetune_glue_mnli.yaml @@ -28,7 +28,7 @@ exp_manager: resume_ignore_no_checkpoint: True create_checkpoint_callback: True checkpoint_callback_params: - monitor: validation_exact_string_match + monitor: validation_${model.data.validation_ds.metric.name} save_top_k: 10 mode: max always_save_nemo: False # TODO: add support diff --git a/nemo/collections/nlp/models/language_modeling/megatron_finetune_model.py b/nemo/collections/nlp/models/language_modeling/megatron_finetune_model.py index 4da8ab57f367..91455f21e477 100644 --- a/nemo/collections/nlp/models/language_modeling/megatron_finetune_model.py +++ b/nemo/collections/nlp/models/language_modeling/megatron_finetune_model.py @@ -132,22 +132,9 @@ def setup(self, stage=None): self.setup_training_data() def _process_global_batch(self, global_batch): - """Process a list of microbatches into a global batch.""" - # If there is no language information in the global batch (ex: English MNLI), we can use the parent global batch processor as is. - if 'lang' not in global_batch[0]: - return self._process_global_batch_without_megatron_batch_sampler(global_batch) - - # For validation data (XNLI), we need to process the global batch and and then deal with language info separately. - else: - assert all(['lang' in micro_batch for micro_batch in global_batch]) - langs_list = [] - processed_global_batch = self._process_global_batch_without_megatron_batch_sampler( - [{k: v for k, v in micro_batch.items() if k != 'lang'} for micro_batch in global_batch] - ) - for micro_batch in global_batch: - langs_list.extend(micro_batch['lang']) - processed_global_batch['lang'] = langs_list - return processed_global_batch + """Optionally processes a global batch.""" + # TODO: maybe remove this now that we've refactored data batch sizes. + return global_batch def on_validation_epoch_start(self): app_state = AppState() @@ -189,17 +176,15 @@ def on_train_epoch_start(self) -> None: return super().on_train_epoch_start() def training_step(self, batch, batch_idx): - micro_batch_size = batch[0]['text_enc'].size(0) + global_batch_size_per_gpu = batc['text_enc'].size(0) # This should happen only on the last batch of the dataset. - if micro_batch_size != self.cfg.data.train_ds.micro_batch_size: + if global_batch_size_per_gpu != self.cfg.data.train_ds.global_batch_size // parallel_state.get_data_parallel_world_size(): app_state = AppState() _reconfigure_microbatch_calculator( rank=app_state.global_rank, rampup_batch_size=None, - global_batch_size=micro_batch_size - * parallel_state.get_data_parallel_world_size() - * get_num_microbatches(), - micro_batch_size=micro_batch_size, + global_batch_size=global_batch_size_per_gpu * parallel_state.get_data_parallel_world_size(), + micro_batch_size=global_batch_size_per_gpu // get_num_microbatches_per_batch(), data_parallel_size=parallel_state.get_data_parallel_world_size(), ) # At this point batch is a list of dictionaries where eatch dict is a microbatch. @@ -264,17 +249,15 @@ def cast_for_metric(self, pred, label, metric_name, class_labels=None, labels_ar return pred, label def _reconfigure_and_process_inference_batch(self, batch): - micro_batch_size = batch[0]['text_enc'].size(0) + global_batch_size_per_gpu = batch['text_enc'].size(0) # This should happen only on the last batch of the dataset. - if micro_batch_size != self.cfg.data.validation_ds.micro_batch_size: + if global_batch_size_per_gpu != self.cfg.data.validation_ds.global_batch_size // parallel_state.get_data_parallel_world_size(): app_state = AppState() _reconfigure_microbatch_calculator( rank=app_state.global_rank, rampup_batch_size=None, - global_batch_size=micro_batch_size - * parallel_state.get_data_parallel_world_size() - * get_num_microbatches(), - micro_batch_size=micro_batch_size, + global_batch_size=global_batch_size_per_gpu * parallel_state.get_data_parallel_world_size(), + micro_batch_size=global_batch_size_per_gpu // get_num_microbatches(), data_parallel_size=parallel_state.get_data_parallel_world_size(), ) @@ -282,7 +265,6 @@ def _reconfigure_and_process_inference_batch(self, batch): # After the process_global_batch call, processed_batch will be a single dictionary containing the global batch. # This is required since the parent class expects a single global batch dictioanry. processed_batch = self._process_global_batch(batch) - return processed_batch def inference_step(self, batch, batch_idx, mode, dataloader_idx=0): @@ -525,20 +507,6 @@ def build_data_loader( sampler = torch.utils.data.distributed.DistributedSampler( dataset, num_replicas=world_size, rank=rank, shuffle=shuffle ) - # This check makes sure the val_check_interval is less than the number of global batches. - # Normally, PTL would do this check and properly account for gradient accumulation. - # But now, it is implicit in the apex fwd/bwd functions and so we need to check for this somewhere. - # The consequence of not doing this is that training loop will never run validation. - # NOTE: Prog bar is also broken as a result of this. - global_batch_size_per_gpu = micro_batch_size * get_num_microbatches() - if ( - self.trainer.val_check_interval > (sampler.num_samples // global_batch_size_per_gpu) - and check_validation_interval - ): - raise ValueError( - f"trainer.val_check_interval {self.trainer.val_check_interval} is > number of global batches {sampler.num_samples // global_batch_size}" - ) - if isinstance(dataset, ConcatMapDataset): collate_fn = dataset.datasets[0].collate_fn else: @@ -548,7 +516,7 @@ def build_data_loader( dataset, collate_fn=collate_fn, sampler=sampler, - batch_size=micro_batch_size, + batch_size=global_batch_size // parallel_state.get_data_parallel_world_size(), num_workers=num_workers, pin_memory=pin_memory, drop_last=drop_last, @@ -689,15 +657,3 @@ def build_train_valid_test_datasets(self, stage): return self._train_ds = self._build_train_dataset(self.cfg.data.train_ds) logging.info(f'Finished building datasets ...') - - def on_train_start(self) -> None: - """PTL hook used to override DataFetcher with GlobalBatchDataFetcher """ - self.trainer.fit_loop._data_fetcher = GlobalBatchDataFetcher() - - def on_validation_start(self) -> None: - """PTL hook used to override DataFetcher with GlobalBatchDataFetcher """ - self.trainer.fit_loop.epoch_loop.val_loop._data_fetcher = GlobalBatchDataFetcher() - self.trainer.validate_loop._data_fetcher = GlobalBatchDataFetcher() - - def on_test_start(self) -> None: - self.trainer.test_loop._data_fetcher = GlobalBatchDataFetcher() From 9e86322e21dc107c7e495bb51a3dbe847f9b2ea5 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Mon, 7 Nov 2022 20:46:31 +0000 Subject: [PATCH 08/17] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- .../language_modeling/megatron_finetune_model.py | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/nemo/collections/nlp/models/language_modeling/megatron_finetune_model.py b/nemo/collections/nlp/models/language_modeling/megatron_finetune_model.py index 91455f21e477..a6bf63a8b741 100644 --- a/nemo/collections/nlp/models/language_modeling/megatron_finetune_model.py +++ b/nemo/collections/nlp/models/language_modeling/megatron_finetune_model.py @@ -178,7 +178,10 @@ def on_train_epoch_start(self) -> None: def training_step(self, batch, batch_idx): global_batch_size_per_gpu = batc['text_enc'].size(0) # This should happen only on the last batch of the dataset. - if global_batch_size_per_gpu != self.cfg.data.train_ds.global_batch_size // parallel_state.get_data_parallel_world_size(): + if ( + global_batch_size_per_gpu + != self.cfg.data.train_ds.global_batch_size // parallel_state.get_data_parallel_world_size() + ): app_state = AppState() _reconfigure_microbatch_calculator( rank=app_state.global_rank, @@ -251,7 +254,10 @@ def cast_for_metric(self, pred, label, metric_name, class_labels=None, labels_ar def _reconfigure_and_process_inference_batch(self, batch): global_batch_size_per_gpu = batch['text_enc'].size(0) # This should happen only on the last batch of the dataset. - if global_batch_size_per_gpu != self.cfg.data.validation_ds.global_batch_size // parallel_state.get_data_parallel_world_size(): + if ( + global_batch_size_per_gpu + != self.cfg.data.validation_ds.global_batch_size // parallel_state.get_data_parallel_world_size() + ): app_state = AppState() _reconfigure_microbatch_calculator( rank=app_state.global_rank, From 723d125071d2a8a6051226cd8df4007b3c1c8641 Mon Sep 17 00:00:00 2001 From: MaximumEntropy Date: Mon, 7 Nov 2022 12:48:08 -0800 Subject: [PATCH 09/17] Fix typo Signed-off-by: MaximumEntropy --- .../nlp/models/language_modeling/megatron_finetune_model.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/nemo/collections/nlp/models/language_modeling/megatron_finetune_model.py b/nemo/collections/nlp/models/language_modeling/megatron_finetune_model.py index 91455f21e477..e50e3de73535 100644 --- a/nemo/collections/nlp/models/language_modeling/megatron_finetune_model.py +++ b/nemo/collections/nlp/models/language_modeling/megatron_finetune_model.py @@ -176,7 +176,7 @@ def on_train_epoch_start(self) -> None: return super().on_train_epoch_start() def training_step(self, batch, batch_idx): - global_batch_size_per_gpu = batc['text_enc'].size(0) + global_batch_size_per_gpu = batch['text_enc'].size(0) # This should happen only on the last batch of the dataset. if global_batch_size_per_gpu != self.cfg.data.train_ds.global_batch_size // parallel_state.get_data_parallel_world_size(): app_state = AppState() From 939efdb68fccbe3334f10e587c46526b5eb6ae12 Mon Sep 17 00:00:00 2001 From: MaximumEntropy Date: Mon, 7 Nov 2022 12:52:47 -0800 Subject: [PATCH 10/17] Remove comments Signed-off-by: MaximumEntropy --- .../nlp/models/language_modeling/megatron_finetune_model.py | 3 --- 1 file changed, 3 deletions(-) diff --git a/nemo/collections/nlp/models/language_modeling/megatron_finetune_model.py b/nemo/collections/nlp/models/language_modeling/megatron_finetune_model.py index 7cc6903b06bd..3b644716cdf3 100644 --- a/nemo/collections/nlp/models/language_modeling/megatron_finetune_model.py +++ b/nemo/collections/nlp/models/language_modeling/megatron_finetune_model.py @@ -267,9 +267,6 @@ def _reconfigure_and_process_inference_batch(self, batch): data_parallel_size=parallel_state.get_data_parallel_world_size(), ) - # At this point processed_batch is a list of dictionaries where eatch dict is a microbatch. - # After the process_global_batch call, processed_batch will be a single dictionary containing the global batch. - # This is required since the parent class expects a single global batch dictioanry. processed_batch = self._process_global_batch(batch) return processed_batch From 61c6fdaf5978721c5168ed248386630fadb48525 Mon Sep 17 00:00:00 2001 From: MaximumEntropy Date: Mon, 7 Nov 2022 13:09:36 -0800 Subject: [PATCH 11/17] Minor Signed-off-by: MaximumEntropy --- .../nlp/language_modeling/megatron_t5_seq2seq_eval.py | 11 ++++------- 1 file changed, 4 insertions(+), 7 deletions(-) diff --git a/examples/nlp/language_modeling/megatron_t5_seq2seq_eval.py b/examples/nlp/language_modeling/megatron_t5_seq2seq_eval.py index ef2ec8a65cbd..5e37492b4f06 100644 --- a/examples/nlp/language_modeling/megatron_t5_seq2seq_eval.py +++ b/examples/nlp/language_modeling/megatron_t5_seq2seq_eval.py @@ -25,8 +25,7 @@ from nemo.collections.nlp.parts.nlp_overrides import ( GradScaler, MegatronHalfPrecisionPlugin, - NLPDDPStrategy, - NLPSaveRestoreConnector, + NLPDDPStrategy ) from nemo.core.config import hydra_runner from nemo.utils import logging @@ -126,18 +125,16 @@ def main(cfg) -> None: model = load_from_nemo(MegatronT0Model, cfg, trainer, t5_cfg, modify_confg_fn=_modify_config) else: validate_checkpoint_loading_args(cfg.model.pretrained_checkpoint) - model = load_from_checkpoint_dir(MegatronT0Model, cfg, trainer, t5_cfg, modify_confg_fn=_modify_config) + model = load_from_checkpoint_dir(MegatronT0Model, cfg, trainer, modify_confg_fn=_modify_config) else: if cfg.model.restore_from_path: t5_cfg = MegatronT5FinetuneModel.restore_from( restore_path=cfg.model.restore_from_path, trainer=trainer, return_config=True ) - model = load_from_nemo(MegatronT5FinetuneModel, cfg, trainer, t5_cfg, modify_confg_fn=_modify_config) + model = load_from_nemo(MegatronT5FinetuneModel, cfg, trainer, modify_confg_fn=_modify_config) else: validate_checkpoint_loading_args(cfg.model.pretrained_checkpoint) - model = load_from_checkpoint_dir( - MegatronT5FinetuneModel, cfg, trainer, t5_cfg, modify_confg_fn=_modify_config - ) + model = load_from_checkpoint_dir(MegatronT5FinetuneModel, cfg, trainer, modify_confg_fn=_modify_config) model.freeze() trainer.validate(model) From 424beda2c6cf45ed9768445e6b7e2e5f441b76cc Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Mon, 7 Nov 2022 21:12:31 +0000 Subject: [PATCH 12/17] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- examples/nlp/language_modeling/megatron_t5_seq2seq_eval.py | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) diff --git a/examples/nlp/language_modeling/megatron_t5_seq2seq_eval.py b/examples/nlp/language_modeling/megatron_t5_seq2seq_eval.py index 5e37492b4f06..e78d34adee65 100644 --- a/examples/nlp/language_modeling/megatron_t5_seq2seq_eval.py +++ b/examples/nlp/language_modeling/megatron_t5_seq2seq_eval.py @@ -22,11 +22,7 @@ from nemo.collections.nlp.models.language_modeling.megatron_finetune_model import MegatronT5FinetuneModel from nemo.collections.nlp.models.language_modeling.megatron_glue_model import MegatronT5GLUEModel from nemo.collections.nlp.models.language_modeling.megatron_t0_model import MegatronT0Model -from nemo.collections.nlp.parts.nlp_overrides import ( - GradScaler, - MegatronHalfPrecisionPlugin, - NLPDDPStrategy -) +from nemo.collections.nlp.parts.nlp_overrides import GradScaler, MegatronHalfPrecisionPlugin, NLPDDPStrategy from nemo.core.config import hydra_runner from nemo.utils import logging from nemo.utils.exp_manager import StatelessTimer, exp_manager From e81bc638000f276969cc9fa7a57f708c530dbadc Mon Sep 17 00:00:00 2001 From: MaximumEntropy Date: Wed, 9 Nov 2022 14:08:06 -0800 Subject: [PATCH 13/17] Fix validation reconfiguration Signed-off-by: MaximumEntropy --- .../language_modeling/megatron_finetune_model.py | 12 ++++-------- 1 file changed, 4 insertions(+), 8 deletions(-) diff --git a/nemo/collections/nlp/models/language_modeling/megatron_finetune_model.py b/nemo/collections/nlp/models/language_modeling/megatron_finetune_model.py index 3b644716cdf3..66d1f7a005b1 100644 --- a/nemo/collections/nlp/models/language_modeling/megatron_finetune_model.py +++ b/nemo/collections/nlp/models/language_modeling/megatron_finetune_model.py @@ -182,12 +182,13 @@ def training_step(self, batch, batch_idx): global_batch_size_per_gpu != self.cfg.data.train_ds.global_batch_size // parallel_state.get_data_parallel_world_size() ): + # NOTE: This should never really be called since `drop_last=True` is required for training datasets. app_state = AppState() _reconfigure_microbatch_calculator( rank=app_state.global_rank, rampup_batch_size=None, global_batch_size=global_batch_size_per_gpu * parallel_state.get_data_parallel_world_size(), - micro_batch_size=global_batch_size_per_gpu // get_num_microbatches_per_batch(), + micro_batch_size=global_batch_size_per_gpu // get_num_microbatches(), data_parallel_size=parallel_state.get_data_parallel_world_size(), ) # At this point batch is a list of dictionaries where eatch dict is a microbatch. @@ -258,12 +259,13 @@ def _reconfigure_and_process_inference_batch(self, batch): global_batch_size_per_gpu != self.cfg.data.validation_ds.global_batch_size // parallel_state.get_data_parallel_world_size() ): + # NOTE: This is reconfiguring to make sure there is no grad-acc for validation batches. app_state = AppState() _reconfigure_microbatch_calculator( rank=app_state.global_rank, rampup_batch_size=None, global_batch_size=global_batch_size_per_gpu * parallel_state.get_data_parallel_world_size(), - micro_batch_size=global_batch_size_per_gpu // get_num_microbatches(), + micro_batch_size=global_batch_size_per_gpu, data_parallel_size=parallel_state.get_data_parallel_world_size(), ) @@ -492,13 +494,11 @@ def test_epoch_end(self, outputs): def build_data_loader( self, dataset, - micro_batch_size, global_batch_size, shuffle, num_workers, pin_memory, drop_last, - check_validation_interval, ): """Buld dataloader given an input dataset.""" @@ -528,13 +528,11 @@ def build_data_loader( def setup_training_data(self): self._train_dl = self.build_data_loader( self._train_ds, - micro_batch_size=self.cfg.data.train_ds.micro_batch_size, global_batch_size=self.cfg.data.train_ds.global_batch_size, shuffle=self.cfg.data.train_ds.shuffle, num_workers=self.cfg.data.train_ds.num_workers, pin_memory=self.cfg.data.train_ds.pin_memory, drop_last=self.cfg.data.train_ds.drop_last, - check_validation_interval=True, ) def setup_eval_data(self, datasets, data_cfg): @@ -542,13 +540,11 @@ def setup_eval_data(self, datasets, data_cfg): for dataset in datasets: eval_dl = self.build_data_loader( dataset, - micro_batch_size=data_cfg.micro_batch_size, global_batch_size=data_cfg.global_batch_size, shuffle=data_cfg.shuffle, num_workers=data_cfg.num_workers, pin_memory=data_cfg.pin_memory, drop_last=data_cfg.drop_last, - check_validation_interval=False, ) dataloaders.append(eval_dl) return dataloaders From 616f7342b87955101d3eaab3cfdf3c634de1709a Mon Sep 17 00:00:00 2001 From: MaximumEntropy Date: Wed, 9 Nov 2022 14:08:40 -0800 Subject: [PATCH 14/17] Remove old comment Signed-off-by: MaximumEntropy --- .../nlp/models/language_modeling/megatron_finetune_model.py | 3 --- 1 file changed, 3 deletions(-) diff --git a/nemo/collections/nlp/models/language_modeling/megatron_finetune_model.py b/nemo/collections/nlp/models/language_modeling/megatron_finetune_model.py index 66d1f7a005b1..7743f93f8227 100644 --- a/nemo/collections/nlp/models/language_modeling/megatron_finetune_model.py +++ b/nemo/collections/nlp/models/language_modeling/megatron_finetune_model.py @@ -191,9 +191,6 @@ def training_step(self, batch, batch_idx): micro_batch_size=global_batch_size_per_gpu // get_num_microbatches(), data_parallel_size=parallel_state.get_data_parallel_world_size(), ) - # At this point batch is a list of dictionaries where eatch dict is a microbatch. - # After the process_global_batch call, batch will be a single dictionary containing the global batch. - # This is required since the parent class expects a single global batch dictioanry. batch = self._process_global_batch(batch) return super().training_step(batch, batch_idx) From 1dab65cadb63b0d04b3958aa2a9a437572be4bb7 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Wed, 9 Nov 2022 22:09:46 +0000 Subject: [PATCH 15/17] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- .../models/language_modeling/megatron_finetune_model.py | 8 +------- 1 file changed, 1 insertion(+), 7 deletions(-) diff --git a/nemo/collections/nlp/models/language_modeling/megatron_finetune_model.py b/nemo/collections/nlp/models/language_modeling/megatron_finetune_model.py index 7743f93f8227..db872543d49d 100644 --- a/nemo/collections/nlp/models/language_modeling/megatron_finetune_model.py +++ b/nemo/collections/nlp/models/language_modeling/megatron_finetune_model.py @@ -489,13 +489,7 @@ def test_epoch_end(self, outputs): _ = self.inference_epoch_end(outputs, 'test', self.cfg.data.test_ds) def build_data_loader( - self, - dataset, - global_batch_size, - shuffle, - num_workers, - pin_memory, - drop_last, + self, dataset, global_batch_size, shuffle, num_workers, pin_memory, drop_last, ): """Buld dataloader given an input dataset.""" From 3b148eabdf1a9a74adfb720a84c807a7e4b93a08 Mon Sep 17 00:00:00 2001 From: MaximumEntropy Date: Thu, 10 Nov 2022 16:40:30 -0800 Subject: [PATCH 16/17] Fixes for test_ds Signed-off-by: MaximumEntropy --- .../megatron_finetune_model.py | 35 ++++++++++++++----- 1 file changed, 26 insertions(+), 9 deletions(-) diff --git a/nemo/collections/nlp/models/language_modeling/megatron_finetune_model.py b/nemo/collections/nlp/models/language_modeling/megatron_finetune_model.py index db872543d49d..c24fc9ad17c2 100644 --- a/nemo/collections/nlp/models/language_modeling/megatron_finetune_model.py +++ b/nemo/collections/nlp/models/language_modeling/megatron_finetune_model.py @@ -147,7 +147,26 @@ def on_validation_epoch_start(self): ) return super().on_validation_epoch_start() + def on_test_epoch_start(self): + app_state = AppState() + _reconfigure_microbatch_calculator( + rank=app_state.global_rank, + rampup_batch_size=None, + global_batch_size=self.cfg.data.test_ds.global_batch_size, + micro_batch_size=self.cfg.data.test_ds.micro_batch_size, + data_parallel_size=parallel_state.get_data_parallel_world_size(), + ) + return super().on_test_epoch_start() + + def on_test_epoch_end(self): + self.on_inference_epoch_end(self.cfg.data.test_ds) + return super().on_test_epoch_end() + def on_validation_epoch_end(self): + self.on_inference_epoch_end(self.cfg.data.validation_ds) + return super().on_validation_epoch_end() + + def on_inference_epoch_end(self, ds): app_state = AppState() if hasattr(self, "_train_ds"): _reconfigure_microbatch_calculator( @@ -163,13 +182,11 @@ def on_validation_epoch_end(self): _reconfigure_microbatch_calculator( rank=app_state.global_rank, rampup_batch_size=None, - global_batch_size=self.cfg.data.validation_ds.global_batch_size, - micro_batch_size=self.cfg.data.validation_ds.micro_batch_size, + global_batch_size=ds.global_batch_size, + micro_batch_size=ds.micro_batch_size, data_parallel_size=parallel_state.get_data_parallel_world_size(), ) - return super().on_validation_epoch_end() - def on_train_epoch_start(self) -> None: # Same logic as validation epoch end, but this may be need if there is no validation sanity check to trigger validation_epoch_end() self.on_validation_epoch_end() @@ -249,12 +266,12 @@ def cast_for_metric(self, pred, label, metric_name, class_labels=None, labels_ar return pred, label - def _reconfigure_and_process_inference_batch(self, batch): + def _reconfigure_and_process_inference_batch(self, batch, ds_config): global_batch_size_per_gpu = batch['text_enc'].size(0) # This should happen only on the last batch of the dataset. if ( global_batch_size_per_gpu - != self.cfg.data.validation_ds.global_batch_size // parallel_state.get_data_parallel_world_size() + != ds_config.global_batch_size // parallel_state.get_data_parallel_world_size() ): # NOTE: This is reconfiguring to make sure there is no grad-acc for validation batches. app_state = AppState() @@ -273,7 +290,7 @@ def inference_step(self, batch, batch_idx, mode, dataloader_idx=0): # Regular finetuning datasets will return a list of dicts for each microbatch. But T0 datasets will return a single dict for the global batch. batch_has_lang_information = isinstance(batch, list) and len(batch[0]) == 7 - processed_batch = self._reconfigure_and_process_inference_batch(batch) + processed_batch = self._reconfigure_and_process_inference_batch(batch, self.cfg.data.validation_ds if mode == 'validation' else self.cfg.data.test_ds) # Call parent validation step to get the loss. # NOTE: There could be extra keys in the processed_batch dictionary such as "langs" for XNLI, this will be ignored in the parent class. @@ -301,8 +318,8 @@ def inference_step(self, batch, batch_idx, mode, dataloader_idx=0): pred=pred, label=label, metric_name=self.val_metric_name if mode == 'validation' else self.test_metric_name, - class_labels=self.cfg.data.validation_ds.metric.get('class_labels', None), - labels_are_strings=self.cfg.data.validation_ds.metric.get('labels_are_strings', False), + class_labels=self.cfg.data.validation_ds.metric.get('class_labels', None) if mode == 'validation' else self.cfg.data.test_ds.metric.get('class_labels', None), + labels_are_strings=self.cfg.data.validation_ds.metric.get('labels_are_strings', False) if mode == 'validation' else self.cfg.data.test_ds.metric.get('labels_are_strings', False), ) if batch_has_lang_information: _ = metric(pred, label, category) From 4816e2d84f557d3d9611f3f7e2f2037b887ef702 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Fri, 11 Nov 2022 00:42:59 +0000 Subject: [PATCH 17/17] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- .../megatron_finetune_model.py | 17 ++++++++++------- 1 file changed, 10 insertions(+), 7 deletions(-) diff --git a/nemo/collections/nlp/models/language_modeling/megatron_finetune_model.py b/nemo/collections/nlp/models/language_modeling/megatron_finetune_model.py index c24fc9ad17c2..c49d7b50580a 100644 --- a/nemo/collections/nlp/models/language_modeling/megatron_finetune_model.py +++ b/nemo/collections/nlp/models/language_modeling/megatron_finetune_model.py @@ -269,10 +269,7 @@ def cast_for_metric(self, pred, label, metric_name, class_labels=None, labels_ar def _reconfigure_and_process_inference_batch(self, batch, ds_config): global_batch_size_per_gpu = batch['text_enc'].size(0) # This should happen only on the last batch of the dataset. - if ( - global_batch_size_per_gpu - != ds_config.global_batch_size // parallel_state.get_data_parallel_world_size() - ): + if global_batch_size_per_gpu != ds_config.global_batch_size // parallel_state.get_data_parallel_world_size(): # NOTE: This is reconfiguring to make sure there is no grad-acc for validation batches. app_state = AppState() _reconfigure_microbatch_calculator( @@ -290,7 +287,9 @@ def inference_step(self, batch, batch_idx, mode, dataloader_idx=0): # Regular finetuning datasets will return a list of dicts for each microbatch. But T0 datasets will return a single dict for the global batch. batch_has_lang_information = isinstance(batch, list) and len(batch[0]) == 7 - processed_batch = self._reconfigure_and_process_inference_batch(batch, self.cfg.data.validation_ds if mode == 'validation' else self.cfg.data.test_ds) + processed_batch = self._reconfigure_and_process_inference_batch( + batch, self.cfg.data.validation_ds if mode == 'validation' else self.cfg.data.test_ds + ) # Call parent validation step to get the loss. # NOTE: There could be extra keys in the processed_batch dictionary such as "langs" for XNLI, this will be ignored in the parent class. @@ -318,8 +317,12 @@ def inference_step(self, batch, batch_idx, mode, dataloader_idx=0): pred=pred, label=label, metric_name=self.val_metric_name if mode == 'validation' else self.test_metric_name, - class_labels=self.cfg.data.validation_ds.metric.get('class_labels', None) if mode == 'validation' else self.cfg.data.test_ds.metric.get('class_labels', None), - labels_are_strings=self.cfg.data.validation_ds.metric.get('labels_are_strings', False) if mode == 'validation' else self.cfg.data.test_ds.metric.get('labels_are_strings', False), + class_labels=self.cfg.data.validation_ds.metric.get('class_labels', None) + if mode == 'validation' + else self.cfg.data.test_ds.metric.get('class_labels', None), + labels_are_strings=self.cfg.data.validation_ds.metric.get('labels_are_strings', False) + if mode == 'validation' + else self.cfg.data.test_ds.metric.get('labels_are_strings', False), ) if batch_has_lang_information: _ = metric(pred, label, category)