Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
27 commits
Select commit Hold shift + click to select a range
dcf6c7d
Initial refactor
MaximumEntropy Nov 4, 2022
f2a4fa4
Resolve config before passing to load_from_checkpoint
MaximumEntropy Nov 4, 2022
09230c9
Fixes for model parallel and nemo restore
MaximumEntropy Nov 4, 2022
ac04895
Merge branch 'main' of github.com:NVIDIA/NeMo into t5_finetuning_kt
MaximumEntropy Nov 4, 2022
6ba1375
Fixes for eval
MaximumEntropy Nov 7, 2022
c7e5706
Merge branch 'main' into t5_finetuning_kt
MaximumEntropy Nov 7, 2022
e5204c6
[pre-commit.ci] auto fixes from pre-commit.com hooks
pre-commit-ci[bot] Nov 7, 2022
711d59d
Revert config changes
MaximumEntropy Nov 7, 2022
3127493
Merge branch 't5_finetuning_kt' of github.com:NVIDIA/NeMo into t5_fin…
MaximumEntropy Nov 7, 2022
d5468e9
Refactor
MaximumEntropy Nov 7, 2022
9e86322
[pre-commit.ci] auto fixes from pre-commit.com hooks
pre-commit-ci[bot] Nov 7, 2022
723d125
Fix typo
MaximumEntropy Nov 7, 2022
5973a61
Merge branch 't5_finetuning_kt' of github.com:NVIDIA/NeMo into t5_fin…
MaximumEntropy Nov 7, 2022
939efdb
Remove comments
MaximumEntropy Nov 7, 2022
61c6fda
Minor
MaximumEntropy Nov 7, 2022
c42e366
Merge branch 'main' into t5_finetuning_kt
MaximumEntropy Nov 7, 2022
424beda
[pre-commit.ci] auto fixes from pre-commit.com hooks
pre-commit-ci[bot] Nov 7, 2022
e81bc63
Fix validation reconfiguration
MaximumEntropy Nov 9, 2022
38dbe58
Merge branch 't5_finetuning_kt' of github.com:NVIDIA/NeMo into t5_fin…
MaximumEntropy Nov 9, 2022
616f734
Remove old comment
MaximumEntropy Nov 9, 2022
a5edd9e
Merge branch 'main' into t5_finetuning_kt
MaximumEntropy Nov 9, 2022
1dab65c
[pre-commit.ci] auto fixes from pre-commit.com hooks
pre-commit-ci[bot] Nov 9, 2022
3b148ea
Fixes for test_ds
MaximumEntropy Nov 11, 2022
4816e2d
[pre-commit.ci] auto fixes from pre-commit.com hooks
pre-commit-ci[bot] Nov 11, 2022
a330bb3
Merge branch 'main' into t5_finetuning_kt
MaximumEntropy Nov 11, 2022
d06a5a8
Merge branch 'main' into t5_finetuning_kt
MaximumEntropy Nov 15, 2022
002bcaa
Merge branch 'main' into t5_finetuning_kt
MaximumEntropy Nov 15, 2022
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 6 additions & 2 deletions examples/nlp/language_modeling/conf/megatron_t0_config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -36,7 +36,11 @@ exp_manager:
save_best_model: True

model:
restore_from_path: ??? # Path to a trained T5 or LM-adapted T5 .nemo file
restore_from_path: null # Path to a trained T5 .nemo file
pretrained_checkpoint:
checkpoint_dir: null # Path to a folder that contains a .ckpt file
checkpoint_name: null # Name of the .ckpt file within the checkpoint_dir.
hparams_file: null # Path to a .yaml file that contains the hyperparameters of the checkpoint.
tensor_model_parallel_size: 1
pipeline_model_parallel_size: 1
pipeline_model_parallel_split_rank: 0
Expand Down Expand Up @@ -82,7 +86,7 @@ model:
num_classes: null
replace_bos_with_pad: ${data.train_ds.replace_bos_with_pad}
add_bos_to_input: ${data.train_ds.add_bos_to_input}
add_eos_to_input: ${data.train_ds.replace_bos_with_pad}
add_eos_to_input: ${data.train_ds.add_eos_to_input}
seed: 1234

optim:
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,11 @@ exp_manager:
create_checkpoint_callback: False

model:
restore_from_path: ??? # Path to a finetuned T5 .nemo file
restore_from_path: null # Path to a trained T5 .nemo file
pretrained_checkpoint:
checkpoint_dir: null # Path to a folder that contains a .ckpt file
checkpoint_name: null # Name of the .ckpt file within the checkpoint_dir.
hparams_file: null # Path to a .yaml file that contains the hyperparameters of the checkpoint.
gradient_as_bucket_view: True # Allocate gradients in a contiguous bucket to save memory (less fragmentation and buffer memory)
megatron_amp_O2: False # Enable O2 optimization for megatron amp

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -17,9 +17,16 @@ exp_manager:
create_checkpoint_callback: False

model:
restore_from_path: ??? # Path to a finetuned T5 .nemo file
restore_from_path: null # Path to a trained T5 .nemo file
pretrained_checkpoint:
checkpoint_dir: null # Path to a folder that contains a .ckpt file
checkpoint_name: null # Name of the .ckpt file within the checkpoint_dir.
hparams_file: null # Path to a .yaml file that contains the hyperparameters of the checkpoint.
gradient_as_bucket_view: True # Allocate gradients in a contiguous bucket to save memory (less fragmentation and buffer memory)
megatron_amp_O2: False # Enable O2 optimization for megatron amp
tensor_model_parallel_size: 1
pipeline_model_parallel_size: 1
pipeline_model_parallel_split_rank: 0

data:
validation_ds:
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -37,7 +37,11 @@ exp_manager:
save_best_model: True

model:
restore_from_path: ??? # Path to a trained T5 .nemo file
restore_from_path: null # Path to a trained T5 .nemo file
pretrained_checkpoint:
checkpoint_dir: null # Path to a folder that contains a .ckpt file
checkpoint_name: null # Name of the .ckpt file within the checkpoint_dir.
hparams_file: null # Path to a .yaml file that contains the hyperparameters of the checkpoint.
tensor_model_parallel_size: 1
pipeline_model_parallel_size: 1
pipeline_model_parallel_split_rank: 0
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -37,9 +37,13 @@ exp_manager:
save_best_model: True

model:
restore_from_path: ???
restore_from_path: null # Path to a trained T5 .nemo file
pretrained_checkpoint:
checkpoint_dir: null # Path to a folder that contains a .ckpt file
checkpoint_name: null # Name of the .ckpt file within the checkpoint_dir.
hparams_file: null # Path to a .yaml file that contains the hyperparameters of the checkpoint.
tensor_model_parallel_size: 1
pipeline_model_parallel_size: 2
pipeline_model_parallel_size: 1
pipeline_model_parallel_split_rank: 1
gradient_as_bucket_view: True # Allocate gradients in a contiguous bucket to save memory (less fragmentation and buffer memory)
resume_from_checkpoint: null
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -36,7 +36,11 @@ exp_manager:
save_best_model: True

model:
restore_from_path: ??? # Path to a trained T5 .nemo file
restore_from_path: null # Path to a trained T5 .nemo file
pretrained_checkpoint:
checkpoint_dir: null # Path to a folder that contains a .ckpt file
checkpoint_name: null # Name of the .ckpt file within the checkpoint_dir.
hparams_file: null # Path to a .yaml file that contains the hyperparameters of the checkpoint.
tensor_model_parallel_size: 1
pipeline_model_parallel_size: 1
pipeline_model_parallel_split_rank: 0
Expand Down
119 changes: 64 additions & 55 deletions examples/nlp/language_modeling/megatron_t5_seq2seq_eval.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@
# See the License for the specific language governing permissions and
# limitations under the License.

from megatron_t5_seq2seq_finetune import load_from_checkpoint_dir, load_from_nemo, validate_checkpoint_loading_args
from omegaconf.omegaconf import OmegaConf, open_dict
from pytorch_lightning import Trainer
from pytorch_lightning.callbacks.timer import Timer
Expand All @@ -21,17 +22,51 @@
from nemo.collections.nlp.models.language_modeling.megatron_finetune_model import MegatronT5FinetuneModel
from nemo.collections.nlp.models.language_modeling.megatron_glue_model import MegatronT5GLUEModel
from nemo.collections.nlp.models.language_modeling.megatron_t0_model import MegatronT0Model
from nemo.collections.nlp.parts.nlp_overrides import (
GradScaler,
MegatronHalfPrecisionPlugin,
NLPDDPStrategy,
NLPSaveRestoreConnector,
)
from nemo.collections.nlp.parts.nlp_overrides import GradScaler, MegatronHalfPrecisionPlugin, NLPDDPStrategy
from nemo.core.config import hydra_runner
from nemo.utils import logging
from nemo.utils.exp_manager import StatelessTimer, exp_manager


def _modify_config(t5_cfg, cfg, add_cfg_to_tree=False):
"""
This function modifies the original t5 pre-training config (t5_cfg) with attributes from the finetuning config (cfg).
The `add_cfg_to_tree` arg adds `cfg` to the top of the yaml tree which is needed for all `hparams.yaml` files when passed as an arg to `load_from_checkpoint()`.
"""
OmegaConf.set_struct(t5_cfg, True)
with open_dict(t5_cfg):
t5_cfg.precision = cfg.trainer.precision
# Overwrite data configs
if cfg.model.data.validation_ds.get('src_file_name', None) is not None:
logging.info(
'Found validation_ds.src_file_name in the config file. Overriding the finetuned model config file with the values from the new config file.'
)
t5_cfg.data.validation_ds.src_file_name = cfg.model.data.validation_ds.src_file_name
if cfg.model.data.validation_ds.get('tgt_file_name', None) is not None:
logging.info(
'Found validation_ds.tgt_file_name in the config file. Overriding the finetuned model config file with the values from the new config file.'
)
t5_cfg.data.validation_ds.tgt_file_name = cfg.model.data.validation_ds.tgt_file_name

if "write_predictions_to_file" in cfg.model.data.validation_ds:
t5_cfg.data.validation_ds.write_predictions_to_file = (
cfg.model.data.validation_ds.write_predictions_to_file
)
if "output_file_path_prefix" in cfg.model.data.validation_ds:
t5_cfg.data.validation_ds.output_file_path_prefix = cfg.model.data.validation_ds.output_file_path_prefix

t5_cfg.data.validation_ds.micro_batch_size = cfg.model.data.validation_ds.micro_batch_size
t5_cfg.data.validation_ds.global_batch_size = cfg.model.data.validation_ds.global_batch_size

# This is needed when modifying a hparam file directly to load `.ckpt` files.
# This is not needed to modify the cfg in `.nemo` files.
if add_cfg_to_tree:
OmegaConf.resolve(t5_cfg)
t5_cfg.cfg = t5_cfg

return t5_cfg


@hydra_runner(config_path="conf", config_name="megatron_t5_config_finetune_glue_eval")
def main(cfg) -> None:
logging.info("\n\n************** Experiment configuration ***********")
Expand Down Expand Up @@ -69,59 +104,33 @@ def main(cfg) -> None:
if isinstance(callback, Timer):
trainer.callbacks[idx] = StatelessTimer(cfg.trainer.max_time,)

t5_cfg = MegatronT5GLUEModel.restore_from(
restore_path=cfg.model.restore_from_path, trainer=trainer, return_config=True
)

# Override the T5 configuration with the one from the config file.
# NOTE: Only data can be overriden here since this the file being restored here should already correspond to a GLUE/XNLI finetuned model.
OmegaConf.set_struct(t5_cfg, True)
with open_dict(t5_cfg):
t5_cfg.precision = cfg.trainer.precision
# Overwrite data configs
if cfg.model.data.validation_ds.get('src_file_name', None) is not None:
logging.info(
'Found validation_ds.src_file_name in the config file. Overriding the finetuned model config file with the values from the new config file.'
if hasattr(cfg.model.data.validation_ds, 'task_name'):
if cfg.model.restore_from_path:
t5_cfg = MegatronT5GLUEModel.restore_from(
restore_path=cfg.model.restore_from_path, trainer=trainer, return_config=True
)
t5_cfg.data.validation_ds.src_file_name = cfg.model.data.validation_ds.src_file_name
if cfg.model.data.validation_ds.get('tgt_file_name', None) is not None:
logging.info(
'Found validation_ds.tgt_file_name in the config file. Overriding the finetuned model config file with the values from the new config file.'
)
t5_cfg.data.validation_ds.tgt_file_name = cfg.model.data.validation_ds.tgt_file_name

if "write_predictions_to_file" in cfg.model.data.validation_ds:
t5_cfg.data.validation_ds.write_predictions_to_file = (
cfg.model.data.validation_ds.write_predictions_to_file
)
if "output_file_path_prefix" in cfg.model.data.validation_ds:
t5_cfg.data.validation_ds.output_file_path_prefix = cfg.model.data.validation_ds.output_file_path_prefix
t5_cfg.data.validation_ds.src_file_name = cfg.model.data.validation_ds.src_file_name

t5_cfg.data.validation_ds.micro_batch_size = cfg.model.data.validation_ds.micro_batch_size
t5_cfg.data.validation_ds.global_batch_size = cfg.model.data.validation_ds.global_batch_size

if hasattr(cfg.model.data.validation_ds, 'task_name'):
model = MegatronT5GLUEModel.restore_from(
restore_path=cfg.model.restore_from_path,
trainer=trainer,
override_config_path=t5_cfg,
save_restore_connector=NLPSaveRestoreConnector(),
)
elif hasattr(cfg.model.data.validation_ds, 'file_names'):
model = MegatronT0Model.restore_from(
restore_path=cfg.model.restore_from_path,
trainer=trainer,
override_config_path=t5_cfg,
save_restore_connector=NLPSaveRestoreConnector(),
model = load_from_nemo(MegatronT5GLUEModel, cfg, trainer, t5_cfg, modify_confg_fn=_modify_config)
else:
validate_checkpoint_loading_args(cfg.model.pretrained_checkpoint)
model = load_from_checkpoint_dir(MegatronT5GLUEModel, cfg, trainer, modify_confg_fn=_modify_config)
elif hasattr(cfg.model.data.validation_ds, 'file_names'):
if cfg.model.restore_from_path:
t5_cfg = MegatronT0Model.restore_from(
restore_path=cfg.model.restore_from_path, trainer=trainer, return_config=True
)
model = load_from_nemo(MegatronT0Model, cfg, trainer, t5_cfg, modify_confg_fn=_modify_config)
else:
model = MegatronT5FinetuneModel.restore_from(
restore_path=cfg.model.restore_from_path,
trainer=trainer,
override_config_path=t5_cfg,
save_restore_connector=NLPSaveRestoreConnector(),
validate_checkpoint_loading_args(cfg.model.pretrained_checkpoint)
model = load_from_checkpoint_dir(MegatronT0Model, cfg, trainer, modify_confg_fn=_modify_config)
else:
if cfg.model.restore_from_path:
t5_cfg = MegatronT5FinetuneModel.restore_from(
restore_path=cfg.model.restore_from_path, trainer=trainer, return_config=True
)
model = load_from_nemo(MegatronT5FinetuneModel, cfg, trainer, modify_confg_fn=_modify_config)
else:
validate_checkpoint_loading_args(cfg.model.pretrained_checkpoint)
model = load_from_checkpoint_dir(MegatronT5FinetuneModel, cfg, trainer, modify_confg_fn=_modify_config)

model.freeze()
trainer.validate(model)
Expand Down
Loading