diff --git a/examples/nlp/language_modeling/conf/megatron_model_base_config.yaml b/examples/nlp/language_modeling/conf/megatron_model_base_config.yaml index f68b9ecf87b2..1602cda23731 100644 --- a/examples/nlp/language_modeling/conf/megatron_model_base_config.yaml +++ b/examples/nlp/language_modeling/conf/megatron_model_base_config.yaml @@ -31,5 +31,6 @@ onnx_safe: False # Use work-arounds for known problems with Torch ONNX exporter. fp32_residual_connection: False # Use FP32 for residual connections. activations_checkpoint_method: null # 'uniform', 'block' activations_checkpoint_num_layers: 1 +activations_checkpoint_granularity: null megatron_legacy: False # Whether to use the legacy Megatron model. This affects the way q,k,v is partitioned from the mixed q,k,v layer in ParallelAttention. This needs to be True for models converted from HF. normalize_attention_scores: True # Whether to scale the output Q * K^T by 1 / sqrt(hidden_size_per_head). This arg is provided as a configuration option mostly for compatibility with models that have been weight-converted from HF. You almost always want to se this to True. diff --git a/nemo/collections/nlp/modules/common/megatron/megatron_decoders.py b/nemo/collections/nlp/modules/common/megatron/megatron_decoders.py index 63d14cfe84d1..901d55ef4511 100644 --- a/nemo/collections/nlp/modules/common/megatron/megatron_decoders.py +++ b/nemo/collections/nlp/modules/common/megatron/megatron_decoders.py @@ -119,6 +119,7 @@ def get_decoder_model( fp32_residual_connection=fp32_residual_connection, activations_checkpoint_method=activations_checkpoint_method, activations_checkpoint_num_layers=activations_checkpoint_num_layers, + activations_checkpoint_granularity=activations_checkpoint_granularity, layernorm_epsilon=layernorm_epsilon, bias_activation_fusion=bias_activation_fusion, bias_dropout_add_fusion=bias_dropout_add_fusion, diff --git a/nemo/collections/nlp/modules/common/megatron/megatron_encoders.py b/nemo/collections/nlp/modules/common/megatron/megatron_encoders.py index 1917979fc66a..6b6a44c036e9 100644 --- a/nemo/collections/nlp/modules/common/megatron/megatron_encoders.py +++ b/nemo/collections/nlp/modules/common/megatron/megatron_encoders.py @@ -121,6 +121,7 @@ def get_encoder_model( fp32_residual_connection=fp32_residual_connection, activations_checkpoint_method=activations_checkpoint_method, activations_checkpoint_num_layers=activations_checkpoint_num_layers, + activations_checkpoint_granularity=activations_checkpoint_granularity, layernorm_epsilon=layernorm_epsilon, bias_activation_fusion=bias_activation_fusion, bias_dropout_add_fusion=bias_dropout_add_fusion, @@ -198,6 +199,7 @@ def get_encoder_model( fp32_residual_connection=fp32_residual_connection, activations_checkpoint_method=activations_checkpoint_method, activations_checkpoint_num_layers=activations_checkpoint_num_layers, + activations_checkpoint_granularity=activations_checkpoint_granularity, layernorm_epsilon=layernorm_epsilon, bias_activation_fusion=bias_activation_fusion, bias_dropout_add_fusion=bias_dropout_add_fusion, diff --git a/nemo/collections/nlp/modules/common/megatron/megatron_transformer_decoder.py b/nemo/collections/nlp/modules/common/megatron/megatron_transformer_decoder.py index 5104855c860d..530eeffaf466 100644 --- a/nemo/collections/nlp/modules/common/megatron/megatron_transformer_decoder.py +++ b/nemo/collections/nlp/modules/common/megatron/megatron_transformer_decoder.py @@ -65,6 +65,7 @@ def __init__( fp32_residual_connection=False, activations_checkpoint_method=None, activations_checkpoint_num_layers=1, + activations_checkpoint_granularity=None, layernorm_epsilon=1e-5, bias_activation_fusion=True, bias_dropout_add_fusion=True, @@ -119,6 +120,7 @@ def __init__( fp32_residual_connection=fp32_residual_connection, activations_checkpoint_method=activations_checkpoint_method, activations_checkpoint_num_layers=activations_checkpoint_num_layers, + activations_checkpoint_granularity=activations_checkpoint_granularity, layernorm_epsilon=layernorm_epsilon, hidden_dropout=hidden_dropout, attention_dropout=attention_dropout, diff --git a/nemo/collections/nlp/modules/common/megatron/megatron_transformer_encoder.py b/nemo/collections/nlp/modules/common/megatron/megatron_transformer_encoder.py index b48d89cd9644..4b1799680d54 100644 --- a/nemo/collections/nlp/modules/common/megatron/megatron_transformer_encoder.py +++ b/nemo/collections/nlp/modules/common/megatron/megatron_transformer_encoder.py @@ -62,6 +62,7 @@ def __init__( fp32_residual_connection=False, activations_checkpoint_method=None, activations_checkpoint_num_layers=1, + activations_checkpoint_granularity=None, layernorm_epsilon=1e-5, bias_activation_fusion=True, bias_dropout_add_fusion=True, @@ -117,6 +118,7 @@ def __init__( fp32_residual_connection=fp32_residual_connection, activations_checkpoint_method=activations_checkpoint_method, activations_checkpoint_num_layers=activations_checkpoint_num_layers, + activations_checkpoint_granularity=activations_checkpoint_granularity, layernorm_epsilon=layernorm_epsilon, hidden_dropout=hidden_dropout, attention_dropout=attention_dropout, diff --git a/nemo/collections/nlp/modules/common/megatron/token_level_encoder_decoder.py b/nemo/collections/nlp/modules/common/megatron/token_level_encoder_decoder.py index 573cbab7fc4c..02d9ef718f6e 100644 --- a/nemo/collections/nlp/modules/common/megatron/token_level_encoder_decoder.py +++ b/nemo/collections/nlp/modules/common/megatron/token_level_encoder_decoder.py @@ -179,6 +179,7 @@ def __init__( fp32_residual_connection=encoder_cfg.get('fp32_residual_connection', False), activations_checkpoint_method=encoder_cfg.get('activations_checkpoint_method', None), activations_checkpoint_num_layers=encoder_cfg.get('activations_checkpoint_num_layers', 1), + activations_checkpoint_granularity=encoder_cfg.get('activations_checkpoint_granularity', None), layernorm_epsilon=encoder_cfg.get('layernorm_epsilon', 1e-5), bias_activation_fusion=encoder_cfg.get('bias_activation_fusion', True), bias_dropout_add_fusion=encoder_cfg.get('bias_dropout_add_fusion', True), @@ -279,11 +280,12 @@ def __init__( use_cpu_initialization=use_cpu_initialization, hidden_dropout=decoder_cfg.get('hidden_dropout', 0.1), attention_dropout=decoder_cfg.get('attention_dropout', 0.1), - ffn_dropout=encoder_cfg.get('ffn_dropout', 0.0), + ffn_dropout=decoder_cfg.get('ffn_dropout', 0.0), precision=precision, fp32_residual_connection=decoder_cfg.get('fp32_residual_connection', False), activations_checkpoint_method=decoder_cfg.get('activations_checkpoint_method', None), activations_checkpoint_num_layers=decoder_cfg.get('activations_checkpoint_num_layers', 1), + activations_checkpoint_granularity=decoder_cfg.get('activations_checkpoint_granularity', None), layernorm_epsilon=decoder_cfg.get('layernorm_epsilon', 1e-5), bias_activation_fusion=decoder_cfg.get('bias_activation_fusion', True), bias_dropout_add_fusion=decoder_cfg.get('bias_dropout_add_fusion', True),