Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -31,5 +31,6 @@ onnx_safe: False # Use work-arounds for known problems with Torch ONNX exporter.
fp32_residual_connection: False # Use FP32 for residual connections.
activations_checkpoint_method: null # 'uniform', 'block'
activations_checkpoint_num_layers: 1
activations_checkpoint_granularity: null
megatron_legacy: False # Whether to use the legacy Megatron model. This affects the way q,k,v is partitioned from the mixed q,k,v layer in ParallelAttention. This needs to be True for models converted from HF.
normalize_attention_scores: True # Whether to scale the output Q * K^T by 1 / sqrt(hidden_size_per_head). This arg is provided as a configuration option mostly for compatibility with models that have been weight-converted from HF. You almost always want to se this to True.
Original file line number Diff line number Diff line change
Expand Up @@ -119,6 +119,7 @@ def get_decoder_model(
fp32_residual_connection=fp32_residual_connection,
activations_checkpoint_method=activations_checkpoint_method,
activations_checkpoint_num_layers=activations_checkpoint_num_layers,
activations_checkpoint_granularity=activations_checkpoint_granularity,
layernorm_epsilon=layernorm_epsilon,
bias_activation_fusion=bias_activation_fusion,
bias_dropout_add_fusion=bias_dropout_add_fusion,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -121,6 +121,7 @@ def get_encoder_model(
fp32_residual_connection=fp32_residual_connection,
activations_checkpoint_method=activations_checkpoint_method,
activations_checkpoint_num_layers=activations_checkpoint_num_layers,
activations_checkpoint_granularity=activations_checkpoint_granularity,
layernorm_epsilon=layernorm_epsilon,
bias_activation_fusion=bias_activation_fusion,
bias_dropout_add_fusion=bias_dropout_add_fusion,
Expand Down Expand Up @@ -198,6 +199,7 @@ def get_encoder_model(
fp32_residual_connection=fp32_residual_connection,
activations_checkpoint_method=activations_checkpoint_method,
activations_checkpoint_num_layers=activations_checkpoint_num_layers,
activations_checkpoint_granularity=activations_checkpoint_granularity,
layernorm_epsilon=layernorm_epsilon,
bias_activation_fusion=bias_activation_fusion,
bias_dropout_add_fusion=bias_dropout_add_fusion,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -65,6 +65,7 @@ def __init__(
fp32_residual_connection=False,
activations_checkpoint_method=None,
activations_checkpoint_num_layers=1,
activations_checkpoint_granularity=None,
layernorm_epsilon=1e-5,
bias_activation_fusion=True,
bias_dropout_add_fusion=True,
Expand Down Expand Up @@ -119,6 +120,7 @@ def __init__(
fp32_residual_connection=fp32_residual_connection,
activations_checkpoint_method=activations_checkpoint_method,
activations_checkpoint_num_layers=activations_checkpoint_num_layers,
activations_checkpoint_granularity=activations_checkpoint_granularity,
layernorm_epsilon=layernorm_epsilon,
hidden_dropout=hidden_dropout,
attention_dropout=attention_dropout,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -62,6 +62,7 @@ def __init__(
fp32_residual_connection=False,
activations_checkpoint_method=None,
activations_checkpoint_num_layers=1,
activations_checkpoint_granularity=None,
layernorm_epsilon=1e-5,
bias_activation_fusion=True,
bias_dropout_add_fusion=True,
Expand Down Expand Up @@ -117,6 +118,7 @@ def __init__(
fp32_residual_connection=fp32_residual_connection,
activations_checkpoint_method=activations_checkpoint_method,
activations_checkpoint_num_layers=activations_checkpoint_num_layers,
activations_checkpoint_granularity=activations_checkpoint_granularity,
layernorm_epsilon=layernorm_epsilon,
hidden_dropout=hidden_dropout,
attention_dropout=attention_dropout,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -179,6 +179,7 @@ def __init__(
fp32_residual_connection=encoder_cfg.get('fp32_residual_connection', False),
activations_checkpoint_method=encoder_cfg.get('activations_checkpoint_method', None),
activations_checkpoint_num_layers=encoder_cfg.get('activations_checkpoint_num_layers', 1),
activations_checkpoint_granularity=encoder_cfg.get('activations_checkpoint_granularity', None),
layernorm_epsilon=encoder_cfg.get('layernorm_epsilon', 1e-5),
bias_activation_fusion=encoder_cfg.get('bias_activation_fusion', True),
bias_dropout_add_fusion=encoder_cfg.get('bias_dropout_add_fusion', True),
Expand Down Expand Up @@ -279,11 +280,12 @@ def __init__(
use_cpu_initialization=use_cpu_initialization,
hidden_dropout=decoder_cfg.get('hidden_dropout', 0.1),
attention_dropout=decoder_cfg.get('attention_dropout', 0.1),
ffn_dropout=encoder_cfg.get('ffn_dropout', 0.0),
ffn_dropout=decoder_cfg.get('ffn_dropout', 0.0),
precision=precision,
fp32_residual_connection=decoder_cfg.get('fp32_residual_connection', False),
activations_checkpoint_method=decoder_cfg.get('activations_checkpoint_method', None),
activations_checkpoint_num_layers=decoder_cfg.get('activations_checkpoint_num_layers', 1),
activations_checkpoint_granularity=decoder_cfg.get('activations_checkpoint_granularity', None),
layernorm_epsilon=decoder_cfg.get('layernorm_epsilon', 1e-5),
bias_activation_fusion=decoder_cfg.get('bias_activation_fusion', True),
bias_dropout_add_fusion=decoder_cfg.get('bias_dropout_add_fusion', True),
Expand Down