diff --git a/deepspeed/module_inject/__init__.py b/deepspeed/module_inject/__init__.py index e69de29bb2d1..c029ca8b7296 100755 --- a/deepspeed/module_inject/__init__.py +++ b/deepspeed/module_inject/__init__.py @@ -0,0 +1 @@ +from .replace_module import replace_transformer_layer diff --git a/deepspeed/module_inject/replace_module.py b/deepspeed/module_inject/replace_module.py index 5274d3c77f84..6b25c96a6d14 100755 --- a/deepspeed/module_inject/replace_module.py +++ b/deepspeed/module_inject/replace_module.py @@ -7,10 +7,10 @@ def replace_transformer_layer(orig_layer_impl, model, micro_batch_size, bert_config, - seed, - max_seq_length, - preln=False, + seed=-1, + preln=True, fp16=True, + training=True, huggingface=False, local_rank=-1): """ Replace bert-style transformer layers with DeepSpeed's transformer layer @@ -21,9 +21,9 @@ def replace_transformer_layer(orig_layer_impl, micro_batch_size (int): micro batch size per gpu used during training/eval bert_config (dict): model config containing hidden size, attention heads, etc. seed (int): random seed value - max_seq_length (int): max sequence length for training preln (bool): does the original layer implementation do pre or post layer norm? fp16 (bool): fp16 or fp32 + Training (bool): select between training (True) or inference (False) mode huggingface (bool): huggingface implementation is unique (supports both encoder/decoder modes) Returns: @@ -32,7 +32,6 @@ def replace_transformer_layer(orig_layer_impl, def replace_fn(child): transformer_config = deepspeed.DeepSpeedTransformerConfig( batch_size=micro_batch_size, - max_seq_length=max_seq_length, hidden_size=bert_config.hidden_size, heads=bert_config.num_attention_heads, attn_dropout_ratio=bert_config.attention_probs_dropout_prob, @@ -43,7 +42,8 @@ def replace_fn(child): fp16=fp16, pre_layer_norm=preln, huggingface=huggingface, - local_rank=local_rank) + local_rank=local_rank, + training=training) new_module = deepspeed.DeepSpeedTransformerLayer(transformer_config) # copy relevant state from child -> new module