From 67143647c0916e697c8490ef82813573474a8583 Mon Sep 17 00:00:00 2001 From: Reza Yazdani Date: Wed, 24 Feb 2021 06:08:59 +0000 Subject: [PATCH 1/2] fixing the module-inject api --- deepspeed/module_inject/__init__.py | 1 + deepspeed/module_inject/replace_module.py | 7 ++----- 2 files changed, 3 insertions(+), 5 deletions(-) diff --git a/deepspeed/module_inject/__init__.py b/deepspeed/module_inject/__init__.py index e69de29bb2d1..c029ca8b7296 100755 --- a/deepspeed/module_inject/__init__.py +++ b/deepspeed/module_inject/__init__.py @@ -0,0 +1 @@ +from .replace_module import replace_transformer_layer diff --git a/deepspeed/module_inject/replace_module.py b/deepspeed/module_inject/replace_module.py index 5274d3c77f84..3640a53da80f 100755 --- a/deepspeed/module_inject/replace_module.py +++ b/deepspeed/module_inject/replace_module.py @@ -7,9 +7,8 @@ def replace_transformer_layer(orig_layer_impl, model, micro_batch_size, bert_config, - seed, - max_seq_length, - preln=False, + seed=-1, + preln=True, fp16=True, huggingface=False, local_rank=-1): @@ -21,7 +20,6 @@ def replace_transformer_layer(orig_layer_impl, micro_batch_size (int): micro batch size per gpu used during training/eval bert_config (dict): model config containing hidden size, attention heads, etc. seed (int): random seed value - max_seq_length (int): max sequence length for training preln (bool): does the original layer implementation do pre or post layer norm? fp16 (bool): fp16 or fp32 huggingface (bool): huggingface implementation is unique (supports both encoder/decoder modes) @@ -32,7 +30,6 @@ def replace_transformer_layer(orig_layer_impl, def replace_fn(child): transformer_config = deepspeed.DeepSpeedTransformerConfig( batch_size=micro_batch_size, - max_seq_length=max_seq_length, hidden_size=bert_config.hidden_size, heads=bert_config.num_attention_heads, attn_dropout_ratio=bert_config.attention_probs_dropout_prob, From 9c1d94c7175f8731827afbe3150241132dc5250d Mon Sep 17 00:00:00 2001 From: Reza Yazdani Date: Wed, 24 Feb 2021 06:13:37 +0000 Subject: [PATCH 2/2] add training mode flag for injection --- deepspeed/module_inject/replace_module.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/deepspeed/module_inject/replace_module.py b/deepspeed/module_inject/replace_module.py index 3640a53da80f..6b25c96a6d14 100755 --- a/deepspeed/module_inject/replace_module.py +++ b/deepspeed/module_inject/replace_module.py @@ -10,6 +10,7 @@ def replace_transformer_layer(orig_layer_impl, seed=-1, preln=True, fp16=True, + training=True, huggingface=False, local_rank=-1): """ Replace bert-style transformer layers with DeepSpeed's transformer layer @@ -22,6 +23,7 @@ def replace_transformer_layer(orig_layer_impl, seed (int): random seed value preln (bool): does the original layer implementation do pre or post layer norm? fp16 (bool): fp16 or fp32 + Training (bool): select between training (True) or inference (False) mode huggingface (bool): huggingface implementation is unique (supports both encoder/decoder modes) Returns: @@ -40,7 +42,8 @@ def replace_fn(child): fp16=fp16, pre_layer_norm=preln, huggingface=huggingface, - local_rank=local_rank) + local_rank=local_rank, + training=training) new_module = deepspeed.DeepSpeedTransformerLayer(transformer_config) # copy relevant state from child -> new module