From d1452c9d6c48f7586b1d1b734f89751c1585d25e Mon Sep 17 00:00:00 2001 From: Olatunji Ruwase Date: Thu, 6 Jan 2022 11:26:52 -0800 Subject: [PATCH 1/3] Improve reproducibility (#154) --- HelloDeepSpeed/README.md | 2 +- HelloDeepSpeed/train_bert.py | 4 ++++ HelloDeepSpeed/train_bert_ds.py | 4 ++++ 3 files changed, 9 insertions(+), 1 deletion(-) diff --git a/HelloDeepSpeed/README.md b/HelloDeepSpeed/README.md index 9bc7adac0..bbc8761de 100644 --- a/HelloDeepSpeed/README.md +++ b/HelloDeepSpeed/README.md @@ -31,7 +31,7 @@ One of the most important parts of training ML models is for the experiments to * Save all the hyperparameters associated with the experiment (be it taken from a config or parsed from the command line) -* Seed your random generators +* Seed your random generators. Some useful tips can be found [here](https://pytorch.org/docs/stable/notes/randomness.html?highlight=reproducibility). * Specify all the packages and their versions. This can be a `requirements.txt` file, a conda `env.yaml` file or a `pyproject.toml` file. If you want complete reproducibility, you can also include a `Dockerfile` to specify the environment to run the experiment in. diff --git a/HelloDeepSpeed/train_bert.py b/HelloDeepSpeed/train_bert.py index 14d61f00c..45e536298 100644 --- a/HelloDeepSpeed/train_bert.py +++ b/HelloDeepSpeed/train_bert.py @@ -6,6 +6,7 @@ from functools import partial from typing import Any, Callable, Dict, Iterable, List, Optional, Tuple, TypeVar, Union +import random import datasets import fire import loguru @@ -788,4 +789,7 @@ def train( if __name__ == "__main__": + torch.manual_seed(42) + np.random.seed(0) + random.seed(0) fire.Fire(train) diff --git a/HelloDeepSpeed/train_bert_ds.py b/HelloDeepSpeed/train_bert_ds.py index 421d03daf..dfb5f272a 100644 --- a/HelloDeepSpeed/train_bert_ds.py +++ b/HelloDeepSpeed/train_bert_ds.py @@ -10,6 +10,7 @@ from functools import partial from typing import Any, Callable, Dict, Iterable, List, Optional, Tuple, TypeVar, Union +import random import datasets import fire import loguru @@ -802,4 +803,7 @@ def train( if __name__ == "__main__": + torch.manual_seed(42) + np.random.seed(0) + random.seed(0) fire.Fire(train) From ec3858410621ec1d8964635b5735ca80fec1bc1d Mon Sep 17 00:00:00 2001 From: Zhewei Yao Date: Thu, 20 Jan 2022 09:36:01 -0800 Subject: [PATCH 2/3] fix cifar10 moe example with new moe api (#156) --- cifar/cifar10_deepspeed.py | 25 +++++-------------------- 1 file changed, 5 insertions(+), 20 deletions(-) diff --git a/cifar/cifar10_deepspeed.py b/cifar/cifar10_deepspeed.py index d51d27b80..ed509b8e1 100755 --- a/cifar/cifar10_deepspeed.py +++ b/cifar/cifar10_deepspeed.py @@ -211,26 +211,11 @@ def forward(self, x): def create_moe_param_groups(model): - from deepspeed.moe.utils import is_moe_param - - params_with_weight_decay = {'params': [], 'name': 'weight_decay_params'} - moe_params_with_weight_decay = { - 'params': [], - 'moe': True, - 'name': 'weight_decay_moe_params' - } - - for module_ in model.modules(): - moe_params_with_weight_decay['params'].extend([ - p for n, p in list(module_._parameters.items()) - if p is not None and is_moe_param(p) - ]) - params_with_weight_decay['params'].extend([ - p for n, p in list(module_._parameters.items()) - if p is not None and not is_moe_param(p) - ]) - - return params_with_weight_decay, moe_params_with_weight_decay + from deepspeed.moe.utils import split_params_into_different_moe_groups_for_optimizer + + parameters = {'params': model.parameters(), 'name': 'parameters'} + + return split_params_into_different_moe_groups_for_optimizer(parameters) parameters = filter(lambda p: p.requires_grad, net.parameters()) From 41f4fdfcb38822c64d74e43e15a068c6a5bb56c8 Mon Sep 17 00:00:00 2001 From: Reza Yazdani <44502768+RezaYazdaniAminabadi@users.noreply.github.com> Date: Thu, 20 Jan 2022 09:36:45 -0800 Subject: [PATCH 3/3] fix the inference tests based on the new changes on DeepSpeed (#157) Co-authored-by: Jeff Rasley --- inference/huggingface/gpt-neo.py | 3 ++- inference/huggingface/run_generation.py | 3 ++- inference/huggingface/test-gpt.sh | 2 +- 3 files changed, 5 insertions(+), 3 deletions(-) diff --git a/inference/huggingface/gpt-neo.py b/inference/huggingface/gpt-neo.py index 25499c595..4c4f5707e 100644 --- a/inference/huggingface/gpt-neo.py +++ b/inference/huggingface/gpt-neo.py @@ -22,6 +22,7 @@ generator.model = deepspeed.init_inference(generator.model, mp_size=world_size, dtype=torch.float, - replace_method='auto') + replace_method='auto', + replace_with_kernel_inject=True) string = generator("DeepSpeed is", do_sample=True, min_length=50) print(string) diff --git a/inference/huggingface/run_generation.py b/inference/huggingface/run_generation.py index 0bef0f499..a609bd1c5 100644 --- a/inference/huggingface/run_generation.py +++ b/inference/huggingface/run_generation.py @@ -261,7 +261,8 @@ def main(): model = deepspeed.init_inference(model, mp_size=1, dtype=(torch.half if args.fp16 else torch.float), - injection_policy=injection_policy) + injection_policy=injection_policy, + replace_with_kernel_inject=True) model = model.module args.length = adjust_length_to_model(args.length, max_sequence_length=model.config.max_position_embeddings) diff --git a/inference/huggingface/test-gpt.sh b/inference/huggingface/test-gpt.sh index fe2bcd710..445f2dc2f 100644 --- a/inference/huggingface/test-gpt.sh +++ b/inference/huggingface/test-gpt.sh @@ -1,4 +1,4 @@ -deepspeed --num_gpus 1 run_generation.py \ +deepspeed --num_nodes 1 --num_gpus 1 run_generation.py \ --model_type=gpt2 \ --model_name_or_path=gpt2-xl \ --sample_input single_query.txt \