diff --git a/HelloDeepSpeed/README.md b/HelloDeepSpeed/README.md index 9bc7adac0..bbc8761de 100644 --- a/HelloDeepSpeed/README.md +++ b/HelloDeepSpeed/README.md @@ -31,7 +31,7 @@ One of the most important parts of training ML models is for the experiments to * Save all the hyperparameters associated with the experiment (be it taken from a config or parsed from the command line) -* Seed your random generators +* Seed your random generators. Some useful tips can be found [here](https://pytorch.org/docs/stable/notes/randomness.html?highlight=reproducibility). * Specify all the packages and their versions. This can be a `requirements.txt` file, a conda `env.yaml` file or a `pyproject.toml` file. If you want complete reproducibility, you can also include a `Dockerfile` to specify the environment to run the experiment in. diff --git a/HelloDeepSpeed/train_bert.py b/HelloDeepSpeed/train_bert.py index 14d61f00c..45e536298 100644 --- a/HelloDeepSpeed/train_bert.py +++ b/HelloDeepSpeed/train_bert.py @@ -6,6 +6,7 @@ from functools import partial from typing import Any, Callable, Dict, Iterable, List, Optional, Tuple, TypeVar, Union +import random import datasets import fire import loguru @@ -788,4 +789,7 @@ def train( if __name__ == "__main__": + torch.manual_seed(42) + np.random.seed(0) + random.seed(0) fire.Fire(train) diff --git a/HelloDeepSpeed/train_bert_ds.py b/HelloDeepSpeed/train_bert_ds.py index 421d03daf..dfb5f272a 100644 --- a/HelloDeepSpeed/train_bert_ds.py +++ b/HelloDeepSpeed/train_bert_ds.py @@ -10,6 +10,7 @@ from functools import partial from typing import Any, Callable, Dict, Iterable, List, Optional, Tuple, TypeVar, Union +import random import datasets import fire import loguru @@ -802,4 +803,7 @@ def train( if __name__ == "__main__": + torch.manual_seed(42) + np.random.seed(0) + random.seed(0) fire.Fire(train) diff --git a/cifar/cifar10_deepspeed.py b/cifar/cifar10_deepspeed.py index d51d27b80..ed509b8e1 100755 --- a/cifar/cifar10_deepspeed.py +++ b/cifar/cifar10_deepspeed.py @@ -211,26 +211,11 @@ def forward(self, x): def create_moe_param_groups(model): - from deepspeed.moe.utils import is_moe_param - - params_with_weight_decay = {'params': [], 'name': 'weight_decay_params'} - moe_params_with_weight_decay = { - 'params': [], - 'moe': True, - 'name': 'weight_decay_moe_params' - } - - for module_ in model.modules(): - moe_params_with_weight_decay['params'].extend([ - p for n, p in list(module_._parameters.items()) - if p is not None and is_moe_param(p) - ]) - params_with_weight_decay['params'].extend([ - p for n, p in list(module_._parameters.items()) - if p is not None and not is_moe_param(p) - ]) - - return params_with_weight_decay, moe_params_with_weight_decay + from deepspeed.moe.utils import split_params_into_different_moe_groups_for_optimizer + + parameters = {'params': model.parameters(), 'name': 'parameters'} + + return split_params_into_different_moe_groups_for_optimizer(parameters) parameters = filter(lambda p: p.requires_grad, net.parameters()) diff --git a/inference/huggingface/gpt-neo.py b/inference/huggingface/gpt-neo.py index 25499c595..4c4f5707e 100644 --- a/inference/huggingface/gpt-neo.py +++ b/inference/huggingface/gpt-neo.py @@ -22,6 +22,7 @@ generator.model = deepspeed.init_inference(generator.model, mp_size=world_size, dtype=torch.float, - replace_method='auto') + replace_method='auto', + replace_with_kernel_inject=True) string = generator("DeepSpeed is", do_sample=True, min_length=50) print(string) diff --git a/inference/huggingface/run_generation.py b/inference/huggingface/run_generation.py index 0bef0f499..a609bd1c5 100644 --- a/inference/huggingface/run_generation.py +++ b/inference/huggingface/run_generation.py @@ -261,7 +261,8 @@ def main(): model = deepspeed.init_inference(model, mp_size=1, dtype=(torch.half if args.fp16 else torch.float), - injection_policy=injection_policy) + injection_policy=injection_policy, + replace_with_kernel_inject=True) model = model.module args.length = adjust_length_to_model(args.length, max_sequence_length=model.config.max_position_embeddings) diff --git a/inference/huggingface/test-gpt.sh b/inference/huggingface/test-gpt.sh index fe2bcd710..445f2dc2f 100644 --- a/inference/huggingface/test-gpt.sh +++ b/inference/huggingface/test-gpt.sh @@ -1,4 +1,4 @@ -deepspeed --num_gpus 1 run_generation.py \ +deepspeed --num_nodes 1 --num_gpus 1 run_generation.py \ --model_type=gpt2 \ --model_name_or_path=gpt2-xl \ --sample_input single_query.txt \