From 2f3b219bf6228a9048dee8c8692af3e5eccbaaad Mon Sep 17 00:00:00 2001 From: Logan Adams Date: Wed, 26 Feb 2025 10:47:59 -0800 Subject: [PATCH 1/2] Improve inference tutorial docs Signed-off-by: Logan Adams --- docs/_tutorials/inference-tutorial.md | 14 +++++++++----- 1 file changed, 9 insertions(+), 5 deletions(-) diff --git a/docs/_tutorials/inference-tutorial.md b/docs/_tutorials/inference-tutorial.md index 1d5899204f53..d52d17bbb8e0 100644 --- a/docs/_tutorials/inference-tutorial.md +++ b/docs/_tutorials/inference-tutorial.md @@ -21,18 +21,22 @@ if args.pre_load_checkpoint: model = model_class.from_pretrained(args.model_name_or_path) else: model = model_class() + +# create the tokenizer +tokenizer = model_class.from_pretrained(args.model_name_or_path) ... import deepspeed # Initialize the DeepSpeed-Inference engine ds_engine = deepspeed.init_inference(model, - tensor_parallel={"tp_size": 2}, - dtype=torch.half, - checkpoint=None if args.pre_load_checkpoint else args.checkpoint_json, - replace_with_kernel_inject=True) + tensor_parallel={"tp_size": 2}, + dtype=torch.half, + checkpoint=None if args.pre_load_checkpoint else args.checkpoint_json, + replace_with_kernel_inject=True) model = ds_engine.module -output = model('Input String') +pipe = pipeline("text-generation", model=model, tokenizer=tokenizer) +output = pipe('Input String') ``` To run inference with only model-parallelism for the models that we don't support kernels, you can pass an injection policy that shows the two specific linear layers on a Transformer Encoder/Decoder layer: 1) the attention output GeMM and 2) layer output GeMM. We need these part of the layer to add the required all-reduce communication between GPUs to merge the partial results across model-parallel ranks. Below, we bring an example that shows how you can use deepspeed-inference with a T5 model: From ff1391045a677525d6bd862657fe169848ca9b37 Mon Sep 17 00:00:00 2001 From: Logan Adams Date: Wed, 26 Feb 2025 11:58:13 -0800 Subject: [PATCH 2/2] PR feedback, generalize the example more Signed-off-by: Logan Adams --- docs/_tutorials/inference-tutorial.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/_tutorials/inference-tutorial.md b/docs/_tutorials/inference-tutorial.md index d52d17bbb8e0..ddf287f24b96 100644 --- a/docs/_tutorials/inference-tutorial.md +++ b/docs/_tutorials/inference-tutorial.md @@ -30,7 +30,7 @@ import deepspeed # Initialize the DeepSpeed-Inference engine ds_engine = deepspeed.init_inference(model, - tensor_parallel={"tp_size": 2}, + tensor_parallel={"tp_size": world_size}, dtype=torch.half, checkpoint=None if args.pre_load_checkpoint else args.checkpoint_json, replace_with_kernel_inject=True)