From c3f3435549829022acbc2bb12c4d349ad0e2a03c Mon Sep 17 00:00:00 2001 From: yuanheng-zhao Date: Thu, 21 Sep 2023 16:23:40 +0800 Subject: [PATCH 01/12] add Colossal-Inference serving example w/ TorchServe --- .../serving/Colossal_Inference_Handler.py | 210 ++++++++++++++++++ examples/inference/serving/README.md | 83 +++++++ examples/inference/serving/config.properties | 10 + examples/inference/serving/download_model.py | 50 +++++ examples/inference/serving/model-config.yaml | 16 ++ examples/inference/serving/sample_text.txt | 1 + 6 files changed, 370 insertions(+) create mode 100644 examples/inference/serving/Colossal_Inference_Handler.py create mode 100644 examples/inference/serving/README.md create mode 100644 examples/inference/serving/config.properties create mode 100644 examples/inference/serving/download_model.py create mode 100644 examples/inference/serving/model-config.yaml create mode 100644 examples/inference/serving/sample_text.txt diff --git a/examples/inference/serving/Colossal_Inference_Handler.py b/examples/inference/serving/Colossal_Inference_Handler.py new file mode 100644 index 000000000000..46a47d7cecda --- /dev/null +++ b/examples/inference/serving/Colossal_Inference_Handler.py @@ -0,0 +1,210 @@ +import logging +import os +import random +import socket +import zipfile +from abc import ABC + +import torch +import transformers +from transformers import AutoTokenizer, BloomForCausalLM, BloomTokenizerFast, LlamaForCausalLM +from ts.torch_handler.base_handler import BaseHandler + +import colossalai +from colossalai.inference.tensor_parallel.engine import TPInferEngine +from colossalai.shardformer import ShardConfig + +logger = logging.getLogger(__name__) +logger.info("Transformers version %s", transformers.__version__) +logger.info("ColossalAI version %s", colossalai.__version__) + + +# from colossalai.testing +# assins a random port, for demo use only +def free_port() -> int: + """Get a free port on localhost. + + Returns: + int: A free port on localhost. + """ + while True: + port = random.randint(20000, 65000) + try: + with socket.socket() as sock: + sock.setsockopt(socket.SOL_SOCKET, socket.SO_REUSEADDR, 1) + sock.bind(("localhost", port)) + return port + except OSError: + continue + + +class ColossalInferenceHandler(BaseHandler, ABC): + """ + Transformers handler class for testing + """ + + def __init__(self): + super(ColossalInferenceHandler, self).__init__() + self.infer_engine = None + self.max_batch_size = None + self.max_input_len = None + self.max_output_len = None + self.tokenizer = None + self.initialized = False + + def initialize(self, ctx): + """Expected behaviour: the sharded Bloom/Llama model is loaded. + + Args: + ctx (context): It is a JSON Object containing information + pertaining to the model artefacts parameters. + """ + if ctx is not None or not hasattr(ctx, "model_yaml_config"): + logger.error("Context ctx and model-config are not appropriately passed in.") + + self.manifest = ctx.manifest + gpu_id = ctx.system_properties.get("gpu_id", -1) + model_dir = ctx.system_properties.get("model_dir") + + # Inference configs are collected together in model yaml config for handler use + inference_config = ctx.model_yaml_config["handler"] + logger.info(inference_config) + inference_config["model_type"] + self.tp_size = inference_config.get("tp_size", 1) + self.max_batch_size = inference_config.get("max_batch_size", 4) + self.max_input_len = inference_config.get("max_input_len", 1024) + self.max_output_len = inference_config.get("max_output_len", 128) + + self.device = torch.device("cuda:" + str(gpu_id) if torch.cuda.is_available() and gpu_id >= 0 else "cpu") + logger.info(f"Device set to {self.device}") + logger.info(f"torch.cuda.device_count() {torch.cuda.device_count()}") + + logger.info(f"Unpacking from model_dir {model_dir}") + model_dir_path = os.path.join(model_dir, "model") + with zipfile.ZipFile(model_dir + "/model.zip", "r") as zip_ref: + zip_ref.extractall(model_dir_path) + logger.info(f"Loading {inference_config['model_type']} pretrain model and tokenizer") + if inference_config["model_type"] == "bloom": + self.model = BloomForCausalLM.from_pretrained( + model_dir_path, + ) + self.tokenizer = BloomTokenizerFast.from_pretrained(model_dir_path, return_tensors="pt") + elif inference_config["model_type"] == "llama": + self.model = LlamaForCausalLM.from_pretrained( + model_dir_path, + ) + self.tokenizer = AutoTokenizer.from_pretrained(model_dir_path, return_tensors="pt") + else: + logger.warning(f"Model type {inference_config['model_type']} not supported yet.") + + logger.info("Transformer model from path %s loaded successfully", model_dir) + + # NOTE world_size, rank, host, port here are used to launch colossalai dist environment + # This world_size is different from the world size of TorchServe + world_size = int(os.getenv("WORLD_SIZE", self.tp_size)) + assert world_size == 1, "Colossal-Inference with tensor parallel is not supported on TorchServe for now" + rank = int(os.getenv("RANK", gpu_id)) + local_rank = int(os.getenv("LOCAL_RANK", gpu_id)) + host = os.getenv("MASTER_ADDR", "localhost") + port = os.getenv("MASTER_PORT", free_port()) # use a random free port + + logger.info( + f" world_size {world_size}" f" local_rank {local_rank}" f" rank {rank}" f" host {host}" f" port {port}" + ) + + torch.cuda.set_device(self.device) + self.model.half() + self.model.cuda() + self.model.eval() + + colossalai.launch(config={}, rank=rank, world_size=world_size, host=host, port=port, backend="nccl") + logger.info("Initializing TPInferEngine ...") + shard_config = ShardConfig(enable_tensor_parallelism=True if self.tp_size > 1 else False, inference_only=True) + self.infer_engine = TPInferEngine( + self.model, shard_config, self.max_batch_size, self.max_input_len, self.max_output_len + ) + logger.info("TPInferEngine initialized successfully") + + self.model = self.infer_engine.model + self.initialized = True + + def preprocess(self, requests): + """Basic text preprocessing, based on the user's chocie of application mode. + Args: + requests (str): The Input data in the form of text is passed on to the preprocess + function. + Returns: + list : The preprocess function returns a list of Tensor for the size of the word tokens. + """ + logger.info("Pre-processing requests") + input_ids_batch = None + attention_mask_batch = None + for idx, data in enumerate(requests): + input_text = data.get("data") + if input_text is None: + input_text = data.get("body") + if isinstance(input_text, (bytes, bytearray)): + input_text = input_text.decode("utf-8") + + logger.info("Received text: '%s'", input_text) + + inputs = self.tokenizer.encode_plus( + input_text, + max_length=self.max_input_len, + padding=True, + add_special_tokens=True, + return_tensors="pt", + truncation=True, + ) + + input_ids = inputs["input_ids"].to(self.device) + attention_mask = inputs["attention_mask"].to(self.device) + # making a batch out of the recieved requests + # attention masks are passed for cases where input tokens are padded. + if input_ids.shape is not None: + if input_ids_batch is None: + input_ids_batch = input_ids + attention_mask_batch = attention_mask + else: + input_ids_batch = torch.cat((input_ids_batch, input_ids), 0) + attention_mask_batch = torch.cat((attention_mask_batch, attention_mask), 0) + return (input_ids_batch, attention_mask_batch) + + def inference(self, input_batch): + """Predict the class (or classes) of the received text using the + serialized transformers checkpoint. + Args: + input_batch (list): List of Text Tensors from the pre-process function is passed here + Returns: + list : It returns a list of the predicted value for the input text + """ + input_ids_batch, attention_mask_batch = input_batch + inferences = [] + + # mode: text_generation + input_ids_batch = input_ids_batch.to(self.device) + outputs = self.infer_engine.generate( + dict(input_ids=input_ids_batch, attention_mask=attention_mask_batch), + do_sample=True, + top_p=0.95, + top_k=60, + ) + + for i, _ in enumerate(outputs): + inferences.append(self.tokenizer.decode(outputs[i], skip_special_tokens=True)) + + # For testing only + logger.info( + f"Generated text: {inferences}", + ) + + return inferences + + def postprocess(self, inference_output): + """Post Process Function converts the predicted response into Torchserve readable format. + Args: + inference_output (list): It contains the predicted response of the input text. + Returns: + (list): Returns a list of the Predictions and Explanations. + """ + return inference_output diff --git a/examples/inference/serving/README.md b/examples/inference/serving/README.md new file mode 100644 index 000000000000..830a2a429efe --- /dev/null +++ b/examples/inference/serving/README.md @@ -0,0 +1,83 @@ +# Colossal-Inference with TorchServe + +## Overview + +This demo is used for testing and demonstrating the usage of Colossal Inference from `colossalai.inference` with deployment with TorchServe. It imports inference modules from colossalai and is based on +https://github.com/hpcaitech/ColossalAI/tree/d151dcab740eaae784333c93d85100c3641bd115. For now, single-gpu inference serving is supported. + +## Conda Environment for testing +Records to create a conda env to test locally as follows. We might want to use docker or configure env on cloud platform later. + +```bash +# use python 3.8 or 3.9 +conda create -n infer python=3.9 + +# prevent installing cuda stuff to root or somewhere weird +module unload cuda + +# use torch 1.13+cuda11.6 for inference +pip install torch==1.13.1+cu116 torchvision==0.14.1+cu116 torchaudio==0.13.1 --extra-index-url https://download.pytorch.org/whl/cu116 + +# conda cuda toolkit (e.g. nvcc, etc) +conda install -c "nvidia/label/cuda-11.6.2" cuda-toolkit + +# install colossalai with PyTorch extensions +cd +pip install -r requirements/requirements.txt +pip install -r requirements/requirements-test.txt +CUDA_EXT=1 pip install -e . + +# install torchserve +cd +python ./ts_scripts/install_dependencies.py --cuda=cu116 +pip install torchserve torch-model-archiver torch-workflow-archiver +``` + +## Steps to deploy a model + +### 1.download/prepare a model +To use on cloud platform, we will zip the downloaded model. +```bash +# download snapshots +huggingface-cli login +python download_model.py --model_name bigscience/bloom-560m -o + +# zip the model repo +cd /models--bigscience--bloom-560m/snapshots/ +zip -r //model.zip * +``` + +> **_NOTE:_** The torch archiver and server will use `/tmp/` folder. Depending on the limit of disk quota, using torch-model-archiver might cause OSError "Disk quota exceeded". To prevent the OSError, set tmp dir environment variable as follows: +`export TMPDIR=/tmp` and `export TEMP=/tmp`, +or use relatively small models (as we did) for local testing. + +### 2. Archive the model +With torch archiver, we will pack the model file (.zip) as well as handler file (.py) together into a .mar file. And then in serving process these files will be unpacked by TorchServe. Revelant model configs and inference configs can be set in `model-config.yaml`. +```bash +cd Language/ColossalInfer +# create a folder under the current directory to store the packed model created by torch archiver +mkdir model_store +torch-model-archiver --model-name bloom --version 0.1 --handler Colossal_Inference_Handler.py --config-file model-config.yaml --extra-files /model.zip --export-path ./model_store/ +``` + +### 3. Launch serving + +Modify `load_models` in config.properties to select the model(s) stored in directory to be deployed. By default we use `load_models=all` to load and deploy all the models (.mar) we have. + +```bash +torchserve --start --ncs --ts-config config.properties +``` +We could set inference, management, and metrics addresses and other TorchServe settings in `config.properties`. + +TorchServe will create a folder `logs/` under the current directory to store ts, model, and metrics logs. + +### 4. Run inference + +```bash +# check inference status +curl http://0.0.0.0:8084/ping + +curl -X POST http://localhost:8084/predictions/bloom -T sample_text.txt +``` + +To stop TorchServe, run `torchserve --stop` diff --git a/examples/inference/serving/config.properties b/examples/inference/serving/config.properties new file mode 100644 index 000000000000..7f2b882a11a7 --- /dev/null +++ b/examples/inference/serving/config.properties @@ -0,0 +1,10 @@ +inference_address=http://0.0.0.0:8084 +management_address=http://0.0.0.0:8085 +metrics_address=http://0.0.0.0:8086 +enable_envvars_config=true +install_py_dep_per_model=true +number_of_gpu=1 +load_models=all +max_response_size=655350000 +default_response_timeout=6000 +model_store=./model_store diff --git a/examples/inference/serving/download_model.py b/examples/inference/serving/download_model.py new file mode 100644 index 000000000000..41ff80617c26 --- /dev/null +++ b/examples/inference/serving/download_model.py @@ -0,0 +1,50 @@ +# CREDITS: These functions are from https://github.com/pytorch/serve/blob/2bf505bae3046b0f7d0900727ec36e611bb5dca3/examples/large_models/utils/Download_model.py +import argparse +import os + +from huggingface_hub import HfApi, snapshot_download + + +def dir_path(path_str): + if os.path.isdir(path_str): + return path_str + elif input(f"{path_str} does not exist, create directory? [y/n]").lower() == "y": + os.makedirs(path_str) + return path_str + else: + raise NotADirectoryError(path_str) + + +class HFModelNotFoundError(Exception): + def __init__(self, model_str): + super().__init__(f"HuggingFace model not found: '{model_str}'") + + +def hf_model(model_str): + api = HfApi() + models = [m.modelId for m in api.list_models()] + if model_str in models: + return model_str + else: + raise HFModelNotFoundError(model_str) + + +parser = argparse.ArgumentParser() +parser.add_argument( + "--model_path", + "-o", + type=dir_path, + default="model", + help="Output directory for downloaded model files", +) +parser.add_argument("--model_name", "-m", type=hf_model, required=True, help="HuggingFace model name") +parser.add_argument("--revision", "-r", type=str, default="main", help="Revision") +args = parser.parse_args() + +snapshot_path = snapshot_download( + repo_id=args.model_name, + revision=args.revision, + cache_dir=args.model_path, + use_auth_token=True, +) +print(f"Files for '{args.model_name}' is downloaded to '{snapshot_path}'") diff --git a/examples/inference/serving/model-config.yaml b/examples/inference/serving/model-config.yaml new file mode 100644 index 000000000000..0f86d424beee --- /dev/null +++ b/examples/inference/serving/model-config.yaml @@ -0,0 +1,16 @@ +# TS frontend parameters settings +minWorkers: 1 # minimum number of workers of a model +maxWorkers: 1 # maximum number of workers of a model +batchSize: 8 # batch size of a model +maxBatchDelay: 100 # maximum delay of a batch (ms) +responseTimeout: 120 # timeout of a specific model's response (*in sec) +deviceType: "gpu" +# deviceIds: [0, 1] # seting CUDA_VISIBLE_DEVICES + +handler: + mode: "text_generation" + model_type: "bloom" + tp_size: 1 + max_batch_size: 8 + max_input_len: 1024 + max_output_len: 128 diff --git a/examples/inference/serving/sample_text.txt b/examples/inference/serving/sample_text.txt new file mode 100644 index 000000000000..18d8729f21b4 --- /dev/null +++ b/examples/inference/serving/sample_text.txt @@ -0,0 +1 @@ +Introduce some landmarks in Beijing From 6434e3e291f187cb0823d43bcf616e7bc02949ab Mon Sep 17 00:00:00 2001 From: yuanheng-zhao Date: Fri, 22 Sep 2023 10:08:36 +0800 Subject: [PATCH 02/12] add dockerfile --- examples/inference/serving/docker/Dockerfile | 40 ++++++++++++++++++++ examples/inference/serving/docker/build.sh | 1 + 2 files changed, 41 insertions(+) create mode 100644 examples/inference/serving/docker/Dockerfile create mode 100755 examples/inference/serving/docker/build.sh diff --git a/examples/inference/serving/docker/Dockerfile b/examples/inference/serving/docker/Dockerfile new file mode 100644 index 000000000000..65c96942a070 --- /dev/null +++ b/examples/inference/serving/docker/Dockerfile @@ -0,0 +1,40 @@ +# FROM hpcaitech/cuda-conda:11.6 +FROM hpcaitech/pytorch-cuda:1.13.0-11.6.0 + +# enable passwordless ssh +RUN mkdir ~/.ssh && \ + printf "Host * \n ForwardAgent yes\nHost *\n StrictHostKeyChecking no" > ~/.ssh/config && \ + ssh-keygen -t rsa -N "" -f ~/.ssh/id_rsa && \ + cat ~/.ssh/id_rsa.pub >> ~/.ssh/authorized_keys + +# enable RDMA support +RUN apt-get update && \ + apt-get install -y infiniband-diags perftest ibverbs-providers libibumad3 libibverbs1 libnl-3-200 libnl-route-3-200 librdmacm1 && \ + apt-get clean && \ + rm -rf /var/lib/apt/lists/* + +# install ninja +RUN apt-get update && \ + apt-get install -y --no-install-recommends ninja-build && \ + apt-get clean && \ + rm -rf /var/lib/apt/lists/* + +# install colossalai +ARG VERSION=main +RUN git clone -b ${VERSION} https://github.com/hpcaitech/ColossalAI.git \ + && cd ./ColossalAI \ + && git checkout 3e05c07bb8921f2a8f9736b6f6673d4e9f1697d0 \ + && CUDA_EXT=1 pip install -v --no-cache-dir . + +# install titans +RUN pip install --no-cache-dir titans + +# install triton +RUN pip install --no-cache-dir triton==2.0.0.dev20221202 + +# install torchserve +ARG VERSION=master +RUN git clone -b ${VERSION} https://github.com/pytorch/serve.git \ + && cd ./serve \ + && python ./ts_scripts/install_dependencies.py --cuda=cu116 \ + && pip install torchserve torch-model-archiver torch-workflow-archiver diff --git a/examples/inference/serving/docker/build.sh b/examples/inference/serving/docker/build.sh new file mode 100755 index 000000000000..f48f4d8b518b --- /dev/null +++ b/examples/inference/serving/docker/build.sh @@ -0,0 +1 @@ +docker build -t hpcaitech/colossal-inference-serve:0.0.1 . From 085097c3de5f666187dd297f1f08963291af3d2c Mon Sep 17 00:00:00 2001 From: yuanheng-zhao Date: Tue, 26 Sep 2023 09:15:24 +0800 Subject: [PATCH 03/12] fix dockerfile --- examples/inference/serving/docker/Dockerfile | 21 +++++++++++++++----- examples/inference/serving/docker/build.sh | 2 +- examples/inference/serving/requirements.txt | 5 +++++ 3 files changed, 22 insertions(+), 6 deletions(-) create mode 100644 examples/inference/serving/requirements.txt diff --git a/examples/inference/serving/docker/Dockerfile b/examples/inference/serving/docker/Dockerfile index 65c96942a070..ea693c371aa4 100644 --- a/examples/inference/serving/docker/Dockerfile +++ b/examples/inference/serving/docker/Dockerfile @@ -1,4 +1,3 @@ -# FROM hpcaitech/cuda-conda:11.6 FROM hpcaitech/pytorch-cuda:1.13.0-11.6.0 # enable passwordless ssh @@ -7,12 +6,21 @@ RUN mkdir ~/.ssh && \ ssh-keygen -t rsa -N "" -f ~/.ssh/id_rsa && \ cat ~/.ssh/id_rsa.pub >> ~/.ssh/authorized_keys -# enable RDMA support +# Download and extract OpenJDK 17 +ENV JAVA_HOME /opt/openjdk-17 RUN apt-get update && \ - apt-get install -y infiniband-diags perftest ibverbs-providers libibumad3 libibverbs1 libnl-3-200 libnl-route-3-200 librdmacm1 && \ - apt-get clean && \ + apt-get install -y wget && \ + wget -q https://download.java.net/openjdk/jdk17/ri/openjdk-17+35_linux-x64_bin.tar.gz -O /tmp/openjdk.tar.gz && \ + mkdir -p $JAVA_HOME && \ + tar xzf /tmp/openjdk.tar.gz -C $JAVA_HOME --strip-components=1 && \ + rm /tmp/openjdk.tar.gz && \ + apt-get purge -y --auto-remove wget && \ rm -rf /var/lib/apt/lists/* +ENV PATH $JAVA_HOME/bin:$PATH +RUN export JAVA_HOME +RUN java -version + # install ninja RUN apt-get update && \ apt-get install -y --no-install-recommends ninja-build && \ @@ -23,7 +31,6 @@ RUN apt-get update && \ ARG VERSION=main RUN git clone -b ${VERSION} https://github.com/hpcaitech/ColossalAI.git \ && cd ./ColossalAI \ - && git checkout 3e05c07bb8921f2a8f9736b6f6673d4e9f1697d0 \ && CUDA_EXT=1 pip install -v --no-cache-dir . # install titans @@ -38,3 +45,7 @@ RUN git clone -b ${VERSION} https://github.com/pytorch/serve.git \ && cd ./serve \ && python ./ts_scripts/install_dependencies.py --cuda=cu116 \ && pip install torchserve torch-model-archiver torch-workflow-archiver + +# install requirements +RUN cd ./ColossalAI/examples/inference/serving \ + pip install -r requirements.txt diff --git a/examples/inference/serving/docker/build.sh b/examples/inference/serving/docker/build.sh index f48f4d8b518b..1fcafbd99274 100755 --- a/examples/inference/serving/docker/build.sh +++ b/examples/inference/serving/docker/build.sh @@ -1 +1 @@ -docker build -t hpcaitech/colossal-inference-serve:0.0.1 . +docker build -t colossal-infer-ts:0.0.1 . diff --git a/examples/inference/serving/requirements.txt b/examples/inference/serving/requirements.txt new file mode 100644 index 000000000000..3d83192b3896 --- /dev/null +++ b/examples/inference/serving/requirements.txt @@ -0,0 +1,5 @@ +torch==1.13 +torchserve +transformers +triton==2.0.0.dev20221202 +colossalai From 9a3a405d8669aba66346c60e021651317a2d07ea Mon Sep 17 00:00:00 2001 From: yuanheng-zhao Date: Tue, 26 Sep 2023 13:32:40 +0800 Subject: [PATCH 04/12] fix dockerfile: fix commit hash, install curl --- examples/inference/serving/docker/Dockerfile | 14 ++++++++++---- 1 file changed, 10 insertions(+), 4 deletions(-) diff --git a/examples/inference/serving/docker/Dockerfile b/examples/inference/serving/docker/Dockerfile index ea693c371aa4..7ec8a23c7ca9 100644 --- a/examples/inference/serving/docker/Dockerfile +++ b/examples/inference/serving/docker/Dockerfile @@ -6,6 +6,12 @@ RUN mkdir ~/.ssh && \ ssh-keygen -t rsa -N "" -f ~/.ssh/id_rsa && \ cat ~/.ssh/id_rsa.pub >> ~/.ssh/authorized_keys +# install curl +RUN apt-get update && \ + apt-get -y install curl && \ + apt-get clean && \ + rm -rf /var/lib/apt/lists/* + # Download and extract OpenJDK 17 ENV JAVA_HOME /opt/openjdk-17 RUN apt-get update && \ @@ -31,11 +37,15 @@ RUN apt-get update && \ ARG VERSION=main RUN git clone -b ${VERSION} https://github.com/hpcaitech/ColossalAI.git \ && cd ./ColossalAI \ + && git checkout 3e05c07bb8921f2a8f9736b6f6673d4e9f1697d0 \ && CUDA_EXT=1 pip install -v --no-cache-dir . # install titans RUN pip install --no-cache-dir titans +# install transformers +RUN pip install --no-cache-dir transformers + # install triton RUN pip install --no-cache-dir triton==2.0.0.dev20221202 @@ -45,7 +55,3 @@ RUN git clone -b ${VERSION} https://github.com/pytorch/serve.git \ && cd ./serve \ && python ./ts_scripts/install_dependencies.py --cuda=cu116 \ && pip install torchserve torch-model-archiver torch-workflow-archiver - -# install requirements -RUN cd ./ColossalAI/examples/inference/serving \ - pip install -r requirements.txt From 9d2ee3200696b3f53201468a5f0cbd33eb064e91 Mon Sep 17 00:00:00 2001 From: yuanheng-zhao Date: Tue, 26 Sep 2023 13:39:40 +0800 Subject: [PATCH 05/12] refactor file structure --- examples/inference/serving/test_ci.sh | 0 .../serving/{ => torch_serve}/Colossal_Inference_Handler.py | 0 examples/inference/serving/{ => torch_serve}/README.md | 0 examples/inference/serving/{ => torch_serve}/config.properties | 0 examples/inference/serving/{ => torch_serve}/docker/Dockerfile | 0 examples/inference/serving/{ => torch_serve}/docker/build.sh | 0 examples/inference/serving/{ => torch_serve}/model-config.yaml | 0 examples/inference/serving/{ => torch_serve}/sample_text.txt | 0 8 files changed, 0 insertions(+), 0 deletions(-) create mode 100644 examples/inference/serving/test_ci.sh rename examples/inference/serving/{ => torch_serve}/Colossal_Inference_Handler.py (100%) rename examples/inference/serving/{ => torch_serve}/README.md (100%) rename examples/inference/serving/{ => torch_serve}/config.properties (100%) rename examples/inference/serving/{ => torch_serve}/docker/Dockerfile (100%) rename examples/inference/serving/{ => torch_serve}/docker/build.sh (100%) rename examples/inference/serving/{ => torch_serve}/model-config.yaml (100%) rename examples/inference/serving/{ => torch_serve}/sample_text.txt (100%) diff --git a/examples/inference/serving/test_ci.sh b/examples/inference/serving/test_ci.sh new file mode 100644 index 000000000000..e69de29bb2d1 diff --git a/examples/inference/serving/Colossal_Inference_Handler.py b/examples/inference/serving/torch_serve/Colossal_Inference_Handler.py similarity index 100% rename from examples/inference/serving/Colossal_Inference_Handler.py rename to examples/inference/serving/torch_serve/Colossal_Inference_Handler.py diff --git a/examples/inference/serving/README.md b/examples/inference/serving/torch_serve/README.md similarity index 100% rename from examples/inference/serving/README.md rename to examples/inference/serving/torch_serve/README.md diff --git a/examples/inference/serving/config.properties b/examples/inference/serving/torch_serve/config.properties similarity index 100% rename from examples/inference/serving/config.properties rename to examples/inference/serving/torch_serve/config.properties diff --git a/examples/inference/serving/docker/Dockerfile b/examples/inference/serving/torch_serve/docker/Dockerfile similarity index 100% rename from examples/inference/serving/docker/Dockerfile rename to examples/inference/serving/torch_serve/docker/Dockerfile diff --git a/examples/inference/serving/docker/build.sh b/examples/inference/serving/torch_serve/docker/build.sh similarity index 100% rename from examples/inference/serving/docker/build.sh rename to examples/inference/serving/torch_serve/docker/build.sh diff --git a/examples/inference/serving/model-config.yaml b/examples/inference/serving/torch_serve/model-config.yaml similarity index 100% rename from examples/inference/serving/model-config.yaml rename to examples/inference/serving/torch_serve/model-config.yaml diff --git a/examples/inference/serving/sample_text.txt b/examples/inference/serving/torch_serve/sample_text.txt similarity index 100% rename from examples/inference/serving/sample_text.txt rename to examples/inference/serving/torch_serve/sample_text.txt From 76a1bb6117c57233bbd71048ed467a53ae191a6c Mon Sep 17 00:00:00 2001 From: yuanheng-zhao Date: Tue, 26 Sep 2023 13:49:45 +0800 Subject: [PATCH 06/12] revise readme --- examples/inference/serving/torch_serve/README.md | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/examples/inference/serving/torch_serve/README.md b/examples/inference/serving/torch_serve/README.md index 830a2a429efe..6ef40d739c4c 100644 --- a/examples/inference/serving/torch_serve/README.md +++ b/examples/inference/serving/torch_serve/README.md @@ -3,18 +3,17 @@ ## Overview This demo is used for testing and demonstrating the usage of Colossal Inference from `colossalai.inference` with deployment with TorchServe. It imports inference modules from colossalai and is based on -https://github.com/hpcaitech/ColossalAI/tree/d151dcab740eaae784333c93d85100c3641bd115. For now, single-gpu inference serving is supported. +https://github.com/hpcaitech/ColossalAI/tree/3e05c07bb8921f2a8f9736b6f6673d4e9f1697d0. For now, single-gpu inference serving is supported. ## Conda Environment for testing Records to create a conda env to test locally as follows. We might want to use docker or configure env on cloud platform later. +*NOTE*: It requires the installation of jdk and the set of `JAVA_HOME`. We recommend to install open-jdk-17 (Please refer to https://openjdk.org/projects/jdk/17/) + ```bash # use python 3.8 or 3.9 conda create -n infer python=3.9 -# prevent installing cuda stuff to root or somewhere weird -module unload cuda - # use torch 1.13+cuda11.6 for inference pip install torch==1.13.1+cu116 torchvision==0.14.1+cu116 torchaudio==0.13.1 --extra-index-url https://download.pytorch.org/whl/cu116 @@ -54,7 +53,7 @@ or use relatively small models (as we did) for local testing. ### 2. Archive the model With torch archiver, we will pack the model file (.zip) as well as handler file (.py) together into a .mar file. And then in serving process these files will be unpacked by TorchServe. Revelant model configs and inference configs can be set in `model-config.yaml`. ```bash -cd Language/ColossalInfer +cd ./ColossalAI/examples/inference/serving/torch_serve # create a folder under the current directory to store the packed model created by torch archiver mkdir model_store torch-model-archiver --model-name bloom --version 0.1 --handler Colossal_Inference_Handler.py --config-file model-config.yaml --extra-files /model.zip --export-path ./model_store/ From 6b178bae75a26510ad20893861cf241dbf6a86ff Mon Sep 17 00:00:00 2001 From: yuanheng-zhao Date: Tue, 26 Sep 2023 13:50:36 +0800 Subject: [PATCH 07/12] trivial --- .../inference/serving/torch_serve/Colossal_Inference_Handler.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples/inference/serving/torch_serve/Colossal_Inference_Handler.py b/examples/inference/serving/torch_serve/Colossal_Inference_Handler.py index 46a47d7cecda..466967e1a96a 100644 --- a/examples/inference/serving/torch_serve/Colossal_Inference_Handler.py +++ b/examples/inference/serving/torch_serve/Colossal_Inference_Handler.py @@ -131,7 +131,7 @@ def initialize(self, ctx): def preprocess(self, requests): """Basic text preprocessing, based on the user's chocie of application mode. Args: - requests (str): The Input data in the form of text is passed on to the preprocess + requests: The Input data in the form of text is passed on to the preprocess function. Returns: list : The preprocess function returns a list of Tensor for the size of the word tokens. From 7acd42fac53b5754f7eb65167c5c30d2cd080551 Mon Sep 17 00:00:00 2001 From: yuanheng-zhao Date: Tue, 26 Sep 2023 14:38:52 +0800 Subject: [PATCH 08/12] trivial: dockerfile format --- .../serving/torch_serve/docker/Dockerfile | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/examples/inference/serving/torch_serve/docker/Dockerfile b/examples/inference/serving/torch_serve/docker/Dockerfile index 7ec8a23c7ca9..6d780a84747f 100644 --- a/examples/inference/serving/torch_serve/docker/Dockerfile +++ b/examples/inference/serving/torch_serve/docker/Dockerfile @@ -35,10 +35,10 @@ RUN apt-get update && \ # install colossalai ARG VERSION=main -RUN git clone -b ${VERSION} https://github.com/hpcaitech/ColossalAI.git \ - && cd ./ColossalAI \ - && git checkout 3e05c07bb8921f2a8f9736b6f6673d4e9f1697d0 \ - && CUDA_EXT=1 pip install -v --no-cache-dir . +RUN git clone -b ${VERSION} https://github.com/hpcaitech/ColossalAI.git && \ + cd ./ColossalAI && \ + git checkout 3e05c07bb8921f2a8f9736b6f6673d4e9f1697d0 && \ + CUDA_EXT=1 pip install -v --no-cache-dir . # install titans RUN pip install --no-cache-dir titans @@ -51,7 +51,7 @@ RUN pip install --no-cache-dir triton==2.0.0.dev20221202 # install torchserve ARG VERSION=master -RUN git clone -b ${VERSION} https://github.com/pytorch/serve.git \ - && cd ./serve \ - && python ./ts_scripts/install_dependencies.py --cuda=cu116 \ - && pip install torchserve torch-model-archiver torch-workflow-archiver +RUN git clone -b ${VERSION} https://github.com/pytorch/serve.git && \ + cd ./serve && \ + python ./ts_scripts/install_dependencies.py --cuda=cu116 && \ + pip install torchserve torch-model-archiver torch-workflow-archiver From 375bbe0d7e18b72dc49380f9284c05a988760b4e Mon Sep 17 00:00:00 2001 From: ocd_with_naming Date: Tue, 26 Sep 2023 22:27:35 +0800 Subject: [PATCH 09/12] clean dir; revise readme --- examples/inference/serving/download_model.py | 50 ------------------- .../inference/serving/torch_serve/README.md | 33 ++++++++++-- .../serving/torch_serve/docker/build.sh | 1 - 3 files changed, 30 insertions(+), 54 deletions(-) delete mode 100644 examples/inference/serving/download_model.py delete mode 100755 examples/inference/serving/torch_serve/docker/build.sh diff --git a/examples/inference/serving/download_model.py b/examples/inference/serving/download_model.py deleted file mode 100644 index 41ff80617c26..000000000000 --- a/examples/inference/serving/download_model.py +++ /dev/null @@ -1,50 +0,0 @@ -# CREDITS: These functions are from https://github.com/pytorch/serve/blob/2bf505bae3046b0f7d0900727ec36e611bb5dca3/examples/large_models/utils/Download_model.py -import argparse -import os - -from huggingface_hub import HfApi, snapshot_download - - -def dir_path(path_str): - if os.path.isdir(path_str): - return path_str - elif input(f"{path_str} does not exist, create directory? [y/n]").lower() == "y": - os.makedirs(path_str) - return path_str - else: - raise NotADirectoryError(path_str) - - -class HFModelNotFoundError(Exception): - def __init__(self, model_str): - super().__init__(f"HuggingFace model not found: '{model_str}'") - - -def hf_model(model_str): - api = HfApi() - models = [m.modelId for m in api.list_models()] - if model_str in models: - return model_str - else: - raise HFModelNotFoundError(model_str) - - -parser = argparse.ArgumentParser() -parser.add_argument( - "--model_path", - "-o", - type=dir_path, - default="model", - help="Output directory for downloaded model files", -) -parser.add_argument("--model_name", "-m", type=hf_model, required=True, help="HuggingFace model name") -parser.add_argument("--revision", "-r", type=str, default="main", help="Revision") -args = parser.parse_args() - -snapshot_path = snapshot_download( - repo_id=args.model_name, - revision=args.revision, - cache_dir=args.model_path, - use_auth_token=True, -) -print(f"Files for '{args.model_name}' is downloaded to '{snapshot_path}'") diff --git a/examples/inference/serving/torch_serve/README.md b/examples/inference/serving/torch_serve/README.md index 6ef40d739c4c..6bd145bc30ae 100644 --- a/examples/inference/serving/torch_serve/README.md +++ b/examples/inference/serving/torch_serve/README.md @@ -5,7 +5,8 @@ This demo is used for testing and demonstrating the usage of Colossal Inference from `colossalai.inference` with deployment with TorchServe. It imports inference modules from colossalai and is based on https://github.com/hpcaitech/ColossalAI/tree/3e05c07bb8921f2a8f9736b6f6673d4e9f1697d0. For now, single-gpu inference serving is supported. -## Conda Environment for testing +## Environment for testing +### Option #1: Use Conda Env Records to create a conda env to test locally as follows. We might want to use docker or configure env on cloud platform later. *NOTE*: It requires the installation of jdk and the set of `JAVA_HOME`. We recommend to install open-jdk-17 (Please refer to https://openjdk.org/projects/jdk/17/) @@ -27,17 +28,43 @@ pip install -r requirements/requirements-test.txt CUDA_EXT=1 pip install -e . # install torchserve -cd +cd python ./ts_scripts/install_dependencies.py --cuda=cu116 pip install torchserve torch-model-archiver torch-workflow-archiver ``` +### Option #2: Use Docker +To use the stable diffusion Docker image, you can build using the provided the [Dockerfile](./docker/Dockerfile). + +```bash +# build from dockerfile +cd ColossalAI/examples/inference/serving/torch_serve/docker +docker build -t hpcaitech/colossal-infer-ts:0.2.0 . +``` + +Once you have the image ready, you can launch the image with the following command + +```bash +cd ColossalAI/examples/inference/serving/torch_serve + +# run the docker container +docker run --rm \ + -it --gpus all \ + --name \ + -v :/data/scratch \ + -w \ + hpcaitech/colossal-infer-ts:0.2.0 \ + /bin/bash +``` + ## Steps to deploy a model ### 1.download/prepare a model -To use on cloud platform, we will zip the downloaded model. +We will download a bloom model, and then zip the downloaded model. You could download the model from [HuggingFace](https://huggingface.co/models) manually, or you might want to refer to this script [download_model.py](https://github.com/pytorch/serve/blob/c3ca2599b4d36d2b61302064b02eab1b65e1908d/examples/large_models/utils/Download_model.py) provided by pytorch-serve team to help you download a snapshot of the model. + ```bash # download snapshots +cd /examples/large_models/utils/ huggingface-cli login python download_model.py --model_name bigscience/bloom-560m -o diff --git a/examples/inference/serving/torch_serve/docker/build.sh b/examples/inference/serving/torch_serve/docker/build.sh deleted file mode 100755 index 1fcafbd99274..000000000000 --- a/examples/inference/serving/torch_serve/docker/build.sh +++ /dev/null @@ -1 +0,0 @@ -docker build -t colossal-infer-ts:0.0.1 . From 3b3e3643de9a538ef6a51b3dcf2e50b1c102e99c Mon Sep 17 00:00:00 2001 From: yuanheng-zhao Date: Wed, 27 Sep 2023 18:17:27 +0800 Subject: [PATCH 10/12] fix comments: fix imports and configs --- .../torch_serve/Colossal_Inference_Handler.py | 53 +++++++------------ 1 file changed, 19 insertions(+), 34 deletions(-) diff --git a/examples/inference/serving/torch_serve/Colossal_Inference_Handler.py b/examples/inference/serving/torch_serve/Colossal_Inference_Handler.py index 466967e1a96a..9896eb0d7d50 100644 --- a/examples/inference/serving/torch_serve/Colossal_Inference_Handler.py +++ b/examples/inference/serving/torch_serve/Colossal_Inference_Handler.py @@ -13,31 +13,13 @@ import colossalai from colossalai.inference.tensor_parallel.engine import TPInferEngine from colossalai.shardformer import ShardConfig +from colossalai.testing import free_port logger = logging.getLogger(__name__) logger.info("Transformers version %s", transformers.__version__) logger.info("ColossalAI version %s", colossalai.__version__) -# from colossalai.testing -# assins a random port, for demo use only -def free_port() -> int: - """Get a free port on localhost. - - Returns: - int: A free port on localhost. - """ - while True: - port = random.randint(20000, 65000) - try: - with socket.socket() as sock: - sock.setsockopt(socket.SOL_SOCKET, socket.SO_REUSEADDR, 1) - sock.bind(("localhost", port)) - return port - except OSError: - continue - - class ColossalInferenceHandler(BaseHandler, ABC): """ Transformers handler class for testing @@ -68,34 +50,35 @@ def initialize(self, ctx): # Inference configs are collected together in model yaml config for handler use inference_config = ctx.model_yaml_config["handler"] - logger.info(inference_config) - inference_config["model_type"] - self.tp_size = inference_config.get("tp_size", 1) - self.max_batch_size = inference_config.get("max_batch_size", 4) - self.max_input_len = inference_config.get("max_input_len", 1024) - self.max_output_len = inference_config.get("max_output_len", 128) + self.inference_config = inference_config + logger.info(self.inference_config) + + self.tp_size = self.inference_config.get("tp_size", 1) + self.max_batch_size = self.inference_config.get("max_batch_size", 4) + self.max_input_len = self.inference_config.get("max_input_len", 1024) + self.max_output_len = self.inference_config.get("max_output_len", 128) self.device = torch.device("cuda:" + str(gpu_id) if torch.cuda.is_available() and gpu_id >= 0 else "cpu") logger.info(f"Device set to {self.device}") logger.info(f"torch.cuda.device_count() {torch.cuda.device_count()}") - logger.info(f"Unpacking from model_dir {model_dir}") + # Unpacking from model_dir model_dir_path = os.path.join(model_dir, "model") with zipfile.ZipFile(model_dir + "/model.zip", "r") as zip_ref: zip_ref.extractall(model_dir_path) - logger.info(f"Loading {inference_config['model_type']} pretrain model and tokenizer") - if inference_config["model_type"] == "bloom": + logger.info(f"Loading {self.inference_config['model_type']} pretrain model and tokenizer") + if self.inference_config["model_type"] == "bloom": self.model = BloomForCausalLM.from_pretrained( model_dir_path, ) self.tokenizer = BloomTokenizerFast.from_pretrained(model_dir_path, return_tensors="pt") - elif inference_config["model_type"] == "llama": + elif self.inference_config["model_type"] == "llama": self.model = LlamaForCausalLM.from_pretrained( model_dir_path, ) self.tokenizer = AutoTokenizer.from_pretrained(model_dir_path, return_tensors="pt") else: - logger.warning(f"Model type {inference_config['model_type']} not supported yet.") + logger.warning(f"Model type {self.inference_config['model_type']} not supported yet.") logger.info("Transformer model from path %s loaded successfully", model_dir) @@ -181,13 +164,15 @@ def inference(self, input_batch): input_ids_batch, attention_mask_batch = input_batch inferences = [] - # mode: text_generation + do_sample = self.inference_config.get("do_sample", True) + top_p = self.inference_config.get("top_p", 0.95 if do_sample else 1.0 ) + top_k = self.inference_config.get("top_k", 60 if do_sample else 50) input_ids_batch = input_ids_batch.to(self.device) outputs = self.infer_engine.generate( dict(input_ids=input_ids_batch, attention_mask=attention_mask_batch), - do_sample=True, - top_p=0.95, - top_k=60, + do_sample=do_sample, + top_p=top_p, + top_k=top_k, ) for i, _ in enumerate(outputs): From 6ffd19dd581c6f945dd18a5e78863885b3e4c213 Mon Sep 17 00:00:00 2001 From: yuanheng-zhao Date: Wed, 27 Sep 2023 18:30:49 +0800 Subject: [PATCH 11/12] fix formats --- .../serving/torch_serve/Colossal_Inference_Handler.py | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) diff --git a/examples/inference/serving/torch_serve/Colossal_Inference_Handler.py b/examples/inference/serving/torch_serve/Colossal_Inference_Handler.py index 9896eb0d7d50..c0d30501efea 100644 --- a/examples/inference/serving/torch_serve/Colossal_Inference_Handler.py +++ b/examples/inference/serving/torch_serve/Colossal_Inference_Handler.py @@ -1,7 +1,5 @@ import logging import os -import random -import socket import zipfile from abc import ABC @@ -52,7 +50,7 @@ def initialize(self, ctx): inference_config = ctx.model_yaml_config["handler"] self.inference_config = inference_config logger.info(self.inference_config) - + self.tp_size = self.inference_config.get("tp_size", 1) self.max_batch_size = self.inference_config.get("max_batch_size", 4) self.max_input_len = self.inference_config.get("max_input_len", 1024) @@ -62,7 +60,7 @@ def initialize(self, ctx): logger.info(f"Device set to {self.device}") logger.info(f"torch.cuda.device_count() {torch.cuda.device_count()}") - # Unpacking from model_dir + # Unpacking from model_dir model_dir_path = os.path.join(model_dir, "model") with zipfile.ZipFile(model_dir + "/model.zip", "r") as zip_ref: zip_ref.extractall(model_dir_path) @@ -165,7 +163,7 @@ def inference(self, input_batch): inferences = [] do_sample = self.inference_config.get("do_sample", True) - top_p = self.inference_config.get("top_p", 0.95 if do_sample else 1.0 ) + top_p = self.inference_config.get("top_p", 0.95 if do_sample else 1.0) top_k = self.inference_config.get("top_k", 60 if do_sample else 50) input_ids_batch = input_ids_batch.to(self.device) outputs = self.infer_engine.generate( From 09469565aeceac6fe673f76481cc2b85d58bda50 Mon Sep 17 00:00:00 2001 From: yuanheng-zhao Date: Mon, 2 Oct 2023 17:14:28 +0800 Subject: [PATCH 12/12] remove unused requirements --- examples/inference/serving/requirements.txt | 5 ----- 1 file changed, 5 deletions(-) delete mode 100644 examples/inference/serving/requirements.txt diff --git a/examples/inference/serving/requirements.txt b/examples/inference/serving/requirements.txt deleted file mode 100644 index 3d83192b3896..000000000000 --- a/examples/inference/serving/requirements.txt +++ /dev/null @@ -1,5 +0,0 @@ -torch==1.13 -torchserve -transformers -triton==2.0.0.dev20221202 -colossalai