From c3f3435549829022acbc2bb12c4d349ad0e2a03c Mon Sep 17 00:00:00 2001
From: yuanheng-zhao <jonathan.zhaoyh@gmail.com>
Date: Thu, 21 Sep 2023 16:23:40 +0800
Subject: [PATCH 01/12] add Colossal-Inference serving example w/ TorchServe

---
 .../serving/Colossal_Inference_Handler.py     | 210 ++++++++++++++++++
 examples/inference/serving/README.md          |  83 +++++++
 examples/inference/serving/config.properties  |  10 +
 examples/inference/serving/download_model.py  |  50 +++++
 examples/inference/serving/model-config.yaml  |  16 ++
 examples/inference/serving/sample_text.txt    |   1 +
 6 files changed, 370 insertions(+)
 create mode 100644 examples/inference/serving/Colossal_Inference_Handler.py
 create mode 100644 examples/inference/serving/README.md
 create mode 100644 examples/inference/serving/config.properties
 create mode 100644 examples/inference/serving/download_model.py
 create mode 100644 examples/inference/serving/model-config.yaml
 create mode 100644 examples/inference/serving/sample_text.txt

diff --git a/examples/inference/serving/Colossal_Inference_Handler.py b/examples/inference/serving/Colossal_Inference_Handler.py
new file mode 100644
index 000000000000..46a47d7cecda
--- /dev/null
+++ b/examples/inference/serving/Colossal_Inference_Handler.py
@@ -0,0 +1,210 @@
+import logging
+import os
+import random
+import socket
+import zipfile
+from abc import ABC
+
+import torch
+import transformers
+from transformers import AutoTokenizer, BloomForCausalLM, BloomTokenizerFast, LlamaForCausalLM
+from ts.torch_handler.base_handler import BaseHandler
+
+import colossalai
+from colossalai.inference.tensor_parallel.engine import TPInferEngine
+from colossalai.shardformer import ShardConfig
+
+logger = logging.getLogger(__name__)
+logger.info("Transformers version %s", transformers.__version__)
+logger.info("ColossalAI version %s", colossalai.__version__)
+
+
+# from colossalai.testing
+# assins a random port, for demo use only
+def free_port() -> int:
+    """Get a free port on localhost.
+
+    Returns:
+        int: A free port on localhost.
+    """
+    while True:
+        port = random.randint(20000, 65000)
+        try:
+            with socket.socket() as sock:
+                sock.setsockopt(socket.SOL_SOCKET, socket.SO_REUSEADDR, 1)
+                sock.bind(("localhost", port))
+                return port
+        except OSError:
+            continue
+
+
+class ColossalInferenceHandler(BaseHandler, ABC):
+    """
+    Transformers handler class for testing
+    """
+
+    def __init__(self):
+        super(ColossalInferenceHandler, self).__init__()
+        self.infer_engine = None
+        self.max_batch_size = None
+        self.max_input_len = None
+        self.max_output_len = None
+        self.tokenizer = None
+        self.initialized = False
+
+    def initialize(self, ctx):
+        """Expected behaviour: the sharded Bloom/Llama model is loaded.
+
+        Args:
+            ctx (context): It is a JSON Object containing information
+            pertaining to the model artefacts parameters.
+        """
+        if ctx is not None or not hasattr(ctx, "model_yaml_config"):
+            logger.error("Context ctx and model-config are not appropriately passed in.")
+
+        self.manifest = ctx.manifest
+        gpu_id = ctx.system_properties.get("gpu_id", -1)
+        model_dir = ctx.system_properties.get("model_dir")
+
+        # Inference configs are collected together in model yaml config for handler use
+        inference_config = ctx.model_yaml_config["handler"]
+        logger.info(inference_config)
+        inference_config["model_type"]
+        self.tp_size = inference_config.get("tp_size", 1)
+        self.max_batch_size = inference_config.get("max_batch_size", 4)
+        self.max_input_len = inference_config.get("max_input_len", 1024)
+        self.max_output_len = inference_config.get("max_output_len", 128)
+
+        self.device = torch.device("cuda:" + str(gpu_id) if torch.cuda.is_available() and gpu_id >= 0 else "cpu")
+        logger.info(f"Device set to {self.device}")
+        logger.info(f"torch.cuda.device_count() {torch.cuda.device_count()}")
+
+        logger.info(f"Unpacking from model_dir {model_dir}")
+        model_dir_path = os.path.join(model_dir, "model")
+        with zipfile.ZipFile(model_dir + "/model.zip", "r") as zip_ref:
+            zip_ref.extractall(model_dir_path)
+        logger.info(f"Loading {inference_config['model_type']} pretrain model and tokenizer")
+        if inference_config["model_type"] == "bloom":
+            self.model = BloomForCausalLM.from_pretrained(
+                model_dir_path,
+            )
+            self.tokenizer = BloomTokenizerFast.from_pretrained(model_dir_path, return_tensors="pt")
+        elif inference_config["model_type"] == "llama":
+            self.model = LlamaForCausalLM.from_pretrained(
+                model_dir_path,
+            )
+            self.tokenizer = AutoTokenizer.from_pretrained(model_dir_path, return_tensors="pt")
+        else:
+            logger.warning(f"Model type {inference_config['model_type']} not supported yet.")
+
+        logger.info("Transformer model from path %s loaded successfully", model_dir)
+
+        # NOTE world_size, rank, host, port here are used to launch colossalai dist environment
+        # This world_size is different from the world size of TorchServe
+        world_size = int(os.getenv("WORLD_SIZE", self.tp_size))
+        assert world_size == 1, "Colossal-Inference with tensor parallel is not supported on TorchServe for now"
+        rank = int(os.getenv("RANK", gpu_id))
+        local_rank = int(os.getenv("LOCAL_RANK", gpu_id))
+        host = os.getenv("MASTER_ADDR", "localhost")
+        port = os.getenv("MASTER_PORT", free_port())  # use a random free port
+
+        logger.info(
+            f"  world_size {world_size}" f"  local_rank {local_rank}" f"  rank {rank}" f"  host {host}" f"  port {port}"
+        )
+
+        torch.cuda.set_device(self.device)
+        self.model.half()
+        self.model.cuda()
+        self.model.eval()
+
+        colossalai.launch(config={}, rank=rank, world_size=world_size, host=host, port=port, backend="nccl")
+        logger.info("Initializing TPInferEngine ...")
+        shard_config = ShardConfig(enable_tensor_parallelism=True if self.tp_size > 1 else False, inference_only=True)
+        self.infer_engine = TPInferEngine(
+            self.model, shard_config, self.max_batch_size, self.max_input_len, self.max_output_len
+        )
+        logger.info("TPInferEngine initialized successfully")
+
+        self.model = self.infer_engine.model
+        self.initialized = True
+
+    def preprocess(self, requests):
+        """Basic text preprocessing, based on the user's chocie of application mode.
+        Args:
+            requests (str): The Input data in the form of text is passed on to the preprocess
+            function.
+        Returns:
+            list : The preprocess function returns a list of Tensor for the size of the word tokens.
+        """
+        logger.info("Pre-processing requests")
+        input_ids_batch = None
+        attention_mask_batch = None
+        for idx, data in enumerate(requests):
+            input_text = data.get("data")
+            if input_text is None:
+                input_text = data.get("body")
+            if isinstance(input_text, (bytes, bytearray)):
+                input_text = input_text.decode("utf-8")
+
+            logger.info("Received text: '%s'", input_text)
+
+            inputs = self.tokenizer.encode_plus(
+                input_text,
+                max_length=self.max_input_len,
+                padding=True,
+                add_special_tokens=True,
+                return_tensors="pt",
+                truncation=True,
+            )
+
+            input_ids = inputs["input_ids"].to(self.device)
+            attention_mask = inputs["attention_mask"].to(self.device)
+            # making a batch out of the recieved requests
+            # attention masks are passed for cases where input tokens are padded.
+            if input_ids.shape is not None:
+                if input_ids_batch is None:
+                    input_ids_batch = input_ids
+                    attention_mask_batch = attention_mask
+                else:
+                    input_ids_batch = torch.cat((input_ids_batch, input_ids), 0)
+                    attention_mask_batch = torch.cat((attention_mask_batch, attention_mask), 0)
+        return (input_ids_batch, attention_mask_batch)
+
+    def inference(self, input_batch):
+        """Predict the class (or classes) of the received text using the
+        serialized transformers checkpoint.
+        Args:
+            input_batch (list): List of Text Tensors from the pre-process function is passed here
+        Returns:
+            list : It returns a list of the predicted value for the input text
+        """
+        input_ids_batch, attention_mask_batch = input_batch
+        inferences = []
+
+        # mode: text_generation
+        input_ids_batch = input_ids_batch.to(self.device)
+        outputs = self.infer_engine.generate(
+            dict(input_ids=input_ids_batch, attention_mask=attention_mask_batch),
+            do_sample=True,
+            top_p=0.95,
+            top_k=60,
+        )
+
+        for i, _ in enumerate(outputs):
+            inferences.append(self.tokenizer.decode(outputs[i], skip_special_tokens=True))
+
+        # For testing only
+        logger.info(
+            f"Generated text: {inferences}",
+        )
+
+        return inferences
+
+    def postprocess(self, inference_output):
+        """Post Process Function converts the predicted response into Torchserve readable format.
+        Args:
+            inference_output (list): It contains the predicted response of the input text.
+        Returns:
+            (list): Returns a list of the Predictions and Explanations.
+        """
+        return inference_output
diff --git a/examples/inference/serving/README.md b/examples/inference/serving/README.md
new file mode 100644
index 000000000000..830a2a429efe
--- /dev/null
+++ b/examples/inference/serving/README.md
@@ -0,0 +1,83 @@
+# Colossal-Inference with TorchServe
+
+## Overview
+
+This demo is used for testing and demonstrating the usage of Colossal Inference from `colossalai.inference` with deployment with TorchServe. It imports inference modules from colossalai and is based on
+https://github.com/hpcaitech/ColossalAI/tree/d151dcab740eaae784333c93d85100c3641bd115. For now, single-gpu inference serving is supported.
+
+## Conda Environment for testing
+Records to create a conda env to test locally as follows. We might want to use docker or configure env on cloud platform later.
+
+```bash
+# use python 3.8 or 3.9
+conda create -n infer python=3.9
+
+# prevent installing cuda stuff to root or somewhere weird
+module unload cuda
+
+# use torch 1.13+cuda11.6 for inference
+pip install torch==1.13.1+cu116 torchvision==0.14.1+cu116 torchaudio==0.13.1 --extra-index-url https://download.pytorch.org/whl/cu116
+
+# conda cuda toolkit (e.g. nvcc, etc)
+conda install -c "nvidia/label/cuda-11.6.2" cuda-toolkit
+
+# install colossalai with PyTorch extensions
+cd <path_to_ColossalAI_repo>
+pip install -r requirements/requirements.txt
+pip install -r requirements/requirements-test.txt
+CUDA_EXT=1 pip install -e .
+
+# install torchserve
+cd <path_to_pytorch_serve_repo>
+python ./ts_scripts/install_dependencies.py --cuda=cu116
+pip install torchserve torch-model-archiver torch-workflow-archiver
+```
+
+## Steps to deploy a model
+
+###  1.download/prepare a model
+To use on cloud platform, we will zip the downloaded model.
+```bash
+# download snapshots
+huggingface-cli login
+python download_model.py --model_name bigscience/bloom-560m -o <path_to_store_downloaded_model>
+
+# zip the model repo
+cd <path_to_store_downloaded_model>/models--bigscience--bloom-560m/snapshots/<specific_revision>
+zip -r <path_to_place_zipped_model>//model.zip *
+```
+
+> **_NOTE:_**  The torch archiver and server will use `/tmp/` folder. Depending on the limit of disk quota, using torch-model-archiver might cause OSError "Disk quota exceeded". To prevent the OSError, set tmp dir environment variable as follows:
+`export TMPDIR=<dir_with_enough_space>/tmp` and `export TEMP=<dir_with_enough_space>/tmp`,
+or use relatively small models (as we did) for local testing.
+
+### 2. Archive the model
+With torch archiver, we will pack the model file (.zip) as well as handler file (.py) together into a .mar file. And then in serving process these files will be unpacked by TorchServe. Revelant model configs and inference configs can be set in `model-config.yaml`.
+```bash
+cd Language/ColossalInfer
+# create a folder under the current directory to store the packed model created by torch archiver
+mkdir model_store
+torch-model-archiver --model-name bloom --version 0.1 --handler Colossal_Inference_Handler.py --config-file model-config.yaml --extra-files <dir_zipped_model>/model.zip --export-path ./model_store/
+```
+
+### 3. Launch serving
+
+Modify `load_models` in config.properties to select the model(s) stored in <model_store> directory to be deployed. By default we use `load_models=all` to load and deploy all the models (.mar) we have.
+
+```bash
+torchserve --start --ncs --ts-config config.properties
+```
+We could set inference, management, and metrics addresses and other TorchServe settings in `config.properties`.
+
+TorchServe will create a folder `logs/` under the current directory to store ts, model, and metrics logs.
+
+### 4. Run inference
+
+```bash
+# check inference status
+curl http://0.0.0.0:8084/ping
+
+curl -X POST http://localhost:8084/predictions/bloom -T sample_text.txt
+```
+
+To stop TorchServe, run `torchserve --stop`
diff --git a/examples/inference/serving/config.properties b/examples/inference/serving/config.properties
new file mode 100644
index 000000000000..7f2b882a11a7
--- /dev/null
+++ b/examples/inference/serving/config.properties
@@ -0,0 +1,10 @@
+inference_address=http://0.0.0.0:8084
+management_address=http://0.0.0.0:8085
+metrics_address=http://0.0.0.0:8086
+enable_envvars_config=true
+install_py_dep_per_model=true
+number_of_gpu=1
+load_models=all
+max_response_size=655350000
+default_response_timeout=6000
+model_store=./model_store
diff --git a/examples/inference/serving/download_model.py b/examples/inference/serving/download_model.py
new file mode 100644
index 000000000000..41ff80617c26
--- /dev/null
+++ b/examples/inference/serving/download_model.py
@@ -0,0 +1,50 @@
+# CREDITS: These functions are from https://github.com/pytorch/serve/blob/2bf505bae3046b0f7d0900727ec36e611bb5dca3/examples/large_models/utils/Download_model.py
+import argparse
+import os
+
+from huggingface_hub import HfApi, snapshot_download
+
+
+def dir_path(path_str):
+    if os.path.isdir(path_str):
+        return path_str
+    elif input(f"{path_str} does not exist, create directory? [y/n]").lower() == "y":
+        os.makedirs(path_str)
+        return path_str
+    else:
+        raise NotADirectoryError(path_str)
+
+
+class HFModelNotFoundError(Exception):
+    def __init__(self, model_str):
+        super().__init__(f"HuggingFace model not found: '{model_str}'")
+
+
+def hf_model(model_str):
+    api = HfApi()
+    models = [m.modelId for m in api.list_models()]
+    if model_str in models:
+        return model_str
+    else:
+        raise HFModelNotFoundError(model_str)
+
+
+parser = argparse.ArgumentParser()
+parser.add_argument(
+    "--model_path",
+    "-o",
+    type=dir_path,
+    default="model",
+    help="Output directory for downloaded model files",
+)
+parser.add_argument("--model_name", "-m", type=hf_model, required=True, help="HuggingFace model name")
+parser.add_argument("--revision", "-r", type=str, default="main", help="Revision")
+args = parser.parse_args()
+
+snapshot_path = snapshot_download(
+    repo_id=args.model_name,
+    revision=args.revision,
+    cache_dir=args.model_path,
+    use_auth_token=True,
+)
+print(f"Files for '{args.model_name}' is downloaded to '{snapshot_path}'")
diff --git a/examples/inference/serving/model-config.yaml b/examples/inference/serving/model-config.yaml
new file mode 100644
index 000000000000..0f86d424beee
--- /dev/null
+++ b/examples/inference/serving/model-config.yaml
@@ -0,0 +1,16 @@
+# TS frontend parameters settings
+minWorkers: 1        # minimum number of workers of a model
+maxWorkers: 1        # maximum number of workers of a model
+batchSize: 8         # batch size of a model
+maxBatchDelay: 100   # maximum delay of a batch (ms)
+responseTimeout: 120 # timeout of a specific model's response (*in sec)
+deviceType: "gpu"
+# deviceIds: [0, 1]    # seting CUDA_VISIBLE_DEVICES
+
+handler:
+    mode: "text_generation"
+    model_type: "bloom"
+    tp_size: 1
+    max_batch_size: 8
+    max_input_len: 1024
+    max_output_len: 128
diff --git a/examples/inference/serving/sample_text.txt b/examples/inference/serving/sample_text.txt
new file mode 100644
index 000000000000..18d8729f21b4
--- /dev/null
+++ b/examples/inference/serving/sample_text.txt
@@ -0,0 +1 @@
+Introduce some landmarks in Beijing

From 6434e3e291f187cb0823d43bcf616e7bc02949ab Mon Sep 17 00:00:00 2001
From: yuanheng-zhao <jonathan.zhaoyh@gmail.com>
Date: Fri, 22 Sep 2023 10:08:36 +0800
Subject: [PATCH 02/12] add dockerfile

---
 examples/inference/serving/docker/Dockerfile | 40 ++++++++++++++++++++
 examples/inference/serving/docker/build.sh   |  1 +
 2 files changed, 41 insertions(+)
 create mode 100644 examples/inference/serving/docker/Dockerfile
 create mode 100755 examples/inference/serving/docker/build.sh

diff --git a/examples/inference/serving/docker/Dockerfile b/examples/inference/serving/docker/Dockerfile
new file mode 100644
index 000000000000..65c96942a070
--- /dev/null
+++ b/examples/inference/serving/docker/Dockerfile
@@ -0,0 +1,40 @@
+# FROM hpcaitech/cuda-conda:11.6
+FROM hpcaitech/pytorch-cuda:1.13.0-11.6.0
+
+# enable passwordless ssh
+RUN mkdir ~/.ssh && \
+    printf "Host * \n    ForwardAgent yes\nHost *\n    StrictHostKeyChecking no" > ~/.ssh/config && \
+    ssh-keygen -t rsa -N "" -f ~/.ssh/id_rsa && \
+    cat ~/.ssh/id_rsa.pub >> ~/.ssh/authorized_keys
+
+# enable RDMA support
+RUN apt-get update && \
+    apt-get install -y infiniband-diags perftest ibverbs-providers libibumad3 libibverbs1 libnl-3-200 libnl-route-3-200 librdmacm1 && \
+    apt-get clean && \
+    rm -rf /var/lib/apt/lists/*
+
+# install ninja
+RUN apt-get update && \
+    apt-get install -y --no-install-recommends ninja-build && \
+    apt-get clean && \
+    rm -rf /var/lib/apt/lists/*
+
+# install colossalai
+ARG VERSION=main
+RUN git clone -b ${VERSION} https://github.com/hpcaitech/ColossalAI.git \
+    && cd ./ColossalAI \
+    && git checkout 3e05c07bb8921f2a8f9736b6f6673d4e9f1697d0 \
+    && CUDA_EXT=1 pip install -v --no-cache-dir .
+
+# install titans
+RUN pip install --no-cache-dir titans
+
+# install triton
+RUN pip install --no-cache-dir triton==2.0.0.dev20221202
+
+# install torchserve
+ARG VERSION=master
+RUN git clone -b ${VERSION} https://github.com/pytorch/serve.git \
+    && cd ./serve \
+    && python ./ts_scripts/install_dependencies.py --cuda=cu116 \
+    && pip install torchserve torch-model-archiver torch-workflow-archiver
diff --git a/examples/inference/serving/docker/build.sh b/examples/inference/serving/docker/build.sh
new file mode 100755
index 000000000000..f48f4d8b518b
--- /dev/null
+++ b/examples/inference/serving/docker/build.sh
@@ -0,0 +1 @@
+docker build -t hpcaitech/colossal-inference-serve:0.0.1 .

From 085097c3de5f666187dd297f1f08963291af3d2c Mon Sep 17 00:00:00 2001
From: yuanheng-zhao <jonathan.zhaoyh@gmail.com>
Date: Tue, 26 Sep 2023 09:15:24 +0800
Subject: [PATCH 03/12] fix dockerfile

---
 examples/inference/serving/docker/Dockerfile | 21 +++++++++++++++-----
 examples/inference/serving/docker/build.sh   |  2 +-
 examples/inference/serving/requirements.txt  |  5 +++++
 3 files changed, 22 insertions(+), 6 deletions(-)
 create mode 100644 examples/inference/serving/requirements.txt

diff --git a/examples/inference/serving/docker/Dockerfile b/examples/inference/serving/docker/Dockerfile
index 65c96942a070..ea693c371aa4 100644
--- a/examples/inference/serving/docker/Dockerfile
+++ b/examples/inference/serving/docker/Dockerfile
@@ -1,4 +1,3 @@
-# FROM hpcaitech/cuda-conda:11.6
 FROM hpcaitech/pytorch-cuda:1.13.0-11.6.0
 
 # enable passwordless ssh
@@ -7,12 +6,21 @@ RUN mkdir ~/.ssh && \
     ssh-keygen -t rsa -N "" -f ~/.ssh/id_rsa && \
     cat ~/.ssh/id_rsa.pub >> ~/.ssh/authorized_keys
 
-# enable RDMA support
+# Download and extract OpenJDK 17
+ENV JAVA_HOME /opt/openjdk-17
 RUN apt-get update && \
-    apt-get install -y infiniband-diags perftest ibverbs-providers libibumad3 libibverbs1 libnl-3-200 libnl-route-3-200 librdmacm1 && \
-    apt-get clean && \
+    apt-get install -y wget && \
+    wget -q https://download.java.net/openjdk/jdk17/ri/openjdk-17+35_linux-x64_bin.tar.gz -O /tmp/openjdk.tar.gz && \
+    mkdir -p $JAVA_HOME && \
+    tar xzf /tmp/openjdk.tar.gz -C $JAVA_HOME --strip-components=1 && \
+    rm /tmp/openjdk.tar.gz && \
+    apt-get purge -y --auto-remove wget && \
     rm -rf /var/lib/apt/lists/*
 
+ENV PATH $JAVA_HOME/bin:$PATH
+RUN export JAVA_HOME
+RUN java -version
+
 # install ninja
 RUN apt-get update && \
     apt-get install -y --no-install-recommends ninja-build && \
@@ -23,7 +31,6 @@ RUN apt-get update && \
 ARG VERSION=main
 RUN git clone -b ${VERSION} https://github.com/hpcaitech/ColossalAI.git \
     && cd ./ColossalAI \
-    && git checkout 3e05c07bb8921f2a8f9736b6f6673d4e9f1697d0 \
     && CUDA_EXT=1 pip install -v --no-cache-dir .
 
 # install titans
@@ -38,3 +45,7 @@ RUN git clone -b ${VERSION} https://github.com/pytorch/serve.git \
     && cd ./serve \
     && python ./ts_scripts/install_dependencies.py --cuda=cu116 \
     && pip install torchserve torch-model-archiver torch-workflow-archiver
+
+# install requirements
+RUN cd ./ColossalAI/examples/inference/serving \
+    pip install -r requirements.txt
diff --git a/examples/inference/serving/docker/build.sh b/examples/inference/serving/docker/build.sh
index f48f4d8b518b..1fcafbd99274 100755
--- a/examples/inference/serving/docker/build.sh
+++ b/examples/inference/serving/docker/build.sh
@@ -1 +1 @@
-docker build -t hpcaitech/colossal-inference-serve:0.0.1 .
+docker build -t colossal-infer-ts:0.0.1 .
diff --git a/examples/inference/serving/requirements.txt b/examples/inference/serving/requirements.txt
new file mode 100644
index 000000000000..3d83192b3896
--- /dev/null
+++ b/examples/inference/serving/requirements.txt
@@ -0,0 +1,5 @@
+torch==1.13
+torchserve
+transformers
+triton==2.0.0.dev20221202
+colossalai

From 9a3a405d8669aba66346c60e021651317a2d07ea Mon Sep 17 00:00:00 2001
From: yuanheng-zhao <jonathan.zhaoyh@gmail.com>
Date: Tue, 26 Sep 2023 13:32:40 +0800
Subject: [PATCH 04/12] fix dockerfile: fix commit hash, install curl

---
 examples/inference/serving/docker/Dockerfile | 14 ++++++++++----
 1 file changed, 10 insertions(+), 4 deletions(-)

diff --git a/examples/inference/serving/docker/Dockerfile b/examples/inference/serving/docker/Dockerfile
index ea693c371aa4..7ec8a23c7ca9 100644
--- a/examples/inference/serving/docker/Dockerfile
+++ b/examples/inference/serving/docker/Dockerfile
@@ -6,6 +6,12 @@ RUN mkdir ~/.ssh && \
     ssh-keygen -t rsa -N "" -f ~/.ssh/id_rsa && \
     cat ~/.ssh/id_rsa.pub >> ~/.ssh/authorized_keys
 
+# install curl
+RUN apt-get update && \
+    apt-get -y install curl && \
+    apt-get clean && \
+    rm -rf /var/lib/apt/lists/*
+
 # Download and extract OpenJDK 17
 ENV JAVA_HOME /opt/openjdk-17
 RUN apt-get update && \
@@ -31,11 +37,15 @@ RUN apt-get update && \
 ARG VERSION=main
 RUN git clone -b ${VERSION} https://github.com/hpcaitech/ColossalAI.git \
     && cd ./ColossalAI \
+    && git checkout 3e05c07bb8921f2a8f9736b6f6673d4e9f1697d0 \
     && CUDA_EXT=1 pip install -v --no-cache-dir .
 
 # install titans
 RUN pip install --no-cache-dir titans
 
+# install transformers
+RUN pip install --no-cache-dir transformers
+
 # install triton
 RUN pip install --no-cache-dir triton==2.0.0.dev20221202
 
@@ -45,7 +55,3 @@ RUN git clone -b ${VERSION} https://github.com/pytorch/serve.git \
     && cd ./serve \
     && python ./ts_scripts/install_dependencies.py --cuda=cu116 \
     && pip install torchserve torch-model-archiver torch-workflow-archiver
-
-# install requirements
-RUN cd ./ColossalAI/examples/inference/serving \
-    pip install -r requirements.txt

From 9d2ee3200696b3f53201468a5f0cbd33eb064e91 Mon Sep 17 00:00:00 2001
From: yuanheng-zhao <jonathan.zhaoyh@gmail.com>
Date: Tue, 26 Sep 2023 13:39:40 +0800
Subject: [PATCH 05/12] refactor file structure

---
 examples/inference/serving/test_ci.sh                             | 0
 .../serving/{ => torch_serve}/Colossal_Inference_Handler.py       | 0
 examples/inference/serving/{ => torch_serve}/README.md            | 0
 examples/inference/serving/{ => torch_serve}/config.properties    | 0
 examples/inference/serving/{ => torch_serve}/docker/Dockerfile    | 0
 examples/inference/serving/{ => torch_serve}/docker/build.sh      | 0
 examples/inference/serving/{ => torch_serve}/model-config.yaml    | 0
 examples/inference/serving/{ => torch_serve}/sample_text.txt      | 0
 8 files changed, 0 insertions(+), 0 deletions(-)
 create mode 100644 examples/inference/serving/test_ci.sh
 rename examples/inference/serving/{ => torch_serve}/Colossal_Inference_Handler.py (100%)
 rename examples/inference/serving/{ => torch_serve}/README.md (100%)
 rename examples/inference/serving/{ => torch_serve}/config.properties (100%)
 rename examples/inference/serving/{ => torch_serve}/docker/Dockerfile (100%)
 rename examples/inference/serving/{ => torch_serve}/docker/build.sh (100%)
 rename examples/inference/serving/{ => torch_serve}/model-config.yaml (100%)
 rename examples/inference/serving/{ => torch_serve}/sample_text.txt (100%)

diff --git a/examples/inference/serving/test_ci.sh b/examples/inference/serving/test_ci.sh
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/examples/inference/serving/Colossal_Inference_Handler.py b/examples/inference/serving/torch_serve/Colossal_Inference_Handler.py
similarity index 100%
rename from examples/inference/serving/Colossal_Inference_Handler.py
rename to examples/inference/serving/torch_serve/Colossal_Inference_Handler.py
diff --git a/examples/inference/serving/README.md b/examples/inference/serving/torch_serve/README.md
similarity index 100%
rename from examples/inference/serving/README.md
rename to examples/inference/serving/torch_serve/README.md
diff --git a/examples/inference/serving/config.properties b/examples/inference/serving/torch_serve/config.properties
similarity index 100%
rename from examples/inference/serving/config.properties
rename to examples/inference/serving/torch_serve/config.properties
diff --git a/examples/inference/serving/docker/Dockerfile b/examples/inference/serving/torch_serve/docker/Dockerfile
similarity index 100%
rename from examples/inference/serving/docker/Dockerfile
rename to examples/inference/serving/torch_serve/docker/Dockerfile
diff --git a/examples/inference/serving/docker/build.sh b/examples/inference/serving/torch_serve/docker/build.sh
similarity index 100%
rename from examples/inference/serving/docker/build.sh
rename to examples/inference/serving/torch_serve/docker/build.sh
diff --git a/examples/inference/serving/model-config.yaml b/examples/inference/serving/torch_serve/model-config.yaml
similarity index 100%
rename from examples/inference/serving/model-config.yaml
rename to examples/inference/serving/torch_serve/model-config.yaml
diff --git a/examples/inference/serving/sample_text.txt b/examples/inference/serving/torch_serve/sample_text.txt
similarity index 100%
rename from examples/inference/serving/sample_text.txt
rename to examples/inference/serving/torch_serve/sample_text.txt

From 76a1bb6117c57233bbd71048ed467a53ae191a6c Mon Sep 17 00:00:00 2001
From: yuanheng-zhao <jonathan.zhaoyh@gmail.com>
Date: Tue, 26 Sep 2023 13:49:45 +0800
Subject: [PATCH 06/12] revise readme

---
 examples/inference/serving/torch_serve/README.md | 9 ++++-----
 1 file changed, 4 insertions(+), 5 deletions(-)

diff --git a/examples/inference/serving/torch_serve/README.md b/examples/inference/serving/torch_serve/README.md
index 830a2a429efe..6ef40d739c4c 100644
--- a/examples/inference/serving/torch_serve/README.md
+++ b/examples/inference/serving/torch_serve/README.md
@@ -3,18 +3,17 @@
 ## Overview
 
 This demo is used for testing and demonstrating the usage of Colossal Inference from `colossalai.inference` with deployment with TorchServe. It imports inference modules from colossalai and is based on
-https://github.com/hpcaitech/ColossalAI/tree/d151dcab740eaae784333c93d85100c3641bd115. For now, single-gpu inference serving is supported.
+https://github.com/hpcaitech/ColossalAI/tree/3e05c07bb8921f2a8f9736b6f6673d4e9f1697d0. For now, single-gpu inference serving is supported.
 
 ## Conda Environment for testing
 Records to create a conda env to test locally as follows. We might want to use docker or configure env on cloud platform later.
 
+*NOTE*: It requires the installation of jdk and the set of `JAVA_HOME`. We recommend to install open-jdk-17 (Please refer to https://openjdk.org/projects/jdk/17/)
+
 ```bash
 # use python 3.8 or 3.9
 conda create -n infer python=3.9
 
-# prevent installing cuda stuff to root or somewhere weird
-module unload cuda
-
 # use torch 1.13+cuda11.6 for inference
 pip install torch==1.13.1+cu116 torchvision==0.14.1+cu116 torchaudio==0.13.1 --extra-index-url https://download.pytorch.org/whl/cu116
 
@@ -54,7 +53,7 @@ or use relatively small models (as we did) for local testing.
 ### 2. Archive the model
 With torch archiver, we will pack the model file (.zip) as well as handler file (.py) together into a .mar file. And then in serving process these files will be unpacked by TorchServe. Revelant model configs and inference configs can be set in `model-config.yaml`.
 ```bash
-cd Language/ColossalInfer
+cd ./ColossalAI/examples/inference/serving/torch_serve
 # create a folder under the current directory to store the packed model created by torch archiver
 mkdir model_store
 torch-model-archiver --model-name bloom --version 0.1 --handler Colossal_Inference_Handler.py --config-file model-config.yaml --extra-files <dir_zipped_model>/model.zip --export-path ./model_store/

From 6b178bae75a26510ad20893861cf241dbf6a86ff Mon Sep 17 00:00:00 2001
From: yuanheng-zhao <jonathan.zhaoyh@gmail.com>
Date: Tue, 26 Sep 2023 13:50:36 +0800
Subject: [PATCH 07/12] trivial

---
 .../inference/serving/torch_serve/Colossal_Inference_Handler.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/examples/inference/serving/torch_serve/Colossal_Inference_Handler.py b/examples/inference/serving/torch_serve/Colossal_Inference_Handler.py
index 46a47d7cecda..466967e1a96a 100644
--- a/examples/inference/serving/torch_serve/Colossal_Inference_Handler.py
+++ b/examples/inference/serving/torch_serve/Colossal_Inference_Handler.py
@@ -131,7 +131,7 @@ def initialize(self, ctx):
     def preprocess(self, requests):
         """Basic text preprocessing, based on the user's chocie of application mode.
         Args:
-            requests (str): The Input data in the form of text is passed on to the preprocess
+            requests: The Input data in the form of text is passed on to the preprocess
             function.
         Returns:
             list : The preprocess function returns a list of Tensor for the size of the word tokens.

From 7acd42fac53b5754f7eb65167c5c30d2cd080551 Mon Sep 17 00:00:00 2001
From: yuanheng-zhao <jonathan.zhaoyh@gmail.com>
Date: Tue, 26 Sep 2023 14:38:52 +0800
Subject: [PATCH 08/12] trivial: dockerfile format

---
 .../serving/torch_serve/docker/Dockerfile        | 16 ++++++++--------
 1 file changed, 8 insertions(+), 8 deletions(-)

diff --git a/examples/inference/serving/torch_serve/docker/Dockerfile b/examples/inference/serving/torch_serve/docker/Dockerfile
index 7ec8a23c7ca9..6d780a84747f 100644
--- a/examples/inference/serving/torch_serve/docker/Dockerfile
+++ b/examples/inference/serving/torch_serve/docker/Dockerfile
@@ -35,10 +35,10 @@ RUN apt-get update && \
 
 # install colossalai
 ARG VERSION=main
-RUN git clone -b ${VERSION} https://github.com/hpcaitech/ColossalAI.git \
-    && cd ./ColossalAI \
-    && git checkout 3e05c07bb8921f2a8f9736b6f6673d4e9f1697d0 \
-    && CUDA_EXT=1 pip install -v --no-cache-dir .
+RUN git clone -b ${VERSION} https://github.com/hpcaitech/ColossalAI.git && \
+    cd ./ColossalAI && \
+    git checkout 3e05c07bb8921f2a8f9736b6f6673d4e9f1697d0 && \
+    CUDA_EXT=1 pip install -v --no-cache-dir .
 
 # install titans
 RUN pip install --no-cache-dir titans
@@ -51,7 +51,7 @@ RUN pip install --no-cache-dir triton==2.0.0.dev20221202
 
 # install torchserve
 ARG VERSION=master
-RUN git clone -b ${VERSION} https://github.com/pytorch/serve.git \
-    && cd ./serve \
-    && python ./ts_scripts/install_dependencies.py --cuda=cu116 \
-    && pip install torchserve torch-model-archiver torch-workflow-archiver
+RUN git clone -b ${VERSION} https://github.com/pytorch/serve.git && \
+    cd ./serve && \
+    python ./ts_scripts/install_dependencies.py --cuda=cu116 && \
+    pip install torchserve torch-model-archiver torch-workflow-archiver

From 375bbe0d7e18b72dc49380f9284c05a988760b4e Mon Sep 17 00:00:00 2001
From: ocd_with_naming <jonathan.zhaoyh@gmail.com>
Date: Tue, 26 Sep 2023 22:27:35 +0800
Subject: [PATCH 09/12] clean dir; revise readme

---
 examples/inference/serving/download_model.py  | 50 -------------------
 .../inference/serving/torch_serve/README.md   | 33 ++++++++++--
 .../serving/torch_serve/docker/build.sh       |  1 -
 3 files changed, 30 insertions(+), 54 deletions(-)
 delete mode 100644 examples/inference/serving/download_model.py
 delete mode 100755 examples/inference/serving/torch_serve/docker/build.sh

diff --git a/examples/inference/serving/download_model.py b/examples/inference/serving/download_model.py
deleted file mode 100644
index 41ff80617c26..000000000000
--- a/examples/inference/serving/download_model.py
+++ /dev/null
@@ -1,50 +0,0 @@
-# CREDITS: These functions are from https://github.com/pytorch/serve/blob/2bf505bae3046b0f7d0900727ec36e611bb5dca3/examples/large_models/utils/Download_model.py
-import argparse
-import os
-
-from huggingface_hub import HfApi, snapshot_download
-
-
-def dir_path(path_str):
-    if os.path.isdir(path_str):
-        return path_str
-    elif input(f"{path_str} does not exist, create directory? [y/n]").lower() == "y":
-        os.makedirs(path_str)
-        return path_str
-    else:
-        raise NotADirectoryError(path_str)
-
-
-class HFModelNotFoundError(Exception):
-    def __init__(self, model_str):
-        super().__init__(f"HuggingFace model not found: '{model_str}'")
-
-
-def hf_model(model_str):
-    api = HfApi()
-    models = [m.modelId for m in api.list_models()]
-    if model_str in models:
-        return model_str
-    else:
-        raise HFModelNotFoundError(model_str)
-
-
-parser = argparse.ArgumentParser()
-parser.add_argument(
-    "--model_path",
-    "-o",
-    type=dir_path,
-    default="model",
-    help="Output directory for downloaded model files",
-)
-parser.add_argument("--model_name", "-m", type=hf_model, required=True, help="HuggingFace model name")
-parser.add_argument("--revision", "-r", type=str, default="main", help="Revision")
-args = parser.parse_args()
-
-snapshot_path = snapshot_download(
-    repo_id=args.model_name,
-    revision=args.revision,
-    cache_dir=args.model_path,
-    use_auth_token=True,
-)
-print(f"Files for '{args.model_name}' is downloaded to '{snapshot_path}'")
diff --git a/examples/inference/serving/torch_serve/README.md b/examples/inference/serving/torch_serve/README.md
index 6ef40d739c4c..6bd145bc30ae 100644
--- a/examples/inference/serving/torch_serve/README.md
+++ b/examples/inference/serving/torch_serve/README.md
@@ -5,7 +5,8 @@
 This demo is used for testing and demonstrating the usage of Colossal Inference from `colossalai.inference` with deployment with TorchServe. It imports inference modules from colossalai and is based on
 https://github.com/hpcaitech/ColossalAI/tree/3e05c07bb8921f2a8f9736b6f6673d4e9f1697d0. For now, single-gpu inference serving is supported.
 
-## Conda Environment for testing
+## Environment for testing
+### Option #1: Use Conda Env
 Records to create a conda env to test locally as follows. We might want to use docker or configure env on cloud platform later.
 
 *NOTE*: It requires the installation of jdk and the set of `JAVA_HOME`. We recommend to install open-jdk-17 (Please refer to https://openjdk.org/projects/jdk/17/)
@@ -27,17 +28,43 @@ pip install -r requirements/requirements-test.txt
 CUDA_EXT=1 pip install -e .
 
 # install torchserve
-cd <path_to_pytorch_serve_repo>
+cd <path_to_torch_serve_repo>
 python ./ts_scripts/install_dependencies.py --cuda=cu116
 pip install torchserve torch-model-archiver torch-workflow-archiver
 ```
 
+### Option #2: Use Docker
+To use the stable diffusion Docker image, you can build using the provided the [Dockerfile](./docker/Dockerfile).
+
+```bash
+# build from dockerfile
+cd ColossalAI/examples/inference/serving/torch_serve/docker
+docker build -t hpcaitech/colossal-infer-ts:0.2.0 .
+```
+
+Once you have the image ready, you can launch the image with the following command
+
+```bash
+cd ColossalAI/examples/inference/serving/torch_serve
+
+# run the docker container
+docker run --rm \
+    -it --gpus all \
+    --name <name_you_assign> \
+    -v <your-data-dir>:/data/scratch \
+    -w <ColossalAI_dir> \
+    hpcaitech/colossal-infer-ts:0.2.0 \
+    /bin/bash
+```
+
 ## Steps to deploy a model
 
 ###  1.download/prepare a model
-To use on cloud platform, we will zip the downloaded model.
+We will download a bloom model, and then zip the downloaded model. You could download the model from [HuggingFace](https://huggingface.co/models) manually, or you might want to refer to this script [download_model.py](https://github.com/pytorch/serve/blob/c3ca2599b4d36d2b61302064b02eab1b65e1908d/examples/large_models/utils/Download_model.py) provided by pytorch-serve team to help you download a snapshot of the model.
+
 ```bash
 # download snapshots
+cd <path_to_torch_serve>/examples/large_models/utils/
 huggingface-cli login
 python download_model.py --model_name bigscience/bloom-560m -o <path_to_store_downloaded_model>
 
diff --git a/examples/inference/serving/torch_serve/docker/build.sh b/examples/inference/serving/torch_serve/docker/build.sh
deleted file mode 100755
index 1fcafbd99274..000000000000
--- a/examples/inference/serving/torch_serve/docker/build.sh
+++ /dev/null
@@ -1 +0,0 @@
-docker build -t colossal-infer-ts:0.0.1 .

From 3b3e3643de9a538ef6a51b3dcf2e50b1c102e99c Mon Sep 17 00:00:00 2001
From: yuanheng-zhao <jonathan.zhaoyh@gmail.com>
Date: Wed, 27 Sep 2023 18:17:27 +0800
Subject: [PATCH 10/12] fix comments: fix imports and configs

---
 .../torch_serve/Colossal_Inference_Handler.py | 53 +++++++------------
 1 file changed, 19 insertions(+), 34 deletions(-)

diff --git a/examples/inference/serving/torch_serve/Colossal_Inference_Handler.py b/examples/inference/serving/torch_serve/Colossal_Inference_Handler.py
index 466967e1a96a..9896eb0d7d50 100644
--- a/examples/inference/serving/torch_serve/Colossal_Inference_Handler.py
+++ b/examples/inference/serving/torch_serve/Colossal_Inference_Handler.py
@@ -13,31 +13,13 @@
 import colossalai
 from colossalai.inference.tensor_parallel.engine import TPInferEngine
 from colossalai.shardformer import ShardConfig
+from colossalai.testing import free_port
 
 logger = logging.getLogger(__name__)
 logger.info("Transformers version %s", transformers.__version__)
 logger.info("ColossalAI version %s", colossalai.__version__)
 
 
-# from colossalai.testing
-# assins a random port, for demo use only
-def free_port() -> int:
-    """Get a free port on localhost.
-
-    Returns:
-        int: A free port on localhost.
-    """
-    while True:
-        port = random.randint(20000, 65000)
-        try:
-            with socket.socket() as sock:
-                sock.setsockopt(socket.SOL_SOCKET, socket.SO_REUSEADDR, 1)
-                sock.bind(("localhost", port))
-                return port
-        except OSError:
-            continue
-
-
 class ColossalInferenceHandler(BaseHandler, ABC):
     """
     Transformers handler class for testing
@@ -68,34 +50,35 @@ def initialize(self, ctx):
 
         # Inference configs are collected together in model yaml config for handler use
         inference_config = ctx.model_yaml_config["handler"]
-        logger.info(inference_config)
-        inference_config["model_type"]
-        self.tp_size = inference_config.get("tp_size", 1)
-        self.max_batch_size = inference_config.get("max_batch_size", 4)
-        self.max_input_len = inference_config.get("max_input_len", 1024)
-        self.max_output_len = inference_config.get("max_output_len", 128)
+        self.inference_config = inference_config
+        logger.info(self.inference_config)
+        
+        self.tp_size = self.inference_config.get("tp_size", 1)
+        self.max_batch_size = self.inference_config.get("max_batch_size", 4)
+        self.max_input_len = self.inference_config.get("max_input_len", 1024)
+        self.max_output_len = self.inference_config.get("max_output_len", 128)
 
         self.device = torch.device("cuda:" + str(gpu_id) if torch.cuda.is_available() and gpu_id >= 0 else "cpu")
         logger.info(f"Device set to {self.device}")
         logger.info(f"torch.cuda.device_count() {torch.cuda.device_count()}")
 
-        logger.info(f"Unpacking from model_dir {model_dir}")
+        # Unpacking from model_dir 
         model_dir_path = os.path.join(model_dir, "model")
         with zipfile.ZipFile(model_dir + "/model.zip", "r") as zip_ref:
             zip_ref.extractall(model_dir_path)
-        logger.info(f"Loading {inference_config['model_type']} pretrain model and tokenizer")
-        if inference_config["model_type"] == "bloom":
+        logger.info(f"Loading {self.inference_config['model_type']} pretrain model and tokenizer")
+        if self.inference_config["model_type"] == "bloom":
             self.model = BloomForCausalLM.from_pretrained(
                 model_dir_path,
             )
             self.tokenizer = BloomTokenizerFast.from_pretrained(model_dir_path, return_tensors="pt")
-        elif inference_config["model_type"] == "llama":
+        elif self.inference_config["model_type"] == "llama":
             self.model = LlamaForCausalLM.from_pretrained(
                 model_dir_path,
             )
             self.tokenizer = AutoTokenizer.from_pretrained(model_dir_path, return_tensors="pt")
         else:
-            logger.warning(f"Model type {inference_config['model_type']} not supported yet.")
+            logger.warning(f"Model type {self.inference_config['model_type']} not supported yet.")
 
         logger.info("Transformer model from path %s loaded successfully", model_dir)
 
@@ -181,13 +164,15 @@ def inference(self, input_batch):
         input_ids_batch, attention_mask_batch = input_batch
         inferences = []
 
-        # mode: text_generation
+        do_sample = self.inference_config.get("do_sample", True)
+        top_p = self.inference_config.get("top_p", 0.95 if do_sample else 1.0 )
+        top_k = self.inference_config.get("top_k", 60 if do_sample else 50)
         input_ids_batch = input_ids_batch.to(self.device)
         outputs = self.infer_engine.generate(
             dict(input_ids=input_ids_batch, attention_mask=attention_mask_batch),
-            do_sample=True,
-            top_p=0.95,
-            top_k=60,
+            do_sample=do_sample,
+            top_p=top_p,
+            top_k=top_k,
         )
 
         for i, _ in enumerate(outputs):

From 6ffd19dd581c6f945dd18a5e78863885b3e4c213 Mon Sep 17 00:00:00 2001
From: yuanheng-zhao <jonathan.zhaoyh@gmail.com>
Date: Wed, 27 Sep 2023 18:30:49 +0800
Subject: [PATCH 11/12] fix formats

---
 .../serving/torch_serve/Colossal_Inference_Handler.py     | 8 +++-----
 1 file changed, 3 insertions(+), 5 deletions(-)

diff --git a/examples/inference/serving/torch_serve/Colossal_Inference_Handler.py b/examples/inference/serving/torch_serve/Colossal_Inference_Handler.py
index 9896eb0d7d50..c0d30501efea 100644
--- a/examples/inference/serving/torch_serve/Colossal_Inference_Handler.py
+++ b/examples/inference/serving/torch_serve/Colossal_Inference_Handler.py
@@ -1,7 +1,5 @@
 import logging
 import os
-import random
-import socket
 import zipfile
 from abc import ABC
 
@@ -52,7 +50,7 @@ def initialize(self, ctx):
         inference_config = ctx.model_yaml_config["handler"]
         self.inference_config = inference_config
         logger.info(self.inference_config)
-        
+
         self.tp_size = self.inference_config.get("tp_size", 1)
         self.max_batch_size = self.inference_config.get("max_batch_size", 4)
         self.max_input_len = self.inference_config.get("max_input_len", 1024)
@@ -62,7 +60,7 @@ def initialize(self, ctx):
         logger.info(f"Device set to {self.device}")
         logger.info(f"torch.cuda.device_count() {torch.cuda.device_count()}")
 
-        # Unpacking from model_dir 
+        # Unpacking from model_dir
         model_dir_path = os.path.join(model_dir, "model")
         with zipfile.ZipFile(model_dir + "/model.zip", "r") as zip_ref:
             zip_ref.extractall(model_dir_path)
@@ -165,7 +163,7 @@ def inference(self, input_batch):
         inferences = []
 
         do_sample = self.inference_config.get("do_sample", True)
-        top_p = self.inference_config.get("top_p", 0.95 if do_sample else 1.0 )
+        top_p = self.inference_config.get("top_p", 0.95 if do_sample else 1.0)
         top_k = self.inference_config.get("top_k", 60 if do_sample else 50)
         input_ids_batch = input_ids_batch.to(self.device)
         outputs = self.infer_engine.generate(

From 09469565aeceac6fe673f76481cc2b85d58bda50 Mon Sep 17 00:00:00 2001
From: yuanheng-zhao <jonathan.zhaoyh@gmail.com>
Date: Mon, 2 Oct 2023 17:14:28 +0800
Subject: [PATCH 12/12] remove unused requirements

---
 examples/inference/serving/requirements.txt | 5 -----
 1 file changed, 5 deletions(-)
 delete mode 100644 examples/inference/serving/requirements.txt

diff --git a/examples/inference/serving/requirements.txt b/examples/inference/serving/requirements.txt
deleted file mode 100644
index 3d83192b3896..000000000000
--- a/examples/inference/serving/requirements.txt
+++ /dev/null
@@ -1,5 +0,0 @@
-torch==1.13
-torchserve
-transformers
-triton==2.0.0.dev20221202
-colossalai