From 86ecbc141ccf85e23d2751646ff2975c80e025b5 Mon Sep 17 00:00:00 2001
From: yuanheng-zhao <jonathan.zhaoyh@gmail.com>
Date: Thu, 28 Sep 2023 10:57:12 +0800
Subject: [PATCH 01/12] fix imports

---
 colossalai/inference/tensor_parallel/modeling/__init__.py | 2 --
 colossalai/kernel/triton/__init__.py                      | 5 +++--
 2 files changed, 3 insertions(+), 4 deletions(-)

diff --git a/colossalai/inference/tensor_parallel/modeling/__init__.py b/colossalai/inference/tensor_parallel/modeling/__init__.py
index 279b54065eed..4662368b17b4 100644
--- a/colossalai/inference/tensor_parallel/modeling/__init__.py
+++ b/colossalai/inference/tensor_parallel/modeling/__init__.py
@@ -1,5 +1,3 @@
-import _utils
-
 from .bloom import BloomInferenceForwards
 from .chatglm2 import ChatGLM2InferenceForwards
 from .llama import LlamaInferenceForwards
diff --git a/colossalai/kernel/triton/__init__.py b/colossalai/kernel/triton/__init__.py
index 87ea9cf6536e..a0f0313954b5 100644
--- a/colossalai/kernel/triton/__init__.py
+++ b/colossalai/kernel/triton/__init__.py
@@ -6,7 +6,8 @@
     from .context_attention import bloom_context_attn_fwd, llama_context_attn_fwd
     from .copy_kv_cache_dest import copy_kv_cache_to_dest
     from .fused_layernorm import layer_norm
-    from .gptq_triton import gptq_fused_linear_triton
+
+    # from .gptq_triton import gptq_fused_linear_triton
     from .rms_norm import rmsnorm_forward
     from .rotary_embedding_kernel import rotary_embedding_fwd
     from .softmax import softmax
@@ -21,7 +22,7 @@
         "copy_kv_cache_to_dest",
         "rotary_embedding_fwd",
         "token_attention_fwd",
-        "gptq_fused_linear_triton",
+        # "gptq_fused_linear_triton",
     ]
 
 except ImportError:

From 2a2325a264f5c74ac764c8bb4fd38965432cc4f9 Mon Sep 17 00:00:00 2001
From: yuanheng-zhao <jonathan.zhaoyh@gmail.com>
Date: Thu, 28 Sep 2023 18:09:48 +0800
Subject: [PATCH 02/12] add ray-serve with Colossal-Infer tp

---
 .../ray_serve/Colossal_Inference_rayserve.py  | 146 ++++++++++++++++++
 .../serving/ray_serve/send_request.py         |  28 ++++
 2 files changed, 174 insertions(+)
 create mode 100644 examples/inference/serving/ray_serve/Colossal_Inference_rayserve.py
 create mode 100644 examples/inference/serving/ray_serve/send_request.py

diff --git a/examples/inference/serving/ray_serve/Colossal_Inference_rayserve.py b/examples/inference/serving/ray_serve/Colossal_Inference_rayserve.py
new file mode 100644
index 000000000000..e15f0a5552ba
--- /dev/null
+++ b/examples/inference/serving/ray_serve/Colossal_Inference_rayserve.py
@@ -0,0 +1,146 @@
+import logging
+import os
+from dataclasses import dataclass
+from typing import Any, List, Union
+
+import ray
+import ray.util.collective as collective
+import starlette
+import torch
+from ray import serve
+from transformers import BloomForCausalLM, BloomTokenizerFast
+
+import colossalai
+from colossalai.inference.tensor_parallel.engine import TPInferEngine
+from colossalai.shardformer import ShardConfig
+from colossalai.testing import free_port
+
+ray_serve_logger = logging.getLogger("ray.serve")
+
+
+def log_cuda_info(scope_name: str):
+    ray_serve_logger.info(f" {scope_name}: ray.get_gpu_ids(): {ray.get_gpu_ids()}")
+    ray_serve_logger.info(
+        f" {scope_name}: CUDA_VISIBLE_DEVICES: {os.getenv('CUDA_VISIBLE_DEVICES', 'NO DEVICES FOUND!')}"
+    )
+    if torch.cuda.is_available():
+        ray_serve_logger.info(
+            f" {scope_name}: cuda current_device: {torch.cuda.current_device()}, cuda device count: {torch.cuda.device_count()}"
+        )
+    else:
+        ray_serve_logger.info(f" {scope_name}: cuda is not available!")
+
+
+@ray.remote(num_gpus=1)
+class Worker:
+    def __init__(self, model_path: str, tp_size: int, max_batch_size: int, max_input_len: int, max_output_len: int):
+        log_cuda_info("Worker.init")
+        self.tp_size = tp_size
+        self.model_path = model_path
+        self.max_batch_size = max_batch_size
+        self.max_input_len = max_input_len
+        self.max_output_len = max_output_len
+
+    def setup(self, world_size, rank):
+        # initialize a ray collective group, otherwise colossalai distributed env won't be built successfully
+        collective.init_collective_group(world_size, rank, "nccl", "default")
+        # initialize and set distributed environment
+        available_port = free_port()  # just grab a free port on localhost
+        colossalai.launch(
+            config={}, rank=rank, world_size=world_size, host="localhost", port=available_port, backend="nccl"
+        )
+        ray_serve_logger.info(f"Worker with rank {rank} (world size {world_size}) setting up..")
+        log_cuda_info("Worker.setup")
+
+        # Load model
+        self.tokenizer = BloomTokenizerFast.from_pretrained(self.model_path)
+        self.tokenizer.pad_token = self.tokenizer.eos_token
+        self.model = BloomForCausalLM.from_pretrained(
+            self.model_path, pad_token_id=self.tokenizer.eos_token_id, torch_dtype=torch.float16
+        )
+
+        shard_config = ShardConfig(enable_tensor_parallelism=True if world_size > 1 else False, inference_only=True)
+        self.infer_engine = TPInferEngine(
+            self.model, shard_config, self.max_batch_size, self.max_input_len, self.max_output_len
+        )
+        self.generate_kwargs = dict(max_new_tokens=self.max_output_len, do_sample=False)
+
+        return True
+
+    def generate(self, text: Union[str, List[str]]) -> str:
+        input_tokens = self.tokenizer.batch_encode_plus(text, return_tensors="pt", padding=True)
+        ray_serve_logger.info(f"text: {text},\ninput_tokens: {input_tokens}")
+
+        model_output = self.infer_engine.generate(input_tokens, **self.generate_kwargs)
+        ray_serve_logger.info(f"model_output.shape: {model_output.shape}")
+
+        text_output = []
+        for i in range(len(model_output)):
+            text_output.append(self.tokenizer.decode(model_output[i]))
+        ray_serve_logger.info(f"output: {text_output}")
+
+        return text_output
+
+
+@serve.deployment(num_replicas=1, ray_actor_options={"num_gpus": 0})
+class Driver:
+    def __init__(self, config):
+        log_cuda_info("Driver:init")
+        model_path = config.model_path
+        tp_size = config.tp_size
+
+        self.num_workers = tp_size
+        self.workers = []
+        init_rets = []
+
+        for i in range(self.num_workers):
+            worker_name = "worker_idx_{}".format(i)
+            w = Worker.options(name=worker_name).remote(
+                model_path, self.num_workers, config.max_batch_size, config.max_input_len, config.max_output_len
+            )
+            self.workers.append(w)
+            init_rets.append(w.setup.remote(self.num_workers, i))
+        _options = {
+            "group_name": "default_driver",
+            "world_size": self.num_workers,
+            "ranks": [i for i in range(self.num_workers)],
+            "backend": "nccl",
+        }
+        collective.create_collective_group(self.workers, **_options)
+        _ = ray.get(init_rets)
+
+    # set batch wait delay in seconds and maximum number of sequences in a batch
+    @serve.batch(batch_wait_timeout_s=0.8, max_batch_size=4)
+    async def batch_generate(self, requests: List[str]):
+        ray_serve_logger.info(f"Driver.batch_generate: requests length: {len(requests)}\n requests: {requests}")
+        results = ray.get([w.generate.remote(requests) for w in self.workers])
+        text_res = results[0]  # get any one of the copies
+        return text_res
+
+    async def __call__(self, request: starlette.requests.Request) -> Any:
+        return await self.batch_generate(request.query_params["text"])
+
+
+@dataclass
+class Config:
+    """temp config"""
+
+    model_path: str
+    tp_size: int = 2
+    max_batch_size: int = 4
+    max_input_len: int = 128
+    max_output_len: int = 32
+
+
+# *** add model path manually into the config***
+driver_config = Config(model_path="ADD MODEL PATH HRER")
+app = Driver.bind(config=driver_config)
+
+
+# 1. use the following cmd in CLI
+# RAY_DEDUP_LOGS=0 serve run Colossal_Inference_rayserve:app
+
+# 2. or, uncomment the following lines
+# handle: DeploymentHandle = serve.run(app)
+# print(requests.post("http://localhost:8000/", json={"text": text}).json())
+# print(requests.get("http://localhost:8000/?text={}".format(text)))
diff --git a/examples/inference/serving/ray_serve/send_request.py b/examples/inference/serving/ray_serve/send_request.py
new file mode 100644
index 000000000000..a775e533efd6
--- /dev/null
+++ b/examples/inference/serving/ray_serve/send_request.py
@@ -0,0 +1,28 @@
+import ray
+import requests
+
+
+@ray.remote
+def send_query(text):
+    # resp = requests.post("http://localhost:8000/", json={"text": text})
+    resp = requests.get("http://localhost:8000/?text={}".format(text))
+    return resp.text
+
+
+test_sentences = [
+    "The cat chased the mouse.",
+    "Sunshine brings joy and warmth.",
+    "Coding requires practice and patience.",
+    "Rainy days inspire cozy reading.",
+    "Laughter is contagious and heartwarming.",
+    "Hiking mountains builds strength and resilience.",
+    "Family bonds grow stronger with time.",
+    "Science unlocks mysteries of the universe.",
+    "Music soothes the soul and ignites passion.",
+    "Artistic expression knows no boundaries.",
+]
+
+results = ray.get([send_query.remote(text) for text in test_sentences])
+print("Result returned:")
+for res in results:
+    print(res)

From 7ca5309393cfb8de76d89066ae233f5f6842c81f Mon Sep 17 00:00:00 2001
From: yuanheng-zhao <jonathan.zhaoyh@gmail.com>
Date: Fri, 29 Sep 2023 09:36:26 +0800
Subject: [PATCH 03/12] trivial: send requests script

---
 .../serving/ray_serve/send_request.py         | 15 ++--------
 .../serving/ray_serve/send_requests.py        | 28 +++++++++++++++++++
 2 files changed, 30 insertions(+), 13 deletions(-)
 create mode 100644 examples/inference/serving/ray_serve/send_requests.py

diff --git a/examples/inference/serving/ray_serve/send_request.py b/examples/inference/serving/ray_serve/send_request.py
index a775e533efd6..903f8eda02e3 100644
--- a/examples/inference/serving/ray_serve/send_request.py
+++ b/examples/inference/serving/ray_serve/send_request.py
@@ -9,20 +9,9 @@ def send_query(text):
     return resp.text
 
 
-test_sentences = [
-    "The cat chased the mouse.",
-    "Sunshine brings joy and warmth.",
-    "Coding requires practice and patience.",
-    "Rainy days inspire cozy reading.",
-    "Laughter is contagious and heartwarming.",
-    "Hiking mountains builds strength and resilience.",
-    "Family bonds grow stronger with time.",
-    "Science unlocks mysteries of the universe.",
-    "Music soothes the soul and ignites passion.",
-    "Artistic expression knows no boundaries.",
-]
+test_sentence = "Introduce some landmarks in Beijing"
 
-results = ray.get([send_query.remote(text) for text in test_sentences])
+results = ray.get(send_query.remote(test_sentence))
 print("Result returned:")
 for res in results:
     print(res)
diff --git a/examples/inference/serving/ray_serve/send_requests.py b/examples/inference/serving/ray_serve/send_requests.py
new file mode 100644
index 000000000000..3e34b9c9a497
--- /dev/null
+++ b/examples/inference/serving/ray_serve/send_requests.py
@@ -0,0 +1,28 @@
+import ray
+import requests
+
+
+@ray.remote
+def send_query(text):
+    # resp = requests.post("http://localhost:8000/", json={"text": text})
+    resp = requests.get("http://localhost:8000/?text={}".format(text))
+    return resp.text
+
+
+test_sentences = [
+    "Introduce some landmarks in Beijing",
+    "What is the weather today",
+    "Coding requires practice and patience",
+    "Rainy days inspire cozy reading",
+    "Laughter is contagious and heartwarming",
+    "Hiking mountains builds strength and resilience",
+    "Family bonds grow stronger with time",
+    "Science unlocks mysteries of the universe",
+    "Music soothes the soul and ignites passion",
+    "Artistic expression knows no boundaries",
+]
+
+results = ray.get([send_query.remote(text) for text in test_sentences])
+print("Result returned:")
+for res in results:
+    print(res)

From 79d0db579e7950bf32fb90c35386a445c2a9ba6f Mon Sep 17 00:00:00 2001
From: yuanheng-zhao <jonathan.zhaoyh@gmail.com>
Date: Fri, 29 Sep 2023 14:18:21 +0800
Subject: [PATCH 04/12] add README

---
 .../inference/serving/ray_serve/README.md     | 55 +++++++++++++++++++
 1 file changed, 55 insertions(+)
 create mode 100644 examples/inference/serving/ray_serve/README.md

diff --git a/examples/inference/serving/ray_serve/README.md b/examples/inference/serving/ray_serve/README.md
new file mode 100644
index 000000000000..08489b705890
--- /dev/null
+++ b/examples/inference/serving/ray_serve/README.md
@@ -0,0 +1,55 @@
+# Colossal-Inference with Ray Serve
+
+This example is used for demonstrating and testing the deployment of Colossal Inference from `colossalai.inference` with [Ray Serve](https://docs.ray.io/en/latest/serve/index.html). It imports inference modules from colossalai and is based on https://github.com/hpcaitech/ColossalAI/tree/a22706337a57dd1c98b95739dd09d98bd55947a0.
+
+Single-gpu inference and multiple-gpu inference (i.e. tensor parallel) serving are supported.
+
+## Env Installation
+
+Conda
+```bash
+# create a new conda env with python 3.8
+conda create -n ray_test python=3.8.18
+
+# use torch1.13+cuda11.6
+pip install torch==1.13.1+cu116 torchvision==0.14.1+cu116 torchaudio==0.13.1 --extra-index-url https://download.pytorch.org/whl/cu116
+
+# install ray from wheels
+pip install -U "ray[default,serve]"
+
+# install cuda toolkit (e.g. nvcc, etc)
+conda install -c "nvidia/label/cuda-11.6.2" cuda-toolkit
+
+# install cuDNN, cuTENSOR, and NCCL
+conda install -c conda-forge cupy cudnn cutensor nccl cuda-version=11.6
+
+# install colossalai with PyTorch extensions
+cd <path_to_ColossalAI_repo>
+CUDA_EXT=1 pip install -e .
+
+pip install transformers
+```
+
+## Launch Ray Serve and run the app
+### Method #1. CLI command
+
+```bash
+    RAY_DEDUP_LOGS=0 serve run Colossal_Inference_rayserve:app
+```
+
+By default, Ray deduplicates logs across cluster. Here we set `RAY_DEDUP_LOGS=0`` to disable log deduplication, enabling each actor to log information in CLI.
+
+`serve run` runs an application from the specified import path. The formats should be `<filename>:<app_name>`.
+
+Then we could send requests by running python script in another window:
+```bash
+python send_request.py
+```
+
+### Method #2. Run inside script
+
+Attach the following to the end of the file, and run the script via `python Colossal_Inference_rayserve.py`
+```pyhton
+handle: DeploymentHandle = serve.run(app)
+print(requests.get("http://localhost:8000/?text={}".format(text)))
+```

From 520741bab01463653b70349aea4d5227d3f83b78 Mon Sep 17 00:00:00 2001
From: yuanheng-zhao <jonathan.zhaoyh@gmail.com>
Date: Fri, 29 Sep 2023 14:31:36 +0800
Subject: [PATCH 05/12] fix worker port

---
 .../ray_serve/Colossal_Inference_rayserve.py  | 24 +++++++------------
 .../inference/serving/ray_serve/README.md     |  9 +++++++
 .../serving/ray_serve/send_request.py         |  5 ++--
 3 files changed, 19 insertions(+), 19 deletions(-)

diff --git a/examples/inference/serving/ray_serve/Colossal_Inference_rayserve.py b/examples/inference/serving/ray_serve/Colossal_Inference_rayserve.py
index e15f0a5552ba..1b9e38220304 100644
--- a/examples/inference/serving/ray_serve/Colossal_Inference_rayserve.py
+++ b/examples/inference/serving/ray_serve/Colossal_Inference_rayserve.py
@@ -41,14 +41,11 @@ def __init__(self, model_path: str, tp_size: int, max_batch_size: int, max_input
         self.max_input_len = max_input_len
         self.max_output_len = max_output_len
 
-    def setup(self, world_size, rank):
+    def setup(self, world_size, rank, port):
         # initialize a ray collective group, otherwise colossalai distributed env won't be built successfully
         collective.init_collective_group(world_size, rank, "nccl", "default")
         # initialize and set distributed environment
-        available_port = free_port()  # just grab a free port on localhost
-        colossalai.launch(
-            config={}, rank=rank, world_size=world_size, host="localhost", port=available_port, backend="nccl"
-        )
+        colossalai.launch(config={}, rank=rank, world_size=world_size, host="localhost", port=port, backend="nccl")
         ray_serve_logger.info(f"Worker with rank {rank} (world size {world_size}) setting up..")
         log_cuda_info("Worker.setup")
 
@@ -93,13 +90,17 @@ def __init__(self, config):
         self.workers = []
         init_rets = []
 
+        # Just grab a free port on localhost
+        # NOTE workers in this communication group listen to the same port
+        available_port = free_port()
+
         for i in range(self.num_workers):
             worker_name = "worker_idx_{}".format(i)
             w = Worker.options(name=worker_name).remote(
                 model_path, self.num_workers, config.max_batch_size, config.max_input_len, config.max_output_len
             )
             self.workers.append(w)
-            init_rets.append(w.setup.remote(self.num_workers, i))
+            init_rets.append(w.setup.remote(self.num_workers, i, available_port))
         _options = {
             "group_name": "default_driver",
             "world_size": self.num_workers,
@@ -133,14 +134,5 @@ class Config:
 
 
 # *** add model path manually into the config***
-driver_config = Config(model_path="ADD MODEL PATH HRER")
+driver_config = Config(model_path="ADD_MODEL_PATH_HRER")
 app = Driver.bind(config=driver_config)
-
-
-# 1. use the following cmd in CLI
-# RAY_DEDUP_LOGS=0 serve run Colossal_Inference_rayserve:app
-
-# 2. or, uncomment the following lines
-# handle: DeploymentHandle = serve.run(app)
-# print(requests.post("http://localhost:8000/", json={"text": text}).json())
-# print(requests.get("http://localhost:8000/?text={}".format(text)))
diff --git a/examples/inference/serving/ray_serve/README.md b/examples/inference/serving/ray_serve/README.md
index 08489b705890..a5606e46d5e7 100644
--- a/examples/inference/serving/ray_serve/README.md
+++ b/examples/inference/serving/ray_serve/README.md
@@ -53,3 +53,12 @@ Attach the following to the end of the file, and run the script via `python Colo
 handle: DeploymentHandle = serve.run(app)
 print(requests.get("http://localhost:8000/?text={}".format(text)))
 ```
+
+
+
+Use
+```bash
+ray stop
+```
+
+to kill any active Ray processes
diff --git a/examples/inference/serving/ray_serve/send_request.py b/examples/inference/serving/ray_serve/send_request.py
index 903f8eda02e3..70a15724ab6c 100644
--- a/examples/inference/serving/ray_serve/send_request.py
+++ b/examples/inference/serving/ray_serve/send_request.py
@@ -11,7 +11,6 @@ def send_query(text):
 
 test_sentence = "Introduce some landmarks in Beijing"
 
-results = ray.get(send_query.remote(test_sentence))
+result = ray.get(send_query.remote(test_sentence))
 print("Result returned:")
-for res in results:
-    print(res)
+print(result)

From 31dc712ba12dda4ab152532427240e45dd78ed2b Mon Sep 17 00:00:00 2001
From: yuanheng-zhao <jonathan.zhaoyh@gmail.com>
Date: Fri, 29 Sep 2023 15:31:43 +0800
Subject: [PATCH 06/12] fix readme

---
 examples/inference/serving/ray_serve/README.md | 16 ++++++++++------
 1 file changed, 10 insertions(+), 6 deletions(-)

diff --git a/examples/inference/serving/ray_serve/README.md b/examples/inference/serving/ray_serve/README.md
index a5606e46d5e7..93ac5838f2c2 100644
--- a/examples/inference/serving/ray_serve/README.md
+++ b/examples/inference/serving/ray_serve/README.md
@@ -2,7 +2,7 @@
 
 This example is used for demonstrating and testing the deployment of Colossal Inference from `colossalai.inference` with [Ray Serve](https://docs.ray.io/en/latest/serve/index.html). It imports inference modules from colossalai and is based on https://github.com/hpcaitech/ColossalAI/tree/a22706337a57dd1c98b95739dd09d98bd55947a0.
 
-Single-gpu inference and multiple-gpu inference (i.e. tensor parallel) serving are supported.
+Single-gpu inference as well as multiple-gpu inference (i.e. tensor parallel) serving are supported.
 
 ## Env Installation
 
@@ -27,12 +27,15 @@ conda install -c conda-forge cupy cudnn cutensor nccl cuda-version=11.6
 cd <path_to_ColossalAI_repo>
 CUDA_EXT=1 pip install -e .
 
+# install other dependencies
+pip install triton==2.0.0.dev20221202
 pip install transformers
 ```
 
 ## Launch Ray Serve and run the app
 ### Method #1. CLI command
 
+Under the current directory, we could launch the app by the following command:
 ```bash
     RAY_DEDUP_LOGS=0 serve run Colossal_Inference_rayserve:app
 ```
@@ -47,18 +50,19 @@ python send_request.py
 ```
 
 ### Method #2. Run inside script
-
-Attach the following to the end of the file, and run the script via `python Colossal_Inference_rayserve.py`
+We could also launch ray serve and run the app inside the script.
+Attach the following to the end of the file,
 ```pyhton
 handle: DeploymentHandle = serve.run(app)
 print(requests.get("http://localhost:8000/?text={}".format(text)))
 ```
+and then run the script by `python Colossal_Inference_rayserve.py`
 
 
+### Terminate Ray Serve
+Ray serve and the application would terminate automatically as you choose the second method to run any job in the script. If you choose the first method (serve run), you might want to apply `ctrl+c` to shut down the application.
 
-Use
+To make sure all the active Ray processes are killed, run
 ```bash
 ray stop
 ```
-
-to kill any active Ray processes

From 5655b73ba31472a7995dcd0195b5911e031c2308 Mon Sep 17 00:00:00 2001
From: yuanheng-zhao <jonathan.zhaoyh@gmail.com>
Date: Fri, 29 Sep 2023 18:01:53 +0800
Subject: [PATCH 07/12] use app builder and autoscaling

---
 .../ray_serve/Colossal_Inference_rayserve.py  | 24 +++++++++++++++----
 .../inference/serving/ray_serve/README.md     |  2 +-
 2 files changed, 20 insertions(+), 6 deletions(-)

diff --git a/examples/inference/serving/ray_serve/Colossal_Inference_rayserve.py b/examples/inference/serving/ray_serve/Colossal_Inference_rayserve.py
index 1b9e38220304..6a15687e9cf2 100644
--- a/examples/inference/serving/ray_serve/Colossal_Inference_rayserve.py
+++ b/examples/inference/serving/ray_serve/Colossal_Inference_rayserve.py
@@ -1,13 +1,14 @@
 import logging
 import os
 from dataclasses import dataclass
-from typing import Any, List, Union
+from typing import Any, Dict, List, Union
 
 import ray
 import ray.util.collective as collective
 import starlette
 import torch
 from ray import serve
+from ray.serve import Application
 from transformers import BloomForCausalLM, BloomTokenizerFast
 
 import colossalai
@@ -79,7 +80,16 @@ def generate(self, text: Union[str, List[str]]) -> str:
         return text_output
 
 
-@serve.deployment(num_replicas=1, ray_actor_options={"num_gpus": 0})
+@serve.deployment(
+    ray_actor_options={"num_cpus": 1, "num_gpus": 0},
+    max_concurrent_queries=5,
+    autoscaling_config={
+        "target_num_ongoing_requests_per_replica": 1,
+        "min_replicas": 1,
+        "initial_replicas": 1,
+        "max_replicas": 1,
+    },
+)
 class Driver:
     def __init__(self, config):
         log_cuda_info("Driver:init")
@@ -133,6 +143,10 @@ class Config:
     max_output_len: int = 32
 
 
-# *** add model path manually into the config***
-driver_config = Config(model_path="ADD_MODEL_PATH_HRER")
-app = Driver.bind(config=driver_config)
+def app(args: Dict[str, str]) -> Application:
+    model_path = args.get("path", None)
+    if model_path is None:
+        raise ValueError("Model path not provided!")
+
+    driver_config = Config(model_path=model_path)
+    return Driver.options(name="Colossal-Inference-Driver").bind(config=driver_config)
diff --git a/examples/inference/serving/ray_serve/README.md b/examples/inference/serving/ray_serve/README.md
index 93ac5838f2c2..901ea433d62c 100644
--- a/examples/inference/serving/ray_serve/README.md
+++ b/examples/inference/serving/ray_serve/README.md
@@ -37,7 +37,7 @@ pip install transformers
 
 Under the current directory, we could launch the app by the following command:
 ```bash
-    RAY_DEDUP_LOGS=0 serve run Colossal_Inference_rayserve:app
+    RAY_DEDUP_LOGS=0 serve run Colossal_Inference_rayserve:app path="PATH_TO_YOUR_MODEL_DIR"
 ```
 
 By default, Ray deduplicates logs across cluster. Here we set `RAY_DEDUP_LOGS=0`` to disable log deduplication, enabling each actor to log information in CLI.

From f3e80fe949530aba3be0aaf418275f1f9c859d83 Mon Sep 17 00:00:00 2001
From: yuanheng-zhao <jonathan.zhaoyh@gmail.com>
Date: Mon, 2 Oct 2023 10:01:57 +0800
Subject: [PATCH 08/12] trivial: input args

---
 .../ray_serve/Colossal_Inference_rayserve.py  | 41 +++++++++----------
 1 file changed, 20 insertions(+), 21 deletions(-)

diff --git a/examples/inference/serving/ray_serve/Colossal_Inference_rayserve.py b/examples/inference/serving/ray_serve/Colossal_Inference_rayserve.py
index 6a15687e9cf2..973789e41b66 100644
--- a/examples/inference/serving/ray_serve/Colossal_Inference_rayserve.py
+++ b/examples/inference/serving/ray_serve/Colossal_Inference_rayserve.py
@@ -1,12 +1,12 @@
 import logging
 import os
-from dataclasses import dataclass
-from typing import Any, Dict, List, Union
+from typing import Any, List, Union
 
 import ray
 import ray.util.collective as collective
 import starlette
 import torch
+from pydantic import BaseModel
 from ray import serve
 from ray.serve import Application
 from transformers import BloomForCausalLM, BloomTokenizerFast
@@ -19,6 +19,16 @@
 ray_serve_logger = logging.getLogger("ray.serve")
 
 
+class GenConfigArgs(BaseModel):
+    """Config for generation"""
+
+    path: str
+    tp_size: int = 2
+    max_batch_size: int = 4
+    max_input_len: int = 128
+    max_output_len: int = 32
+
+
 def log_cuda_info(scope_name: str):
     ray_serve_logger.info(f" {scope_name}: ray.get_gpu_ids(): {ray.get_gpu_ids()}")
     ray_serve_logger.info(
@@ -91,9 +101,9 @@ def generate(self, text: Union[str, List[str]]) -> str:
     },
 )
 class Driver:
-    def __init__(self, config):
+    def __init__(self, config: GenConfigArgs):
         log_cuda_info("Driver:init")
-        model_path = config.model_path
+        model_path = config.path
         tp_size = config.tp_size
 
         self.num_workers = tp_size
@@ -132,21 +142,10 @@ async def __call__(self, request: starlette.requests.Request) -> Any:
         return await self.batch_generate(request.query_params["text"])
 
 
-@dataclass
-class Config:
-    """temp config"""
-
-    model_path: str
-    tp_size: int = 2
-    max_batch_size: int = 4
-    max_input_len: int = 128
-    max_output_len: int = 32
-
-
-def app(args: Dict[str, str]) -> Application:
-    model_path = args.get("path", None)
-    if model_path is None:
-        raise ValueError("Model path not provided!")
+def app(args: GenConfigArgs) -> Application:
+    print(args)
+    if args.path is None or not os.path.exists(args.path):
+        raise ValueError("Model path not provided or invalid path!")
 
-    driver_config = Config(model_path=model_path)
-    return Driver.options(name="Colossal-Inference-Driver").bind(config=driver_config)
+    # driver_config = Config(model_path=model_path)
+    return Driver.options(name="Colossal-Inference-Driver").bind(config=args)

From 5285049109ff510f2503f51e70b0ad98bbbf2a69 Mon Sep 17 00:00:00 2001
From: yuanheng-zhao <jonathan.zhaoyh@gmail.com>
Date: Mon, 2 Oct 2023 11:41:43 +0800
Subject: [PATCH 09/12] clean code; revise readme

---
 .../ray_serve/Colossal_Inference_rayserve.py  |  1 -
 .../inference/serving/ray_serve/README.md     | 46 +++++++++++++------
 .../serving/ray_serve/send_request.py         |  1 -
 .../serving/ray_serve/send_requests.py        |  1 -
 4 files changed, 32 insertions(+), 17 deletions(-)

diff --git a/examples/inference/serving/ray_serve/Colossal_Inference_rayserve.py b/examples/inference/serving/ray_serve/Colossal_Inference_rayserve.py
index 973789e41b66..b9dd0eee7179 100644
--- a/examples/inference/serving/ray_serve/Colossal_Inference_rayserve.py
+++ b/examples/inference/serving/ray_serve/Colossal_Inference_rayserve.py
@@ -147,5 +147,4 @@ def app(args: GenConfigArgs) -> Application:
     if args.path is None or not os.path.exists(args.path):
         raise ValueError("Model path not provided or invalid path!")
 
-    # driver_config = Config(model_path=model_path)
     return Driver.options(name="Colossal-Inference-Driver").bind(config=args)
diff --git a/examples/inference/serving/ray_serve/README.md b/examples/inference/serving/ray_serve/README.md
index 901ea433d62c..1d408238760b 100644
--- a/examples/inference/serving/ray_serve/README.md
+++ b/examples/inference/serving/ray_serve/README.md
@@ -4,9 +4,9 @@ This example is used for demonstrating and testing the deployment of Colossal In
 
 Single-gpu inference as well as multiple-gpu inference (i.e. tensor parallel) serving are supported.
 
-## Env Installation
+## Installation
 
-Conda
+### Conda Environment
 ```bash
 # create a new conda env with python 3.8
 conda create -n ray_test python=3.8.18
@@ -37,12 +37,10 @@ pip install transformers
 
 Under the current directory, we could launch the app by the following command:
 ```bash
-    RAY_DEDUP_LOGS=0 serve run Colossal_Inference_rayserve:app path="PATH_TO_YOUR_MODEL_DIR"
+RAY_DEDUP_LOGS=0 serve run Colossal_Inference_rayserve:app path="PATH_TO_YOUR_MODEL_DIR"
 ```
 
-By default, Ray deduplicates logs across cluster. Here we set `RAY_DEDUP_LOGS=0`` to disable log deduplication, enabling each actor to log information in CLI.
-
-`serve run` runs an application from the specified import path. The formats should be `<filename>:<app_name>`.
+By default, Ray deduplicates logs across cluster. Here we set `RAY_DEDUP_LOGS=0` to disable log deduplication, enabling each actor to log information in CLI. `serve run` runs an application from the specified import path. The formats should be `<filename>:<app_name>`.
 
 Then we could send requests by running python script in another window:
 ```bash
@@ -50,17 +48,37 @@ python send_request.py
 ```
 
 ### Method #2. Run inside script
-We could also launch ray serve and run the app inside the script.
-Attach the following to the end of the file,
-```pyhton
-handle: DeploymentHandle = serve.run(app)
-print(requests.get("http://localhost:8000/?text={}".format(text)))
-```
-and then run the script by `python Colossal_Inference_rayserve.py`
 
+We could also launch ray serve and run the app inside a single script by making some modifications:
+To avoid ray handler from raising error in serializing pydantic objects, we'll replace the config class from `class GenConfigArgs(BaseModel)` to
+```python
+from dataclasses import dataclass
+@dataclass
+class GenConfigArgs:
+    # attributes remain unchanged
+```
+Comment out the app builder
+```python
+# def app(args: GenConfigArgs) -> Application:
+#     ...
+#     return Driver.options(name="Colossal-Inference-Driver").bind(config=args)
+```
+And attach the following lines to the end of the file,
+```python
+from ray.serve.handle import DeploymentHandle, DeploymentResponse
+
+app = Driver.bind(config=GenConfigArgs(path="<Path_to_model_dir>"))
+handle: DeploymentHandle = serve.run(app).options(use_new_handle_api=True)
+response: DeploymentResponse = handle.batch_generate.remote(requests="Introduce some landmarks in Beijing")
+print(response.result())
+```
+Then we could run the script
+```python
+python Colossal_Inference_rayserve.py
+```
 
 ### Terminate Ray Serve
-Ray serve and the application would terminate automatically as you choose the second method to run any job in the script. If you choose the first method (serve run), you might want to apply `ctrl+c` to shut down the application.
+Ray serve and the application would terminate automatically as you choose the second method to run any job in the script. If you choose the first method (serve run), you might want to apply `ctrl+c` to shut down the application, or use `serve shutdown` to shut down serve and deletes all applications on the ray cluster.
 
 To make sure all the active Ray processes are killed, run
 ```bash
diff --git a/examples/inference/serving/ray_serve/send_request.py b/examples/inference/serving/ray_serve/send_request.py
index 70a15724ab6c..3bab1764a1a5 100644
--- a/examples/inference/serving/ray_serve/send_request.py
+++ b/examples/inference/serving/ray_serve/send_request.py
@@ -4,7 +4,6 @@
 
 @ray.remote
 def send_query(text):
-    # resp = requests.post("http://localhost:8000/", json={"text": text})
     resp = requests.get("http://localhost:8000/?text={}".format(text))
     return resp.text
 
diff --git a/examples/inference/serving/ray_serve/send_requests.py b/examples/inference/serving/ray_serve/send_requests.py
index 3e34b9c9a497..bee3b6b68c85 100644
--- a/examples/inference/serving/ray_serve/send_requests.py
+++ b/examples/inference/serving/ray_serve/send_requests.py
@@ -4,7 +4,6 @@
 
 @ray.remote
 def send_query(text):
-    # resp = requests.post("http://localhost:8000/", json={"text": text})
     resp = requests.get("http://localhost:8000/?text={}".format(text))
     return resp.text
 

From 5f6962ec376e01b6b50e040b13d9a740aea4ac0a Mon Sep 17 00:00:00 2001
From: yuanheng-zhao <jonathan.zhaoyh@gmail.com>
Date: Mon, 2 Oct 2023 11:44:20 +0800
Subject: [PATCH 10/12] testci (skip example test)

---
 examples/inference/serving/test_ci.sh | 0
 1 file changed, 0 insertions(+), 0 deletions(-)
 create mode 100644 examples/inference/serving/test_ci.sh

diff --git a/examples/inference/serving/test_ci.sh b/examples/inference/serving/test_ci.sh
new file mode 100644
index 000000000000..e69de29bb2d1

From 698d955628482a366caeda2fac011d9fabf2ab48 Mon Sep 17 00:00:00 2001
From: yuanheng-zhao <jonathan.zhaoyh@gmail.com>
Date: Mon, 2 Oct 2023 14:42:41 +0800
Subject: [PATCH 11/12] use auto model/tokenizer

---
 .../serving/ray_serve/Colossal_Inference_rayserve.py  | 11 ++++++-----
 1 file changed, 6 insertions(+), 5 deletions(-)

diff --git a/examples/inference/serving/ray_serve/Colossal_Inference_rayserve.py b/examples/inference/serving/ray_serve/Colossal_Inference_rayserve.py
index b9dd0eee7179..51d520ebbcf6 100644
--- a/examples/inference/serving/ray_serve/Colossal_Inference_rayserve.py
+++ b/examples/inference/serving/ray_serve/Colossal_Inference_rayserve.py
@@ -9,7 +9,7 @@
 from pydantic import BaseModel
 from ray import serve
 from ray.serve import Application
-from transformers import BloomForCausalLM, BloomTokenizerFast
+from transformers import AutoModelForCausalLM, AutoTokenizer
 
 import colossalai
 from colossalai.inference.tensor_parallel.engine import TPInferEngine
@@ -61,10 +61,11 @@ def setup(self, world_size, rank, port):
         log_cuda_info("Worker.setup")
 
         # Load model
-        self.tokenizer = BloomTokenizerFast.from_pretrained(self.model_path)
-        self.tokenizer.pad_token = self.tokenizer.eos_token
-        self.model = BloomForCausalLM.from_pretrained(
-            self.model_path, pad_token_id=self.tokenizer.eos_token_id, torch_dtype=torch.float16
+        self.tokenizer = AutoTokenizer.from_pretrained(self.model_path)
+        if self.tokenizer.pad_token is None:
+            self.tokenizer.pad_token = self.tokenizer.eos_token
+        self.model = AutoModelForCausalLM.from_pretrained(
+            self.model_path, pad_token_id=self.tokenizer.pad_token_id, torch_dtype=torch.float16
         )
 
         shard_config = ShardConfig(enable_tensor_parallelism=True if world_size > 1 else False, inference_only=True)

From 0dd634d823e1ff90ef2f29605ce96299f1a1ab2c Mon Sep 17 00:00:00 2001
From: yuanheng-zhao <jonathan.zhaoyh@gmail.com>
Date: Mon, 2 Oct 2023 15:28:47 +0800
Subject: [PATCH 12/12] revert imports fix (fixed in other PRs)

---
 colossalai/inference/tensor_parallel/modeling/__init__.py | 2 ++
 colossalai/kernel/triton/__init__.py                      | 5 ++---
 2 files changed, 4 insertions(+), 3 deletions(-)

diff --git a/colossalai/inference/tensor_parallel/modeling/__init__.py b/colossalai/inference/tensor_parallel/modeling/__init__.py
index 4662368b17b4..279b54065eed 100644
--- a/colossalai/inference/tensor_parallel/modeling/__init__.py
+++ b/colossalai/inference/tensor_parallel/modeling/__init__.py
@@ -1,3 +1,5 @@
+import _utils
+
 from .bloom import BloomInferenceForwards
 from .chatglm2 import ChatGLM2InferenceForwards
 from .llama import LlamaInferenceForwards
diff --git a/colossalai/kernel/triton/__init__.py b/colossalai/kernel/triton/__init__.py
index a0f0313954b5..87ea9cf6536e 100644
--- a/colossalai/kernel/triton/__init__.py
+++ b/colossalai/kernel/triton/__init__.py
@@ -6,8 +6,7 @@
     from .context_attention import bloom_context_attn_fwd, llama_context_attn_fwd
     from .copy_kv_cache_dest import copy_kv_cache_to_dest
     from .fused_layernorm import layer_norm
-
-    # from .gptq_triton import gptq_fused_linear_triton
+    from .gptq_triton import gptq_fused_linear_triton
     from .rms_norm import rmsnorm_forward
     from .rotary_embedding_kernel import rotary_embedding_fwd
     from .softmax import softmax
@@ -22,7 +21,7 @@
         "copy_kv_cache_to_dest",
         "rotary_embedding_fwd",
         "token_attention_fwd",
-        # "gptq_fused_linear_triton",
+        "gptq_fused_linear_triton",
     ]
 
 except ImportError: