From 1603523237b6f16d73956a29a4b45720376b3e84 Mon Sep 17 00:00:00 2001
From: Yuanheng <jonathan.zhaoyh@gmail.com>
Date: Sun, 24 Mar 2024 17:21:55 +0800
Subject: [PATCH 1/8] revise grok-1 example

---
 examples/language/grok-1/inference.py         | 24 ++++++++++++++-----
 examples/language/grok-1/inference_tp.py      | 24 ++++++++++++++-----
 examples/language/grok-1/requirements.txt     |  1 -
 .../language/grok-1/run_inference_fast.sh     |  2 +-
 .../language/grok-1/run_inference_slow.sh     |  2 +-
 examples/language/grok-1/utils.py             |  6 ++---
 6 files changed, 41 insertions(+), 18 deletions(-)

diff --git a/examples/language/grok-1/inference.py b/examples/language/grok-1/inference.py
index ca0ad0d4fe95..2c30013d3c1c 100644
--- a/examples/language/grok-1/inference.py
+++ b/examples/language/grok-1/inference.py
@@ -1,8 +1,7 @@
 import time
 
 import torch
-from sentencepiece import SentencePieceProcessor
-from transformers import AutoModelForCausalLM
+from transformers import AutoModelForCausalLM, LlamaTokenizerFast
 from utils import get_defualt_parser, inference, print_output
 
 if __name__ == "__main__":
@@ -16,11 +15,15 @@
         device_map="auto",
         torch_dtype=torch.bfloat16,
     )
-    sp = SentencePieceProcessor(model_file=args.tokenizer)
+    model.eval()
+    init_time = time.time() - start
+
+    tokenizer = LlamaTokenizerFast.from_pretrained("Xenova/grok-1-tokenizer")
+
     for text in args.text:
         output = inference(
             model,
-            sp,
+            tokenizer,
             text,
             max_new_tokens=args.max_new_tokens,
             do_sample=args.do_sample,
@@ -28,5 +31,14 @@
             top_k=args.top_k,
             top_p=args.top_p,
         )
-        print_output(text, sp.decode(output))
-    print(f"Overall time: {time.time() - start} seconds.")
+        print_output(text, tokenizer.decode(output))
+
+    overall_time = time.time() - start
+    gen_latency = overall_time - init_time
+    avg_gen_latency = gen_latency / len(args.text)
+    print(
+        f"Initializing time: {init_time} seconds.\n"
+        f"Overall time: {overall_time} seconds. \n"
+        f"Generation latency: {gen_latency} seconds. \n"
+        f"Average generation latency: {avg_gen_latency} seconds. \n"
+    )
diff --git a/examples/language/grok-1/inference_tp.py b/examples/language/grok-1/inference_tp.py
index 99de60e1f6be..58ca2742d944 100644
--- a/examples/language/grok-1/inference_tp.py
+++ b/examples/language/grok-1/inference_tp.py
@@ -2,8 +2,7 @@
 
 import torch
 from grok1_policy import Grok1ForCausalLMPolicy
-from sentencepiece import SentencePieceProcessor
-from transformers import AutoModelForCausalLM
+from transformers import AutoModelForCausalLM, LlamaTokenizerFast
 from utils import get_defualt_parser, inference, print_output
 
 import colossalai
@@ -33,11 +32,15 @@
             args.pretrained, trust_remote_code=True, torch_dtype=torch.bfloat16
         )
     model, *_ = booster.boost(model)
-    sp = SentencePieceProcessor(model_file=args.tokenizer)
+    model.eval()
+    init_time = time.time() - start
+
+    tokenizer = LlamaTokenizerFast.from_pretrained("Xenova/grok-1-tokenizer")
+
     for text in args.text:
         output = inference(
             model.unwrap(),
-            sp,
+            tokenizer,
             text,
             max_new_tokens=args.max_new_tokens,
             do_sample=args.do_sample,
@@ -46,5 +49,14 @@
             top_p=args.top_p,
         )
         if coordinator.is_master():
-            print_output(text, sp.decode(output))
-    coordinator.print_on_master(f"Overall time: {time.time() - start} seconds.")
+            print_output(text, tokenizer.decode(output))
+
+    overall_time = time.time() - start
+    gen_latency = overall_time - init_time
+    avg_gen_latency = gen_latency / len(args.text)
+    coordinator.print_on_master(
+        f"Initializing time: {init_time} seconds.\n"
+        f"Overall time: {overall_time} seconds. \n"
+        f"Generation latency: {gen_latency} seconds. \n"
+        f"Average generation latency: {avg_gen_latency} seconds. \n"
+    )
diff --git a/examples/language/grok-1/requirements.txt b/examples/language/grok-1/requirements.txt
index 15d5ea53a15e..2d94924ce3b4 100644
--- a/examples/language/grok-1/requirements.txt
+++ b/examples/language/grok-1/requirements.txt
@@ -1,4 +1,3 @@
 torch>=2.1.0,<2.2.0
 colossalai>=0.3.6
-sentencepiece==0.1.99
 transformers==4.35.0
diff --git a/examples/language/grok-1/run_inference_fast.sh b/examples/language/grok-1/run_inference_fast.sh
index 0dc398c53e33..644a264bdf1c 100755
--- a/examples/language/grok-1/run_inference_fast.sh
+++ b/examples/language/grok-1/run_inference_fast.sh
@@ -5,7 +5,7 @@ TOKENIZER=${2:-"tokenizer.model"}
 
 torchrun --standalone --nproc_per_node 8 inference_tp.py --pretrained "$PRETRAINED" \
     --tokenizer "$TOKENIZER" \
-    --max_new_tokens 64 \
+    --max_new_tokens 100 \
     --text "The company's annual conference, featuring keynote speakers and exclusive product launches, will be held at the Los Angeles Convention Center from October 20th to October 23rd, 2021. Extract the date mentioned in the above sentence." \
             "将以下句子翻译成英语。 我喜欢看电影和读书。" \
             "All books have the same weight, 10 books weigh 5kg, what is the weight of 2 books?"
diff --git a/examples/language/grok-1/run_inference_slow.sh b/examples/language/grok-1/run_inference_slow.sh
index c64dd93b9e62..6322e8cb9754 100755
--- a/examples/language/grok-1/run_inference_slow.sh
+++ b/examples/language/grok-1/run_inference_slow.sh
@@ -5,7 +5,7 @@ TOKENIZER=${2:-"tokenizer.model"}
 
 python3 inference.py --pretrained "$PRETRAINED" \
     --tokenizer "$TOKENIZER" \
-    --max_new_tokens 64 \
+    --max_new_tokens 100 \
     --text "The company's annual conference, featuring keynote speakers and exclusive product launches, will be held at the Los Angeles Convention Center from October 20th to October 23rd, 2021. Extract the date mentioned in the above sentence." \
             "将以下句子翻译成英语。 我喜欢看电影和读书。" \
             "All books have the same weight, 10 books weigh 5kg, what is the weight of 2 books?"
diff --git a/examples/language/grok-1/utils.py b/examples/language/grok-1/utils.py
index f113f852eff6..7663127a5515 100644
--- a/examples/language/grok-1/utils.py
+++ b/examples/language/grok-1/utils.py
@@ -20,9 +20,9 @@ def print_output(text, output):
 
 
 @torch.no_grad()
-def inference(model, sp, text, **generate_kwargs):
-    input_ids = sp.encode(text)
-    input_ids = torch.tensor([input_ids]).cuda()
+def inference(model, tokenizer, text, **generate_kwargs):
+    input_ids = tokenizer(text, return_tensors="pt").input_ids
+    input_ids = input_ids.cuda()
     attention_mask = torch.ones_like(input_ids)
     inputs = {
         "input_ids": input_ids,

From f45e6a240d16f8245e7a70e4f451a9ecd51df92f Mon Sep 17 00:00:00 2001
From: Yuanheng <jonathan.zhaoyh@gmail.com>
Date: Sun, 24 Mar 2024 17:24:16 +0800
Subject: [PATCH 2/8] remove unused arg in scripts

---
 examples/language/grok-1/run_inference_fast.sh | 2 --
 examples/language/grok-1/run_inference_slow.sh | 2 --
 2 files changed, 4 deletions(-)

diff --git a/examples/language/grok-1/run_inference_fast.sh b/examples/language/grok-1/run_inference_fast.sh
index 644a264bdf1c..1ccd8383dc5e 100755
--- a/examples/language/grok-1/run_inference_fast.sh
+++ b/examples/language/grok-1/run_inference_fast.sh
@@ -1,10 +1,8 @@
 #!/usr/bin/env bash
 
 PRETRAINED=${1:-"hpcaitech/grok-1"}
-TOKENIZER=${2:-"tokenizer.model"}
 
 torchrun --standalone --nproc_per_node 8 inference_tp.py --pretrained "$PRETRAINED" \
-    --tokenizer "$TOKENIZER" \
     --max_new_tokens 100 \
     --text "The company's annual conference, featuring keynote speakers and exclusive product launches, will be held at the Los Angeles Convention Center from October 20th to October 23rd, 2021. Extract the date mentioned in the above sentence." \
             "将以下句子翻译成英语。 我喜欢看电影和读书。" \
diff --git a/examples/language/grok-1/run_inference_slow.sh b/examples/language/grok-1/run_inference_slow.sh
index 6322e8cb9754..a857828925dd 100755
--- a/examples/language/grok-1/run_inference_slow.sh
+++ b/examples/language/grok-1/run_inference_slow.sh
@@ -1,10 +1,8 @@
 #!/usr/bin/env bash
 
 PRETRAINED=${1:-"hpcaitech/grok-1"}
-TOKENIZER=${2:-"tokenizer.model"}
 
 python3 inference.py --pretrained "$PRETRAINED" \
-    --tokenizer "$TOKENIZER" \
     --max_new_tokens 100 \
     --text "The company's annual conference, featuring keynote speakers and exclusive product launches, will be held at the Los Angeles Convention Center from October 20th to October 23rd, 2021. Extract the date mentioned in the above sentence." \
             "将以下句子翻译成英语。 我喜欢看电影和读书。" \

From 78fe442eca074b9bed03b6bf35cbf3429800acf3 Mon Sep 17 00:00:00 2001
From: Yuanheng <jonathan.zhaoyh@gmail.com>
Date: Sun, 24 Mar 2024 17:28:52 +0800
Subject: [PATCH 3/8] prevent re-installing torch

---
 requirements/requirements.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/requirements/requirements.txt b/requirements/requirements.txt
index 095617d76355..3c1b5c458fdc 100644
--- a/requirements/requirements.txt
+++ b/requirements/requirements.txt
@@ -8,7 +8,7 @@ click
 fabric
 contexttimer
 ninja
-torch>=1.12
+torch>=1.12,<2.2.0
 safetensors
 einops
 pydantic

From f4bf6a8638c769fda6f6b80e6e3bb7758bee7429 Mon Sep 17 00:00:00 2001
From: Yuanheng <jonathan.zhaoyh@gmail.com>
Date: Sun, 24 Mar 2024 17:41:01 +0800
Subject: [PATCH 4/8] update readme

---
 examples/language/grok-1/README.md | 26 ++++++++------------------
 1 file changed, 8 insertions(+), 18 deletions(-)

diff --git a/examples/language/grok-1/README.md b/examples/language/grok-1/README.md
index c523f941262d..d36db0da5baf 100644
--- a/examples/language/grok-1/README.md
+++ b/examples/language/grok-1/README.md
@@ -11,33 +11,23 @@ cd examples/language/grok-1
 pip install -r requirements.txt
 ```
 
-## Tokenizer preparation
-
-You should download the tokenizer from the official grok-1 repository.
-
-```bash
-wget https://github.com/xai-org/grok-1/raw/main/tokenizer.model
-```
-
 ## Inference
 
 You need 8x A100 80GB or equivalent GPUs to run the inference.
 
-We provide two scripts for inference. `run_inference_fast.sh` uses tensor parallelism provided by ColossalAI, and it is faster. `run_inference_slow.sh` uses auto device provided by transformers, and it is slower.
+We provide two scripts for inference. `run_inference_fast.sh` uses tensor parallelism provided by ColossalAI, which is faster for generation, while `run_inference_slow.sh` uses auto device provided by transformers, which is relatively slower.
 
-Command format:
+Command example:
 
 ```bash
-./run_inference_fast.sh <model_name_or_path> <tokenizer_path>
-./run_inference_slow.sh <model_name_or_path> <tokenizer_path>
+./run_inference_fast.sh <MODEL_NAME_OR_PATH>
+./run_inference_slow.sh <MODEL_NAME_OR_PATH>
 ```
 
-`model_name_or_path` can be a local path or a model name from Hugging Face model hub. We provided weights on model hub, named `hpcaitech/grok-1`.
-
-Command example:
-
+`MODEL_NAME_OR_PATH` can be a model name from Hugging Face model hub or a local path to PyTorch-version model checkpoints. We provided weights on model hub, named `hpcaitech/grok-1`. And you could also download the weights in advance using `git`:
 ```bash
-./run_inference_fast.sh hpcaitech/grok-1 tokenizer.model
+git lfs install
+git clone https://huggingface.co/hpcai-tech/grok-1
 ```
 
-It will take 5-10 minutes to load checkpoints. Don't worry, it's not stuck.
+It will take, depending on your Internet speed, several hours to tens of hours to download checkpoints (about 600G!), and 5-10 minutes to load checkpoints when it's ready to launch the inference. Don't worry, it's not stuck.

From 7b5ee7e14302094dd5b3c4056729d41d2e8bbb1f Mon Sep 17 00:00:00 2001
From: Yuanheng <jonathan.zhaoyh@gmail.com>
Date: Sun, 24 Mar 2024 17:48:35 +0800
Subject: [PATCH 5/8] revert modifying colossalai requirements

---
 requirements/requirements.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/requirements/requirements.txt b/requirements/requirements.txt
index 3c1b5c458fdc..095617d76355 100644
--- a/requirements/requirements.txt
+++ b/requirements/requirements.txt
@@ -8,7 +8,7 @@ click
 fabric
 contexttimer
 ninja
-torch>=1.12,<2.2.0
+torch>=1.12
 safetensors
 einops
 pydantic

From fb8a6d3b7fd8a3240eb79fb389cbb1f5908a12cd Mon Sep 17 00:00:00 2001
From: Yuanheng <jonathan.zhaoyh@gmail.com>
Date: Sun, 24 Mar 2024 18:15:25 +0800
Subject: [PATCH 6/8] add perf

---
 examples/language/grok-1/README.md | 15 ++++++++++++++-
 1 file changed, 14 insertions(+), 1 deletion(-)

diff --git a/examples/language/grok-1/README.md b/examples/language/grok-1/README.md
index d36db0da5baf..da03be152033 100644
--- a/examples/language/grok-1/README.md
+++ b/examples/language/grok-1/README.md
@@ -1,6 +1,6 @@
 # Grok-1 Inference
 
-## Install
+## Installation
 
 ```bash
 # Make sure you install colossalai from the latest source code
@@ -31,3 +31,16 @@ git clone https://huggingface.co/hpcai-tech/grok-1
 ```
 
 It will take, depending on your Internet speed, several hours to tens of hours to download checkpoints (about 600G!), and 5-10 minutes to load checkpoints when it's ready to launch the inference. Don't worry, it's not stuck.
+
+
+## Performance
+
+For request of batch size set to 1 and maximum length set to 100:
+
+| Method                  | Initialization-Duration(sec) | Average-Generation-Latency(sec) |
+|-------------------------|------------------------------|---------------------------------|
+| ColossalAI              | 431.45                       | 14.92                           |
+| HuggingFace Auto-Device | 426.96                       | 48.38                           |
+| JAX                     | 147.61                       | 56.25                           |
+
+Tested on 8x80G NVIDIA H800.

From 00abba7a51935523727d9b7d006a1738e8d62b5c Mon Sep 17 00:00:00 2001
From: Yuanheng <jonathan.zhaoyh@gmail.com>
Date: Sun, 24 Mar 2024 18:15:51 +0800
Subject: [PATCH 7/8] trivial

---
 examples/language/grok-1/inference.py    | 8 ++++----
 examples/language/grok-1/inference_tp.py | 8 ++++----
 2 files changed, 8 insertions(+), 8 deletions(-)

diff --git a/examples/language/grok-1/inference.py b/examples/language/grok-1/inference.py
index 2c30013d3c1c..8b47cf5f4281 100644
--- a/examples/language/grok-1/inference.py
+++ b/examples/language/grok-1/inference.py
@@ -37,8 +37,8 @@
     gen_latency = overall_time - init_time
     avg_gen_latency = gen_latency / len(args.text)
     print(
-        f"Initializing time: {init_time} seconds.\n"
-        f"Overall time: {overall_time} seconds. \n"
-        f"Generation latency: {gen_latency} seconds. \n"
-        f"Average generation latency: {avg_gen_latency} seconds. \n"
+        f"Initializing time: {init_time:.2f} seconds.\n"
+        f"Overall time: {overall_time:.2f} seconds. \n"
+        f"Generation latency: {gen_latency:.2f} seconds. \n"
+        f"Average generation latency: {avg_gen_latency:.2f} seconds. \n"
     )
diff --git a/examples/language/grok-1/inference_tp.py b/examples/language/grok-1/inference_tp.py
index 58ca2742d944..664e837a1235 100644
--- a/examples/language/grok-1/inference_tp.py
+++ b/examples/language/grok-1/inference_tp.py
@@ -55,8 +55,8 @@
     gen_latency = overall_time - init_time
     avg_gen_latency = gen_latency / len(args.text)
     coordinator.print_on_master(
-        f"Initializing time: {init_time} seconds.\n"
-        f"Overall time: {overall_time} seconds. \n"
-        f"Generation latency: {gen_latency} seconds. \n"
-        f"Average generation latency: {avg_gen_latency} seconds. \n"
+        f"Initializing time: {init_time:.2f} seconds.\n"
+        f"Overall time: {overall_time:.2f} seconds. \n"
+        f"Generation latency: {gen_latency:.2f} seconds. \n"
+        f"Average generation latency: {avg_gen_latency:.2f} seconds. \n"
     )

From 1129fad31562d3ddca8defedbd63a348ffe77c00 Mon Sep 17 00:00:00 2001
From: ocd_with_naming <jonathan.zhaoyh@gmail.com>
Date: Sun, 24 Mar 2024 19:58:07 +0800
Subject: [PATCH 8/8] add tokenizer url

---
 examples/language/grok-1/inference.py    | 2 ++
 examples/language/grok-1/inference_tp.py | 2 ++
 2 files changed, 4 insertions(+)

diff --git a/examples/language/grok-1/inference.py b/examples/language/grok-1/inference.py
index 8b47cf5f4281..a73820af90bb 100644
--- a/examples/language/grok-1/inference.py
+++ b/examples/language/grok-1/inference.py
@@ -18,6 +18,8 @@
     model.eval()
     init_time = time.time() - start
 
+    # A transformers-compatible version of the grok-1 tokenizer by Xenova
+    # https://huggingface.co/Xenova/grok-1-tokenizer
     tokenizer = LlamaTokenizerFast.from_pretrained("Xenova/grok-1-tokenizer")
 
     for text in args.text:
diff --git a/examples/language/grok-1/inference_tp.py b/examples/language/grok-1/inference_tp.py
index 664e837a1235..604de14877f5 100644
--- a/examples/language/grok-1/inference_tp.py
+++ b/examples/language/grok-1/inference_tp.py
@@ -35,6 +35,8 @@
     model.eval()
     init_time = time.time() - start
 
+    # A transformers-compatible version of the grok-1 tokenizer by Xenova
+    # https://huggingface.co/Xenova/grok-1-tokenizer
     tokenizer = LlamaTokenizerFast.from_pretrained("Xenova/grok-1-tokenizer")
 
     for text in args.text: