hpcaitech · tiandiao123 · Sep 5, 2023 · Sep 5, 2023 · Sep 5, 2023
diff --git a/colossalai/inference/README.md b/colossalai/inference/README.md
@@ -94,24 +94,24 @@ For various models, experiments were conducted using multiple batch sizes under
 
 ### Single GPU Performance:
 
-Currently the stats below are calculated based on A100 (single GPU), and we calculate token latency based on average values of context-forward and decoding forward process, which means we combine both of processes to calculate token generation times. We are actively developing new features and methods to furthur optimize the performance of LLM models. Please stay tuned. 
+Currently the stats below are calculated based on A100 (single GPU), and we calculate token latency based on average values of context-forward and decoding forward process, which means we combine both of processes to calculate token generation times. We are actively developing new features and methods to furthur optimize the performance of LLM models. Please stay tuned.
 
 #### Llama
 
 |       batch_size        |   8    |   16   |   32   |
 | :---------------------: | :----: | :----: | :----: |
-| hugging-face torch fp16 | 199.12 | 246.56 | 246.56 |
-|   colossal-inference    | 241.12 | 451.84 | 643.52 |
+| hugging-face torch fp16 | 199.12 | 246.56 | 278.4  |
+|   colossal-inference    | 326.4  | 582.72 | 816.64 |
 
-![llama](https://raw.githubusercontent.com/hpcaitech/public_assets/main/colossalai/img/inference/Infer-llama.png)
+![llama](https://raw.githubusercontent.com/hpcaitech/public_assets/main/colossalai/img/inference/Infer-llama7b.png)
 
 ### Bloom
 
-|       batch_size        |   4    |   8    |
-| :---------------------: | :----: | :----: |
-| hugging-face torch fp16 | 145.28 | 189.68 |
-|   colossal-inference    | 187.48 | 323.28 |
+|       batch_size        |   8    |   16   |   32   |
+| :---------------------: | :----: | :----: | :----: |
+| hugging-face torch fp16 | 189.68 | 226.66 | 249.61 |
+|   colossal-inference    | 323.28 | 538.52 | 611.64 |
 
-![bloom](https://raw.githubusercontent.com/hpcaitech/public_assets/main/colossalai/img/inference/Infer-bloom.png)
+![bloom](https://raw.githubusercontent.com/hpcaitech/public_assets/main/colossalai/img/inference/Infer-bloom7b.png)
 
 The results of more models are coming soon!
diff --git a/examples/inference/bench_bloom.py b/examples/inference/bench_bloom.py
@@ -0,0 +1,106 @@
+import os
+import time
+
+import pytest
+import torch
+from transformers import BloomForCausalLM, BloomTokenizerFast
+
+import colossalai
+from colossalai.cluster import ProcessGroupMesh
+from colossalai.inference.tensor_parallel.engine import TPInferEngine
+from colossalai.logging import disable_existing_loggers
+from colossalai.shardformer import ShardConfig, ShardFormer
+from colossalai.testing import clear_cache_before_run, parameterize, rerun_if_address_is_in_use, spawn
+
+os.environ['TRANSFORMERS_NO_ADVISORY_WARNINGS'] = 'true'
+TPSIZE = 1
+MAX_BATCH_SIZE = 32
+MAX_INPUT_LEN = 1024
+MAX_OUTPUT_LEN = 128
+
+
+def print_perf_stats(latency_set, config, bs, warmup=3):
+    # trim warmup queries
+    latency_set = list(latency_set)
+    latency_set = latency_set[warmup:]
+    count = len(latency_set)
+
+    if count > 0:
+        latency_set.sort()
+        avg = sum(latency_set) / count
+        num_layers = getattr(config, "num_layers", config.num_hidden_layers)
+        num_parameters = num_layers * config.hidden_size * config.hidden_size * 12
+        num_bytes = 2    # float16
+
+        print("Avg Per Token Latency: {0:8.2f} ms".format(avg * 1000))
+        print("Avg BW: {0:8.2f} GB/s".format(1 / avg * num_parameters * num_bytes / 1e9))
+        print("Avg flops: {0:8.2f} TFlops/s".format(1 / avg * num_parameters * num_bytes * bs / 1e12))
+        print("Avg Throughput: tokens/s: {}".format((1000 / (avg * 1000)) * bs))
+
+
+@parameterize('test_config', [{
+    'tp_size': TPSIZE,
+}])
+def bench_bloom(test_config):
+
+    model_path = "/home/lczyh/data3/models/bloom-7b1"
+    tokenizer = BloomTokenizerFast.from_pretrained(model_path)
+    tokenizer.pad_token = tokenizer.eos_token
+    model = BloomForCausalLM.from_pretrained(model_path, pad_token_id=tokenizer.eos_token_id)
+    model = model.half()
+    # To benchmark torch original, uncommment the following line
+    # model.to(torch.cuda.current_device())
+
+    # init TPInferEngine and shard original model by shardformer
+    # To benchmark torch original, comment out lines of creating, preparing, and sharding by the shardformer
+    infer_engine = TPInferEngine(model, MAX_BATCH_SIZE, MAX_INPUT_LEN, MAX_OUTPUT_LEN)
+    shard_config = ShardConfig(enable_tensor_parallelism=True if test_config['tp_size'] > 1 else False,
+                               inference_only=True)
+    shardformer = ShardFormer(shard_config=shard_config)
+    infer_engine.prepare_with_shard_config(shard_config)
+    infer_engine.shard_model_by(shardformer)
+
+    # prepare data for generation
+    batch_size = MAX_BATCH_SIZE
+    input_len = MAX_INPUT_LEN
+    generate_kwargs = dict(max_new_tokens=MAX_OUTPUT_LEN, do_sample=False)
+    input_tokens = {
+        "input_ids": torch.randint(10, 1000, (batch_size, input_len)),
+        "attention_mask": torch.ones((batch_size, input_len))
+    }
+    for t in input_tokens:
+        if torch.is_tensor(input_tokens[t]):
+            input_tokens[t] = input_tokens[t].to(torch.cuda.current_device())
+            print(f" input_tokens[{t}].shape: {input_tokens[t].shape}")
+
+    iters = 10
+    times = []
+    for i in range(iters):
+        torch.cuda.synchronize()
+        start = time.time()
+        outputs = infer_engine.generate(input_tokens, generate_kwargs)
+        torch.cuda.synchronize()
+        end = time.time()
+        # infer_engine.cache_manager.free_all()
+        out_len = outputs.shape[1]
+        print(f" iter {i}: out len {str(out_len)}, generation time {str(end - start)} s")
+        times.append((end - start) / (out_len - input_len))
+
+    print_perf_stats(times, model.config, batch_size)
+
+
+def check_bloom(rank, world_size, port):
+    disable_existing_loggers()
+    colossalai.launch(config={}, rank=rank, world_size=world_size, host='localhost', port=port, backend='nccl')
+    bench_bloom()
+
+
+@pytest.mark.dist
+@rerun_if_address_is_in_use()
+@clear_cache_before_run()
+def test_bloom():
+    spawn(check_bloom, TPSIZE)
+
+
+if __name__ == "__main__":
+    test_bloom()