jamesthesnake · jamesthesnake · Oct 25, 2023 · Oct 16, 2023 · Oct 13, 2023 · Oct 16, 2023
diff --git a/LICENSE b/LICENSE
@@ -477,3 +477,53 @@ Copyright 2021- HPC-AI Technology Inc. All rights reserved.
    LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
    OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
    SOFTWARE.
+
+
+   ---------------- LICENSE FOR torch-int ----------------
+
+   MIT License
+
+   Copyright (c) 2022 Guangxuan Xiao
+
+   Permission is hereby granted, free of charge, to any person obtaining a copy
+   of this software and associated documentation files (the "Software"), to deal
+   in the Software without restriction, including without limitation the rights
+   to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+   copies of the Software, and to permit persons to whom the Software is
+   furnished to do so, subject to the following conditions:
+
+   The above copyright notice and this permission notice shall be included in all
+   copies or substantial portions of the Software.
+
+   THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+   IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+   FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+   AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+   LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+   OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+   SOFTWARE.
+
+
+   ---------------- LICENSE FOR smoothquant ----------------
+
+   MIT License
+
+   Copyright (c) 2022 MIT HAN Lab
+
+   Permission is hereby granted, free of charge, to any person obtaining a copy
+   of this software and associated documentation files (the "Software"), to deal
+   in the Software without restriction, including without limitation the rights
+   to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+   copies of the Software, and to permit persons to whom the Software is
+   furnished to do so, subject to the following conditions:
+
+   The above copyright notice and this permission notice shall be included in all
+   copies or substantial portions of the Software.
+
+   THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+   IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+   FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+   AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+   LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+   OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+   SOFTWARE.
diff --git a/applications/Colossal-LLaMA-2/colossal_llama2/utils/flash_attention_patch.py b/applications/Colossal-LLaMA-2/colossal_llama2/utils/flash_attention_patch.py
@@ -6,25 +6,20 @@
 
 import torch
 import torch.nn.functional as F
+from einops import rearrange
+from flash_attn.bert_padding import pad_input, unpad_input
+from flash_attn.flash_attn_interface import flash_attn_func, flash_attn_varlen_kvpacked_func
+from flash_attn.ops.rms_norm import rms_norm
 from transformers.models.llama.modeling_llama import (
-    LlamaRMSNorm,
     LlamaAttention,
-    LlamaModel,
     LlamaForCausalLM,
+    LlamaModel,
+    LlamaRMSNorm,
     apply_rotary_pos_emb,
     repeat_kv,
 )
 
 from colossalai.logging import get_dist_logger
-from einops import rearrange
-
-from flash_attn.bert_padding import pad_input, unpad_input
-from flash_attn.flash_attn_interface import (
-    flash_attn_func,
-    flash_attn_varlen_kvpacked_func,
-)
-from flash_attn.ops.rms_norm import rms_norm
-
 
 logger = get_dist_logger()
 
@@ -65,6 +60,7 @@ def attention_forward(
     past_key_value: Optional[Tuple[torch.Tensor]] = None,
     output_attentions: bool = False,
     use_cache: bool = False,
+    **kwargs,
 ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
     """
     Re-define LLaMA-2 `LlamaAttention` forward method using flash-attention.

diff --git a/colossalai/booster/plugin/gemini_plugin.py b/colossalai/booster/plugin/gemini_plugin.py
@@ -245,6 +245,7 @@ class GeminiPlugin(DPPluginBase):
         chunk_config_dict (dict, optional): chunk configuration dictionary.
         chunk_init_device (torch.device, optional): device to initialize the chunk.
         placement_policy (str, optional): "static" and "auto". Defaults to "static".
+        enable_gradient_accumulation (bool, optional): Whether to enable gradient accumulation. When set to True, gradient will be stored after doing backward pass. Defaults to False.
         shard_param_frac (float, optional): fraction of parameters to be sharded. Only for "static" placement.
             If `shard_param_frac` is 1.0, it's equal to zero-3. If `shard_param_frac` is 0.0, it's equal to zero-2. Defaults to 1.0.
         offload_optim_frac (float, optional): fraction of optimizer states to be offloaded. Only for "static" placement.
@@ -257,7 +258,7 @@ class GeminiPlugin(DPPluginBase):
         warmup_non_model_data_ratio (float, optional): ratio of expected non-model data memory during warmup. Only for "auto" placement. Defaults to 0.8.
         steady_cuda_cap_ratio (float, optional): ratio of allowed cuda capacity for model data during steady state. Only for "auto" placement. Defaults to 0.9.
         precision (str, optional): precision. Support 'fp16' and 'bf16'. Defaults to 'fp16'.
-        master_weights (bool, optional): master weights. Defaults to True.
+        master_weights (bool, optional): Whether to keep fp32 master parameter weights in optimizer. Defaults to True.
         pin_memory (bool, optional): use pin memory on CPU. Defaults to False.
         force_outputs_fp32 (bool, optional): force outputs are fp32. Defaults to False.
         strict_ddp_mode (bool, optional): use strict ddp mode (only use dp without other parallelism). Defaults to False.
@@ -291,6 +292,7 @@ def __init__(
         chunk_config_dict: Optional[dict] = None,
         chunk_init_device: Optional[torch.device] = None,
         placement_policy: str = "static",
+        enable_gradient_accumulation: bool = False,
         shard_param_frac: float = 1.0,  # only for static placement
         offload_optim_frac: float = 0.0,  # only for static placement
         offload_param_frac: float = 0.0,  # only for static placement
@@ -323,6 +325,7 @@ def __init__(
             chunk_config_dict=chunk_config_dict,
             chunk_init_device=(chunk_init_device or get_current_device()),
             placement_policy=placement_policy,
+            enable_gradient_accumulation=enable_gradient_accumulation,
             shard_param_frac=shard_param_frac,
             offload_optim_frac=offload_optim_frac,
             offload_param_frac=offload_param_frac,

diff --git a/colossalai/booster/plugin/low_level_zero_plugin.py b/colossalai/booster/plugin/low_level_zero_plugin.py
@@ -335,4 +335,4 @@ def get_checkpoint_io(self) -> CheckpointIO:
 
     def no_sync(self, model: nn.Module, optimizer: OptimizerWrapper) -> Iterator[None]:
         assert isinstance(optimizer, LowLevelZeroOptimizer)
-        return optimizer.optim.no_sync()
+        return optimizer.no_sync()
diff --git a/colossalai/inference/README.md b/colossalai/inference/README.md
@@ -4,7 +4,7 @@
 
 ## Introduction
 
-`Colossal Inference` is a module that contains colossal-ai designed inference framework, featuring high performance, steady and easy usability. `Colossal Inference` incorporated the advantages of the latest open-source inference systems, including TGI, vLLM, FasterTransformer, LightLLM and flash attention. while combining the design of Colossal AI, especially Shardformer, to reduce the learning curve for users.
+`Colossal Inference` is a module that contains colossal-ai designed inference framework, featuring high performance, steady and easy usability. `Colossal Inference` incorporated the advantages of the latest open-source inference systems, including LightLLM, TGI, vLLM, FasterTransformer and flash attention. while combining the design of Colossal AI, especially Shardformer, to reduce the learning curve for users.
 
 ## Design
 
@@ -62,6 +62,12 @@ triton==2.0.0.dev20221202
 vllm
 # for install flash-attention, please use commit hash: 67ae6fd74b4bc99c36b2ce524cf139c35663793c
 flash-attention
+
+# install lightllm since we depend on lightllm triton kernels
+git clone https://github.com/ModelTC/lightllm 
+git checkout 28c1267cfca536b7b4f28e921e03de735b003039
+cd lightllm
+pip3 install -e .
 ```
 
 ### Docker
@@ -73,6 +79,17 @@ You can use docker run to use docker container to set-up environment
 docker pull hpcaitech/colossalai-inference:v2
 docker run -it --gpus all --name ANY_NAME -v $PWD:/workspace -w /workspace hpcaitech/colossalai-inference:v2 /bin/bash
 
+# enter into docker container
+cd /path/to/CollossalAI
+pip install -e .
+
+# install lightllm
+git clone https://github.com/ModelTC/lightllm 
+git checkout 28c1267cfca536b7b4f28e921e03de735b003039
+cd lightllm
+pip3 install -e .
+
+
 ```
 
 ### Dive into fast-inference!
@@ -94,7 +111,7 @@ For various models, experiments were conducted using multiple batch sizes under
 
 ### Single GPU Performance:
 
-Currently the stats below are calculated based on A100 (single GPU), and we calculate token latency based on average values of context-forward and decoding forward process, which means we combine both of processes to calculate token generation times. We are actively developing new features and methods to furthur optimize the performance of LLM models. Please stay tuned.
+Currently the stats below are calculated based on A100 (single GPU), and we calculate token latency based on average values of context-forward and decoding forward process, which means we combine both of processes to calculate token generation times. We are actively developing new features and methods to further optimize the performance of LLM models. Please stay tuned.
 
 #### Llama
 

diff --git a/colossalai/inference/__init__.py b/colossalai/inference/__init__.py
@@ -1,3 +1,3 @@
 from .pipeline import PPInferEngine
 
-__all__ = ['PPInferEngine']
+__all__ = ["PPInferEngine"]
diff --git a/colossalai/inference/pipeline/__init__.py b/colossalai/inference/pipeline/__init__.py
@@ -1,3 +1,3 @@
 from .engine import PPInferEngine
 
-__all__ = ['PPInferEngine']
+__all__ = ["PPInferEngine"]
diff --git a/colossalai/inference/pipeline/benchmark/benchmark.py b/colossalai/inference/pipeline/benchmark/benchmark.py
@@ -1,28 +1,32 @@
+import argparse
+import time
+
 import torch
 import torch.distributed as dist
 import transformers
 
 import colossalai
-import time
 from colossalai.inference import PPInferEngine
 from colossalai.inference.pipeline.policy.llama_ppinfer import LlamaForCausalLMPipelinePolicy
-import argparse
-GIGABYTE = 1024 ** 3
+
+GIGABYTE = 1024**3
 MEGABYTE = 1024 * 1024
 
 colossalai.launch_from_torch(config={})
 
-def data_gen(batch_size: int=4, seq_len: int=512):
+
+def data_gen(batch_size: int = 4, seq_len: int = 512):
     input_ids = torch.randint(10, 30000, (1, seq_len), dtype=torch.int32)
     attention_mask = torch.ones((1, seq_len), dtype=torch.int32)
     data = dict(input_ids=input_ids, attention_mask=attention_mask)
     for k, v in data.items():
-        if torch.is_tensor(v) or 'Tensor' in v.__class__.__name__:
+        if torch.is_tensor(v) or "Tensor" in v.__class__.__name__:
             new_shape = [1] * v.dim()
             new_shape[0] = batch_size
-            data[k] = v.to('cuda').repeat(*new_shape)
+            data[k] = v.to("cuda").repeat(*new_shape)
     return data
 
+
 def print_details_info(timestamps, model_config, args, whole_end2end):
     if dist.get_rank() == 0:
         prefill = []
@@ -31,32 +35,37 @@ def print_details_info(timestamps, model_config, args, whole_end2end):
         for timestamp in timestamps:
             prefill.append(timestamp[1] - timestamp[0])
             encoder.append(
-                sum(timestamp[i + 1] - timestamp[i] for i in range(1,len(timestamp) - 1)) / (len(timestamp) - 2))
+                sum(timestamp[i + 1] - timestamp[i] for i in range(1, len(timestamp) - 1)) / (len(timestamp) - 2)
+            )
             end2end.append(timestamp[-1] - timestamp[0])
         print(whole_end2end)
-        with open(f"{args.log_path}/llama-{args.model}{args.dtype}_pp{args.pp_size}_{args.seq_len}_{args.new_length}_bsz{args.batch_size}_mbsz{args.mb_size}.log","w+") as f:
-            mb_avg_end2end = sum(end2end)/len(end2end)
-            mb_avg_latency = mb_avg_end2end/(args.new_length * args.mb_size)
-            whole_avg_latency = whole_end2end/(args.new_length * args.batch_size)
+        with open(
+            f"{args.log_path}/llama-{args.model}{args.dtype}_pp{args.pp_size}_{args.seq_len}_{args.new_length}_bsz{args.batch_size}_mbsz{args.mb_size}.log",
+            "w+",
+        ) as f:
+            mb_avg_end2end = sum(end2end) / len(end2end)
+            mb_avg_latency = mb_avg_end2end / (args.new_length * args.mb_size)
+            whole_avg_latency = whole_end2end / (args.new_length * args.batch_size)
             num_layers = getattr(model_config, "num_layers", model_config.num_hidden_layers)
             num_parameters = num_layers * model_config.hidden_size * model_config.hidden_size * 12 / args.pp_size
-            if args.dtype in ['fp16','bf16']:
+            if args.dtype in ["fp16", "bf16"]:
                 num_bytes = 2
             else:
                 num_bytes = 4
 
-            f.write(f"llama-{args.model}{args.dtype}_pp{args.pp_size}, input_len:{args.seq_len}, output_len:{args.new_length}, bsz:{args.batch_size}, mbsz:{args.mb_size}\n")
-            f.write("Average prefill time: {0:8.2f} ms\n".format(sum(prefill)/len(prefill)*1000))
-            f.write("Average encode time: {0:8.2f} ms\n".format(sum(encoder)/len(encoder)*1000))
-            f.write("Average micro batch end2end time: {0:8.2f} ms\n".format(mb_avg_end2end*1000))
+            f.write(
+                f"llama-{args.model}{args.dtype}_pp{args.pp_size}, input_len:{args.seq_len}, output_len:{args.new_length}, bsz:{args.batch_size}, mbsz:{args.mb_size}\n"
+            )
+            f.write("Average prefill time: {0:8.2f} ms\n".format(sum(prefill) / len(prefill) * 1000))
+            f.write("Average encode time: {0:8.2f} ms\n".format(sum(encoder) / len(encoder) * 1000))
+            f.write("Average micro batch end2end time: {0:8.2f} ms\n".format(mb_avg_end2end * 1000))
             f.write("Average micro batch Per Token Latency: {0:8.2f} ms\n".format(mb_avg_latency * 1000))
-            f.write("Whole batch end2end time: {0:8.2f} ms\n".format(whole_end2end*1000))
+            f.write("Whole batch end2end time: {0:8.2f} ms\n".format(whole_end2end * 1000))
             f.write("Whole batch Per Token Latency: {0:8.2f} ms\n".format(whole_avg_latency * 1000))
-            f.write("Throughput: {} tokens/s\n".format((1000/(whole_avg_latency * 1000))))
-            f.write("flops: {0:8.2f} TFlops/s\n".format(1/whole_avg_latency * num_parameters * num_bytes / 1e12))
+            f.write("Throughput: {} tokens/s\n".format((1000 / (whole_avg_latency * 1000))))
+            f.write("flops: {0:8.2f} TFlops/s\n".format(1 / whole_avg_latency * num_parameters * num_bytes / 1e12))
             f.write("----------------------------------------------------------\n")
 
-
     if torch.cuda.is_available():
         current_device = torch.cuda.current_device()
 
@@ -66,7 +75,10 @@ def print_details_info(timestamps, model_config, args, whole_end2end):
         max_memory_allocated = torch.cuda.max_memory_allocated()
         memory_reserved = torch.cuda.memory_reserved()
         max_memory_reserved = torch.cuda.max_memory_reserved()
-        with open(f"{args.log_path}/llama-{args.model}{args.dtype}_pp{args.pp_size}_{args.seq_len}_{args.new_length}_bsz{args.batch_size}_mbsz{args.mb_size}.log","a") as f:
+        with open(
+            f"{args.log_path}/llama-{args.model}{args.dtype}_pp{args.pp_size}_{args.seq_len}_{args.new_length}_bsz{args.batch_size}_mbsz{args.mb_size}.log",
+            "a",
+        ) as f:
             f.write(
                 f"\nCurrently using GPU: {current_device}\n"
                 f"free memory : {global_free_memory / GIGABYTE:.4f} GB,\n"
@@ -77,29 +89,37 @@ def print_details_info(timestamps, model_config, args, whole_end2end):
                 f"Max CUDA memory reserved/cached: {max_memory_reserved / GIGABYTE:.4f} GB,\n"
             )
 
-if __name__ == '__main__':
+
+if __name__ == "__main__":
     parser = argparse.ArgumentParser()
-    parser.add_argument('--model', default='toy', help='the size of model')
-    parser.add_argument('-b', '--batch_size', type=int, default=8, help='batch size')
-    parser.add_argument('-s', '--seq_len', type=int, default=8, help='sequence length')
-    parser.add_argument('--new_length', type=int, default=4, help='new tokens length')
-    parser.add_argument('--mb_size', type=int, default=1, help='micro_batch_size')
-    parser.add_argument('--pp_size', type=int, default=2, help='pipeline size')
-    parser.add_argument('--log_path', type=str, default='./log' ,help='where to store the benchmark log')
-    parser.add_argument('--dtype', type=str, default='fp16', help='data type')
+    parser.add_argument("--model", default="toy", help="the size of model")
+    parser.add_argument("-b", "--batch_size", type=int, default=8, help="batch size")
+    parser.add_argument("-s", "--seq_len", type=int, default=8, help="sequence length")
+    parser.add_argument("--new_length", type=int, default=4, help="new tokens length")
+    parser.add_argument("--mb_size", type=int, default=1, help="micro_batch_size")
+    parser.add_argument("--pp_size", type=int, default=2, help="pipeline size")
+    parser.add_argument("--log_path", type=str, default="./log", help="where to store the benchmark log")
+    parser.add_argument("--dtype", type=str, default="fp16", help="data type")
     args = parser.parse_args()
 
-    if args.model == 'toy':
+    if args.model == "toy":
         model = transformers.LlamaForCausalLM(transformers.LlamaConfig(num_hidden_layers=8))
-    elif args.model == '7b':
-        model = transformers.LlamaForCausalLM(transformers.AutoConfig.from_pretrained('decapoda-research/llama-7b-hf'))
-    elif args.model == '13b':
-        model = transformers.LlamaForCausalLM(transformers.AutoConfig.from_pretrained('decapoda-research/llama-13b-hf'))
+    elif args.model == "7b":
+        model = transformers.LlamaForCausalLM(transformers.AutoConfig.from_pretrained("decapoda-research/llama-7b-hf"))
+    elif args.model == "13b":
+        model = transformers.LlamaForCausalLM(transformers.AutoConfig.from_pretrained("decapoda-research/llama-13b-hf"))
     else:
         raise NotImplementedError
-
-
-    engine = PPInferEngine(pp_size=args.pp_size, dtype=args.dtype, micro_batch_size=args.mb_size, new_length=args.new_length, model=model, model_policy=LlamaForCausalLMPipelinePolicy(),verbose=True)
+
+    engine = PPInferEngine(
+        pp_size=args.pp_size,
+        dtype=args.dtype,
+        micro_batch_size=args.mb_size,
+        new_length=args.new_length,
+        model=model,
+        model_policy=LlamaForCausalLMPipelinePolicy(),
+        verbose=True,
+    )
     data = data_gen(args.batch_size, args.seq_len)
 
     torch.cuda.synchronize()
@@ -109,4 +129,3 @@ def print_details_info(timestamps, model_config, args, whole_end2end):
     whole_end2end = time.time() - whole_end2end
 
     print_details_info(timestamps, model.config, args, whole_end2end)
-