jamesthesnake · jamesthesnake · May 8, 2023 · Apr 30, 2023 · May 4, 2023 · May 4, 2023
diff --git a/applications/Chat/README.md b/applications/Chat/README.md
@@ -59,7 +59,7 @@ The Coati package provides a unified large language model framework that has imp
    Image source: https://openai.com/blog/chatgpt
 </div>
 
-**As Colossa-AI is undergoing some major updates, this project will be actively maintained to stay in line with the Colossal-AI project.**
+**As Colossal-AI is undergoing some major updates, this project will be actively maintained to stay in line with the Colossal-AI project.**
 
 
 More details can be found in the latest news.

diff --git a/applications/Chat/benchmarks/benchmark_opt_lora_dummy.py b/applications/Chat/benchmarks/benchmark_opt_lora_dummy.py
@@ -101,6 +101,11 @@ def main(args):
         initial_model = deepcopy(actor).cuda().half()
         reward_model = RewardModel(deepcopy(critic.model), deepcopy(critic.value_head)).cuda().half()
 
+    if args.use_kernels:
+        from coati.kernels import convert_to_xformer_model
+        actor, critic, initial_model, reward_model = map(convert_to_xformer_model,
+                                                         (actor, critic, initial_model, reward_model))
+
     actor_numel = get_model_numel(actor, strategy)
     critic_numel = get_model_numel(critic, strategy)
     initial_model_numel = get_model_numel(initial_model, strategy)
@@ -184,5 +189,6 @@ def main(args):
     parser.add_argument('--lora_rank', type=int, default=0)
     parser.add_argument('--cuda_mem_frac', type=float, default=1.0)
     parser.add_argument('--offload_inference_models', action='store_true', default=False)
+    parser.add_argument('--use_kernels', action='store_true', default=False)
     args = parser.parse_args()
     main(args)
diff --git a/applications/Chat/coati/kernels/__init__.py b/applications/Chat/coati/kernels/__init__.py
@@ -0,0 +1,6 @@
+from .wrapper import convert_to_xformer_model, recover_from_xformer_model
+
+__all__ = [
+    'convert_to_xformer_model',
+    'recover_from_xformer_model',
+]
diff --git a/applications/Chat/coati/kernels/opt_attn.py b/applications/Chat/coati/kernels/opt_attn.py
@@ -0,0 +1,87 @@
+from typing import Optional, Tuple
+
+import torch
+import xformers.ops as xops
+from torch import Tensor
+from transformers.models.opt.modeling_opt import OPTAttention
+
+
+# This is modified from https://github.com/huggingface/transformers/blob/main/src/transformers/models/opt/modeling_opt.py
+class XOPTAttention(OPTAttention):
+    # def _shape(self, tensor: Tensor, seq_len: int, bsz: int):
+    #     return tensor.view(bsz, seq_len, self.num_heads, self.head_dim).contiguous()
+
+    def forward(
+        self,
+        hidden_states: Tensor,
+        key_value_states: Optional[Tensor] = None,
+        past_key_value: Optional[Tensor] = None,
+        attention_mask: Optional[Tensor] = None,
+        layer_head_mask: Optional[Tensor] = None,
+        output_attentions: bool = False,
+    ) -> Tuple[Tensor, Optional[Tensor], Optional[Tuple[Tensor]]]:
+        if not self.training:
+            return super().forward(hidden_states, key_value_states, past_key_value, attention_mask, layer_head_mask,
+                                   output_attentions)
+        """Input shape: Batch x Time x Channel"""
+        assert layer_head_mask is None, 'Xformers attention does not support layer_head_mask'
+        assert not output_attentions, 'Xformers attention does not support output_attentions'
+
+        # if key_value_states are provided this layer is used as a cross-attention layer
+        # for the decoder
+        is_cross_attention = key_value_states is not None
+
+        bsz, tgt_len, _ = hidden_states.size()
+
+        # get query proj
+        query_states = self.q_proj(hidden_states)
+        # get key, value proj
+        if is_cross_attention and past_key_value is not None:
+            # reuse k,v, cross_attentions
+            key_states = past_key_value[0]
+            value_states = past_key_value[1]
+        elif is_cross_attention:
+            # cross_attentions
+            key_states = self._shape(self.k_proj(key_value_states), -1, bsz)
+            value_states = self._shape(self.v_proj(key_value_states), -1, bsz)
+        elif past_key_value is not None:
+            # reuse k, v, self_attention
+            key_states = self._shape(self.k_proj(hidden_states), -1, bsz)
+            value_states = self._shape(self.v_proj(hidden_states), -1, bsz)
+            key_states = torch.cat([past_key_value[0], key_states], dim=2)
+            value_states = torch.cat([past_key_value[1], value_states], dim=2)
+        else:
+            # self_attention
+            key_states = self._shape(self.k_proj(hidden_states), -1, bsz)
+            value_states = self._shape(self.v_proj(hidden_states), -1, bsz)
+
+        if self.is_decoder:
+            # if cross_attention save Tuple(torch.Tensor, torch.Tensor) of all cross attention key/value_states.
+            # Further calls to cross_attention layer can then reuse all cross-attention
+            # key/value_states (first "if" case)
+            # if uni-directional self-attention (decoder) save Tuple(torch.Tensor, torch.Tensor) of
+            # all previous decoder key/value_states. Further calls to uni-directional self-attention
+            # can concat previous decoder key/value_states to current projected key/value_states (third "elif" case)
+            # if encoder bi-directional self-attention `past_key_value` is always `None`
+            past_key_value = (key_states, value_states)
+
+        query_states = self._shape(query_states, tgt_len, bsz).transpose(1, 2)
+        key_states = key_states.transpose(1, 2)
+        value_states = value_states.transpose(1, 2)
+
+        attn_output = xops.memory_efficient_attention(query_states,
+                                                      key_states,
+                                                      value_states,
+                                                      attn_bias=xops.LowerTriangularMask(),
+                                                      p=self.dropout if self.training else 0.0,
+                                                      scale=self.scaling)
+
+        # Use the `embed_dim` from the config (stored in the class) rather than `hidden_state` because `attn_output` can be
+        # partitioned aross GPUs when using tensor-parallelism.
+        attn_output = attn_output.reshape(bsz, tgt_len, self.embed_dim)
+
+        attn_output = self.out_proj(attn_output)
+
+        attn_weights_reshaped = None
+
+        return attn_output, attn_weights_reshaped, past_key_value
diff --git a/applications/Chat/coati/kernels/wrapper.py b/applications/Chat/coati/kernels/wrapper.py
@@ -0,0 +1,18 @@
+import torch.nn as nn
+from transformers.models.opt.modeling_opt import OPTAttention
+
+from .opt_attn import XOPTAttention
+
+
+def convert_to_xformer_model(model: nn.Module) -> nn.Module:
+    for module in model.modules():
+        if isinstance(module, OPTAttention):
+            module.__class__ = XOPTAttention
+    return model
+
+
+def recover_from_xformer_model(model: nn.Module) -> nn.Module:
+    for module in model.modules():
+        if isinstance(module, XOPTAttention):
+            module.__class__ = OPTAttention
+    return model
diff --git a/applications/Chat/evaluate/evaluate.py b/applications/Chat/evaluate/evaluate.py
@@ -130,7 +130,7 @@ def evaluate(args):
             assert answer1_jsons[i]['id'] == answer2_jsons[i]['id']
             answer_id = answer1_jsons[i]['id']
 
-            ques = answer1_jsons[i]['instruction'] if answer1_jsons[i]['input'] == "" else answer1_jsons[i]['instuction'] + \
+            ques = answer1_jsons[i]['instruction'] if answer1_jsons[i]['input'] == "" else answer1_jsons[i]['instruction'] + \
                 " " + answer1_jsons[i]['input']
             cat = answer1_jsons[i]['category']
             ans1 = answer1_jsons[i]['output']

diff --git a/applications/Chat/evaluate/generate_gpt35_answers.py b/applications/Chat/evaluate/generate_gpt35_answers.py
@@ -35,7 +35,7 @@
 
 def get_answer(question: str, max_tokens: int):
     answer = question
-    prompt = question['instruction'] if question['input'] == "" else question['instuction'] + \
+    prompt = question['instruction'] if question['input'] == "" else question['instruction'] + \
             " " + question['input']
     for _ in range(MAX_API_RETRY):
         try:

diff --git a/applications/Chat/examples/README.md b/applications/Chat/examples/README.md
@@ -24,7 +24,6 @@
     - [LLaMA](#llama)
   - [Add your own models](#add-your-own-models)
     - [Actor model](#actor-model)
-    - [LM model](#lm-model)
     - [Reward model](#reward-model)
     - [Critic model](#critic-model)
 
@@ -150,11 +149,11 @@ torchrun --standalone --nproc_per_node=4 train_prompts.py \
          --strategy colossalai_zero2 \
          --prompt_dataset /path/to/your/prompt_dataset \
          --pretrain_dataset /path/to/your/pretrain_dataset \
-         --rm_pretrain /your/pretrain/rm/defination \
+         --rm_pretrain /your/pretrain/rm/definition \
          --rm_path /your/rm/model/path
 ```
 
-Prompt dataset: the instruction dataset mentioned in the above figure which includes the instructions, e.g. you can use [seed_prompts_ch.jsonl](https://github.com/XueFuzhao/InstructionWild/blob/main/data/seed_prompts_ch.jsonl) or [seed_prompts_en.jsonl](https://github.com/XueFuzhao/InstructionWild/blob/main/data/seed_prompts_en.jsonl) in InstructionWild.
+Prompt dataset: the instruction dataset mentioned in the above figure which includes the instructions, e.g. you can use the [script](https://github.com/hpcaitech/ColossalAI/tree/main/applications/Chat/examples/example_data_reformat.py) to reformat [seed_prompts_ch.jsonl](https://github.com/XueFuzhao/InstructionWild/blob/main/data/seed_prompts_ch.jsonl) or [seed_prompts_en.jsonl](https://github.com/XueFuzhao/InstructionWild/blob/main/data/seed_prompts_en.jsonl) in InstructionWild.  
 Pretrain dataset: the pretrain dataset including the instruction and corresponding response, e.g. you can use the [InstructWild Data](https://github.com/XueFuzhao/InstructionWild/tree/main/data) in stage 1 supervised instructs tuning.
 
 ### Arg List
@@ -233,7 +232,7 @@ If you want to support your own model in Coati, please refer the pull request fo
 You should complete the implementation of four model classes, including Reward model, Critic model, LM model, Actor model
 
 here are some example code for a NewModel named `Coati`.
-if it is supported in huggingaface [transformers](https://github.com/huggingface/transformers), you can load it by `from_pretrained`, o
+if it is supported in huggingface [transformers](https://github.com/huggingface/transformers), you can load it by `from_pretrained`, o
 r you can build your own model by yourself.
 
 ### Actor model

diff --git a/applications/Chat/examples/community/peft/easy_dataset.py b/applications/Chat/examples/community/peft/easy_dataset.py
@@ -188,7 +188,7 @@ def __init__(self, data_file: str, tokenizer: AutoTokenizer, max_length=512, is_
                 else:
                     raw_input_ids.append(encoded_ids)
 
-        grouped_inpup_ids = []
+        grouped_input_ids = []
         current_input_ids = []
         attention_mask = []
         if tokenizer.pad_token_id is None:
@@ -199,7 +199,7 @@ def __init__(self, data_file: str, tokenizer: AutoTokenizer, max_length=512, is_
                     #pad the current_input_ids to max_length with tokenizer.pad_token_id
                     padded_length = max_length - len(current_input_ids)
                     current_input_ids.extend([tokenizer.pad_token_id] * padded_length)
-                    grouped_inpup_ids.append(torch.tensor(current_input_ids, dtype=torch.long))
+                    grouped_input_ids.append(torch.tensor(current_input_ids, dtype=torch.long))
                     attention_mask.append(
                         torch.tensor([1] * (max_length - padded_length) + [0] * padded_length, dtype=torch.long))
                     current_input_ids = []
@@ -208,7 +208,7 @@ def __init__(self, data_file: str, tokenizer: AutoTokenizer, max_length=512, is_
             if len(current_input_ids) > 0:
                 padded_length = max_length - len(current_input_ids)
                 current_input_ids.extend([tokenizer.pad_token_id] * padded_length)
-                grouped_inpup_ids.append(torch.tensor(current_input_ids, dtype=torch.long))
+                grouped_input_ids.append(torch.tensor(current_input_ids, dtype=torch.long))
                 attention_mask.append(
                     torch.tensor([1] * (max_length - padded_length) + [0] * padded_length, dtype=torch.long))
         else:
@@ -218,8 +218,8 @@ def __init__(self, data_file: str, tokenizer: AutoTokenizer, max_length=512, is_
                 input_ids.extend([tokenizer.pad_token_id] * padded_length)
                 attention_mask.append(
                     torch.tensor([1] * (max_length - padded_length) + [0] * padded_length, dtype=torch.long))
-                grouped_inpup_ids.append(torch.tensor(input_ids, dtype=torch.long))
-        self.input_ids = grouped_inpup_ids
+                grouped_input_ids.append(torch.tensor(input_ids, dtype=torch.long))
+        self.input_ids = grouped_input_ids
         self.labels = copy.deepcopy(self.input_ids)
         self.file_name = data_file
         self.attention_mask = attention_mask

diff --git a/applications/Chat/examples/community/peft/train_peft_prompts.py b/applications/Chat/examples/community/peft/train_peft_prompts.py
@@ -41,7 +41,7 @@ def main(args):
     # configure model
     if args.model == 'bloom':
         # initial_model = BLOOMActor(pretrained=args.pretrain)
-        print('Using peft lora to load Bloom model as inital_model')
+        print('Using peft lora to load Bloom model as initial_model')
         initial_model = BLOOMActor(pretrained=args.pretrain, lora_path=args.sft_lora_path)
         print('Using peft lora to load Bloom model as initial_model (Done)')
     else:

diff --git a/applications/Chat/examples/community/peft/train_peft_sft.py b/applications/Chat/examples/community/peft/train_peft_sft.py
@@ -86,7 +86,7 @@ def train(args):
 
         if args.strategy == 'colossalai_gemini':
             # this is a hack to deal with the resized embedding
-            # to make sure all parameters are ColoParameter for Colossal-AI Gemini Compatiblity
+            # to make sure all parameters are ColoParameter for Colossal-AI Gemini Compatibility
             for name, param in model.named_parameters():
                 if not isinstance(param, ColoParameter):
                     sub_module_name = '.'.join(name.split('.')[:-1])

diff --git a/applications/Chat/examples/example_data_reformat.py b/applications/Chat/examples/example_data_reformat.py
@@ -0,0 +1,12 @@
+jsonl_file = 'seed_prompts_xx.jsonl'  # seed_prompts_en.jsonl or seed_prompts_ch.json from InstructionWild
+reformat_file = 'prompts_xx.jsonl'  # reformat jsonl file used as Prompt dataset in Stage3
+
+data = ''
+with open(jsonl_file, 'r', encoding="utf-8") as f1:
+    for jsonstr in f1.readlines():
+        jsonstr = '\t' + jsonstr.strip('\n') + ',\n'
+        data = data + jsonstr
+    data = '[\n' + data + ']'
+
+with open(reformat_file, 'w') as f2:
+    f2.write(data)
diff --git a/applications/Chat/examples/train_prompts.py b/applications/Chat/examples/train_prompts.py
@@ -36,45 +36,45 @@ def main(args):
     if args.rm_path is not None:
         state_dict = torch.load(args.rm_path, map_location='cpu')
 
-    # configure model
-    if args.model == 'gpt2':
-        initial_model = GPTActor(pretrained=args.pretrain)
-    elif args.model == 'bloom':
-        initial_model = BLOOMActor(pretrained=args.pretrain)
-    elif args.model == 'opt':
-        initial_model = OPTActor(pretrained=args.pretrain)
-    elif args.model == 'llama':
-        initial_model = LlamaActor(pretrained=args.pretrain)
-    elif args.model == 'roberta':
-        initial_model = RoBERTaActor(pretrained=args.pretrain)
-    else:
-        raise ValueError(f'Unsupported actor model "{args.model}"')
+    with strategy.model_init_context():
+        # configure model
+        if args.model == 'gpt2':
+            initial_model = GPTActor(pretrained=args.pretrain)
+        elif args.model == 'bloom':
+            initial_model = BLOOMActor(pretrained=args.pretrain)
+        elif args.model == 'opt':
+            initial_model = OPTActor(pretrained=args.pretrain)
+        elif args.model == 'llama':
+            initial_model = LlamaActor(pretrained=args.pretrain)
+        elif args.model == 'roberta':
+            initial_model = RoBERTaActor(pretrained=args.pretrain)
+        else:
+            raise ValueError(f'Unsupported actor model "{args.model}"')
 
-    if args.rm_model == None:
-        rm_model_name = args.model
-    else:
-        rm_model_name = args.rm_model
-
-    if rm_model_name == 'gpt2':
-        reward_model = GPTRM(pretrained=args.rm_pretrain)
-    elif rm_model_name == 'bloom':
-        reward_model = BLOOMRM(pretrained=args.rm_pretrain)
-    elif rm_model_name == 'opt':
-        reward_model = OPTRM(pretrained=args.rm_pretrain)
-    elif rm_model_name == 'llama':
-        reward_model = LlamaRM(pretrained=args.rm_pretrain)
-    elif rm_model_name == 'roberta':
-        reward_model = RoBERTaRM(pretrained=args.rm_pretrain)
-    else:
-        raise ValueError(f'Unsupported reward model "{rm_model_name}"')
+        if args.rm_model == None:
+            rm_model_name = args.model
+        else:
+            rm_model_name = args.rm_model
 
-    if args.rm_path is not None:
-        reward_model.load_state_dict(state_dict)
+        if rm_model_name == 'gpt2':
+            reward_model = GPTRM(pretrained=args.rm_pretrain)
+        elif rm_model_name == 'bloom':
+            reward_model = BLOOMRM(pretrained=args.rm_pretrain)
+        elif rm_model_name == 'opt':
+            reward_model = OPTRM(pretrained=args.rm_pretrain)
+        elif rm_model_name == 'llama':
+            reward_model = LlamaRM(pretrained=args.rm_pretrain)
+        elif rm_model_name == 'roberta':
+            reward_model = RoBERTaRM(pretrained=args.rm_pretrain)
+        else:
+            raise ValueError(f'Unsupported reward model "{rm_model_name}"')
 
-    initial_model.to(torch.float16).to(torch.cuda.current_device())
-    reward_model.to(torch.float16).to(torch.cuda.current_device())
+        if args.rm_path is not None:
+            reward_model.load_state_dict(state_dict)
+
+        initial_model.to(torch.float16).to(torch.cuda.current_device())
+        reward_model.to(torch.float16).to(torch.cuda.current_device())
 
-    with strategy.model_init_context():
         if args.model == 'gpt2':
             actor = GPTActor(pretrained=args.pretrain, lora_rank=args.lora_rank)
         elif args.model == 'bloom':

diff --git a/applications/Chat/examples/train_sft.py b/applications/Chat/examples/train_sft.py
@@ -84,7 +84,7 @@ def train(args):
 
         if args.strategy == 'colossalai_gemini':
             # this is a hack to deal with the resized embedding
-            # to make sure all parameters are ColoParameter for Colossal-AI Gemini Compatiblity
+            # to make sure all parameters are ColoParameter for Colossal-AI Gemini Compatibility
             for name, param in model.named_parameters():
                 if not isinstance(param, ColoParameter):
                     sub_module_name = '.'.join(name.split('.')[:-1])

diff --git a/colossalai/_analyzer/__init__.py b/colossalai/_analyzer/__init__.py