Skip to content
Merged

Ra #38

Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion applications/Chat/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -59,7 +59,7 @@ The Coati package provides a unified large language model framework that has imp
Image source: https://openai.com/blog/chatgpt
</div>

**As Colossa-AI is undergoing some major updates, this project will be actively maintained to stay in line with the Colossal-AI project.**
**As Colossal-AI is undergoing some major updates, this project will be actively maintained to stay in line with the Colossal-AI project.**


More details can be found in the latest news.
Expand Down
6 changes: 6 additions & 0 deletions applications/Chat/benchmarks/benchmark_opt_lora_dummy.py
Original file line number Diff line number Diff line change
Expand Up @@ -101,6 +101,11 @@ def main(args):
initial_model = deepcopy(actor).cuda().half()
reward_model = RewardModel(deepcopy(critic.model), deepcopy(critic.value_head)).cuda().half()

if args.use_kernels:
from coati.kernels import convert_to_xformer_model
actor, critic, initial_model, reward_model = map(convert_to_xformer_model,
(actor, critic, initial_model, reward_model))

actor_numel = get_model_numel(actor, strategy)
critic_numel = get_model_numel(critic, strategy)
initial_model_numel = get_model_numel(initial_model, strategy)
Expand Down Expand Up @@ -184,5 +189,6 @@ def main(args):
parser.add_argument('--lora_rank', type=int, default=0)
parser.add_argument('--cuda_mem_frac', type=float, default=1.0)
parser.add_argument('--offload_inference_models', action='store_true', default=False)
parser.add_argument('--use_kernels', action='store_true', default=False)
args = parser.parse_args()
main(args)
6 changes: 6 additions & 0 deletions applications/Chat/coati/kernels/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
from .wrapper import convert_to_xformer_model, recover_from_xformer_model

__all__ = [
'convert_to_xformer_model',
'recover_from_xformer_model',
]
87 changes: 87 additions & 0 deletions applications/Chat/coati/kernels/opt_attn.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,87 @@
from typing import Optional, Tuple

import torch
import xformers.ops as xops
from torch import Tensor
from transformers.models.opt.modeling_opt import OPTAttention


# This is modified from https://github.com/huggingface/transformers/blob/main/src/transformers/models/opt/modeling_opt.py
class XOPTAttention(OPTAttention):
# def _shape(self, tensor: Tensor, seq_len: int, bsz: int):
# return tensor.view(bsz, seq_len, self.num_heads, self.head_dim).contiguous()

def forward(
self,
hidden_states: Tensor,
key_value_states: Optional[Tensor] = None,
past_key_value: Optional[Tensor] = None,
attention_mask: Optional[Tensor] = None,
layer_head_mask: Optional[Tensor] = None,
output_attentions: bool = False,
) -> Tuple[Tensor, Optional[Tensor], Optional[Tuple[Tensor]]]:
if not self.training:
return super().forward(hidden_states, key_value_states, past_key_value, attention_mask, layer_head_mask,
output_attentions)
"""Input shape: Batch x Time x Channel"""
assert layer_head_mask is None, 'Xformers attention does not support layer_head_mask'
assert not output_attentions, 'Xformers attention does not support output_attentions'

# if key_value_states are provided this layer is used as a cross-attention layer
# for the decoder
is_cross_attention = key_value_states is not None

bsz, tgt_len, _ = hidden_states.size()

# get query proj
query_states = self.q_proj(hidden_states)
# get key, value proj
if is_cross_attention and past_key_value is not None:
# reuse k,v, cross_attentions
key_states = past_key_value[0]
value_states = past_key_value[1]
elif is_cross_attention:
# cross_attentions
key_states = self._shape(self.k_proj(key_value_states), -1, bsz)
value_states = self._shape(self.v_proj(key_value_states), -1, bsz)
elif past_key_value is not None:
# reuse k, v, self_attention
key_states = self._shape(self.k_proj(hidden_states), -1, bsz)
value_states = self._shape(self.v_proj(hidden_states), -1, bsz)
key_states = torch.cat([past_key_value[0], key_states], dim=2)
value_states = torch.cat([past_key_value[1], value_states], dim=2)
else:
# self_attention
key_states = self._shape(self.k_proj(hidden_states), -1, bsz)
value_states = self._shape(self.v_proj(hidden_states), -1, bsz)

if self.is_decoder:
# if cross_attention save Tuple(torch.Tensor, torch.Tensor) of all cross attention key/value_states.
# Further calls to cross_attention layer can then reuse all cross-attention
# key/value_states (first "if" case)
# if uni-directional self-attention (decoder) save Tuple(torch.Tensor, torch.Tensor) of
# all previous decoder key/value_states. Further calls to uni-directional self-attention
# can concat previous decoder key/value_states to current projected key/value_states (third "elif" case)
# if encoder bi-directional self-attention `past_key_value` is always `None`
past_key_value = (key_states, value_states)

query_states = self._shape(query_states, tgt_len, bsz).transpose(1, 2)
key_states = key_states.transpose(1, 2)
value_states = value_states.transpose(1, 2)

attn_output = xops.memory_efficient_attention(query_states,
key_states,
value_states,
attn_bias=xops.LowerTriangularMask(),
p=self.dropout if self.training else 0.0,
scale=self.scaling)

# Use the `embed_dim` from the config (stored in the class) rather than `hidden_state` because `attn_output` can be
# partitioned aross GPUs when using tensor-parallelism.
attn_output = attn_output.reshape(bsz, tgt_len, self.embed_dim)

attn_output = self.out_proj(attn_output)

attn_weights_reshaped = None

return attn_output, attn_weights_reshaped, past_key_value
18 changes: 18 additions & 0 deletions applications/Chat/coati/kernels/wrapper.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,18 @@
import torch.nn as nn
from transformers.models.opt.modeling_opt import OPTAttention

from .opt_attn import XOPTAttention


def convert_to_xformer_model(model: nn.Module) -> nn.Module:
for module in model.modules():
if isinstance(module, OPTAttention):
module.__class__ = XOPTAttention
return model


def recover_from_xformer_model(model: nn.Module) -> nn.Module:
for module in model.modules():
if isinstance(module, XOPTAttention):
module.__class__ = OPTAttention
return model
2 changes: 1 addition & 1 deletion applications/Chat/evaluate/evaluate.py
Original file line number Diff line number Diff line change
Expand Up @@ -130,7 +130,7 @@ def evaluate(args):
assert answer1_jsons[i]['id'] == answer2_jsons[i]['id']
answer_id = answer1_jsons[i]['id']

ques = answer1_jsons[i]['instruction'] if answer1_jsons[i]['input'] == "" else answer1_jsons[i]['instuction'] + \
ques = answer1_jsons[i]['instruction'] if answer1_jsons[i]['input'] == "" else answer1_jsons[i]['instruction'] + \
" " + answer1_jsons[i]['input']
cat = answer1_jsons[i]['category']
ans1 = answer1_jsons[i]['output']
Expand Down
2 changes: 1 addition & 1 deletion applications/Chat/evaluate/generate_gpt35_answers.py
Original file line number Diff line number Diff line change
Expand Up @@ -35,7 +35,7 @@

def get_answer(question: str, max_tokens: int):
answer = question
prompt = question['instruction'] if question['input'] == "" else question['instuction'] + \
prompt = question['instruction'] if question['input'] == "" else question['instruction'] + \
" " + question['input']
for _ in range(MAX_API_RETRY):
try:
Expand Down
7 changes: 3 additions & 4 deletions applications/Chat/examples/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,6 @@
- [LLaMA](#llama)
- [Add your own models](#add-your-own-models)
- [Actor model](#actor-model)
- [LM model](#lm-model)
- [Reward model](#reward-model)
- [Critic model](#critic-model)

Expand Down Expand Up @@ -150,11 +149,11 @@ torchrun --standalone --nproc_per_node=4 train_prompts.py \
--strategy colossalai_zero2 \
--prompt_dataset /path/to/your/prompt_dataset \
--pretrain_dataset /path/to/your/pretrain_dataset \
--rm_pretrain /your/pretrain/rm/defination \
--rm_pretrain /your/pretrain/rm/definition \
--rm_path /your/rm/model/path
```

Prompt dataset: the instruction dataset mentioned in the above figure which includes the instructions, e.g. you can use [seed_prompts_ch.jsonl](https://github.com/XueFuzhao/InstructionWild/blob/main/data/seed_prompts_ch.jsonl) or [seed_prompts_en.jsonl](https://github.com/XueFuzhao/InstructionWild/blob/main/data/seed_prompts_en.jsonl) in InstructionWild.
Prompt dataset: the instruction dataset mentioned in the above figure which includes the instructions, e.g. you can use the [script](https://github.com/hpcaitech/ColossalAI/tree/main/applications/Chat/examples/example_data_reformat.py) to reformat [seed_prompts_ch.jsonl](https://github.com/XueFuzhao/InstructionWild/blob/main/data/seed_prompts_ch.jsonl) or [seed_prompts_en.jsonl](https://github.com/XueFuzhao/InstructionWild/blob/main/data/seed_prompts_en.jsonl) in InstructionWild.
Pretrain dataset: the pretrain dataset including the instruction and corresponding response, e.g. you can use the [InstructWild Data](https://github.com/XueFuzhao/InstructionWild/tree/main/data) in stage 1 supervised instructs tuning.

### Arg List
Expand Down Expand Up @@ -233,7 +232,7 @@ If you want to support your own model in Coati, please refer the pull request fo
You should complete the implementation of four model classes, including Reward model, Critic model, LM model, Actor model

here are some example code for a NewModel named `Coati`.
if it is supported in huggingaface [transformers](https://github.com/huggingface/transformers), you can load it by `from_pretrained`, o
if it is supported in huggingface [transformers](https://github.com/huggingface/transformers), you can load it by `from_pretrained`, o
r you can build your own model by yourself.

### Actor model
Expand Down
10 changes: 5 additions & 5 deletions applications/Chat/examples/community/peft/easy_dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -188,7 +188,7 @@ def __init__(self, data_file: str, tokenizer: AutoTokenizer, max_length=512, is_
else:
raw_input_ids.append(encoded_ids)

grouped_inpup_ids = []
grouped_input_ids = []
current_input_ids = []
attention_mask = []
if tokenizer.pad_token_id is None:
Expand All @@ -199,7 +199,7 @@ def __init__(self, data_file: str, tokenizer: AutoTokenizer, max_length=512, is_
#pad the current_input_ids to max_length with tokenizer.pad_token_id
padded_length = max_length - len(current_input_ids)
current_input_ids.extend([tokenizer.pad_token_id] * padded_length)
grouped_inpup_ids.append(torch.tensor(current_input_ids, dtype=torch.long))
grouped_input_ids.append(torch.tensor(current_input_ids, dtype=torch.long))
attention_mask.append(
torch.tensor([1] * (max_length - padded_length) + [0] * padded_length, dtype=torch.long))
current_input_ids = []
Expand All @@ -208,7 +208,7 @@ def __init__(self, data_file: str, tokenizer: AutoTokenizer, max_length=512, is_
if len(current_input_ids) > 0:
padded_length = max_length - len(current_input_ids)
current_input_ids.extend([tokenizer.pad_token_id] * padded_length)
grouped_inpup_ids.append(torch.tensor(current_input_ids, dtype=torch.long))
grouped_input_ids.append(torch.tensor(current_input_ids, dtype=torch.long))
attention_mask.append(
torch.tensor([1] * (max_length - padded_length) + [0] * padded_length, dtype=torch.long))
else:
Expand All @@ -218,8 +218,8 @@ def __init__(self, data_file: str, tokenizer: AutoTokenizer, max_length=512, is_
input_ids.extend([tokenizer.pad_token_id] * padded_length)
attention_mask.append(
torch.tensor([1] * (max_length - padded_length) + [0] * padded_length, dtype=torch.long))
grouped_inpup_ids.append(torch.tensor(input_ids, dtype=torch.long))
self.input_ids = grouped_inpup_ids
grouped_input_ids.append(torch.tensor(input_ids, dtype=torch.long))
self.input_ids = grouped_input_ids
self.labels = copy.deepcopy(self.input_ids)
self.file_name = data_file
self.attention_mask = attention_mask
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -41,7 +41,7 @@ def main(args):
# configure model
if args.model == 'bloom':
# initial_model = BLOOMActor(pretrained=args.pretrain)
print('Using peft lora to load Bloom model as inital_model')
print('Using peft lora to load Bloom model as initial_model')
initial_model = BLOOMActor(pretrained=args.pretrain, lora_path=args.sft_lora_path)
print('Using peft lora to load Bloom model as initial_model (Done)')
else:
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -86,7 +86,7 @@ def train(args):

if args.strategy == 'colossalai_gemini':
# this is a hack to deal with the resized embedding
# to make sure all parameters are ColoParameter for Colossal-AI Gemini Compatiblity
# to make sure all parameters are ColoParameter for Colossal-AI Gemini Compatibility
for name, param in model.named_parameters():
if not isinstance(param, ColoParameter):
sub_module_name = '.'.join(name.split('.')[:-1])
Expand Down
12 changes: 12 additions & 0 deletions applications/Chat/examples/example_data_reformat.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
jsonl_file = 'seed_prompts_xx.jsonl' # seed_prompts_en.jsonl or seed_prompts_ch.json from InstructionWild
reformat_file = 'prompts_xx.jsonl' # reformat jsonl file used as Prompt dataset in Stage3

data = ''
with open(jsonl_file, 'r', encoding="utf-8") as f1:
for jsonstr in f1.readlines():
jsonstr = '\t' + jsonstr.strip('\n') + ',\n'
data = data + jsonstr
data = '[\n' + data + ']'

with open(reformat_file, 'w') as f2:
f2.write(data)
70 changes: 35 additions & 35 deletions applications/Chat/examples/train_prompts.py
Original file line number Diff line number Diff line change
Expand Up @@ -36,45 +36,45 @@ def main(args):
if args.rm_path is not None:
state_dict = torch.load(args.rm_path, map_location='cpu')

# configure model
if args.model == 'gpt2':
initial_model = GPTActor(pretrained=args.pretrain)
elif args.model == 'bloom':
initial_model = BLOOMActor(pretrained=args.pretrain)
elif args.model == 'opt':
initial_model = OPTActor(pretrained=args.pretrain)
elif args.model == 'llama':
initial_model = LlamaActor(pretrained=args.pretrain)
elif args.model == 'roberta':
initial_model = RoBERTaActor(pretrained=args.pretrain)
else:
raise ValueError(f'Unsupported actor model "{args.model}"')
with strategy.model_init_context():
# configure model
if args.model == 'gpt2':
initial_model = GPTActor(pretrained=args.pretrain)
elif args.model == 'bloom':
initial_model = BLOOMActor(pretrained=args.pretrain)
elif args.model == 'opt':
initial_model = OPTActor(pretrained=args.pretrain)
elif args.model == 'llama':
initial_model = LlamaActor(pretrained=args.pretrain)
elif args.model == 'roberta':
initial_model = RoBERTaActor(pretrained=args.pretrain)
else:
raise ValueError(f'Unsupported actor model "{args.model}"')

if args.rm_model == None:
rm_model_name = args.model
else:
rm_model_name = args.rm_model

if rm_model_name == 'gpt2':
reward_model = GPTRM(pretrained=args.rm_pretrain)
elif rm_model_name == 'bloom':
reward_model = BLOOMRM(pretrained=args.rm_pretrain)
elif rm_model_name == 'opt':
reward_model = OPTRM(pretrained=args.rm_pretrain)
elif rm_model_name == 'llama':
reward_model = LlamaRM(pretrained=args.rm_pretrain)
elif rm_model_name == 'roberta':
reward_model = RoBERTaRM(pretrained=args.rm_pretrain)
else:
raise ValueError(f'Unsupported reward model "{rm_model_name}"')
if args.rm_model == None:
rm_model_name = args.model
else:
rm_model_name = args.rm_model

if args.rm_path is not None:
reward_model.load_state_dict(state_dict)
if rm_model_name == 'gpt2':
reward_model = GPTRM(pretrained=args.rm_pretrain)
elif rm_model_name == 'bloom':
reward_model = BLOOMRM(pretrained=args.rm_pretrain)
elif rm_model_name == 'opt':
reward_model = OPTRM(pretrained=args.rm_pretrain)
elif rm_model_name == 'llama':
reward_model = LlamaRM(pretrained=args.rm_pretrain)
elif rm_model_name == 'roberta':
reward_model = RoBERTaRM(pretrained=args.rm_pretrain)
else:
raise ValueError(f'Unsupported reward model "{rm_model_name}"')

initial_model.to(torch.float16).to(torch.cuda.current_device())
reward_model.to(torch.float16).to(torch.cuda.current_device())
if args.rm_path is not None:
reward_model.load_state_dict(state_dict)

initial_model.to(torch.float16).to(torch.cuda.current_device())
reward_model.to(torch.float16).to(torch.cuda.current_device())

with strategy.model_init_context():
if args.model == 'gpt2':
actor = GPTActor(pretrained=args.pretrain, lora_rank=args.lora_rank)
elif args.model == 'bloom':
Expand Down
2 changes: 1 addition & 1 deletion applications/Chat/examples/train_sft.py
Original file line number Diff line number Diff line change
Expand Up @@ -84,7 +84,7 @@ def train(args):

if args.strategy == 'colossalai_gemini':
# this is a hack to deal with the resized embedding
# to make sure all parameters are ColoParameter for Colossal-AI Gemini Compatiblity
# to make sure all parameters are ColoParameter for Colossal-AI Gemini Compatibility
for name, param in model.named_parameters():
if not isinstance(param, ColoParameter):
sub_module_name = '.'.join(name.split('.')[:-1])
Expand Down
Empty file.
Loading