Skip to content
Merged

s #43

Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
21 commits
Select commit Hold shift + click to select a range
bfbf650
fix spelling error
digger-yu May 4, 2023
8ba7858
Update generate_gpt35_answers.py
digger-yu May 4, 2023
7bd0bee
[chat] add opt attn kernel (#3655)
ver217 May 4, 2023
6650dae
[doc] fix chat spelling error (#3671)
digger-yu May 5, 2023
0f785cb
[chat] PPO stage3 doc enhancement (#3679)
Camille7777 May 5, 2023
307894f
[booster] gemini plugin support shard checkpoint (#3610)
flybird11111 May 5, 2023
b36e67c
Merge pull request #3680 from digger-yu/digger-yu-patch-2
TongLi3701 May 5, 2023
b49020c
[CI] Update test_sharded_optim_with_sync_bn.py (#3688)
digger-yu May 5, 2023
d0915f5
[booster] refactor all dp fashion plugins (#3684)
ver217 May 5, 2023
65bdc31
fix some spelling error with applications/Chat/examples/ (#3692)
digger-yu May 6, 2023
d556648
[example] add finetune bert with booster example (#3693)
ver217 May 6, 2023
2da5d81
[chat] fix train_prompts.py gemini strategy bug (#3666)
zhang-yi-chi May 6, 2023
2629f97
[tensor] Refactor handle_trans_spec in DistSpecManager
yhna940 May 6, 2023
f83ea81
[example] add train resnet/vit with booster example (#3694)
ver217 May 8, 2023
3bf09ef
[booster] update prepare dataloader method for plugin (#3706)
ver217 May 8, 2023
6552cbf
[booster] fix no_sync method (#3709)
ver217 May 9, 2023
20068ba
[booster] add tests for ddp and low level zero's checkpointio (#3715)
flybird11111 May 10, 2023
f7361ee
[chat] fix community example ray (#3719)
MisterLin1995 May 10, 2023
b7141c3
[CI] fix some spelling errors (#3707)
digger-yu May 10, 2023
899aa86
[CI] fix typo with tests components (#3695)
digger-yu May 11, 2023
1f73609
[CI] fix typo with tests/ etc. (#3727)
digger-yu May 11, 2023
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion applications/Chat/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -59,7 +59,7 @@ The Coati package provides a unified large language model framework that has imp
Image source: https://openai.com/blog/chatgpt
</div>

**As Colossa-AI is undergoing some major updates, this project will be actively maintained to stay in line with the Colossal-AI project.**
**As Colossal-AI is undergoing some major updates, this project will be actively maintained to stay in line with the Colossal-AI project.**


More details can be found in the latest news.
Expand Down
6 changes: 6 additions & 0 deletions applications/Chat/benchmarks/benchmark_opt_lora_dummy.py
Original file line number Diff line number Diff line change
Expand Up @@ -101,6 +101,11 @@ def main(args):
initial_model = deepcopy(actor).cuda().half()
reward_model = RewardModel(deepcopy(critic.model), deepcopy(critic.value_head)).cuda().half()

if args.use_kernels:
from coati.kernels import convert_to_xformer_model
actor, critic, initial_model, reward_model = map(convert_to_xformer_model,
(actor, critic, initial_model, reward_model))

actor_numel = get_model_numel(actor, strategy)
critic_numel = get_model_numel(critic, strategy)
initial_model_numel = get_model_numel(initial_model, strategy)
Expand Down Expand Up @@ -184,5 +189,6 @@ def main(args):
parser.add_argument('--lora_rank', type=int, default=0)
parser.add_argument('--cuda_mem_frac', type=float, default=1.0)
parser.add_argument('--offload_inference_models', action='store_true', default=False)
parser.add_argument('--use_kernels', action='store_true', default=False)
args = parser.parse_args()
main(args)
6 changes: 6 additions & 0 deletions applications/Chat/coati/kernels/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
from .wrapper import convert_to_xformer_model, recover_from_xformer_model

__all__ = [
'convert_to_xformer_model',
'recover_from_xformer_model',
]
87 changes: 87 additions & 0 deletions applications/Chat/coati/kernels/opt_attn.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,87 @@
from typing import Optional, Tuple

import torch
import xformers.ops as xops
from torch import Tensor
from transformers.models.opt.modeling_opt import OPTAttention


# This is modified from https://github.com/huggingface/transformers/blob/main/src/transformers/models/opt/modeling_opt.py
class XOPTAttention(OPTAttention):
# def _shape(self, tensor: Tensor, seq_len: int, bsz: int):
# return tensor.view(bsz, seq_len, self.num_heads, self.head_dim).contiguous()

def forward(
self,
hidden_states: Tensor,
key_value_states: Optional[Tensor] = None,
past_key_value: Optional[Tensor] = None,
attention_mask: Optional[Tensor] = None,
layer_head_mask: Optional[Tensor] = None,
output_attentions: bool = False,
) -> Tuple[Tensor, Optional[Tensor], Optional[Tuple[Tensor]]]:
if not self.training:
return super().forward(hidden_states, key_value_states, past_key_value, attention_mask, layer_head_mask,
output_attentions)
"""Input shape: Batch x Time x Channel"""
assert layer_head_mask is None, 'Xformers attention does not support layer_head_mask'
assert not output_attentions, 'Xformers attention does not support output_attentions'

# if key_value_states are provided this layer is used as a cross-attention layer
# for the decoder
is_cross_attention = key_value_states is not None

bsz, tgt_len, _ = hidden_states.size()

# get query proj
query_states = self.q_proj(hidden_states)
# get key, value proj
if is_cross_attention and past_key_value is not None:
# reuse k,v, cross_attentions
key_states = past_key_value[0]
value_states = past_key_value[1]
elif is_cross_attention:
# cross_attentions
key_states = self._shape(self.k_proj(key_value_states), -1, bsz)
value_states = self._shape(self.v_proj(key_value_states), -1, bsz)
elif past_key_value is not None:
# reuse k, v, self_attention
key_states = self._shape(self.k_proj(hidden_states), -1, bsz)
value_states = self._shape(self.v_proj(hidden_states), -1, bsz)
key_states = torch.cat([past_key_value[0], key_states], dim=2)
value_states = torch.cat([past_key_value[1], value_states], dim=2)
else:
# self_attention
key_states = self._shape(self.k_proj(hidden_states), -1, bsz)
value_states = self._shape(self.v_proj(hidden_states), -1, bsz)

if self.is_decoder:
# if cross_attention save Tuple(torch.Tensor, torch.Tensor) of all cross attention key/value_states.
# Further calls to cross_attention layer can then reuse all cross-attention
# key/value_states (first "if" case)
# if uni-directional self-attention (decoder) save Tuple(torch.Tensor, torch.Tensor) of
# all previous decoder key/value_states. Further calls to uni-directional self-attention
# can concat previous decoder key/value_states to current projected key/value_states (third "elif" case)
# if encoder bi-directional self-attention `past_key_value` is always `None`
past_key_value = (key_states, value_states)

query_states = self._shape(query_states, tgt_len, bsz).transpose(1, 2)
key_states = key_states.transpose(1, 2)
value_states = value_states.transpose(1, 2)

attn_output = xops.memory_efficient_attention(query_states,
key_states,
value_states,
attn_bias=xops.LowerTriangularMask(),
p=self.dropout if self.training else 0.0,
scale=self.scaling)

# Use the `embed_dim` from the config (stored in the class) rather than `hidden_state` because `attn_output` can be
# partitioned across GPUs when using tensor-parallelism.
attn_output = attn_output.reshape(bsz, tgt_len, self.embed_dim)

attn_output = self.out_proj(attn_output)

attn_weights_reshaped = None

return attn_output, attn_weights_reshaped, past_key_value
18 changes: 18 additions & 0 deletions applications/Chat/coati/kernels/wrapper.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,18 @@
import torch.nn as nn
from transformers.models.opt.modeling_opt import OPTAttention

from .opt_attn import XOPTAttention


def convert_to_xformer_model(model: nn.Module) -> nn.Module:
for module in model.modules():
if isinstance(module, OPTAttention):
module.__class__ = XOPTAttention
return model


def recover_from_xformer_model(model: nn.Module) -> nn.Module:
for module in model.modules():
if isinstance(module, XOPTAttention):
module.__class__ = OPTAttention
return model
2 changes: 1 addition & 1 deletion applications/Chat/evaluate/evaluate.py
Original file line number Diff line number Diff line change
Expand Up @@ -130,7 +130,7 @@ def evaluate(args):
assert answer1_jsons[i]['id'] == answer2_jsons[i]['id']
answer_id = answer1_jsons[i]['id']

ques = answer1_jsons[i]['instruction'] if answer1_jsons[i]['input'] == "" else answer1_jsons[i]['instuction'] + \
ques = answer1_jsons[i]['instruction'] if answer1_jsons[i]['input'] == "" else answer1_jsons[i]['instruction'] + \
" " + answer1_jsons[i]['input']
cat = answer1_jsons[i]['category']
ans1 = answer1_jsons[i]['output']
Expand Down
2 changes: 1 addition & 1 deletion applications/Chat/evaluate/generate_gpt35_answers.py
Original file line number Diff line number Diff line change
Expand Up @@ -35,7 +35,7 @@

def get_answer(question: str, max_tokens: int):
answer = question
prompt = question['instruction'] if question['input'] == "" else question['instuction'] + \
prompt = question['instruction'] if question['input'] == "" else question['instruction'] + \
" " + question['input']
for _ in range(MAX_API_RETRY):
try:
Expand Down
7 changes: 3 additions & 4 deletions applications/Chat/examples/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,6 @@
- [LLaMA](#llama)
- [Add your own models](#add-your-own-models)
- [Actor model](#actor-model)
- [LM model](#lm-model)
- [Reward model](#reward-model)
- [Critic model](#critic-model)

Expand Down Expand Up @@ -150,11 +149,11 @@ torchrun --standalone --nproc_per_node=4 train_prompts.py \
--strategy colossalai_zero2 \
--prompt_dataset /path/to/your/prompt_dataset \
--pretrain_dataset /path/to/your/pretrain_dataset \
--rm_pretrain /your/pretrain/rm/defination \
--rm_pretrain /your/pretrain/rm/definition \
--rm_path /your/rm/model/path
```

Prompt dataset: the instruction dataset mentioned in the above figure which includes the instructions, e.g. you can use [seed_prompts_ch.jsonl](https://github.com/XueFuzhao/InstructionWild/blob/main/data/seed_prompts_ch.jsonl) or [seed_prompts_en.jsonl](https://github.com/XueFuzhao/InstructionWild/blob/main/data/seed_prompts_en.jsonl) in InstructionWild.
Prompt dataset: the instruction dataset mentioned in the above figure which includes the instructions, e.g. you can use the [script](https://github.com/hpcaitech/ColossalAI/tree/main/applications/Chat/examples/example_data_reformat.py) to reformat [seed_prompts_ch.jsonl](https://github.com/XueFuzhao/InstructionWild/blob/main/data/seed_prompts_ch.jsonl) or [seed_prompts_en.jsonl](https://github.com/XueFuzhao/InstructionWild/blob/main/data/seed_prompts_en.jsonl) in InstructionWild.
Pretrain dataset: the pretrain dataset including the instruction and corresponding response, e.g. you can use the [InstructWild Data](https://github.com/XueFuzhao/InstructionWild/tree/main/data) in stage 1 supervised instructs tuning.

### Arg List
Expand Down Expand Up @@ -233,7 +232,7 @@ If you want to support your own model in Coati, please refer the pull request fo
You should complete the implementation of four model classes, including Reward model, Critic model, LM model, Actor model

here are some example code for a NewModel named `Coati`.
if it is supported in huggingaface [transformers](https://github.com/huggingface/transformers), you can load it by `from_pretrained`, o
if it is supported in huggingface [transformers](https://github.com/huggingface/transformers), you can load it by `from_pretrained`, o
r you can build your own model by yourself.

### Actor model
Expand Down
2 changes: 1 addition & 1 deletion applications/Chat/examples/community/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@ Community examples consist of both inference and training examples that have bee
| Example | Description | Code Example | Colab | Author |
|:---------------------------------------|:---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|:------------------------------------------------------------------|:-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|-----------------------------------------------------------:|
| Peft | Adding Peft support for SFT and Prompts model training | [Huggingface Peft](https://github.com/hpcaitech/ColossalAI/tree/main/applications/Chat/examples/community/peft) | - | [YY Lin](https://github.com/yynil) |
| Train prompts on Ray | A Ray based implementation of Train prompts example | [Huggingface Peft](https://github.com/hpcaitech/ColossalAI/tree/main/applications/Chat/examples/community/ray) | - | [MisterLin1995](https://github.com/MisterLin1995) |
| Train prompts on Ray | A Ray based implementation of Train prompts example | [Training On Ray](https://github.com/hpcaitech/ColossalAI/tree/main/applications/Chat/examples/community/ray) | - | [MisterLin1995](https://github.com/MisterLin1995) |
|...|...|...|...|...|

### How to get involved
Expand Down
10 changes: 5 additions & 5 deletions applications/Chat/examples/community/peft/easy_dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -188,7 +188,7 @@ def __init__(self, data_file: str, tokenizer: AutoTokenizer, max_length=512, is_
else:
raw_input_ids.append(encoded_ids)

grouped_inpup_ids = []
grouped_input_ids = []
current_input_ids = []
attention_mask = []
if tokenizer.pad_token_id is None:
Expand All @@ -199,7 +199,7 @@ def __init__(self, data_file: str, tokenizer: AutoTokenizer, max_length=512, is_
#pad the current_input_ids to max_length with tokenizer.pad_token_id
padded_length = max_length - len(current_input_ids)
current_input_ids.extend([tokenizer.pad_token_id] * padded_length)
grouped_inpup_ids.append(torch.tensor(current_input_ids, dtype=torch.long))
grouped_input_ids.append(torch.tensor(current_input_ids, dtype=torch.long))
attention_mask.append(
torch.tensor([1] * (max_length - padded_length) + [0] * padded_length, dtype=torch.long))
current_input_ids = []
Expand All @@ -208,7 +208,7 @@ def __init__(self, data_file: str, tokenizer: AutoTokenizer, max_length=512, is_
if len(current_input_ids) > 0:
padded_length = max_length - len(current_input_ids)
current_input_ids.extend([tokenizer.pad_token_id] * padded_length)
grouped_inpup_ids.append(torch.tensor(current_input_ids, dtype=torch.long))
grouped_input_ids.append(torch.tensor(current_input_ids, dtype=torch.long))
attention_mask.append(
torch.tensor([1] * (max_length - padded_length) + [0] * padded_length, dtype=torch.long))
else:
Expand All @@ -218,8 +218,8 @@ def __init__(self, data_file: str, tokenizer: AutoTokenizer, max_length=512, is_
input_ids.extend([tokenizer.pad_token_id] * padded_length)
attention_mask.append(
torch.tensor([1] * (max_length - padded_length) + [0] * padded_length, dtype=torch.long))
grouped_inpup_ids.append(torch.tensor(input_ids, dtype=torch.long))
self.input_ids = grouped_inpup_ids
grouped_input_ids.append(torch.tensor(input_ids, dtype=torch.long))
self.input_ids = grouped_input_ids
self.labels = copy.deepcopy(self.input_ids)
self.file_name = data_file
self.attention_mask = attention_mask
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -41,7 +41,7 @@ def main(args):
# configure model
if args.model == 'bloom':
# initial_model = BLOOMActor(pretrained=args.pretrain)
print('Using peft lora to load Bloom model as inital_model')
print('Using peft lora to load Bloom model as initial_model')
initial_model = BLOOMActor(pretrained=args.pretrain, lora_path=args.sft_lora_path)
print('Using peft lora to load Bloom model as initial_model (Done)')
else:
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -86,7 +86,7 @@ def train(args):

if args.strategy == 'colossalai_gemini':
# this is a hack to deal with the resized embedding
# to make sure all parameters are ColoParameter for Colossal-AI Gemini Compatiblity
# to make sure all parameters are ColoParameter for Colossal-AI Gemini Compatibility
for name, param in model.named_parameters():
if not isinstance(param, ColoParameter):
sub_module_name = '.'.join(name.split('.')[:-1])
Expand Down
12 changes: 12 additions & 0 deletions applications/Chat/examples/example_data_reformat.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
jsonl_file = 'seed_prompts_xx.jsonl' # seed_prompts_en.jsonl or seed_prompts_ch.json from InstructionWild
reformat_file = 'prompts_xx.jsonl' # reformat jsonl file used as Prompt dataset in Stage3

data = ''
with open(jsonl_file, 'r', encoding="utf-8") as f1:
for jsonstr in f1.readlines():
jsonstr = '\t' + jsonstr.strip('\n') + ',\n'
data = data + jsonstr
data = '[\n' + data + ']'

with open(reformat_file, 'w') as f2:
f2.write(data)
Loading