Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
19 commits
Select commit Hold shift + click to select a range
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 1 addition & 2 deletions .github/workflows/run_chatgpt_examples.yml
Original file line number Diff line number Diff line change
Expand Up @@ -28,9 +28,8 @@ jobs:
- name: Checkout ColossalAI
uses: actions/checkout@v2

- name: Install ColossalAI and ChatGPT
- name: Install ChatGPT
run: |
pip install -e .
cd applications/Chat
pip install -v .
pip install -r examples/requirements.txt
Expand Down
3 changes: 1 addition & 2 deletions .github/workflows/run_chatgpt_unit_tests.yml
Original file line number Diff line number Diff line change
Expand Up @@ -30,9 +30,8 @@ jobs:
- name: Checkout ColossalAI
uses: actions/checkout@v2

- name: Install ColossalAI and ChatGPT
- name: Install ChatGPT
run: |
pip install -e .
cd applications/Chat
pip install -v .
pip install -r requirements-test.txt
Expand Down
13 changes: 11 additions & 2 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,7 @@
</div>

## Latest News
* [2023/09] [70 Billion Parameter LLaMA2 Model Training Accelerated by 195%](https://www.hpc-ai.tech/blog/70b-llama2-training)
* [2023/07] [HPC-AI Tech Raises 22 Million USD in Series A Funding](https://www.hpc-ai.tech/blog/hpc-ai-tech-raises-22-million-usd-in-series-a-funding-to-fuel-team-expansion-and-business-growth)
* [2023/07] [65B Model Pretraining Accelerated by 38%, Best Practices for Building LLaMA-Like Base Models Open-Source](https://www.hpc-ai.tech/blog/large-model-pretraining)
* [2023/03] [ColossalChat: An Open-Source Solution for Cloning ChatGPT With a Complete RLHF Pipeline](https://medium.com/@yangyou_berkeley/colossalchat-an-open-source-solution-for-cloning-chatgpt-with-a-complete-rlhf-pipeline-5edf08fb538b)
Expand All @@ -50,7 +51,7 @@
<li>
<a href="#Parallel-Training-Demo">Parallel Training Demo</a>
<ul>
<li><a href="#LLaMA">LLaMA</a></li>
<li><a href="#LLaMA2">LLaMA 1/2</a></li>
<li><a href="#GPT-3">GPT-3</a></li>
<li><a href="#GPT-2">GPT-2</a></li>
<li><a href="#BERT">BERT</a></li>
Expand Down Expand Up @@ -217,8 +218,16 @@ Acceleration of [AlphaFold Protein Structure](https://alphafold.ebi.ac.uk/)
<p align="right">(<a href="#top">back to top</a>)</p>

## Parallel Training Demo
### LLaMA2
<p align="center">
<img src="https://raw.githubusercontent.com/hpcaitech/public_assets/main/colossalai/img/llama2_pretraining.png" width=600/>
</p>

- 70 billion parameter LLaMA2 model training accelerated by 195%
[[code]](https://github.com/hpcaitech/ColossalAI/tree/example/llama/examples/language/llama)
[[blog]](https://www.hpc-ai.tech/blog/70b-llama2-training)

### LLaMA
### LLaMA1
<p align="center">
<img src="https://raw.githubusercontent.com/hpcaitech/public_assets/main/examples/images/LLaMA_pretraining.png" width=600/>
</p>
Expand Down
75 changes: 63 additions & 12 deletions applications/Chat/coati/dataset/sft_dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,7 @@
from torch.utils.data import Dataset
from tqdm import tqdm
from transformers import PreTrainedTokenizer

from coati.models.chatglm.chatglm_tokenizer import ChatGLMTokenizer
from colossalai.logging import get_dist_logger

from .utils import is_rank_0, jload
Expand Down Expand Up @@ -71,6 +71,42 @@ def _preprocess(sources: Sequence[str],
return sequences_token["input_ids"], labels, sequences_token["attention_mask"]


def _preprocess_chatglm(sources: Sequence[str],
targets: Sequence[str],
tokenizer: PreTrainedTokenizer,
max_length: int,
) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
"""
Preprocess the data by tokenizing.
None for attention mask, ChatGLM will calculate attention mask according to input ids
"""

labels = []
input_ids = []
for source, target in zip(sources, targets):
source_id = tokenizer.encode(text=source, add_special_tokens=False)
target_id = tokenizer.encode(text=target, add_special_tokens=False)
input_id = tokenizer.build_inputs_with_special_tokens(source_id, target_id)
# truncate
sp_token_list = [tokenizer.gmask_token_id, tokenizer.bos_token_id]
truncate_length = max(0, len(input_id) - max_length)
input_id = input_id[truncate_length: ]
if truncate_length == len(source_id) + 1:
input_id = sp_token_list + input_id[1: ]
elif truncate_length > len(source_id) + 1:
input_id = sp_token_list + input_id[2: ]

context_length = input_id.index(tokenizer.bos_token_id)
mask_position = context_length - 1
label = [IGNORE_INDEX] * context_length + input_id[mask_position+1:]

pad_len = max_length - len(input_id)
input_id = input_id + [tokenizer.pad_token_id] * pad_len
input_ids.append(input_id)
labels.append(label + [IGNORE_INDEX] * pad_len)
return torch.tensor(input_ids), torch.tensor(labels), None


class SFTDataset(Dataset):
"""
Dataset for sft model
Expand All @@ -94,18 +130,25 @@ def __init__(self,
data["completion"] + tokenizer.eos_token
for data in tqdm(dataset, disable=not is_rank_0())
]

self.input_ids, self.labels, self.attention_mask = \
_preprocess(sources, targets, tokenizer, max_length)
if isinstance(tokenizer, ChatGLMTokenizer):
self.input_ids, self.labels, self.attention_mask = \
_preprocess_chatglm(sources, targets, tokenizer, max_length)
else:
self.input_ids, self.labels, self.attention_mask = \
_preprocess(sources, targets, tokenizer, max_length)

def __len__(self):
length = self.input_ids.shape[0]
return length

def __getitem__(self, idx):
return dict(input_ids=self.input_ids[idx],
labels=self.labels[idx],
attention_mask=self.attention_mask[idx])
if self.attention_mask is not None:
return dict(input_ids=self.input_ids[idx],
labels=self.labels[idx],
attention_mask=self.attention_mask[idx])
else:
return dict(input_ids=self.input_ids[idx],
labels=self.labels[idx])


class SupervisedDataset(Dataset):
Expand Down Expand Up @@ -137,14 +180,22 @@ def __init__(self,
]

logger.info("Tokenizing inputs... This may take some time...")
self.input_ids, self.labels, self.attention_mask = \
_preprocess(sources, targets, tokenizer, max_length)
if isinstance(tokenizer, ChatGLMTokenizer):
self.input_ids, self.labels, self.attention_mask = \
_preprocess_chatglm(sources, targets, tokenizer, max_length)
else:
self.input_ids, self.labels, self.attention_mask = \
_preprocess(sources, targets, tokenizer, max_length)

def __len__(self):
length = self.input_ids.shape[0]
return length

def __getitem__(self, idx):
return dict(input_ids=self.input_ids[idx],
labels=self.labels[idx],
attention_mask=self.attention_mask[idx])
if self.attention_mask is not None:
return dict(input_ids=self.input_ids[idx],
labels=self.labels[idx],
attention_mask=self.attention_mask[idx])
else:
return dict(input_ids=self.input_ids[idx],
labels=self.labels[idx])
3 changes: 3 additions & 0 deletions applications/Chat/coati/models/chatglm/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
from .chatglm_actor import ChatGLMActor

__all__ = ['ChatGLMActor']
34 changes: 34 additions & 0 deletions applications/Chat/coati/models/chatglm/chatglm_actor.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,34 @@
from typing import Optional

import torch
from .configuration_chatglm import ChatGLMConfig
from .modeling_chatglm import ChatGLMForConditionalGeneration

from ..base import Actor


class ChatGLMActor(Actor):
"""
ChatGLM Actor model.

Args:
pretrained (str): Pretrained model name or path.
config (ChatGLMConfig): Model config.
checkpoint (bool): Enable gradient checkpointing.

do not support lora for now.
"""

def __init__(self,
pretrained: str = None,
config: Optional[ChatGLMConfig] = None,
checkpoint: bool = False) -> None:
if pretrained is not None:
model = ChatGLMForConditionalGeneration.from_pretrained(pretrained)
elif config is not None:
model = ChatGLMForConditionalGeneration(config)
else:
model = ChatGLMForConditionalGeneration(ChatGLMConfig())
if checkpoint:
model.gradient_checkpointing_enable()
super().__init__(model, lora_rank=0, lora_train_bias='none')
Loading