Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
16 changes: 4 additions & 12 deletions examples/language/roberta/README.md
Original file line number Diff line number Diff line change
@@ -1,9 +1,9 @@
# Introduction
This repo introduce how to pretrain a chinese roberta-large from scratch, including preprocessing, pretraining, finetune. The repo can help you quickly train a high-quality bert.
This example introduce how to pretrain roberta from scratch, including preprocessing, pretraining, finetune. The example can help you quickly train a high-quality roberta.

## 0. Prerequisite
- Install Colossal-AI
- Editing the port from /etc/ssh/sshd_config and /etc/ssh/ssh_config, every host expose the same ssh port of server and client. If you are a root user, you also set the **PermitRootLogin** from /etc/ssh/sshd_config to "yes"
- Editing the port from `/etc/ssh/sshd_config` and `/etc/ssh/ssh_config`, every host expose the same ssh port of server and client. If you are a root user, you also set the **PermitRootLogin** from `/etc/ssh/sshd_config` to "yes"
- Ensure that each host can log in to each other without password. If you have n hosts, need to execute n<sup>2</sup> times

```
Expand Down Expand Up @@ -33,7 +33,7 @@ service ssh restart
```bash
cd preprocessing
```
following the `README.md`, preprocess original corpus to h5py+numpy
following the `README.md`, preprocess original corpus to h5py plus numpy

## 2. Pretrain

Expand All @@ -47,12 +47,4 @@ following the `README.md`, load the h5py generated by preprocess of step 1 to pr
The checkpoint produced by this repo can replace `pytorch_model.bin` from [hfl/chinese-roberta-wwm-ext-large](https://huggingface.co/hfl/chinese-roberta-wwm-ext-large/tree/main) directly. Then use transfomers from Hugging Face to finetune downstream application.

## Contributors
The repo is contributed by AI team from [Moore Threads](https://www.mthreads.com/). If you find any problems for pretraining, please file an issue or send an email to yehua.zhang@mthreads.com. At last, welcome any form of contribution!

```
@misc{
title={A simple Chinese RoBERTa Example for Whole Word Masked},
author={Yehua Zhang, Chen Zhang},
year={2022}
}
```
The example is contributed by AI team from [Moore Threads](https://www.mthreads.com/). If you find any problems for pretraining, please file an issue or send an email to yehua.zhang@mthreads.com. At last, welcome any form of contribution!
9 changes: 0 additions & 9 deletions examples/language/roberta/configs/colossalai_ddp.py

This file was deleted.

37 changes: 0 additions & 37 deletions examples/language/roberta/configs/colossalai_zero.py

This file was deleted.

7 changes: 3 additions & 4 deletions examples/language/roberta/preprocessing/get_mask.py
Original file line number Diff line number Diff line change
Expand Up @@ -163,16 +163,15 @@ def create_masked_lm_predictions(self, tokens):

def get_new_segment(self, segment):
"""
输入一句话,返回一句经过处理的话: 为了支持中文全称mask,将被分开的词,将上特殊标记("#"),使得后续处理模块,能够知道哪些字是属于同一个词的。
:param segment: 一句话
:return: 一句处理过的话
Input a sentence, return a processed sentence: In order to support the Chinese whole word mask, the words that are separated will be marked with a special mark ("#"), so that the subsequent processing module can know which words belong to the same word.
:param segment: a sentence
"""
seq_cws = jieba.lcut(''.join(segment))
seq_cws_dict = {x: 1 for x in seq_cws}
new_segment = []
i = 0
while i < len(segment):
if len(self.rec.findall(segment[i])) == 0: # 不是中文的,原文加进去。
if len(self.rec.findall(segment[i])) == 0:
new_segment.append(segment[i])
i += 1
continue
Expand Down
19 changes: 6 additions & 13 deletions examples/language/roberta/preprocessing/sentence_split.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,26 +10,19 @@
import functools

def split_sentence(document: str, flag: str = "all", limit: int = 510) -> List[str]:
"""
Args:
document:
flag: Type:str, "all" 中英文标点分句,"zh" 中文标点分句,"en" 英文标点分句
limit: 默认单句最大长度为510个字符
Returns: Type:list
"""
sent_list = []
try:
if flag == "zh":
document = re.sub('(?P<quotation_mark>([。?!…](?![”’"\'])))', r'\g<quotation_mark>\n', document) # 单字符断句符
document = re.sub('(?P<quotation_mark>([。?!]|…{1,2})[”’"\'])', r'\g<quotation_mark>\n', document) # 特殊引号
document = re.sub('(?P<quotation_mark>([。?!…](?![”’"\'])))', r'\g<quotation_mark>\n', document)
document = re.sub('(?P<quotation_mark>([。?!]|…{1,2})[”’"\'])', r'\g<quotation_mark>\n', document)
elif flag == "en":
document = re.sub('(?P<quotation_mark>([.?!](?![”’"\'])))', r'\g<quotation_mark>\n', document) # 英文单字符断句符
document = re.sub('(?P<quotation_mark>([?!.]["\']))', r'\g<quotation_mark>\n', document) # 特殊引号
document = re.sub('(?P<quotation_mark>([.?!](?![”’"\'])))', r'\g<quotation_mark>\n', document)
document = re.sub('(?P<quotation_mark>([?!.]["\']))', r'\g<quotation_mark>\n', document) # Special quotation marks
else:
document = re.sub('(?P<quotation_mark>([。?!….?!](?![”’"\'])))', r'\g<quotation_mark>\n', document) # 单字符断句符
document = re.sub('(?P<quotation_mark>([。?!….?!](?![”’"\'])))', r'\g<quotation_mark>\n', document)

document = re.sub('(?P<quotation_mark>(([。?!.!?]|…{1,2})[”’"\']))', r'\g<quotation_mark>\n',
document) # 特殊引号
document) # Special quotation marks

sent_list_ori = document.splitlines()
for sent in sent_list_ori:
Expand Down
12 changes: 5 additions & 7 deletions examples/language/roberta/preprocessing/tokenize_mask.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,8 +15,8 @@
def get_raw_instance(document, max_sequence_length=512):

"""
获取初步的训练实例,将整段按照max_sequence_length切分成多个部分,并以多个处理好的实例的形式返回。
:param document: 一整段
Get the initial training instances, split the whole segment into multiple parts according to the max_sequence_length, and return as multiple processed instances.
:param document: document
:param max_sequence_length:
:return: a list. each element is a sequence of text
"""
Expand All @@ -26,10 +26,9 @@ def get_raw_instance(document, max_sequence_length=512):
sizes = [len(seq) for seq in document]

result_list = []
curr_seq = [] # 当前处理的序列
curr_seq = []
sz_idx = 0
while sz_idx < len(sizes):
# 当前句子加上新的句子,如果长度小于最大限制,则合并当前句子和新句子;否则即超过了最大限制,那么做为一个新的序列加到目标列表中

if len(curr_seq) + sizes[sz_idx] <= max_sequence_length_allowed: # or len(curr_seq)==0:
curr_seq += document[sz_idx]
Expand All @@ -43,14 +42,13 @@ def get_raw_instance(document, max_sequence_length=512):
else:
result_list.append(curr_seq)
curr_seq = []
# 对最后一个序列进行处理,如果太短的话,丢弃掉。

if len(curr_seq) > max_sequence_length_allowed / 2: # /2
result_list.append(curr_seq)

# # 计算总共可以得到多少份
# num_instance=int(len(big_list)/max_sequence_length_allowed)+1
# print("num_instance:",num_instance)
# # 切分成多份,添加到列表中

# result_list=[]
# for j in range(num_instance):
# index=j*max_sequence_length_allowed
Expand Down
24 changes: 24 additions & 0 deletions examples/language/roberta/pretraining/arguments.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,30 @@

def parse_args():
parser = colossalai.get_default_parser()

parser.add_argument(
"--distplan",
type=str,
default='CAI_Gemini',
help="The distributed plan [colossalai, zero1, zero2, torch_ddp, torch_zero].",
)
parser.add_argument(
"--tp_degree",
type=int,
default=1,
help="Tensor Parallelism Degree. Valid when using colossalai as dist plan.",
)
parser.add_argument(
"--placement",
type=str,
default='cpu',
help="Placement Policy for Gemini. Valid when using colossalai as dist plan.",
)
parser.add_argument(
"--shardinit",
action='store_true',
help="Shard the tensors when init the model to shrink peak memory size on the assigned device. Valid when using colossalai as dist plan.",
)

parser.add_argument(
'--lr',
Expand Down
12 changes: 6 additions & 6 deletions examples/language/roberta/pretraining/evaluation.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,11 +5,11 @@
from utils.global_vars import get_timers, get_tensorboard_writer
from nvidia_bert_dataset_provider import NvidiaBertDatasetProvider

def evaluate(engine, args, logger, global_step):
def evaluate(model, args, logger, global_step, criterion):
evaluate_dataset_provider = NvidiaBertDatasetProvider(args, evaluate=True)
start_shard = 0

engine.eval()
model.eval()
timers = get_timers()
eval_step = 0
eval_loss = 0
Expand Down Expand Up @@ -39,9 +39,9 @@ def evaluate(engine, args, logger, global_step):
mlm_label = batch_data[3].cuda()
# nsp_label = batch_data[5].cuda()

output = engine(input_ids=input_ids, token_type_ids=token_type_ids, attention_mask=attention_mask)
output = model(input_ids=input_ids, token_type_ids=token_type_ids, attention_mask=attention_mask)

loss = engine.criterion(output.logits, mlm_label)#prediction_scores
loss = criterion(output.logits, mlm_label)#prediction_scores
evaluate_dataset_provider.prefetch_batch()

eval_loss += loss.float().item()
Expand All @@ -67,5 +67,5 @@ def evaluate(engine, args, logger, global_step):
logger.info('')

evaluate_dataset_provider.release_shard()
engine.train()
return cur_loss
model.train()
return cur_loss
4 changes: 2 additions & 2 deletions examples/language/roberta/pretraining/pretrain_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@
from transformers import BertForPreTraining, RobertaForMaskedLM, RobertaConfig
from transformers import GPT2Config, GPT2LMHeadModel
from transformers import AutoTokenizer, AutoModelForMaskedLM
from colossalai.nn.optimizer import FusedAdam
from colossalai.nn.optimizer import FusedAdam, HybridAdam
from torch.optim import AdamW
from colossalai.core import global_context as gpc
import torch
Expand Down Expand Up @@ -83,7 +83,7 @@ def get_optimizer(model, lr):
'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)],
'weight_decay': 0.0
}]
optimizer = FusedAdam(optimizer_grouped_parameters, lr=lr, betas=[0.9, 0.95])
optimizer = HybridAdam(optimizer_grouped_parameters, lr=lr, betas=[0.9, 0.95])
return optimizer


Expand Down
2 changes: 0 additions & 2 deletions examples/language/roberta/pretraining/run_pretrain.sh
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,6 @@ tensorboard_path="$root_path/tensorboard"
log_path="$root_path/exp_log"
ckpt_path="$root_path/ckpt"

colossal_config="$root_path/../configs/colossalai_ddp.py"

mkdir -p $tensorboard_path
mkdir -p $log_path
Expand All @@ -32,7 +31,6 @@ env OMP_NUM_THREADS=40 colossalai run --hostfile ./hostfile \
--tensorboard_path $tensorboard_path \
--log_path $log_path \
--ckpt_path $ckpt_path \
--colossal_config $colossal_config \
--log_interval 50 \
--mlm bert \
--wandb \
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,6 @@ tensorboard_path="$root_path/tensorboard"
log_path="$root_path/exp_log"
ckpt_path="$root_path/ckpt"

colossal_config="$root_path/../configs/colossalai_ddp.py"

mkdir -p $tensorboard_path
mkdir -p $log_path
Expand All @@ -32,7 +31,6 @@ env OMP_NUM_THREADS=40 colossalai run --hostfile ./hostfile \
--tensorboard_path $tensorboard_path \
--log_path $log_path \
--ckpt_path $ckpt_path \
--colossal_config $colossal_config \
--log_interval 50 \
--mlm bert \
--wandb \
Expand Down
Loading