hpcaitech · binmakeswell · Apr 7, 2023 · Mar 30, 2023 · Mar 30, 2023 · Mar 30, 2023
@@ -1,9 +1,9 @@
 # Introduction
-This repo introduce how to pretrain a chinese roberta-large from scratch, including preprocessing, pretraining, finetune. The repo can help you quickly train a high-quality  bert.
+This example introduce how to pretrain roberta from scratch, including preprocessing, pretraining, finetune. The example can help you quickly train a high-quality roberta.
 
 ## 0. Prerequisite
 - Install Colossal-AI
-- Editing the port from /etc/ssh/sshd_config and /etc/ssh/ssh_config, every host expose the same ssh port of server and client. If you are a root user, you also set the **PermitRootLogin** from /etc/ssh/sshd_config to "yes"
+- Editing the port from `/etc/ssh/sshd_config` and `/etc/ssh/ssh_config`, every host expose the same ssh port of server and client. If you are a root user, you also set the **PermitRootLogin** from `/etc/ssh/sshd_config` to "yes"
 - Ensure that each host can log in to each other without password. If you have n hosts, need to execute n<sup>2</sup> times
 
 ```
@@ -33,7 +33,7 @@ service ssh restart
 ```bash
 cd preprocessing
 ```
-following the `README.md`, preprocess original corpus to h5py+numpy
+following the `README.md`, preprocess original corpus to h5py plus numpy
 
 ## 2. Pretrain
 
@@ -47,12 +47,4 @@ following the `README.md`, load the h5py generated by preprocess of step 1 to pr
 The checkpoint produced by this repo can replace `pytorch_model.bin` from  [hfl/chinese-roberta-wwm-ext-large](https://huggingface.co/hfl/chinese-roberta-wwm-ext-large/tree/main) directly. Then use transfomers from Hugging Face to finetune downstream application.
 
 ## Contributors
-The repo is contributed by AI team from [Moore Threads](https://www.mthreads.com/). If you find any problems for pretraining, please file an issue or send an email to yehua.zhang@mthreads.com. At last, welcome any form of contribution!
-
-```
-@misc{
-  title={A simple Chinese RoBERTa Example for Whole Word Masked},
-  author={Yehua Zhang, Chen Zhang},
-  year={2022}
-}
-```
+The example is contributed by AI team from [Moore Threads](https://www.mthreads.com/). If you find any problems for pretraining, please file an issue or send an email to yehua.zhang@mthreads.com. At last, welcome any form of contribution!
@@ -163,16 +163,15 @@ def create_masked_lm_predictions(self, tokens):
 
     def get_new_segment(self, segment):
         """
-        输入一句话，返回一句经过处理的话: 为了支持中文全称mask，将被分开的词，将上特殊标记("#")，使得后续处理模块，能够知道哪些字是属于同一个词的。
-        :param segment: 一句话
-        :return: 一句处理过的话
+        Input a sentence, return a processed sentence: In order to support the Chinese whole word mask, the words that are separated will be marked with a special mark ("#"), so that the subsequent processing module can know which words belong to the same word.
+        :param segment: a sentence
         """
         seq_cws = jieba.lcut(''.join(segment))
         seq_cws_dict = {x: 1 for x in seq_cws}
         new_segment = []
         i = 0
         while i < len(segment):
-            if len(self.rec.findall(segment[i])) == 0: # 不是中文的，原文加进去。
+            if len(self.rec.findall(segment[i])) == 0: 
                 new_segment.append(segment[i])
                 i += 1
                 continue

@@ -10,26 +10,19 @@
 import functools
 
 def split_sentence(document: str, flag: str = "all", limit: int = 510) -> List[str]:
-    """
-    Args:
-        document:
-        flag: Type:str, "all" 中英文标点分句，"zh" 中文标点分句，"en" 英文标点分句
-        limit: 默认单句最大长度为510个字符
-    Returns: Type:list
-    """
     sent_list = []
     try:
         if flag == "zh":
-            document = re.sub('(?P<quotation_mark>([。？！…](?![”’"\'])))', r'\g<quotation_mark>\n', document)  # 单字符断句符
-            document = re.sub('(?P<quotation_mark>([。？！]|…{1,2})[”’"\'])', r'\g<quotation_mark>\n', document)  # 特殊引号
+            document = re.sub('(?P<quotation_mark>([。？！…](?![”’"\'])))', r'\g<quotation_mark>\n', document) 
+            document = re.sub('(?P<quotation_mark>([。？！]|…{1,2})[”’"\'])', r'\g<quotation_mark>\n', document)
         elif flag == "en":
-            document = re.sub('(?P<quotation_mark>([.?!](?![”’"\'])))', r'\g<quotation_mark>\n', document)  # 英文单字符断句符
-            document = re.sub('(?P<quotation_mark>([?!.]["\']))', r'\g<quotation_mark>\n', document)  # 特殊引号
+            document = re.sub('(?P<quotation_mark>([.?!](?![”’"\'])))', r'\g<quotation_mark>\n', document)  
+            document = re.sub('(?P<quotation_mark>([?!.]["\']))', r'\g<quotation_mark>\n', document)  # Special quotation marks
         else:
-            document = re.sub('(?P<quotation_mark>([。？！….?!](?![”’"\'])))', r'\g<quotation_mark>\n', document)  # 单字符断句符
+            document = re.sub('(?P<quotation_mark>([。？！….?!](?![”’"\'])))', r'\g<quotation_mark>\n', document)  
 
             document = re.sub('(?P<quotation_mark>(([。？！.!?]|…{1,2})[”’"\']))', r'\g<quotation_mark>\n',
-                            document)  # 特殊引号
+                            document)  # Special quotation marks
 
         sent_list_ori = document.splitlines()
         for sent in sent_list_ori:

@@ -15,8 +15,8 @@
 def get_raw_instance(document, max_sequence_length=512):
 
     """
-    获取初步的训练实例，将整段按照max_sequence_length切分成多个部分,并以多个处理好的实例的形式返回。
-    :param document: 一整段
+    Get the initial training instances, split the whole segment into multiple parts according to the max_sequence_length, and return as multiple processed instances.
+    :param document: document
     :param max_sequence_length:
     :return: a list. each element is a sequence of text
     """
@@ -26,10 +26,9 @@ def get_raw_instance(document, max_sequence_length=512):
     sizes = [len(seq) for seq in document]
 
     result_list = []
-    curr_seq = [] # 当前处理的序列
+    curr_seq = [] 
     sz_idx = 0
     while sz_idx < len(sizes):
-        # 当前句子加上新的句子，如果长度小于最大限制，则合并当前句子和新句子；否则即超过了最大限制，那么做为一个新的序列加到目标列表中
 
         if len(curr_seq) + sizes[sz_idx] <= max_sequence_length_allowed: # or len(curr_seq)==0:
             curr_seq += document[sz_idx]
@@ -43,14 +42,13 @@ def get_raw_instance(document, max_sequence_length=512):
         else:
             result_list.append(curr_seq)
             curr_seq = []
-    # 对最后一个序列进行处理，如果太短的话，丢弃掉。
+
     if len(curr_seq) > max_sequence_length_allowed / 2: # /2
         result_list.append(curr_seq)
 
-    # # 计算总共可以得到多少份
     # num_instance=int(len(big_list)/max_sequence_length_allowed)+1
     # print("num_instance:",num_instance)
-    # # 切分成多份，添加到列表中
+
     # result_list=[]
     # for j in range(num_instance):
     #     index=j*max_sequence_length_allowed

@@ -6,6 +6,30 @@
 
 def parse_args():
     parser = colossalai.get_default_parser()
+
+    parser.add_argument(
+        "--distplan",
+        type=str,
+        default='CAI_Gemini',
+        help="The distributed plan [colossalai, zero1, zero2, torch_ddp, torch_zero].",
+    )
+    parser.add_argument(
+        "--tp_degree",
+        type=int,
+        default=1,
+        help="Tensor Parallelism Degree. Valid when using colossalai as dist plan.",
+    )
+    parser.add_argument(
+        "--placement",
+        type=str,
+        default='cpu',
+        help="Placement Policy for Gemini. Valid when using colossalai as dist plan.",
+    )
+    parser.add_argument(
+        "--shardinit",
+        action='store_true',
+        help="Shard the tensors when init the model to shrink peak memory size on the assigned device. Valid when using colossalai as dist plan.",
+    )
 
     parser.add_argument(
         '--lr', 

@@ -5,11 +5,11 @@
 from utils.global_vars import get_timers, get_tensorboard_writer
 from nvidia_bert_dataset_provider import NvidiaBertDatasetProvider 
 
-def evaluate(engine, args, logger, global_step):
+def evaluate(model, args, logger, global_step, criterion):
     evaluate_dataset_provider = NvidiaBertDatasetProvider(args, evaluate=True)
     start_shard = 0
 
-    engine.eval()
+    model.eval()
     timers = get_timers()
     eval_step = 0
     eval_loss = 0
@@ -39,9 +39,9 @@ def evaluate(engine, args, logger, global_step):
                 mlm_label = batch_data[3].cuda()
                 # nsp_label = batch_data[5].cuda()
 
-                output = engine(input_ids=input_ids, token_type_ids=token_type_ids, attention_mask=attention_mask)
+                output = model(input_ids=input_ids, token_type_ids=token_type_ids, attention_mask=attention_mask)
 
-                loss = engine.criterion(output.logits, mlm_label)#prediction_scores
+                loss = criterion(output.logits, mlm_label)#prediction_scores
                 evaluate_dataset_provider.prefetch_batch()
 
                 eval_loss += loss.float().item()
@@ -67,5 +67,5 @@ def evaluate(engine, args, logger, global_step):
             logger.info('')
 
     evaluate_dataset_provider.release_shard()
-    engine.train()
-    return cur_loss
+    model.train()
+    return cur_loss
@@ -5,7 +5,7 @@
 from transformers import BertForPreTraining, RobertaForMaskedLM, RobertaConfig
 from transformers import GPT2Config, GPT2LMHeadModel
 from transformers import AutoTokenizer, AutoModelForMaskedLM
-from colossalai.nn.optimizer import FusedAdam
+from colossalai.nn.optimizer import FusedAdam, HybridAdam
 from torch.optim import AdamW
 from colossalai.core import global_context as gpc
 import torch
@@ -83,7 +83,7 @@ def get_optimizer(model, lr):
         'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)],
         'weight_decay': 0.0
     }]
-    optimizer = FusedAdam(optimizer_grouped_parameters, lr=lr, betas=[0.9, 0.95])
+    optimizer = HybridAdam(optimizer_grouped_parameters, lr=lr, betas=[0.9, 0.95])
     return optimizer
 
 

@@ -7,7 +7,6 @@ tensorboard_path="$root_path/tensorboard"
 log_path="$root_path/exp_log"
 ckpt_path="$root_path/ckpt"
 
-colossal_config="$root_path/../configs/colossalai_ddp.py"
 
 mkdir -p $tensorboard_path
 mkdir -p $log_path
@@ -32,7 +31,6 @@ env OMP_NUM_THREADS=40 colossalai run --hostfile ./hostfile \
                 --tensorboard_path $tensorboard_path \
                 --log_path $log_path \
                 --ckpt_path $ckpt_path \
-                --colossal_config $colossal_config \
                 --log_interval 50 \
                 --mlm bert \
                 --wandb \

@@ -7,7 +7,6 @@ tensorboard_path="$root_path/tensorboard"
 log_path="$root_path/exp_log"
 ckpt_path="$root_path/ckpt"
 
-colossal_config="$root_path/../configs/colossalai_ddp.py"
 
 mkdir -p $tensorboard_path
 mkdir -p $log_path
@@ -32,7 +31,6 @@ env OMP_NUM_THREADS=40 colossalai run --hostfile ./hostfile \
                 --tensorboard_path $tensorboard_path \
                 --log_path $log_path \
                 --ckpt_path $ckpt_path \
-                --colossal_config $colossal_config \
                 --log_interval 50 \
                 --mlm bert \
                 --wandb \