From 7e10f4a929b030d00f264b4ed015870a39d06040 Mon Sep 17 00:00:00 2001 From: FoolPlayer <498107402@qq.com> Date: Thu, 7 Sep 2023 14:28:39 +0800 Subject: [PATCH 1/9] add gpt2 HybridParallelPlugin example --- .../language/gpt/hybridparallelism/data.py | 127 +++++++ .../gpt_hybridparallelism.py | 311 ++++++++++++++++++ .../language/gpt/hybridparallelism/run.sh | 2 + 3 files changed, 440 insertions(+) create mode 100644 examples/language/gpt/hybridparallelism/data.py create mode 100644 examples/language/gpt/hybridparallelism/gpt_hybridparallelism.py create mode 100644 examples/language/gpt/hybridparallelism/run.sh diff --git a/examples/language/gpt/hybridparallelism/data.py b/examples/language/gpt/hybridparallelism/data.py new file mode 100644 index 000000000000..981cedcca8c2 --- /dev/null +++ b/examples/language/gpt/hybridparallelism/data.py @@ -0,0 +1,127 @@ +import datasets +from transformers import AutoTokenizer, PreTrainedTokenizer + +from colossalai.booster.plugin.dp_plugin_base import DPPluginBase + + +class GLUEDataBuilder: + + task_text_field_map = { + "cola": ["sentence"], + "sst2": ["sentence"], + "mrpc": ["sentence1", "sentence2"], + "qqp": ["question1", "question2"], + "stsb": ["sentence1", "sentence2"], + "mnli": ["premise", "hypothesis"], + "qnli": ["question", "sentence"], + "rte": ["sentence1", "sentence2"], + "wnli": ["sentence1", "sentence2"], + "ax": ["premise", "hypothesis"], + } + + glue_task_num_labels = { + "cola": 2, + "sst2": 2, + "mrpc": 2, + "qqp": 2, + "stsb": 1, + "mnli": 3, + "qnli": 2, + "rte": 2, + "wnli": 2, + "ax": 3, + } + + loader_columns = [ + "datasets_idx", + "input_ids", + "token_type_ids", + "attention_mask", + "start_positions", + "end_positions", + "labels", + ] + + def __init__( + self, + model_name_or_path: str, + plugin: DPPluginBase, + task_name: str = "mrpc", + max_seq_length: int = 128, + train_batch_size: int = 32, + eval_batch_size: int = 32, + **kwargs, + ): + super().__init__() + self.model_name_or_path = model_name_or_path + self.task_name = task_name + self.max_seq_length = max_seq_length + self.train_batch_size = train_batch_size + self.eval_batch_size = eval_batch_size + self.plugin = plugin + + self.text_fields = self.task_text_field_map[task_name] + self.num_labels = self.glue_task_num_labels[task_name] + self.tokenizer: PreTrainedTokenizer = AutoTokenizer.from_pretrained(self.model_name_or_path, use_fast=True) + self.setup() + + def setup(self): + self.dataset = datasets.load_dataset("glue", self.task_name) + + for split in self.dataset.keys(): + self.dataset[split] = self.dataset[split].map( + self.convert_to_features, + batched=True, + remove_columns=["label"], + ) + self.columns = [c for c in self.dataset[split].column_names if c in self.loader_columns] + self.dataset[split].set_format(type="torch", columns=self.columns) + + self.eval_splits = [x for x in self.dataset.keys() if "validation" in x] + + def prepare_data(self): + datasets.load_dataset("glue", self.task_name) + AutoTokenizer.from_pretrained(self.model_name_or_path, use_fast=True) + + def train_dataloader(self): + return self.plugin.prepare_dataloader(self.dataset["train"], + batch_size=self.train_batch_size, + shuffle=True, + drop_last=True) + + def val_dataloader(self): + if len(self.eval_splits) == 1: + return self.plugin.prepare_dataloader(self.dataset["validation"], batch_size=self.eval_batch_size) + elif len(self.eval_splits) > 1: + return [ + self.plugin.prepare_dataloader(self.dataset[x], batch_size=self.eval_batch_size) + for x in self.eval_splits + ] + + def test_dataloader(self): + if len(self.eval_splits) == 1: + return self.plugin.prepare_dataloader(self.dataset["test"], batch_size=self.eval_batch_size) + elif len(self.eval_splits) > 1: + return [ + self.plugin.prepare_dataloader(self.dataset[x], batch_size=self.eval_batch_size) + for x in self.eval_splits + ] + + def convert_to_features(self, example_batch): + + # Either encode single sentence or sentence pairs + if len(self.text_fields) > 1: + texts_or_text_pairs = list(zip(example_batch[self.text_fields[0]], example_batch[self.text_fields[1]])) + else: + texts_or_text_pairs = example_batch[self.text_fields[0]] + + # Tokenize the text/text pairs + features = self.tokenizer.batch_encode_plus(texts_or_text_pairs, + max_length=self.max_seq_length, + padding='max_length', + truncation=True) + + # Rename label to labels to make it easier to pass to model forward + features["labels"] = example_batch["label"] + + return features diff --git a/examples/language/gpt/hybridparallelism/gpt_hybridparallelism.py b/examples/language/gpt/hybridparallelism/gpt_hybridparallelism.py new file mode 100644 index 000000000000..1215ca225b81 --- /dev/null +++ b/examples/language/gpt/hybridparallelism/gpt_hybridparallelism.py @@ -0,0 +1,311 @@ +import argparse +from contextlib import nullcontext +from typing import Callable, List, Union + +import evaluate +import torch +import torch.distributed as dist +import torch.nn as nn +from data import GLUEDataBuilder +from torch.optim import Adam, Optimizer +from torch.optim.lr_scheduler import _LRScheduler as LRScheduler +from torch.utils.data import DataLoader +from tqdm import tqdm +from transformers import AutoConfig, GPT2ForSequenceClassification, get_linear_schedule_with_warmup + +import colossalai +from colossalai.booster import Booster +from colossalai.booster.plugin import GeminiPlugin, HybridParallelPlugin, LowLevelZeroPlugin, TorchDDPPlugin +from colossalai.cluster import DistCoordinator +from colossalai.lazy import LazyInitContext +from colossalai.nn.optimizer import HybridAdam +from colossalai.utils import get_current_device + +# ============================== +# Prepare Hyperparameters +# ============================== +NUM_EPOCHS = 3 +BATCH_SIZE = 32 +LEARNING_RATE = 2.4e-5 +WEIGHT_DECAY = 0.01 +WARMUP_FRACTION = 0.1 + +output_transform_fn = lambda x: x +criterion = lambda x: x.loss + + +def move_to_cuda(batch): + return {k: v.cuda() for k, v in batch.items()} + + +@torch.no_grad() +def evaluate_model( + model: nn.Module, + optimizer, + criterion, + test_dataloader: Union[DataLoader, List[DataLoader]], + num_labels: int, + task_name: str, + eval_splits: List[str], + booster: Booster, + coordinator: DistCoordinator, +): + metric = evaluate.load("glue", task_name, process_id=coordinator.rank, num_process=coordinator.world_size) + model.eval() + + def evaluate_subset(dataloader: DataLoader): + accum_loss = torch.zeros(1, device=get_current_device()) + for batch in dataloader: + batch = move_to_cuda(batch) + labels = batch["labels"] + batch_size = batch["input_ids"].shape[0] + if hasattr(booster.plugin, "stage_manager") and booster.plugin.stage_manager is not None: + pg_mesh = booster.plugin.pg_mesh + pp_group = booster.plugin.pp_group + current_pp_group_ranks = pg_mesh.get_ranks_in_group(pp_group) + current_rank = dist.get_rank() + #TODO pass dataloader to execute_pipeline directly + batch = iter([batch]) + outputs = booster.execute_pipeline(batch, + model, + criterion, + optimizer, + return_loss=True, + return_outputs=True) + + if booster.plugin.stage_manager.is_last_stage(): + val_loss = outputs["loss"] + + logits = outputs["outputs"]["logits"] + + accum_loss.add_(val_loss) + + if num_labels > 1: + preds = torch.argmax(logits, axis=1) + elif num_labels == 1: + preds = logits.squeeze() + + dist.broadcast(preds, src=current_rank, group=pp_group) + dist.broadcast(val_loss, src=current_rank, group=pp_group) + + metric.add_batch(predictions=preds, references=labels) + elif current_rank in current_pp_group_ranks: + val_loss = torch.empty((1,), device=get_current_device()) + preds = torch.empty((batch_size,), dtype=torch.int64, device=get_current_device()) + + dist.broadcast(preds, src=current_pp_group_ranks[-1], group=pp_group) + dist.broadcast(val_loss, src=current_pp_group_ranks[-1], group=pp_group) + + accum_loss.add_(val_loss) + metric.add_batch(predictions=preds, references=labels) + + else: + batch = move_to_cuda(batch) + outputs = model(**batch) + val_loss, logits = outputs[:2] + accum_loss.add_(val_loss) + + if num_labels > 1: + preds = torch.argmax(logits, axis=1) + elif num_labels == 1: + preds = logits.squeeze() + + metric.add_batch(predictions=preds, references=labels) + + results = metric.compute() + dist.all_reduce(accum_loss.div_(len(dataloader))) + if coordinator.is_master() and results is not None: + results['loss'] = accum_loss.item() / coordinator.world_size + + return results + + if isinstance(test_dataloader, DataLoader): + return evaluate_subset(test_dataloader) + else: + assert len(test_dataloader) == len(eval_splits) + final_results = {} + for split, sub_loader in zip(eval_splits, test_dataloader): + results = evaluate_subset(sub_loader) + final_results.update({f'{k}_{split}': v for k, v in results.items()}) + return final_results + + +def train_epoch(epoch: int, model: nn.Module, optimizer: Optimizer, _criterion: Callable, lr_scheduler: LRScheduler, + train_dataloader: DataLoader, booster: Booster, coordinator: DistCoordinator): + + model.train() + is_pp_last_stage = hasattr( + booster.plugin, + "stage_manager") and booster.plugin.stage_manager is not None and booster.plugin.stage_manager.is_last_stage() + with tqdm(train_dataloader, + desc=f'Epoch [{epoch + 1}/{NUM_EPOCHS}]', + disable=not (coordinator.is_master() or is_pp_last_stage)) as pbar: + for batch in pbar: + # Forward pass + batch = move_to_cuda(batch) + if hasattr(booster.plugin, "stage_manager") and booster.plugin.stage_manager is not None: + #TODO pass train_dataloader to execute_pipeline directly + batch = iter([batch]) + outputs = booster.execute_pipeline(batch, + model, + _criterion, + optimizer, + return_loss=True, + return_outputs=True) + # Backward and optimize + if booster.plugin.stage_manager.is_last_stage(): + loss = outputs['loss'] + pbar.set_postfix({'loss': loss.item()}) + else: + outputs = model(**batch) + loss = _criterion(outputs, None) + # Backward + booster.backward(loss, optimizer) + pbar.set_postfix({'loss': loss.item()}) + + optimizer.step() + optimizer.zero_grad() + lr_scheduler.step() + + +def main(): + # ============================== + # Parse Arguments + # ============================== + parser = argparse.ArgumentParser() + parser.add_argument('-t', '--task', default='mrpc', help="GLUE task to run") + parser.add_argument('-p', + '--plugin', + type=str, + default='torch_ddp', + choices=['torch_ddp', 'torch_ddp_fp16', 'gemini', 'low_level_zero', 'hybrid_parallel'], + help="plugin to use") + parser.add_argument( + "--model_type", + type=str, + default="gpt2", + help="only support gpt2", + ) + parser.add_argument('--target_f1', type=float, default=None, help="target f1 score. Raise exception if not reached") + parser.add_argument('--use_lazy_init', type=bool, default=False, help="for initiating lazy init context") + parser.add_argument('--pretrained_path', type=str, default=None, help="The path so save the pretrained weight") + args = parser.parse_args() + + if args.model_type == 'gpt2': + model_name = "gpt2" + else: + raise RuntimeError + # ============================== + # Launch Distributed Environment + # ============================== + colossalai.launch_from_torch(config={}, seed=42) + coordinator = DistCoordinator() + + # local_batch_size = BATCH_SIZE // coordinator.world_size + lr = LEARNING_RATE * coordinator.world_size + + # ============================== + # Instantiate Plugin and Booster + # ============================== + booster_kwargs = {} + if args.plugin == 'torch_ddp_fp16': + booster_kwargs['mixed_precision'] = 'fp16' + if args.plugin.startswith('torch_ddp'): + plugin = TorchDDPPlugin() + elif args.plugin == 'gemini': + plugin = GeminiPlugin(initial_scale=2**5) + elif args.plugin == 'low_level_zero': + plugin = LowLevelZeroPlugin(initial_scale=2**5) + elif args.plugin == 'hybrid_parallel': + + # modify the param accordingly for finetuning test cases + plugin = HybridParallelPlugin(tp_size=1, + pp_size=2, + num_microbatches=None, + microbatch_size=1, + enable_all_optimization=True, + zero_stage=1, + precision='fp16', + initial_scale=1) + + booster = Booster(plugin=plugin, **booster_kwargs) + + # ============================== + # Prepare Dataloader + # ============================== + data_builder = GLUEDataBuilder(model_name, + plugin, + args.task, + train_batch_size=BATCH_SIZE, + eval_batch_size=BATCH_SIZE) + train_dataloader = data_builder.train_dataloader() + test_dataloader = data_builder.test_dataloader() + + # ==================================== + # Prepare model, optimizer + # ==================================== + # bert pretrained model + + if model_name == "gpt2": + if args.pretrained_path is None: + cfg = AutoConfig.from_pretrained(model_name, num_labels=data_builder.num_labels) + model = GPT2ForSequenceClassification.from_pretrained(model_name, config=cfg).cuda() + else: + model = GPT2ForSequenceClassification.from_pretrained(args.pretrained_path, local_files_only=True).cuda() + else: + raise RuntimeError + + # optimizer + no_decay = ["bias", "LayerNorm.weight"] + optimizer_grouped_parameters = [ + { + "params": [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)], + "weight_decay": WEIGHT_DECAY, + }, + { + "params": [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)], + "weight_decay": 0.0, + }, + ] + + optimizer = HybridAdam(optimizer_grouped_parameters, lr=lr, eps=1e-8) + + # lr scheduler + total_steps = len(train_dataloader) * NUM_EPOCHS + num_warmup_steps = int(WARMUP_FRACTION * total_steps) + lr_scheduler = get_linear_schedule_with_warmup( + optimizer, + num_warmup_steps=num_warmup_steps, + num_training_steps=total_steps, + ) + + def _criterion(outputs, inputs): + outputs = output_transform_fn(outputs) + loss = criterion(outputs) + return loss + + # ============================== + # Boost with ColossalAI + # ============================== + model, optimizer, _criterion, _, lr_scheduler = booster.boost(model, + optimizer, + criterion=_criterion, + lr_scheduler=lr_scheduler) + + # ============================== + # Train model + # ============================== + for epoch in range(NUM_EPOCHS): + train_epoch(epoch, model, optimizer, _criterion, lr_scheduler, train_dataloader, booster, coordinator) + + results = evaluate_model(model, optimizer, _criterion, test_dataloader, data_builder.num_labels, args.task, + data_builder.eval_splits, booster, coordinator) + + if coordinator.is_master(): + print(results) + if args.target_f1 is not None and 'f1' in results: + assert results['f1'] >= args.target_f1, f'f1 score {results["f1"]} is lower than target {args.target_f1}' + + +if __name__ == '__main__': + main() diff --git a/examples/language/gpt/hybridparallelism/run.sh b/examples/language/gpt/hybridparallelism/run.sh new file mode 100644 index 000000000000..4dd726bf37b4 --- /dev/null +++ b/examples/language/gpt/hybridparallelism/run.sh @@ -0,0 +1,2 @@ +torchrun --standalone --nproc_per_node 4 --master_port 29800 gpt_hybridparallelism.py --target_f1 0.6 --plugin hybrid_parallel --model_type "gpt2" +# torchrun --standalone --nproc_per_node 4 --master_port 29800 gpt_hybridparallelism.py --target_f1 0.6 --plugin hybrid_parallel --model_type "gpt2" --pretrained_path "you/path/to/pretrained_model" From 6c798a1225888ca3f0a2fc20137c779beac728ea Mon Sep 17 00:00:00 2001 From: FoolPlayer <498107402@qq.com> Date: Thu, 14 Sep 2023 10:07:46 +0800 Subject: [PATCH 2/9] update readme and testci --- examples/language/gpt/README.md | 10 ++++++++++ examples/language/gpt/hybridparallelism/run.sh | 5 ++++- examples/language/gpt/test_ci.sh | 1 + 3 files changed, 15 insertions(+), 1 deletion(-) diff --git a/examples/language/gpt/README.md b/examples/language/gpt/README.md index 47d24a4d69cb..03679e66404a 100644 --- a/examples/language/gpt/README.md +++ b/examples/language/gpt/README.md @@ -65,6 +65,16 @@ Titans provides a customized GPT model, which uses distributed operators as buil In [./titans/README.md], we provide a hybrid parallelism of ZeRO, TP and PP. You can switch parallel strategies using a config file. +### Hybridparallelism + +Hybridparallelism provides a user friendly plugin to set multiple parallelism method for training and inference. In [./hybridparallelism], we provide a n example to finetune gpt2 using Hybridparallelism. + +Quick run +```bash +cd ./hybridparallelism +bash run.sh +``` + ## Performance Testbed: a cluster of 8xA100 (80GB) and 1xAMD EPYC 7543 32-Core Processor (512 GB). GPUs are connected via PCI-e. diff --git a/examples/language/gpt/hybridparallelism/run.sh b/examples/language/gpt/hybridparallelism/run.sh index 4dd726bf37b4..f95ac8b8fcb7 100644 --- a/examples/language/gpt/hybridparallelism/run.sh +++ b/examples/language/gpt/hybridparallelism/run.sh @@ -1,2 +1,5 @@ +# load via internet torchrun --standalone --nproc_per_node 4 --master_port 29800 gpt_hybridparallelism.py --target_f1 0.6 --plugin hybrid_parallel --model_type "gpt2" -# torchrun --standalone --nproc_per_node 4 --master_port 29800 gpt_hybridparallelism.py --target_f1 0.6 --plugin hybrid_parallel --model_type "gpt2" --pretrained_path "you/path/to/pretrained_model" + +# load from local +# torchrun --standalone --nproc_per_node 4 --master_port 29800 gpt_hybridparallelism.py --target_f1 0.6 --plugin hybrid_parallel --model_type "gpt2" --pretrained_path "your/path/to/pretrained_model" diff --git a/examples/language/gpt/test_ci.sh b/examples/language/gpt/test_ci.sh index d67c17229e71..f95d267436dc 100644 --- a/examples/language/gpt/test_ci.sh +++ b/examples/language/gpt/test_ci.sh @@ -1,2 +1,3 @@ set -x cd gemini && bash test_ci.sh +cd hybridparallelism && bash run.sh From 0aca4a27f3328ad769653052a60ab852b5769cb0 Mon Sep 17 00:00:00 2001 From: FoolPlayer <498107402@qq.com> Date: Thu, 14 Sep 2023 15:50:15 +0800 Subject: [PATCH 3/9] update test ci --- .../gpt_hybridparallelism.py | 78 ++++++++----------- examples/language/gpt/test_ci.sh | 2 + 2 files changed, 35 insertions(+), 45 deletions(-) diff --git a/examples/language/gpt/hybridparallelism/gpt_hybridparallelism.py b/examples/language/gpt/hybridparallelism/gpt_hybridparallelism.py index 1215ca225b81..03e5ec91b3fe 100644 --- a/examples/language/gpt/hybridparallelism/gpt_hybridparallelism.py +++ b/examples/language/gpt/hybridparallelism/gpt_hybridparallelism.py @@ -41,7 +41,6 @@ def move_to_cuda(batch): @torch.no_grad() def evaluate_model( model: nn.Module, - optimizer, criterion, test_dataloader: Union[DataLoader, List[DataLoader]], num_labels: int, @@ -54,30 +53,24 @@ def evaluate_model( model.eval() def evaluate_subset(dataloader: DataLoader): + use_pipeline = isinstance(booster.plugin, HybridParallelPlugin) and booster.plugin.pp_size > 1 + is_pp_last_stage = use_pipeline and booster.plugin.stage_manager.is_last_stage() + accum_loss = torch.zeros(1, device=get_current_device()) for batch in dataloader: batch = move_to_cuda(batch) labels = batch["labels"] - batch_size = batch["input_ids"].shape[0] - if hasattr(booster.plugin, "stage_manager") and booster.plugin.stage_manager is not None: + if use_pipeline: pg_mesh = booster.plugin.pg_mesh pp_group = booster.plugin.pp_group current_pp_group_ranks = pg_mesh.get_ranks_in_group(pp_group) current_rank = dist.get_rank() - #TODO pass dataloader to execute_pipeline directly batch = iter([batch]) - outputs = booster.execute_pipeline(batch, - model, - criterion, - optimizer, - return_loss=True, - return_outputs=True) - - if booster.plugin.stage_manager.is_last_stage(): - val_loss = outputs["loss"] + outputs = booster.execute_pipeline(batch, model, criterion, return_loss=True, return_outputs=True) + if is_pp_last_stage: logits = outputs["outputs"]["logits"] - + val_loss = outputs["loss"] accum_loss.add_(val_loss) if num_labels > 1: @@ -85,19 +78,15 @@ def evaluate_subset(dataloader: DataLoader): elif num_labels == 1: preds = logits.squeeze() - dist.broadcast(preds, src=current_rank, group=pp_group) - dist.broadcast(val_loss, src=current_rank, group=pp_group) + dist.broadcast_object_list([preds, val_loss], src=current_pp_group_ranks[-1], group=pp_group) metric.add_batch(predictions=preds, references=labels) elif current_rank in current_pp_group_ranks: - val_loss = torch.empty((1,), device=get_current_device()) - preds = torch.empty((batch_size,), dtype=torch.int64, device=get_current_device()) + object_list = [None, None] + dist.broadcast_object_list(object_list, src=current_pp_group_ranks[-1], group=pp_group) - dist.broadcast(preds, src=current_pp_group_ranks[-1], group=pp_group) - dist.broadcast(val_loss, src=current_pp_group_ranks[-1], group=pp_group) - - accum_loss.add_(val_loss) - metric.add_batch(predictions=preds, references=labels) + metric.add_batch(predictions=object_list[0].to(get_current_device()), references=labels) + accum_loss.add_(object_list[1].to(get_current_device())) else: batch = move_to_cuda(batch) @@ -133,31 +122,33 @@ def evaluate_subset(dataloader: DataLoader): def train_epoch(epoch: int, model: nn.Module, optimizer: Optimizer, _criterion: Callable, lr_scheduler: LRScheduler, train_dataloader: DataLoader, booster: Booster, coordinator: DistCoordinator): + use_pipeline = isinstance(booster.plugin, HybridParallelPlugin) and booster.plugin.pp_size > 1 + is_pp_last_stage = use_pipeline and booster.plugin.stage_manager.is_last_stage() + total_step = len(train_dataloader) + model.train() - is_pp_last_stage = hasattr( - booster.plugin, - "stage_manager") and booster.plugin.stage_manager is not None and booster.plugin.stage_manager.is_last_stage() - with tqdm(train_dataloader, + optimizer.zero_grad() + train_dataloader_iter = iter(train_dataloader) + with tqdm(range(total_step), desc=f'Epoch [{epoch + 1}/{NUM_EPOCHS}]', disable=not (coordinator.is_master() or is_pp_last_stage)) as pbar: - for batch in pbar: - # Forward pass - batch = move_to_cuda(batch) - if hasattr(booster.plugin, "stage_manager") and booster.plugin.stage_manager is not None: - #TODO pass train_dataloader to execute_pipeline directly - batch = iter([batch]) - outputs = booster.execute_pipeline(batch, + # Forward pass + for _ in pbar: + if use_pipeline: + outputs = booster.execute_pipeline(train_dataloader_iter, model, _criterion, optimizer, return_loss=True, return_outputs=True) # Backward and optimize - if booster.plugin.stage_manager.is_last_stage(): + if is_pp_last_stage: loss = outputs['loss'] pbar.set_postfix({'loss': loss.item()}) else: - outputs = model(**batch) + data = next(train_dataloader_iter) + data = move_to_cuda(data) + outputs = model(**data) loss = _criterion(outputs, None) # Backward booster.backward(loss, optimizer) @@ -184,11 +175,10 @@ def main(): "--model_type", type=str, default="gpt2", - help="only support gpt2", + help="only gpt2 now", ) parser.add_argument('--target_f1', type=float, default=None, help="target f1 score. Raise exception if not reached") parser.add_argument('--use_lazy_init', type=bool, default=False, help="for initiating lazy init context") - parser.add_argument('--pretrained_path', type=str, default=None, help="The path so save the pretrained weight") args = parser.parse_args() if args.model_type == 'gpt2': @@ -244,14 +234,12 @@ def main(): # ==================================== # Prepare model, optimizer # ==================================== - # bert pretrained model + # gpt2 pretrained model + + cfg = AutoConfig.from_pretrained(model_name, num_labels=data_builder.num_labels) if model_name == "gpt2": - if args.pretrained_path is None: - cfg = AutoConfig.from_pretrained(model_name, num_labels=data_builder.num_labels) - model = GPT2ForSequenceClassification.from_pretrained(model_name, config=cfg).cuda() - else: - model = GPT2ForSequenceClassification.from_pretrained(args.pretrained_path, local_files_only=True).cuda() + model = GPT2ForSequenceClassification.from_pretrained(model_name, config=cfg).cuda() else: raise RuntimeError @@ -298,7 +286,7 @@ def _criterion(outputs, inputs): for epoch in range(NUM_EPOCHS): train_epoch(epoch, model, optimizer, _criterion, lr_scheduler, train_dataloader, booster, coordinator) - results = evaluate_model(model, optimizer, _criterion, test_dataloader, data_builder.num_labels, args.task, + results = evaluate_model(model, _criterion, test_dataloader, data_builder.num_labels, args.task, data_builder.eval_splits, booster, coordinator) if coordinator.is_master(): diff --git a/examples/language/gpt/test_ci.sh b/examples/language/gpt/test_ci.sh index f95d267436dc..80e00988b012 100644 --- a/examples/language/gpt/test_ci.sh +++ b/examples/language/gpt/test_ci.sh @@ -1,3 +1,5 @@ set -x +pip install -r requirements.txt + cd gemini && bash test_ci.sh cd hybridparallelism && bash run.sh From e42909fce9fad7a5dfa490587e9a46721d2ed109 Mon Sep 17 00:00:00 2001 From: FoolPlayer <498107402@qq.com> Date: Fri, 15 Sep 2023 10:13:33 +0800 Subject: [PATCH 4/9] fix test_ci bug --- examples/language/gpt/test_ci.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples/language/gpt/test_ci.sh b/examples/language/gpt/test_ci.sh index 80e00988b012..b9e4e43a8d35 100644 --- a/examples/language/gpt/test_ci.sh +++ b/examples/language/gpt/test_ci.sh @@ -2,4 +2,4 @@ set -x pip install -r requirements.txt cd gemini && bash test_ci.sh -cd hybridparallelism && bash run.sh +cd ../hybridparallelism && bash run.sh From 0f9d45977a3584f360dc32c400c8a64d0857c34b Mon Sep 17 00:00:00 2001 From: FoolPlayer <498107402@qq.com> Date: Fri, 15 Sep 2023 11:32:22 +0800 Subject: [PATCH 5/9] update requirements --- examples/language/gpt/requirements.txt | 2 ++ 1 file changed, 2 insertions(+) diff --git a/examples/language/gpt/requirements.txt b/examples/language/gpt/requirements.txt index ef58bb76bfc8..59a0bd6143d4 100644 --- a/examples/language/gpt/requirements.txt +++ b/examples/language/gpt/requirements.txt @@ -1,2 +1,4 @@ transformers >= 4.23 colossalai +evaluate +tqdm From 78bd1fcefb0874af3994560ee41a510a6f994bd9 Mon Sep 17 00:00:00 2001 From: FoolPlayer <498107402@qq.com> Date: Fri, 15 Sep 2023 14:20:58 +0800 Subject: [PATCH 6/9] add requirements --- examples/language/gpt/requirements.txt | 1 + 1 file changed, 1 insertion(+) diff --git a/examples/language/gpt/requirements.txt b/examples/language/gpt/requirements.txt index 59a0bd6143d4..b058e6dd7625 100644 --- a/examples/language/gpt/requirements.txt +++ b/examples/language/gpt/requirements.txt @@ -2,3 +2,4 @@ transformers >= 4.23 colossalai evaluate tqdm +scipy From b742ebc81bb54a8e9e423e868382813ea873b2e0 Mon Sep 17 00:00:00 2001 From: FoolPlayer <498107402@qq.com> Date: Fri, 15 Sep 2023 14:40:37 +0800 Subject: [PATCH 7/9] update requirements --- examples/language/gpt/requirements.txt | 1 + 1 file changed, 1 insertion(+) diff --git a/examples/language/gpt/requirements.txt b/examples/language/gpt/requirements.txt index b058e6dd7625..7a11d6588110 100644 --- a/examples/language/gpt/requirements.txt +++ b/examples/language/gpt/requirements.txt @@ -3,3 +3,4 @@ colossalai evaluate tqdm scipy +sklearn From 3edd122d9af8789fa2ae996981e05da63948ed92 Mon Sep 17 00:00:00 2001 From: FoolPlayer <498107402@qq.com> Date: Fri, 15 Sep 2023 14:56:26 +0800 Subject: [PATCH 8/9] add requirement --- examples/language/gpt/requirements.txt | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/examples/language/gpt/requirements.txt b/examples/language/gpt/requirements.txt index 7a11d6588110..1a173f228aee 100644 --- a/examples/language/gpt/requirements.txt +++ b/examples/language/gpt/requirements.txt @@ -3,4 +3,5 @@ colossalai evaluate tqdm scipy -sklearn +scikit-learn +numpy From efacbb4eab899f427b79091deb5003463e90153e Mon Sep 17 00:00:00 2001 From: FoolPlayer <498107402@qq.com> Date: Fri, 15 Sep 2023 16:22:38 +0800 Subject: [PATCH 9/9] rename file --- .../{gpt_hybridparallelism.py => finetune.py} | 0 examples/language/gpt/hybridparallelism/run.sh | 4 ++-- 2 files changed, 2 insertions(+), 2 deletions(-) rename examples/language/gpt/hybridparallelism/{gpt_hybridparallelism.py => finetune.py} (100%) diff --git a/examples/language/gpt/hybridparallelism/gpt_hybridparallelism.py b/examples/language/gpt/hybridparallelism/finetune.py similarity index 100% rename from examples/language/gpt/hybridparallelism/gpt_hybridparallelism.py rename to examples/language/gpt/hybridparallelism/finetune.py diff --git a/examples/language/gpt/hybridparallelism/run.sh b/examples/language/gpt/hybridparallelism/run.sh index f95ac8b8fcb7..679cbbf9b1e2 100644 --- a/examples/language/gpt/hybridparallelism/run.sh +++ b/examples/language/gpt/hybridparallelism/run.sh @@ -1,5 +1,5 @@ # load via internet -torchrun --standalone --nproc_per_node 4 --master_port 29800 gpt_hybridparallelism.py --target_f1 0.6 --plugin hybrid_parallel --model_type "gpt2" +torchrun --standalone --nproc_per_node 4 --master_port 29800 finetune.py --target_f1 0.6 --plugin hybrid_parallel --model_type "gpt2" # load from local -# torchrun --standalone --nproc_per_node 4 --master_port 29800 gpt_hybridparallelism.py --target_f1 0.6 --plugin hybrid_parallel --model_type "gpt2" --pretrained_path "your/path/to/pretrained_model" +# torchrun --standalone --nproc_per_node 4 --master_port 29800 finetune.py --target_f1 0.6 --plugin hybrid_parallel --model_type "gpt2" --pretrained_path "your/path/to/pretrained_model"