hpcaitech · ver217 · Apr 26, 2023 · Apr 26, 2023 · Apr 26, 2023 · Apr 26, 2023
@@ -1,70 +1,5 @@
 # Benchmarks
 
-## Benchmark GPT on dummy prompt data
-
-We provide various GPT models (string in parentheses is the corresponding model name used in this script):
-
-- GPT2-S (s)
-- GPT2-M (m)
-- GPT2-L (l)
-- GPT2-XL (xl)
-- GPT2-4B (4b)
-- GPT2-6B (6b)
-- GPT2-8B (8b)
-- GPT2-10B (10b)
-- GPT2-12B (12b)
-- GPT2-15B (15b)
-- GPT2-18B (18b)
-- GPT2-20B (20b)
-- GPT2-24B (24b)
-- GPT2-28B (28b)
-- GPT2-32B (32b)
-- GPT2-36B (36b)
-- GPT2-40B (40b)
-- GPT3 (175b)
-
-We also provide various training strategies:
-
-- ddp: torch DDP
-- colossalai_gemini: ColossalAI GeminiDDP with `placement_policy="cuda"`, like zero3
-- colossalai_gemini_cpu: ColossalAI GeminiDDP with `placement_policy="cpu"`, like zero3-offload
-- colossalai_zero2: ColossalAI zero2
-- colossalai_zero2_cpu: ColossalAI zero2-offload
-- colossalai_zero1: ColossalAI zero1
-- colossalai_zero1_cpu: ColossalAI zero1-offload
-
-We only support `torchrun` to launch now. E.g.
-
-```shell
-# run GPT2-S on single-node single-GPU with min batch size
-torchrun --standalone --nproc_per_node 1 benchmark_gpt_dummy.py --model s --strategy ddp --experience_batch_size 1 --train_batch_size 1
-# run GPT2-XL on single-node 4-GPU
-torchrun --standalone --nproc_per_node 4 benchmark_gpt_dummy.py --model xl --strategy colossalai_zero2
-# run GPT3 on 8-node 8-GPU
-torchrun --nnodes 8 --nproc_per_node 8 \
- --rdzv_id=$JOB_ID --rdzv_backend=c10d --rdzv_endpoint=$HOST_NODE_ADDR \
- benchmark_gpt_dummy.py --model 175b --strategy colossalai_gemini
-```
-
-> ⚠ Batch sizes in CLI args and outputed throughput/TFLOPS are all values of per GPU.
-
-In this benchmark, we assume the model architectures/sizes of actor and critic are the same for simplicity. But in practice, to reduce training cost, we may use a smaller critic.
-
-We also provide a simple shell script to run a set of benchmarks. But it only supports benchmark on single node. However, it's easy to run on multi-nodes by modifying launch command in this script.
-
-Usage:
-
-```shell
-# run for GPUS=(1 2 4 8) x strategy=("ddp" "colossalai_zero2" "colossalai_gemini" "colossalai_zero2_cpu" "colossalai_gemini_cpu") x model=("s" "m" "l" "xl" "2b" "4b" "6b" "8b" "10b") x batch_size=(1 2 4 8 16 32 64 128 256)
-./benchmark_gpt_dummy.sh
-# run for GPUS=2 x strategy=("ddp" "colossalai_zero2" "colossalai_gemini" "colossalai_zero2_cpu" "colossalai_gemini_cpu") x model=("s" "m" "l" "xl" "2b" "4b" "6b" "8b" "10b") x batch_size=(1 2 4 8 16 32 64 128 256)
-./benchmark_gpt_dummy.sh 2
-# run for GPUS=2 x strategy=ddp x model=("s" "m" "l" "xl" "2b" "4b" "6b" "8b" "10b") x batch_size=(1 2 4 8 16 32 64 128 256)
-./benchmark_gpt_dummy.sh 2 ddp
-# run for GPUS=2 x strategy=ddp x model=l x batch_size=(1 2 4 8 16 32 64 128 256)
-./benchmark_gpt_dummy.sh 2 ddp l
-```
-
 ## Benchmark OPT with LoRA on dummy prompt data
 
 We provide various OPT models (string in parentheses is the corresponding model name used in this script):
@@ -80,15 +15,21 @@ We provide various OPT models (string in parentheses is the corresponding model
 - OPT-10B (10b)
 - OPT-13B (13b)
 
+We also provide various training strategies:
+
+- ddp: torch DDP
+- colossalai_gemini: ColossalAI GeminiDDP with `placement_policy="cuda"`, like zero3
+- colossalai_gemini_cpu: ColossalAI GeminiDDP with `placement_policy="cpu"`, like zero3-offload
+- colossalai_zero2: ColossalAI zero2
+- colossalai_zero2_cpu: ColossalAI zero2-offload
+- colossalai_zero1: ColossalAI zero1
+- colossalai_zero1_cpu: ColossalAI zero1-offload
+
 We only support `torchrun` to launch now. E.g.
 
 ```shell
 # run OPT-125M with no lora (lora_rank=0) on single-node single-GPU with min batch size
-torchrun --standalone --nproc_per_node 1 benchmark_opt_lora_dummy.py --model 125m --strategy ddp --experience_batch_size 1 --train_batch_size 1 --lora_rank 0
-# run OPT-350M with lora_rank=4 on single-node 4-GPU
-torchrun --standalone --nproc_per_node 4 benchmark_opt_lora_dummy.py --model 350m --strategy colossalai_zero2 --lora_rank 4
+torchrun --standalone --nproc_per_node 1 benchmark_opt_lora_dummy.py --model 125m --critic_model 125m --strategy ddp --experience_batch_size 1 --train_batch_size 1 --lora_rank 0
+# run Actor (OPT-1.3B) and Critic (OPT-350M) with lora_rank=4 on single-node 4-GPU
+torchrun --standalone --nproc_per_node 4 benchmark_opt_lora_dummy.py --model 1.3b --critic_model 350m --strategy colossalai_zero2 --lora_rank 4
 ```
-
-> ⚠ Batch sizes in CLI args and outputed throughput/TFLOPS are all values of per GPU.
-
-In this benchmark, we assume the model architectures/sizes of actor and critic are the same for simplicity. But in practice, to reduce training cost, we may use a smaller critic.
@@ -140,8 +140,7 @@ def main(args):
                          ptx_coef=0,
                          max_epochs=args.max_epochs,
                          train_batch_size=args.train_batch_size,
-                         experience_batch_size=args.experience_batch_size,
-                         tokenizer=preprocess_batch,
+                         offload_inference_models=args.offload_inference_models,
                          max_length=512,
                          do_sample=True,
                          temperature=1.0,
@@ -179,10 +178,11 @@ def main(args):
     parser.add_argument('--num_episodes', type=int, default=3)
     parser.add_argument('--max_timesteps', type=int, default=8)
     parser.add_argument('--update_timesteps', type=int, default=8)
-    parser.add_argument('--max_epochs', type=int, default=3)
+    parser.add_argument('--max_epochs', type=int, default=1)
     parser.add_argument('--train_batch_size', type=int, default=8)
     parser.add_argument('--experience_batch_size', type=int, default=8)
     parser.add_argument('--lora_rank', type=int, default=0)
     parser.add_argument('--cuda_mem_frac', type=float, default=1.0)
+    parser.add_argument('--offload_inference_models', action='store_true', default=False)
     args = parser.parse_args()
     main(args)
@@ -15,7 +15,6 @@ class Trainer(ABC):
     Args:
         strategy (Strategy):the strategy to use for training
         max_epochs (int, defaults to 1): the number of epochs of training process
-        tokenizer (Callable, optional): the tokenizer to use for tokenizing the input
         dataloader_pin_memory (bool, defaults to True): whether to pin memory for data loader
         callbacks (List[Callback], defaults to []): the callbacks to call during training process
         generate_kwargs (dict, optional): the kwargs to use while model generating
@@ -24,14 +23,12 @@ class Trainer(ABC):
     def __init__(self,
                  strategy: Strategy,
                  max_epochs: int = 1,
-                 tokenizer: Optional[Callable[[Any], dict]] = None,
                  dataloader_pin_memory: bool = True,
                  callbacks: List[Callback] = [],
                  **generate_kwargs) -> None:
         super().__init__()
         self.strategy = strategy
         self.max_epochs = max_epochs
-        self.tokenizer = tokenizer
         self.generate_kwargs = generate_kwargs
         self.dataloader_pin_memory = dataloader_pin_memory
         self.callbacks = callbacks