From f9dc4de2c20d1cf2b6ee3e19080b8b4a20342681 Mon Sep 17 00:00:00 2001
From: Cheng Li <pistasable@gmail.com>
Date: Thu, 11 Nov 2021 00:01:04 +0000
Subject: [PATCH 1/7] add autotuning hf examples

---
 autotuning/.gitignore                         |   4 +
 autotuning/hf/README.md                       |  51 +++++++
 autotuning/hf/bert-base/README.md             |  46 ++++++
 autotuning/hf/bert-base/test_tune.sh          | 121 ++++++++++++++++
 autotuning/hf/bert-large/README.md            |  44 ++++++
 autotuning/hf/bert-large/test_tune.sh         | 121 ++++++++++++++++
 autotuning/hf/deberta/README.md               |  62 ++++++++
 .../hf/deberta/ds_config_fp16_tune.json       |  16 +++
 autotuning/hf/deberta/test_tune.sh            | 128 +++++++++++++++++
 autotuning/hf/distilbert/README.md            |  46 ++++++
 autotuning/hf/distilbert/ds_config_tune.json  |  12 ++
 autotuning/hf/distilbert/test_tune.sh         | 121 ++++++++++++++++
 .../hf/dsconfigs/ds_config_fp16_tune.json     |  15 ++
 .../hf/dsconfigs/ds_config_fp16_z0.json       |   9 ++
 .../hf/dsconfigs/ds_config_fp16_z1.json       |   9 ++
 .../hf/dsconfigs/ds_config_fp16_z2.json       |   9 ++
 .../hf/dsconfigs/ds_config_fp16_z3.json       |   9 ++
 autotuning/hf/dsconfigs/ds_config_tune.json   |  12 ++
 autotuning/hf/dsconfigs/ds_config_z0.json     |   6 +
 autotuning/hf/dsconfigs/ds_config_z1.json     |   6 +
 autotuning/hf/dsconfigs/ds_config_z2.json     |   6 +
 autotuning/hf/dsconfigs/ds_config_z3.json     |   6 +
 autotuning/hf/gpt2-large/README.md            |  48 +++++++
 autotuning/hf/gpt2-large/test_tune.sh         | 133 +++++++++++++++++
 autotuning/hf/gpt2-medium/README.md           |  46 ++++++
 autotuning/hf/gpt2-medium/test_tune.sh        | 126 ++++++++++++++++
 autotuning/hf/gpt2-xl/README.md               |  45 ++++++
 autotuning/hf/gpt2-xl/test_tune.sh            | 126 ++++++++++++++++
 autotuning/hf/gpt2/README.md                  |  47 ++++++
 autotuning/hf/gpt2/test_tune.sh               | 135 ++++++++++++++++++
 30 files changed, 1565 insertions(+)
 create mode 100644 autotuning/.gitignore
 create mode 100644 autotuning/hf/README.md
 create mode 100644 autotuning/hf/bert-base/README.md
 create mode 100755 autotuning/hf/bert-base/test_tune.sh
 create mode 100644 autotuning/hf/bert-large/README.md
 create mode 100755 autotuning/hf/bert-large/test_tune.sh
 create mode 100644 autotuning/hf/deberta/README.md
 create mode 100644 autotuning/hf/deberta/ds_config_fp16_tune.json
 create mode 100755 autotuning/hf/deberta/test_tune.sh
 create mode 100644 autotuning/hf/distilbert/README.md
 create mode 100644 autotuning/hf/distilbert/ds_config_tune.json
 create mode 100755 autotuning/hf/distilbert/test_tune.sh
 create mode 100644 autotuning/hf/dsconfigs/ds_config_fp16_tune.json
 create mode 100644 autotuning/hf/dsconfigs/ds_config_fp16_z0.json
 create mode 100644 autotuning/hf/dsconfigs/ds_config_fp16_z1.json
 create mode 100644 autotuning/hf/dsconfigs/ds_config_fp16_z2.json
 create mode 100644 autotuning/hf/dsconfigs/ds_config_fp16_z3.json
 create mode 100644 autotuning/hf/dsconfigs/ds_config_tune.json
 create mode 100644 autotuning/hf/dsconfigs/ds_config_z0.json
 create mode 100644 autotuning/hf/dsconfigs/ds_config_z1.json
 create mode 100644 autotuning/hf/dsconfigs/ds_config_z2.json
 create mode 100644 autotuning/hf/dsconfigs/ds_config_z3.json
 create mode 100644 autotuning/hf/gpt2-large/README.md
 create mode 100755 autotuning/hf/gpt2-large/test_tune.sh
 create mode 100644 autotuning/hf/gpt2-medium/README.md
 create mode 100755 autotuning/hf/gpt2-medium/test_tune.sh
 create mode 100644 autotuning/hf/gpt2-xl/README.md
 create mode 100755 autotuning/hf/gpt2-xl/test_tune.sh
 create mode 100644 autotuning/hf/gpt2/README.md
 create mode 100755 autotuning/hf/gpt2/test_tune.sh

diff --git a/autotuning/.gitignore b/autotuning/.gitignore
new file mode 100644
index 000000000..82319e4a0
--- /dev/null
+++ b/autotuning/.gitignore
@@ -0,0 +1,4 @@
+autotuning_results*
+autotuning_exps*
+output*
+mnli
diff --git a/autotuning/hf/README.md b/autotuning/hf/README.md
new file mode 100644
index 000000000..c8095c594
--- /dev/null
+++ b/autotuning/hf/README.md
@@ -0,0 +1,51 @@
+# Autotuning Hugging Face Examples
+
+This showcases the Autotuning feature in DeepSpeed (DS) with Hugging Face (HF) models.
+
+## List of Models
+
+- [DistilBERT](distilbert)
+- [BERT-base](bert-base)
+- [BERT-large](bert-large)
+- [GPT2](gpt2)
+- [GPT2-medium](gpt2-medium)
+- [GPT2-large](gpt2-large)
+- [GPT2-xl](gpt2-xl)
+- [DeBERTa](deberta)
+
+Each model folder has a `test_tune.sh` script:
+
+- `./test_tune.sh tune` tunes the model training and and then runs it using the selected tuned DeepSpeed configuration.
+- `./test_tune.sh 0` runs the model using HF without DeepSpeed.
+- `./test_tune.sh z0` runs the model using HF + DS with ZeRO optimization disabled.
+- `./test_tune.sh z1` runs the model using HF + DS with ZeRO optimization stage 1.
+- `./test_tune.sh z2` runs the model using HF + DS with ZeRO optimization stage 2.
+- `./test_tune.sh z3` runs the model using HF + DS with ZeRO optimization stage 3.
+
+
+## Testing Environment
+
+The training runs on 1 node with 16 Nvidia V100 GPUs. The autotuning uses the same hardware resource as the training. The HF packages below are used.
+
+- transformers (4.12.0.dev0)
+- datasets (1.11.0)
+
+## Throughput Comparsion
+
+The table below shows the throughput (samples per second) comparsion. The corresponding train micro batch size per GPU (mbs or tmbspg) and ZeRO stage used to achieve the throughput value is also shown in the parentheses. Assume the strategy users would usein the handtuning process is to start from `mbs = 1` and increase mbs by 2 each time until running out of GPU memory.
+ - `baseline` is the vanila HF without DeepSpeed (DS) and mbs is hand-tuned.
+ - `HF + DS hand-tuned` is HF with DS, and mbs is hand-tuned while other DS configuration uses default values.
+ - `HF + DS autotuning` is HF with DS, and the DS configuration selected from autotuning.
+
+Notation: Hugging Face (HF), DeepSpeed (DS), ZeRO stage (z), graident accumulation steps (gas), train micro batch size per GPU (mbs or tmbspg).
+
+| Model   name | num_params |     baseline (vanila HF)      |          HF + DS hand-tuned          | HF + DS autotuning (fast-mode) | throughput improvement over baseline | autotuning time (mins) | number of experiments |
+| :----------: | :--------: | :---------------------------: | :----------------------------------: | :----------------------------: | :----------------------------------: | :--------------------: | :-------------------: |
+|  DistilBERT  |    66M     | 5161.902 (gas = 1, mbs = 256) | 5305.067 (z = 0, gas = 1 mbs = 256)  |  5305.067 (z0_gas1_tmbspg256)  |                1.03x                 |           11           |          11           |
+|  BERT-base   |   0.11B    | 2502.236 (gas = 1,mbs = 128)  | 2523.684 (z = 0, gas = 1, mbs = 128) |  2682.849 (z0_gas1_tmbspg220)  |                1.07x                 |           43           |          35           |
+|  BERT-large  |   0.34B    |  742.692 (gas = 1,mbs = 64)   |  766.929 (z = 1, gas = 1, mbs = 64)  |   808.168 (z1_gas1_tmbspg93)   |                1.09x                 |           36           |          22           |
+|     GPT2     |   0.12B    |   284.142 (gas = 1,mbs = 8)   |  397.827 (z = 1, gas = 1, mbs = 8)   |   431.586 (z1_gas1_tmbspg14)   |                1.52x                 |           25           |          17           |
+| GPT2-medium  |   0.35B    |   71.61 (gas = 1, mbs = 2)    |  142.211 (z = 1, gas = 1, mbs = 4)   |    163.3 (z1_gas1_tmbspg6)     |                 2.28                 |           15           |          25           |
+|  GPT2-large  |   0.77B    |   27.874 (gas = 1, mbs = 1)   |   56.797 (z = 1, gas = 1, mbs = 2)   |    69.061 (z = 1, mbs = 3)     |                2.48x                 |           27           |          13           |
+|   GPT2-xl    |    1.5B    |         Not runnable          |      27.462 (gas = 1, mbs = 1)       |    27.497 (z1_gas1_tmbspg1)    |                 inf                  |           21           |           9           |
+|   DeBERTa    |    1.5B    |         Not runnable          |   140.587 (z = 1, gas = 1 mbs = 8)   |  162.395  (z1_gas1_tmbspg11)   |                 inf                  |           40           |          12           |
diff --git a/autotuning/hf/bert-base/README.md b/autotuning/hf/bert-base/README.md
new file mode 100644
index 000000000..48e0a3310
--- /dev/null
+++ b/autotuning/hf/bert-base/README.md
@@ -0,0 +1,46 @@
+# [bert-base-cased](https://huggingface.co/bert-base-cased)
+
+This model has the following configuration:
+
+- 12-layer
+- 768 hidden dimension
+- 12 attention heads
+- 110M parameters.
+
+## Environment
+
+The training use fp32 and runs on 1 node with 16 Nvidia V100 GPUs. The autotuning uses the same hardware resource as the training. `max_train_batch_size` is not defined.
+
+- transformers (4.12.0.dev0)
+- datasets (1.11.0)
+
+## Throughput Comparsion
+
+The table below shows the throughput (samples per second) comparsion. The corresponding train micro batch size per GPU (mbs or tmbspg) and ZeRO stage used to achieve the throughput value is also shown in the parentheses. Assume the strategy users would usein the handtuning process is to start from `mbs = 1` and increase mbs by 2 each time until running out of GPU memory.
+ - `baseline` is the vanila Hugging Face (HF) without DeepSpeed (DS) and mbs is hand-tuned.
+ - `HF + DS hand-tuned` is HF with DS, and mbs is hand-tuned while other DS configuration uses default values.
+ - `HF + DS autotuning` is HF with DS, and the DS configuration selected from autotuning.
+
+Notation: Hugging Face (HF), DeepSpeed (DS), ZeRO stage (z), graident accumulation steps (gas), train micro batch size per GPU (mbs or tmbspg).
+
+| Model name | baseline (vanila HF)          | HF + DS handtuned                    | HF + DS autotuning           |
+| ---------- | ----------------------------- | ------------------------------------ | ---------------------------- |
+| BERT-base  | 2502.236 (gas = 1, mbs = 128) | 2523.684 (z = 0, gas = 1, mbs = 128) | 2682.849 (z0_gas1_tmbspg220) |
+
+## Detailed `HF + DS autotuning` Result Summary
+
+Note that the performance metric used in autotuning is calculated using the timings captured within DeepSpeed foward, backward and step functions. The sum of these timings is less than the actual training step latency, thus the throughput metric values used by autotuning would be higher than the end-to-end throughput in training.
+
+- Fast-mode Autotuning time: 43 mins
+- Number of experiments: 35
+- Throughput Improvement over baseline: 1.07x
+
+| tuning_space | num_experiments | best_metric_val | best_exp_name     |
+| :----------- | --------------: | --------------: | :---------------- |
+| z0           |               9 |         2880.94 | z0_gas1_tmbspg220 |
+| z1           |               7 |         2861.43 | z1_gas1_tmbspg220 |
+| z2           |               8 |         2714.96 | z2_gas1_tmbspg240 |
+| z3           |              11 |         2420.78 | z3_gas1_tmbspg240 |
+| global       |              35 |         2880.94 | z0_gas1_tmbspg220 |
+
+Tuning completed in 0:43:33.853567. Total number of experiments: 35.
diff --git a/autotuning/hf/bert-base/test_tune.sh b/autotuning/hf/bert-base/test_tune.sh
new file mode 100755
index 000000000..560ddb1e6
--- /dev/null
+++ b/autotuning/hf/bert-base/test_tune.sh
@@ -0,0 +1,121 @@
+TASK_NAME=mnli
+MODEL_NAME=bert-base-cased
+HF_PATH=~/projects
+PER_DEVICE_TRAIN_BATCH_SIZE=64
+MAX_TRAIN_BATCH_SIZE=4096
+NEPOCHS=1
+NGPUS=16
+NNODES=1
+MAX_STEPS=200
+OUTPUT_DIR=./${TASK_NAME}/output_b${PER_DEVICE_TRAIN_BATCH_SIZE}_g${NGPUS}_$MAX_STEPS
+
+TEST=$1
+
+if [ ${TEST} == "0" ]
+then
+    python -m torch.distributed.launch --nproc_per_node=$NGPUS $HF_PATH/transformers/examples/pytorch/text-classification/run_glue.py \
+    --model_name_or_path $MODEL_NAME \
+    --task_name $TASK_NAME \
+    --do_train \
+    --do_eval \
+    --max_seq_length 128 \
+    --per_device_train_batch_size $PER_DEVICE_TRAIN_BATCH_SIZE \
+    # --learning_rate 2e-5 \
+    --num_train_epochs $NEPOCHS \
+    --output_dir ${OUTPUT_DIR}_b${PER_DEVICE_TRAIN_BATCH_SIZE}_0 \
+    --overwrite_output_dir \
+    --save_steps 0 \
+    --max_steps $MAX_STEPS \
+    --save_strategy "no"
+elif [ ${TEST} == "z0" ]
+then
+    deepspeed --num_nodes=$NNODES $HF_PATH/transformers/examples/pytorch/text-classification/run_glue.py --deepspeed ../dsconfigs/ds_config_z0.json \
+    --model_name_or_path $MODEL_NAME \
+    --task_name $TASK_NAME \
+    --do_train \
+    --do_eval \
+    --max_seq_length 128 \
+    --per_device_train_batch_size $PER_DEVICE_TRAIN_BATCH_SIZE \
+    --learning_rate 2e-5 \
+    --num_train_epochs $NEPOCHS \
+    --output_dir ${OUTPUT_DIR}_b${PER_DEVICE_TRAIN_BATCH_SIZE}_z0 \
+    --save_steps 0 \
+    --overwrite_output_dir \
+    --max_steps $MAX_STEPS
+elif [ ${TEST} == "z1" ]
+then
+    deepspeed --num_nodes=$NNODES $HF_PATH/transformers/examples/pytorch/text-classification/run_glue.py --deepspeed ../dsconfigs/ds_config_z1.json \
+    --model_name_or_path $MODEL_NAME \
+    --task_name $TASK_NAME \
+    --do_train \
+    --do_eval \
+    --max_seq_length 128 \
+    --per_device_train_batch_size $PER_DEVICE_TRAIN_BATCH_SIZE \
+    --learning_rate 2e-5 \
+    --num_train_epochs $NEPOCHS \
+    --output_dir ${OUTPUT_DIR}_b${PER_DEVICE_TRAIN_BATCH_SIZE}_z1 \
+    --save_steps 0 \
+    --overwrite_output_dir \
+    --max_steps $MAX_STEPS
+elif [ ${TEST} == "z2" ]
+then
+    deepspeed --num_nodes=$NNODES $HF_PATH/transformers/examples/pytorch/text-classification/run_glue.py --deepspeed ../dsconfigs/ds_config_z2.json \
+    --model_name_or_path $MODEL_NAME \
+    --task_name $TASK_NAME \
+    --do_train \
+    --do_eval \
+    --max_seq_length 128 \
+    --per_device_train_batch_size $PER_DEVICE_TRAIN_BATCH_SIZE \
+    --learning_rate 2e-5 \
+    --num_train_epochs $NEPOCHS \
+    --output_dir ${OUTPUT_DIR}_b${PER_DEVICE_TRAIN_BATCH_SIZE}_z2 \
+    --save_steps 0 \
+    --overwrite_output_dir \
+    --max_steps $MAX_STEPS
+elif [ ${TEST} == "z3" ]
+then
+    deepspeed --num_nodes=$NNODES $HF_PATH/transformers/examples/pytorch/text-classification/run_glue.py --deepspeed ../dsconfigs/ds_config_z3.json \
+    --model_name_or_path $MODEL_NAME \
+    --task_name $TASK_NAME \
+    --do_train \
+    --do_eval \
+    --max_seq_length 128 \
+    --per_device_train_batch_size $PER_DEVICE_TRAIN_BATCH_SIZE \
+    --learning_rate 2e-5 \
+    --num_train_epochs $NEPOCHS \
+    --output_dir ${OUTPUT_DIR}_b${PER_DEVICE_TRAIN_BATCH_SIZE}_z3 \
+    --save_steps 0 \
+    --overwrite_output_dir \
+    --max_steps $MAX_STEPS
+elif [ ${TEST} == "tune" ]
+then
+    deepspeed --autotuning run --num_nodes=$NNODES $HF_PATH/transformers/examples/pytorch/text-classification/run_glue.py --deepspeed ../dsconfigs/ds_config_tune.json \
+    --model_name_or_path $MODEL_NAME \
+    --task_name $TASK_NAME \
+    --do_train \
+    --do_eval \
+    --max_seq_length 128 \
+    --per_device_train_batch_size $PER_DEVICE_TRAIN_BATCH_SIZE \
+    --learning_rate 2e-5 \
+    --num_train_epochs $NEPOCHS \
+    --output_dir ${OUTPUT_DIR}_tune \
+    --save_steps 0 \
+    --overwrite_output_dir \
+    --max_steps $MAX_STEPS
+elif [ ${TEST} == "fs" ]
+then
+    python -m torch.distributed.launch --nproc_per_node=$NGPUS $HF_PATH/transformers/examples/pytorch/text-classification/run_glue.py \
+    --model_name_or_path $MODEL_NAME \
+    --task_name $TASK_NAME \
+    --do_train \
+    --do_eval \
+    --max_seq_length 128 \
+    --per_device_train_batch_size $PER_DEVICE_TRAIN_BATCH_SIZE \
+    --learning_rate 2e-5 \
+    --num_train_epochs $NEPOCHS \
+    --output_dir ${OUTPUT_DIR}_b${PER_DEVICE_TRAIN_BATCH_SIZE}_fs \
+    --overwrite_output_dir \
+    --save_steps 0 \
+    --max_steps $MAX_STEPS \
+    --sharded_ddp zero_dp_2
+fi
diff --git a/autotuning/hf/bert-large/README.md b/autotuning/hf/bert-large/README.md
new file mode 100644
index 000000000..97d393d13
--- /dev/null
+++ b/autotuning/hf/bert-large/README.md
@@ -0,0 +1,44 @@
+# [bert-large-uncased](https://huggingface.co/bert-large-uncased)
+
+This model has the following configuration:
+
+- 24-layer
+- 1024 hidden dimension
+- 16 attention heads
+- 336M parameters
+
+The training use fp32 and runs on 1 node with 16 Nvidia V100 GPUs. The autotuning uses the same hardware resource as the training. `max_train_batch_size` is not defined.
+
+- transformers (4.12.0.dev0)
+- datasets (1.11.0)
+
+## Throughput Comparsion
+
+The table below shows the throughput (samples per second) comparsion. The corresponding train micro batch size per GPU (mbs or tmbspg) and ZeRO stage used to achieve the throughput value is also shown in the parentheses. Assume the strategy users would usein the handtuning process is to start from `mbs = 1` and increase mbs by 2 each time until running out of GPU memory.
+ - `baseline` is the vanila Hugging Face (HF) without DeepSpeed (DS) and mbs is hand-tuned.
+ - `HF + DS hand-tuned` is HF with DS, and mbs is hand-tuned while other DS configuration uses default values.
+ - `HF + DS autotuning` is HF with DS, and the DS configuration selected from autotuning.
+
+Notation: Hugging Face (HF), DeepSpeed (DS), ZeRO stage (z), graident accumulation steps (gas), train micro batch size per GPU (mbs or tmbspg).
+
+| Model name | baseline (vanila HF)        | HF + DS handtuned                 | HF + DS autotuning         |
+| ---------- | --------------------------- | --------------------------------- | -------------------------- |
+| BERT-large | 742.692 (gas = 1, mbs = 64) | 766.929 (z = 1, gas =1, mbs = 64) | 808.168 (z1_gas1_tmbspg93) |
+
+## Detailed `HF + DS autotuning` Result Summary
+
+Note that the performance metric used in autotuning is calculated using the timings captured within DeepSpeed foward, backward and step functions. The sum of these timings is less than the actual training step latency, thus the throughput metric values used by autotuning would be higher than the end-to-end throughput in training.
+
+- Fast-mode Autotuning time: 36 mins
+- Number of experiments: 22
+- Throughput Improvement over baseline: 1.09x
+
+| tuning_space | num_experiments | best_metric_val | best_exp_name    |
+| :----------- | --------------: | --------------: | :--------------- |
+| z0           |               6 |         835.244 | z0_gas1_tmbspg93 |
+| z1           |               6 |         842.243 | z1_gas1_tmbspg93 |
+| z2           |               9 |         764.524 | z2_gas1_tmbspg94 |
+| z3           |               1 |               0 | z3_gas1_tmbspg94 |
+| global       |              22 |         842.243 | z1_gas1_tmbspg93 |
+
+Tuning completed in 0:36:16.261417. Total number of experiments: 23.
diff --git a/autotuning/hf/bert-large/test_tune.sh b/autotuning/hf/bert-large/test_tune.sh
new file mode 100755
index 000000000..b8254f1a0
--- /dev/null
+++ b/autotuning/hf/bert-large/test_tune.sh
@@ -0,0 +1,121 @@
+TASK_NAME=mnli
+MODEL_NAME=bert-large-uncased
+HF_PATH=~/projects
+PER_DEVICE_TRAIN_BATCH_SIZE=64
+MAX_TRAIN_BATCH_SIZE=4096
+NEPOCHS=1
+NGPUS=16
+NNODES=1
+MAX_STEPS=200
+OUTPUT_DIR=./${TASK_NAME}/output_b${PER_DEVICE_TRAIN_BATCH_SIZE}_g${NGPUS}_$MAX_STEPS
+
+TEST=$1
+
+if [ ${TEST} == "0" ]
+then
+    python -m torch.distributed.launch --nproc_per_node=$NGPUS $HF_PATH/transformers/examples/pytorch/text-classification/run_glue.py \
+    --model_name_or_path $MODEL_NAME \
+    --task_name $TASK_NAME \
+    --do_train \
+    --do_eval \
+    --max_seq_length 128 \
+    --per_device_train_batch_size $PER_DEVICE_TRAIN_BATCH_SIZE \
+    --learning_rate 2e-5 \
+    --num_train_epochs $NEPOCHS \
+    --output_dir ${OUTPUT_DIR}_b${PER_DEVICE_TRAIN_BATCH_SIZE}_0 \
+    --overwrite_output_dir \
+    --save_steps 0 \
+    --max_steps $MAX_STEPS \
+    --save_strategy "no"
+elif [ ${TEST} == "z0" ]
+then
+    deepspeed --num_nodes=$NNODES --num_gpus=$NGPUS $HF_PATH/transformers/examples/pytorch/text-classification/run_glue.py --deepspeed ../dsconfigs/ds_config_z0.json \
+    --model_name_or_path $MODEL_NAME \
+    --task_name $TASK_NAME \
+    --do_train \
+    --do_eval \
+    --max_seq_length 128 \
+    --per_device_train_batch_size $PER_DEVICE_TRAIN_BATCH_SIZE \
+    --learning_rate 2e-5 \
+    --num_train_epochs $NEPOCHS \
+    --output_dir ${OUTPUT_DIR}_b${PER_DEVICE_TRAIN_BATCH_SIZE}_z0 \
+    --save_steps 0 \
+    --overwrite_output_dir \
+    --max_steps $MAX_STEPS
+elif [ ${TEST} == "z1" ]
+then
+    deepspeed --num_nodes=$NNODES --num_gpus=$NGPUS $HF_PATH/transformers/examples/pytorch/text-classification/run_glue.py --deepspeed ../dsconfigs/ds_config_z1.json \
+    --model_name_or_path $MODEL_NAME \
+    --task_name $TASK_NAME \
+    --do_train \
+    --do_eval \
+    --max_seq_length 128 \
+    --per_device_train_batch_size $PER_DEVICE_TRAIN_BATCH_SIZE \
+    --learning_rate 2e-5 \
+    --num_train_epochs $NEPOCHS \
+    --output_dir ${OUTPUT_DIR}_b${PER_DEVICE_TRAIN_BATCH_SIZE}_z1 \
+    --save_steps 0 \
+    --overwrite_output_dir \
+    --max_steps $MAX_STEPS
+elif [ ${TEST} == "z2" ]
+then
+    deepspeed --num_nodes=$NNODES --num_gpus=$NGPUS $HF_PATH/transformers/examples/pytorch/text-classification/run_glue.py --deepspeed ../dsconfigs/ds_config_z2.json \
+    --model_name_or_path $MODEL_NAME \
+    --task_name $TASK_NAME \
+    --do_train \
+    --do_eval \
+    --max_seq_length 128 \
+    --per_device_train_batch_size $PER_DEVICE_TRAIN_BATCH_SIZE \
+    --learning_rate 2e-5 \
+    --num_train_epochs $NEPOCHS \
+    --output_dir ${OUTPUT_DIR}_b${PER_DEVICE_TRAIN_BATCH_SIZE}_z2 \
+    --save_steps 0 \
+    --overwrite_output_dir \
+    --max_steps $MAX_STEPS
+elif [ ${TEST} == "z3" ]
+then
+    deepspeed --num_nodes=$NNODES --num_gpus=$NGPUS $HF_PATH/transformers/examples/pytorch/text-classification/run_glue.py --deepspeed ../dsconfigs/ds_config_z3.json \
+    --model_name_or_path $MODEL_NAME \
+    --task_name $TASK_NAME \
+    --do_train \
+    --do_eval \
+    --max_seq_length 128 \
+    --per_device_train_batch_size $PER_DEVICE_TRAIN_BATCH_SIZE \
+    --learning_rate 2e-5 \
+    --num_train_epochs $NEPOCHS \
+    --output_dir ${OUTPUT_DIR}_b${PER_DEVICE_TRAIN_BATCH_SIZE}_z3 \
+    --save_steps 0 \
+    --overwrite_output_dir \
+    --max_steps $MAX_STEPS
+elif [ ${TEST} == "tune" ]
+then
+    deepspeed --autotuning run --num_nodes=$NNODES --num_gpus=$NGPUS $HF_PATH/transformers/examples/pytorch/text-classification/run_glue.py --deepspeed ../dsconfigs/ds_config_tune.json \
+    --model_name_or_path $MODEL_NAME \
+    --task_name $TASK_NAME \
+    --do_train \
+    --do_eval \
+    --max_seq_length 128 \
+    --per_device_train_batch_size $PER_DEVICE_TRAIN_BATCH_SIZE \
+    --learning_rate 2e-5 \
+    --num_train_epochs $NEPOCHS \
+    --output_dir ${OUTPUT_DIR}_tune \
+    --save_steps 0 \
+    --overwrite_output_dir \
+    --max_steps $MAX_STEPS
+elif [ ${TEST} == "fs" ]
+then
+    python -m torch.distributed.launch --nproc_per_node=$NGPUS $HF_PATH/transformers/examples/pytorch/text-classification/run_glue.py \
+    --model_name_or_path $MODEL_NAME \
+    --task_name $TASK_NAME \
+    --do_train \
+    --do_eval \
+    --max_seq_length 128 \
+    --per_device_train_batch_size $PER_DEVICE_TRAIN_BATCH_SIZE \
+    --learning_rate 2e-5 \
+    --num_train_epochs $NEPOCHS \
+    --output_dir ${OUTPUT_DIR}_b${PER_DEVICE_TRAIN_BATCH_SIZE}_fs \
+    --overwrite_output_dir \
+    --save_steps 0 \
+    --max_steps $MAX_STEPS \
+    --sharded_ddp zero_dp_2
+fi
diff --git a/autotuning/hf/deberta/README.md b/autotuning/hf/deberta/README.md
new file mode 100644
index 000000000..9f47712ff
--- /dev/null
+++ b/autotuning/hf/deberta/README.md
@@ -0,0 +1,62 @@
+# [deberta-v2-xxlarge-mnli](https://huggingface.co/microsoft/deberta-v2-xxlarge)
+
+This model has the following configuration:
+
+- 48-layer
+- 1536 hidden dimension
+- 1.5B parameters.
+
+Refer to [DeBERTa: Decoding-enhanced BERT with Disentangled Attention](https://github.com/microsoft/DeBERTa).
+## Environment
+
+The training use fp16 and runs on 1 node with 16 Nvidia V100 GPUs. The autotuning uses the same hardware resource as the training.
+
+- transformers (4.12.0.dev0)
+- datasets (1.11.0)
+
+## Throughput Comparsion
+
+The table below shows the throughput (samples per second) comparsion. The corresponding train micro batch size per GPU (mbs or tmbspg) and ZeRO stage used to achieve the throughput value is also shown in the parentheses. Assume the strategy users would usein the handtuning process is to start from `mbs = 1` and increase mbs by 2 each time until running out of GPU memory.
+ - `baseline` is the vanila Hugging Face (HF) without DeepSpeed (DS) and mbs is hand-tuned.
+ - `HF + DS hand-tuned` is HF with DS, and mbs is hand-tuned while other DS configuration uses default values.
+ - `HF + DS autotuning` is HF with DS, and the DS configuration selected from autotuning.
+
+| Model name | baseline (vanila HF) | HF + DS hand-tuned                | HF + DS autotuning (fast-mode) |
+| ---------- | -------------------- | --------------------------------- | ------------------------------ |
+| DeBERTa    | Not runnable         | 140.587 (z = 1, gas = 1 mbs = 8), | 162.395  (z1_gas1_tmbspg11)    |
+
+Notation: Hugging Face (HF), DeepSpeed (DS), ZeRO stage (z), graident accumulation steps (gas), train micro batch size per GPU (mbs or tmbspg), reduce_bucket_size (rbs), allgather_bucket_size (abs).
+
+## Detailed `HF + DS autotuning` Result Summary
+
+Note that the performance metric used in autotuning is calculated using the timings captured within DeepSpeed foward, backward and step functions. The sum of these timings is less than the actual training step latency, thus the throughput metric values used by autotuning would be higher than the end-to-end throughput in training.
+### Fast-mode
+- Autotuning time: 40 mins
+- Number of experiments: 12
+- Throughput Improvement over baseline: Inf
+
+| tuning_space | num_experiments | best_metric_val | best_exp_name    |
+| :----------- | --------------: | --------------: | :--------------- |
+| z0           |               1 |               0 | z0_gas1_tmbspg1  |
+| z1           |               6 |         177.843 | z1_gas1_tmbspg11 |
+| z2           |               4 |         154.002 | z2_gas1_tmbspg14 |
+| z3           |               1 |               0 | z3_gas1_tmbspg14 |
+| global       |              12 |         177.843 | z1_gas1_tmbspg11 |
+
+Tuning completed in 0:39:25.253998. Total number of experiments: 12.
+
+### Full-mode
+- Autotuning time: 1 hr 2 mins
+- Number of experiments: 24
+- Throughput Improvement over baseline: Inf
+
+| tuning_space      | num_experiments | best_metric_val | best_exp_name                          |
+| :---------------- | --------------: | --------------: | :------------------------------------- |
+| z0                |               1 |               0 | z0_gas1_tmbspg1                        |
+| z1                |               6 |         177.843 | z1_gas1_tmbspg11                       |
+| z1_rbs_abs_tmbspg |              12 |         193.577 | z1_rbs5.0e+07_abs1.0e+09_gas1_tmbspg11 |
+| z2                |               4 |         154.002 | z2_gas1_tmbspg14                       |
+| z3                |               1 |               0 | z3_gas1_tmbspg14                       |
+| global            |              24 |         193.577 | z1_rbs5.0e+07_abs1.0e+09_gas1_tmbspg11 |
+
+Tuning completed in 1:02:32.759424. Total number of experiments: 24.
diff --git a/autotuning/hf/deberta/ds_config_fp16_tune.json b/autotuning/hf/deberta/ds_config_fp16_tune.json
new file mode 100644
index 000000000..b405929bb
--- /dev/null
+++ b/autotuning/hf/deberta/ds_config_fp16_tune.json
@@ -0,0 +1,16 @@
+{
+  "train_micro_batch_size_per_gpu": "auto",
+  "fp16": {
+    "enabled": true,
+    "initial_scale_power": 12
+  },
+  "autotuning": {
+    "enabled": true,
+    "overwrite": false,
+    "fast": true,
+    "arg_mappings": {
+      "train_micro_batch_size_per_gpu": "--per_device_train_batch_size",
+      "gradient_accumulation_steps ": "--gradient_accumulation_steps"
+    }
+  }
+}
\ No newline at end of file
diff --git a/autotuning/hf/deberta/test_tune.sh b/autotuning/hf/deberta/test_tune.sh
new file mode 100755
index 000000000..45b60dae3
--- /dev/null
+++ b/autotuning/hf/deberta/test_tune.sh
@@ -0,0 +1,128 @@
+MODEL_NAME=microsoft/deberta-v2-xxlarge
+TASK_NAME=mnli
+PER_DEVICE_TRAIN_BATCH_SIZE=1
+HF_PATH=~/projects
+
+NEPOCHS=1
+NGPUS=16
+NNODES=1
+MAX_STEPS=200
+OUTPUT_DIR=./output_b${PER_DEVICE_TRAIN_BATCH_SIZE}_g${NGPUS}_$MAX_STEPS
+
+TEST=$1
+
+if [ ${TEST} == "0" ]
+then
+    python -m torch.distributed.launch --nproc_per_node=$NGPUS $HF_PATH/transformers/examples/pytorch/text-classification/run_glue.py \
+    --model_name_or_path $MODEL_NAME \
+    --task_name $TASK_NAME \
+    --do_train --do_eval \
+    --fp16 \
+    --max_seq_length 256 \
+    --per_device_train_batch_size $PER_DEVICE_TRAIN_BATCH_SIZE \
+    --learning_rate 3e-6 \
+    --num_train_epochs $NEPOCHS \
+    --output_dir ${OUTPUT_DIR}_0 \
+    --overwrite_output_dir \
+    --save_steps 0 \
+    --max_steps $MAX_STEPS \
+    --save_strategy "no"
+elif [ ${TEST} == "z0" ]
+then
+    deepspeed --num_nodes=$NNODES --num_gpus=$NGPUS $HF_PATH/transformers/examples/pytorch/text-classification/run_glue.py --deepspeed ../dsconfigs/ds_config_fp16_z0.json\
+    --model_name_or_path $MODEL_NAME \
+    --task_name $TASK_NAME \
+    --do_train --do_eval \
+    --fp16 \
+    --max_seq_length 256 \
+    --per_device_train_batch_size $PER_DEVICE_TRAIN_BATCH_SIZE \
+    --learning_rate 3e-6 \
+    --num_train_epochs $NEPOCHS \
+    --output_dir ${OUTPUT_DIR}_z0 \
+    --overwrite_output_dir \
+    --save_steps 0 \
+    --save_steps 0 \
+    --max_steps $MAX_STEPS \
+    --save_strategy "no"
+elif [ ${TEST} == "z1" ]
+then
+    deepspeed --num_nodes=$NNODES --num_gpus=$NGPUS $HF_PATH/transformers/examples/pytorch/text-classification/run_glue.py --deepspeed ../dsconfigs/ds_config_fp16_z1.json\
+    --model_name_or_path $MODEL_NAME \
+    --task_name $TASK_NAME \
+    --do_train --do_eval \
+    --fp16 \
+    --max_seq_length 256 \
+    --per_device_train_batch_size $PER_DEVICE_TRAIN_BATCH_SIZE \
+    --learning_rate 3e-6 \
+    --num_train_epochs $NEPOCHS \
+    --output_dir ${OUTPUT_DIR}_z1 \
+    --overwrite_output_dir \
+    --save_steps 0 \
+    --max_steps $MAX_STEPS \
+    --save_strategy "no"
+elif [ ${TEST} == "z2" ]
+then
+    deepspeed --num_nodes=$NNODES --num_gpus=$NGPUS $HF_PATH/transformers/examples/pytorch/text-classification/run_glue.py --deepspeed ../dsconfigs/ds_config_fp16_z2.json\
+    --model_name_or_path $MODEL_NAME \
+    --task_name $TASK_NAME \
+    --do_train --do_eval \
+    --fp16 \
+    --max_seq_length 256 \
+    --per_device_train_batch_size $PER_DEVICE_TRAIN_BATCH_SIZE \
+    --learning_rate 3e-6 \
+    --num_train_epochs $NEPOCHS \
+    --output_dir ${OUTPUT_DIR}_z2 \
+    --overwrite_output_dir \
+    --save_steps 0 \
+    --max_steps $MAX_STEPS \
+    --save_strategy "no"
+elif [ ${TEST} == "z3" ]
+then
+    deepspeed --num_nodes=$NNODES --num_gpus=$NGPUS $HF_PATH/transformers/examples/pytorch/text-classification/run_glue.py --deepspeed ../dsconfigs/ds_config_fp16_z3.json\
+    --model_name_or_path $MODEL_NAME \
+    --task_name $TASK_NAME \
+    --do_train --do_eval \
+    --fp16 \
+    --max_seq_length 256 \
+    --per_device_train_batch_size $PER_DEVICE_TRAIN_BATCH_SIZE \
+    --learning_rate 3e-6 \
+    --num_train_epochs $NEPOCHS \
+    --output_dir ${OUTPUT_DIR}_z3 \
+    --overwrite_output_dir \
+    --save_steps 0 \
+    --max_steps $MAX_STEPS \
+    --save_strategy "no"
+elif [ ${TEST} == "tune" ]
+then
+    deepspeed --autotuning run --num_nodes=$NNODES --num_gpus=$NGPUS $HF_PATH/transformers/examples/pytorch/text-classification/run_glue.py --deepspeed ./ds_config_fp16_tune.json\
+    --model_name_or_path $MODEL_NAME \
+    --task_name $TASK_NAME \
+    --do_train --do_eval \
+    --fp16 \
+    --max_seq_length 256 \
+    --per_device_train_batch_size $PER_DEVICE_TRAIN_BATCH_SIZE \
+    --learning_rate 3e-6 \
+    --num_train_epochs $NEPOCHS \
+    --output_dir ${OUTPUT_DIR}_tune \
+    --overwrite_output_dir \
+    --save_steps 0 \
+    --max_steps $MAX_STEPS \
+    --save_strategy "no"
+elif [ ${TEST} == "fs" ]
+then
+    python -m torch.distributed.launch --nproc_per_node=$NGPUS $HF_PATH/transformers/examples/pytorch/text-classification/run_glue.py \
+    --model_name_or_path $MODEL_NAME \
+    --task_name $TASK_NAME \
+    --do_train --do_eval \
+    --fp16 \
+    --max_seq_length 256 \
+    --per_device_train_batch_size $PER_DEVICE_TRAIN_BATCH_SIZE \
+    --learning_rate 3e-6 \
+    --num_train_epochs $NEPOCHS \
+    --output_dir ${OUTPUT_DIR}_fs \
+    --overwrite_output_dir \
+    --save_steps 0 \
+    --max_steps $MAX_STEPS \
+    --save_strategy "no"
+    --sharded_ddp zero_dp_2
+fi
diff --git a/autotuning/hf/distilbert/README.md b/autotuning/hf/distilbert/README.md
new file mode 100644
index 000000000..5a86266f0
--- /dev/null
+++ b/autotuning/hf/distilbert/README.md
@@ -0,0 +1,46 @@
+# [distilbert-base-uncased](https://huggingface.co/distilbert-base-uncased)
+
+This model has the following configuration:
+
+- 12-layer
+- 768 hidden dimension
+- 12 attention heads
+- 66M parameters.
+
+## Environment
+
+The training uses 1 node with 16 Nvidia V100 GPUs, fp32, max_train_batch_size = 4096. The autotuning uses the same hardware resource as the training.
+
+- transformers (4.12.0.dev0)
+- datasets (1.11.0)
+
+## Throughput Comparsion
+
+The table below shows the throughput (samples per second) comparsion. The corresponding train micro batch size per GPU (mbs or tmbspg) and ZeRO stage used to achieve the throughput value is also shown in the parentheses. Assume the strategy users would usein the handtuning process is to start from `mbs = 1` and increase mbs by 2 each time until running out of GPU memory.
+ - `baseline` is the vanila Hugging Face (HF) without DeepSpeed (DS) and mbs is hand-tuned.
+ - `HF + DS hand-tuned` is HF with DS, and mbs is hand-tuned while other DS configuration uses default values.
+ - `HF + DS autotuning` is HF with DS, and the DS configuration selected from autotuning.
+
+Notation: Hugging Face (HF), DeepSpeed (DS), ZeRO stage (z), graident accumulation steps (gas), train micro batch size per GPU (mbs or tmbspg).
+
+| Model name | baseline (vanila HF)          | HF + DS hand-tuned                   | HF + DS autotuning (fast-mode) |
+| ---------- | ----------------------------- | ------------------------------------ | ------------------------------ |
+| DistilBERT | 5161.902 (gas = 1, mbs = 256) | 5305.067 (z = 0, gas = 1 mbs = 256), | 5305.067 (z0_gas1_tmbspg256)   |
+
+## Detailed `HF + DS autotuning` Result Summary
+
+Note that the performance metric used in autotuning is calculated using the timings captured within DeepSpeed foward, backward and step functions. The sum of these timings is less than the actual training step latency, thus the throughput metric values used by autotuning would be higher than the end-to-end throughput in training.
+
+- Fast-mode Autotuning time: 11 mins
+- Number of experiments: 11
+- Throughput Improvement: 1.03x
+
+| tuning_space | num_experiments | best_metric_val | best_exp_name     |
+| :----------- | --------------: | --------------: | :---------------- |
+| z0           |               5 |         5759.96 | z0_gas1_tmbspg256 |
+| z1           |               2 |         5667.06 | z1_gas1_tmbspg256 |
+| z2           |               2 |         5366.97 | z2_gas1_tmbspg256 |
+| z3           |               2 |         4892.49 | z3_gas1_tmbspg256 |
+| global       |              11 |         5759.96 | z0_gas1_tmbspg256 |
+
+Tuning completed in 0:10:45.085016. Total number of experiments: 11.
diff --git a/autotuning/hf/distilbert/ds_config_tune.json b/autotuning/hf/distilbert/ds_config_tune.json
new file mode 100644
index 000000000..23a48ddf9
--- /dev/null
+++ b/autotuning/hf/distilbert/ds_config_tune.json
@@ -0,0 +1,12 @@
+{
+  "train_micro_batch_size_per_gpu": "auto",
+  "autotuning": {
+    "enabled": true,
+    "overwrite": false,
+    "max_train_batch_size": 4096,
+    "arg_mappings": {
+      "train_micro_batch_size_per_gpu": "--per_device_train_batch_size",
+      "gradient_accumulation_steps ": "--gradient_accumulation_steps"
+    }
+  }
+}
diff --git a/autotuning/hf/distilbert/test_tune.sh b/autotuning/hf/distilbert/test_tune.sh
new file mode 100755
index 000000000..560ddb1e6
--- /dev/null
+++ b/autotuning/hf/distilbert/test_tune.sh
@@ -0,0 +1,121 @@
+TASK_NAME=mnli
+MODEL_NAME=bert-base-cased
+HF_PATH=~/projects
+PER_DEVICE_TRAIN_BATCH_SIZE=64
+MAX_TRAIN_BATCH_SIZE=4096
+NEPOCHS=1
+NGPUS=16
+NNODES=1
+MAX_STEPS=200
+OUTPUT_DIR=./${TASK_NAME}/output_b${PER_DEVICE_TRAIN_BATCH_SIZE}_g${NGPUS}_$MAX_STEPS
+
+TEST=$1
+
+if [ ${TEST} == "0" ]
+then
+    python -m torch.distributed.launch --nproc_per_node=$NGPUS $HF_PATH/transformers/examples/pytorch/text-classification/run_glue.py \
+    --model_name_or_path $MODEL_NAME \
+    --task_name $TASK_NAME \
+    --do_train \
+    --do_eval \
+    --max_seq_length 128 \
+    --per_device_train_batch_size $PER_DEVICE_TRAIN_BATCH_SIZE \
+    # --learning_rate 2e-5 \
+    --num_train_epochs $NEPOCHS \
+    --output_dir ${OUTPUT_DIR}_b${PER_DEVICE_TRAIN_BATCH_SIZE}_0 \
+    --overwrite_output_dir \
+    --save_steps 0 \
+    --max_steps $MAX_STEPS \
+    --save_strategy "no"
+elif [ ${TEST} == "z0" ]
+then
+    deepspeed --num_nodes=$NNODES $HF_PATH/transformers/examples/pytorch/text-classification/run_glue.py --deepspeed ../dsconfigs/ds_config_z0.json \
+    --model_name_or_path $MODEL_NAME \
+    --task_name $TASK_NAME \
+    --do_train \
+    --do_eval \
+    --max_seq_length 128 \
+    --per_device_train_batch_size $PER_DEVICE_TRAIN_BATCH_SIZE \
+    --learning_rate 2e-5 \
+    --num_train_epochs $NEPOCHS \
+    --output_dir ${OUTPUT_DIR}_b${PER_DEVICE_TRAIN_BATCH_SIZE}_z0 \
+    --save_steps 0 \
+    --overwrite_output_dir \
+    --max_steps $MAX_STEPS
+elif [ ${TEST} == "z1" ]
+then
+    deepspeed --num_nodes=$NNODES $HF_PATH/transformers/examples/pytorch/text-classification/run_glue.py --deepspeed ../dsconfigs/ds_config_z1.json \
+    --model_name_or_path $MODEL_NAME \
+    --task_name $TASK_NAME \
+    --do_train \
+    --do_eval \
+    --max_seq_length 128 \
+    --per_device_train_batch_size $PER_DEVICE_TRAIN_BATCH_SIZE \
+    --learning_rate 2e-5 \
+    --num_train_epochs $NEPOCHS \
+    --output_dir ${OUTPUT_DIR}_b${PER_DEVICE_TRAIN_BATCH_SIZE}_z1 \
+    --save_steps 0 \
+    --overwrite_output_dir \
+    --max_steps $MAX_STEPS
+elif [ ${TEST} == "z2" ]
+then
+    deepspeed --num_nodes=$NNODES $HF_PATH/transformers/examples/pytorch/text-classification/run_glue.py --deepspeed ../dsconfigs/ds_config_z2.json \
+    --model_name_or_path $MODEL_NAME \
+    --task_name $TASK_NAME \
+    --do_train \
+    --do_eval \
+    --max_seq_length 128 \
+    --per_device_train_batch_size $PER_DEVICE_TRAIN_BATCH_SIZE \
+    --learning_rate 2e-5 \
+    --num_train_epochs $NEPOCHS \
+    --output_dir ${OUTPUT_DIR}_b${PER_DEVICE_TRAIN_BATCH_SIZE}_z2 \
+    --save_steps 0 \
+    --overwrite_output_dir \
+    --max_steps $MAX_STEPS
+elif [ ${TEST} == "z3" ]
+then
+    deepspeed --num_nodes=$NNODES $HF_PATH/transformers/examples/pytorch/text-classification/run_glue.py --deepspeed ../dsconfigs/ds_config_z3.json \
+    --model_name_or_path $MODEL_NAME \
+    --task_name $TASK_NAME \
+    --do_train \
+    --do_eval \
+    --max_seq_length 128 \
+    --per_device_train_batch_size $PER_DEVICE_TRAIN_BATCH_SIZE \
+    --learning_rate 2e-5 \
+    --num_train_epochs $NEPOCHS \
+    --output_dir ${OUTPUT_DIR}_b${PER_DEVICE_TRAIN_BATCH_SIZE}_z3 \
+    --save_steps 0 \
+    --overwrite_output_dir \
+    --max_steps $MAX_STEPS
+elif [ ${TEST} == "tune" ]
+then
+    deepspeed --autotuning run --num_nodes=$NNODES $HF_PATH/transformers/examples/pytorch/text-classification/run_glue.py --deepspeed ../dsconfigs/ds_config_tune.json \
+    --model_name_or_path $MODEL_NAME \
+    --task_name $TASK_NAME \
+    --do_train \
+    --do_eval \
+    --max_seq_length 128 \
+    --per_device_train_batch_size $PER_DEVICE_TRAIN_BATCH_SIZE \
+    --learning_rate 2e-5 \
+    --num_train_epochs $NEPOCHS \
+    --output_dir ${OUTPUT_DIR}_tune \
+    --save_steps 0 \
+    --overwrite_output_dir \
+    --max_steps $MAX_STEPS
+elif [ ${TEST} == "fs" ]
+then
+    python -m torch.distributed.launch --nproc_per_node=$NGPUS $HF_PATH/transformers/examples/pytorch/text-classification/run_glue.py \
+    --model_name_or_path $MODEL_NAME \
+    --task_name $TASK_NAME \
+    --do_train \
+    --do_eval \
+    --max_seq_length 128 \
+    --per_device_train_batch_size $PER_DEVICE_TRAIN_BATCH_SIZE \
+    --learning_rate 2e-5 \
+    --num_train_epochs $NEPOCHS \
+    --output_dir ${OUTPUT_DIR}_b${PER_DEVICE_TRAIN_BATCH_SIZE}_fs \
+    --overwrite_output_dir \
+    --save_steps 0 \
+    --max_steps $MAX_STEPS \
+    --sharded_ddp zero_dp_2
+fi
diff --git a/autotuning/hf/dsconfigs/ds_config_fp16_tune.json b/autotuning/hf/dsconfigs/ds_config_fp16_tune.json
new file mode 100644
index 000000000..7ae31168b
--- /dev/null
+++ b/autotuning/hf/dsconfigs/ds_config_fp16_tune.json
@@ -0,0 +1,15 @@
+{
+  "train_micro_batch_size_per_gpu": "auto",
+  "fp16": {
+    "enabled": true
+  },
+  "autotuning": {
+    "enabled": true,
+    "overwrite": false,
+    "fast": true,
+    "arg_mappings": {
+      "train_micro_batch_size_per_gpu": "--per_device_train_batch_size",
+      "gradient_accumulation_steps ": "--gradient_accumulation_steps"
+    }
+  }
+}
diff --git a/autotuning/hf/dsconfigs/ds_config_fp16_z0.json b/autotuning/hf/dsconfigs/ds_config_fp16_z0.json
new file mode 100644
index 000000000..ff375bb3e
--- /dev/null
+++ b/autotuning/hf/dsconfigs/ds_config_fp16_z0.json
@@ -0,0 +1,9 @@
+{
+  "train_micro_batch_size_per_gpu": "auto",
+  "zero_optimization": {
+    "stage": 0
+  },
+  "fp16": {
+    "enabled": true
+  }
+}
diff --git a/autotuning/hf/dsconfigs/ds_config_fp16_z1.json b/autotuning/hf/dsconfigs/ds_config_fp16_z1.json
new file mode 100644
index 000000000..209706d24
--- /dev/null
+++ b/autotuning/hf/dsconfigs/ds_config_fp16_z1.json
@@ -0,0 +1,9 @@
+{
+  "train_micro_batch_size_per_gpu": "auto",
+  "zero_optimization": {
+    "stage": 1
+  },
+  "fp16": {
+    "enabled": true
+  }
+}
diff --git a/autotuning/hf/dsconfigs/ds_config_fp16_z2.json b/autotuning/hf/dsconfigs/ds_config_fp16_z2.json
new file mode 100644
index 000000000..d3782ab14
--- /dev/null
+++ b/autotuning/hf/dsconfigs/ds_config_fp16_z2.json
@@ -0,0 +1,9 @@
+{
+  "train_micro_batch_size_per_gpu": "auto",
+  "zero_optimization": {
+    "stage": 2
+  },
+  "fp16": {
+    "enabled": true
+  }
+}
diff --git a/autotuning/hf/dsconfigs/ds_config_fp16_z3.json b/autotuning/hf/dsconfigs/ds_config_fp16_z3.json
new file mode 100644
index 000000000..d0affd293
--- /dev/null
+++ b/autotuning/hf/dsconfigs/ds_config_fp16_z3.json
@@ -0,0 +1,9 @@
+{
+  "train_micro_batch_size_per_gpu": "auto",
+  "zero_optimization": {
+    "stage": 3
+  },
+  "fp16": {
+    "enabled": true
+  }
+}
diff --git a/autotuning/hf/dsconfigs/ds_config_tune.json b/autotuning/hf/dsconfigs/ds_config_tune.json
new file mode 100644
index 000000000..413e19630
--- /dev/null
+++ b/autotuning/hf/dsconfigs/ds_config_tune.json
@@ -0,0 +1,12 @@
+{
+  "train_micro_batch_size_per_gpu": "auto",
+  "autotuning": {
+    "enabled": true,
+    "overwrite": false,
+    "fast": true,
+    "arg_mappings": {
+      "train_micro_batch_size_per_gpu": "--per_device_train_batch_size",
+      "gradient_accumulation_steps ": "--gradient_accumulation_steps"
+    }
+  }
+}
diff --git a/autotuning/hf/dsconfigs/ds_config_z0.json b/autotuning/hf/dsconfigs/ds_config_z0.json
new file mode 100644
index 000000000..6247e56c4
--- /dev/null
+++ b/autotuning/hf/dsconfigs/ds_config_z0.json
@@ -0,0 +1,6 @@
+{
+  "train_micro_batch_size_per_gpu": "auto",
+  "zero_optimization": {
+    "stage": 0
+  }
+}
diff --git a/autotuning/hf/dsconfigs/ds_config_z1.json b/autotuning/hf/dsconfigs/ds_config_z1.json
new file mode 100644
index 000000000..fd39970a4
--- /dev/null
+++ b/autotuning/hf/dsconfigs/ds_config_z1.json
@@ -0,0 +1,6 @@
+{
+  "train_micro_batch_size_per_gpu": "auto",
+  "zero_optimization": {
+    "stage": 1
+  }
+}
diff --git a/autotuning/hf/dsconfigs/ds_config_z2.json b/autotuning/hf/dsconfigs/ds_config_z2.json
new file mode 100644
index 000000000..b898aee82
--- /dev/null
+++ b/autotuning/hf/dsconfigs/ds_config_z2.json
@@ -0,0 +1,6 @@
+{
+  "train_micro_batch_size_per_gpu": "auto",
+  "zero_optimization": {
+    "stage": 2
+  }
+}
diff --git a/autotuning/hf/dsconfigs/ds_config_z3.json b/autotuning/hf/dsconfigs/ds_config_z3.json
new file mode 100644
index 000000000..5b118864e
--- /dev/null
+++ b/autotuning/hf/dsconfigs/ds_config_z3.json
@@ -0,0 +1,6 @@
+{
+  "train_micro_batch_size_per_gpu": "auto",
+  "zero_optimization": {
+    "stage": 3
+  }
+}
diff --git a/autotuning/hf/gpt2-large/README.md b/autotuning/hf/gpt2-large/README.md
new file mode 100644
index 000000000..b9b52dc0e
--- /dev/null
+++ b/autotuning/hf/gpt2-large/README.md
@@ -0,0 +1,48 @@
+# [gpt2-large](https://huggingface.co/gpt2-large)
+
+This model has the following configuration:
+
+- 36-layer
+- 1280 hidden dimension
+- 20 attention heads
+- 774M parameters.
+
+Refer to [GPT-2/GPT and causal language modeling](https://github.com/huggingface/transformers/tree/master/examples/pytorch/language-modeling#gpt-2gpt-and-causal-language-modeling)
+
+## Environment
+
+The training use fp16 and runs on 1 node with 16 Nvidia V100 GPUs. The autotuning uses the same hardware resource as the training. `max_train_batch_size` is not defined.
+
+- transformers (4.12.0.dev0)
+- datasets (1.11.0)
+
+## Throughput Comparsion
+
+The table below shows the throughput (samples per second) comparsion. The corresponding train micro batch size per GPU (mbs or tmbspg) and ZeRO stage used to achieve the throughput value is also shown in the parentheses. Assume the strategy users would usein the handtuning process is to start from `mbs = 1` and increase mbs by 2 each time until running out of GPU memory.
+ - `baseline` is the vanila Hugging Face (HF) without DeepSpeed (DS) and mbs is hand-tuned.
+ - `HF + DS hand-tuned` is HF with DS, and mbs is hand-tuned while other DS configuration uses default values.
+ - `HF + DS autotuning` is HF with DS, and the DS configuration selected from autotuning.
+
+| Model name | baseline (vanila HF) | HF + DS hand-tuned       | HF + DS autotuning (fast-mode) |
+| ---------- | -------------------- | ------------------------ | ------------------------------ |
+| GPT2-large | 27.874 (mbs = 1)     | 56.797 (z = 1, mbs = 2), | 69.061 (z = 1, mbs = 3)        |
+
+Notation: Hugging Face (HF), DeepSpeed (DS), ZeRO stage (z), graident accumulation steps (gas), train micro batch size per GPU (mbs or tmbspg).
+
+## Detailed `HF + DS autotuning` Result Summary
+
+Note that the performance metric used in autotuning is calculated using the timings captured within DeepSpeed foward, backward and step functions. The sum of these timings is less than the actual training step latency, thus the throughput metric values used by autotuning would be higher than the end-to-end throughput in training.
+
+- Fast-mode Autotuning time: 27 mins
+- Number of experiments: 13
+- Throughput Improvement over baseline: 2.48x
+
+| tuning_space | num_experiments | best_metric_val | best_exp_name   |
+| :----------- | --------------: | --------------: | :-------------- |
+| z0           |               4 |         59.0229 | z0_gas1_tmbspg2 |
+| z1           |               5 |         87.3017 | z1_gas1_tmbspg3 |
+| z2           |               3 |         77.8338 | z2_gas1_tmbspg3 |
+| z3           |               1 |               0 | z3_gas1_tmbspg3 |
+| global       |              13 |         87.3017 | z1_gas1_tmbspg3 |
+
+Tuning completed in 0:27:33.988447. Total number of experiments: 13.
diff --git a/autotuning/hf/gpt2-large/test_tune.sh b/autotuning/hf/gpt2-large/test_tune.sh
new file mode 100755
index 000000000..c477e60b8
--- /dev/null
+++ b/autotuning/hf/gpt2-large/test_tune.sh
@@ -0,0 +1,133 @@
+MODEL_NAME=gpt2-large
+PER_DEVICE_TRAIN_BATCH_SIZE=1
+HF_PATH=~/projects
+
+NEPOCHS=1
+NGPUS=16
+NNODES=1
+MAX_STEPS=200
+OUTPUT_DIR=./output_b${PER_DEVICE_TRAIN_BATCH_SIZE}_g${NGPUS}_$MAX_STEPS
+
+TEST=$1
+
+if [ ${TEST} == "0" ]
+then
+    python -m torch.distributed.launch --nproc_per_node=$NGPUS $HF_PATH/transformers/examples/pytorch/language-modeling/run_clm.py \
+    --model_name_or_path $MODEL_NAME \
+    --dataset_name wikitext \
+    --dataset_config_name wikitext-2-raw-v1 \
+    --do_train \
+    --do_eval \
+    --fp16 \
+    --per_device_train_batch_size $PER_DEVICE_TRAIN_BATCH_SIZE \
+    --learning_rate 2e-5 \
+    --num_train_epochs $NEPOCHS \
+    --output_dir ${OUTPUT_DIR}_0 \
+    --overwrite_output_dir \
+    --save_steps 0 \
+    --max_steps $MAX_STEPS \
+    --save_strategy "no"
+elif [ ${TEST} == "z0" ]
+then
+    deepspeed --num_nodes=$NNODES --num_gpus=$NGPUS $HF_PATH/transformers/examples/pytorch/language-modeling/run_clm.py --deepspeed ../dsconfigs/ds_config_fp16_z0.json\
+    --model_name_or_path $MODEL_NAME \
+    --dataset_name wikitext \
+    --dataset_config_name wikitext-2-raw-v1 \
+    --do_train \
+    --do_eval \
+    --fp16 \
+    --per_device_train_batch_size $PER_DEVICE_TRAIN_BATCH_SIZE \
+    --learning_rate 2e-5 \
+    --num_train_epochs $NEPOCHS \
+    --output_dir ${OUTPUT_DIR}_z0 \
+    --overwrite_output_dir \
+    --save_steps 0 \
+    --max_steps $MAX_STEPS \
+    --save_strategy "no"
+elif [ ${TEST} == "z1" ]
+then
+    deepspeed --num_nodes=$NNODES --num_gpus=$NGPUS $HF_PATH/transformers/examples/pytorch/language-modeling/run_clm.py --deepspeed ../dsconfigs/ds_config_fp16_z1.json\
+    --model_name_or_path $MODEL_NAME \
+    --dataset_name wikitext \
+    --dataset_config_name wikitext-2-raw-v1 \
+    --do_train \
+    --do_eval \
+    --fp16 \
+    --per_device_train_batch_size $PER_DEVICE_TRAIN_BATCH_SIZE \
+    --learning_rate 2e-5 \
+    --num_train_epochs $NEPOCHS \
+    --output_dir ${OUTPUT_DIR}_z1 \
+    --overwrite_output_dir \
+    --save_steps 0 \
+    --max_steps $MAX_STEPS \
+    --save_strategy "no"
+elif [ ${TEST} == "z2" ]
+then
+    deepspeed --num_nodes=$NNODES --num_gpus=$NGPUS $HF_PATH/transformers/examples/pytorch/language-modeling/run_clm.py --deepspeed ../dsconfigs/ds_config_fp16_z2.json\
+    --model_name_or_path $MODEL_NAME \
+    --dataset_name wikitext \
+    --dataset_config_name wikitext-2-raw-v1 \
+    --do_train \
+    --do_eval \
+    --fp16 \
+    --per_device_train_batch_size $PER_DEVICE_TRAIN_BATCH_SIZE \
+    --learning_rate 2e-5 \
+    --num_train_epochs $NEPOCHS \
+    --output_dir ${OUTPUT_DIR}_z2 \
+    --overwrite_output_dir \
+    --save_steps 0 \
+    --max_steps $MAX_STEPS \
+    --save_strategy "no"
+elif [ ${TEST} == "z3" ]
+then
+    deepspeed --num_nodes=$NNODES --num_gpus=$NGPUS $HF_PATH/transformers/examples/pytorch/language-modeling/run_clm.py --deepspeed ../dsconfigs/ds_config_fp16_z3.json\
+    --model_name_or_path $MODEL_NAME \
+    --dataset_name wikitext \
+    --dataset_config_name wikitext-2-raw-v1 \
+    --do_train \
+    --do_eval \
+    --fp16 \
+    --per_device_train_batch_size $PER_DEVICE_TRAIN_BATCH_SIZE \
+    --learning_rate 2e-5 \
+    --num_train_epochs $NEPOCHS \
+    --output_dir ${OUTPUT_DIR}_z3 \
+    --overwrite_output_dir \
+    --save_steps 0 \
+    --max_steps $MAX_STEPS \
+    --save_strategy "no"
+elif [ ${TEST} == "tune" ]
+then
+    deepspeed --autotuning run --num_nodes=$NNODES --num_gpus=$NGPUS $HF_PATH/transformers/examples/pytorch/language-modeling/run_clm.py --deepspeed ../dsconfigs/ds_config_fp16_tune.json\
+    --model_name_or_path $MODEL_NAME \
+    --dataset_name wikitext \
+    --dataset_config_name wikitext-2-raw-v1 \
+    --do_train \
+    --do_eval \
+    --fp16 \
+    --per_device_train_batch_size $PER_DEVICE_TRAIN_BATCH_SIZE \
+    --learning_rate 2e-5 \
+    --num_train_epochs $NEPOCHS \
+    --output_dir ${OUTPUT_DIR}_tune \
+    --overwrite_output_dir \
+    --save_steps 0 \
+    --max_steps $MAX_STEPS \
+    --save_strategy "no"
+elif [ ${TEST} == "fs" ]
+then
+    python -m torch.distributed.launch --nproc_per_node=$NGPUS $HF_PATH/transformers/examples/pytorch/language-modeling/run_clm.py \
+    --model_name_or_path $MODEL_NAME \
+    --dataset_name wikitext \
+    --dataset_config_name wikitext-2-raw-v1 \
+    --do_train \
+    --do_eval \
+    --fp16 \
+    --per_device_train_batch_size $PER_DEVICE_TRAIN_BATCH_SIZE \
+    --learning_rate 2e-5 \
+    --num_train_epochs $NEPOCHS \
+    --output_dir ${OUTPUT_DIR}_fs \
+    --overwrite_output_dir \
+    --save_steps 0 \
+    --max_steps $MAX_STEPS \
+    --save_strategy "no"
+    --sharded_ddp zero_dp_2
+fi
diff --git a/autotuning/hf/gpt2-medium/README.md b/autotuning/hf/gpt2-medium/README.md
new file mode 100644
index 000000000..ebec83b05
--- /dev/null
+++ b/autotuning/hf/gpt2-medium/README.md
@@ -0,0 +1,46 @@
+# [gpt2-medium](https://huggingface.co/gpt2-medium)
+
+This model has the following configuration:
+- 24-layer
+- 1024 hidden dimension
+- 16 attention heads
+- 345M parameters.
+
+Refer to [GPT-2/GPT and causal language modeling](https://github.com/huggingface/transformers/tree/master/examples/pytorch/language-modeling#gpt-2gpt-and-causal-language-modeling)
+
+## Environment
+
+The training use fp16 and runs on 1 node with 16 Nvidia V100 GPUs. The autotuning uses the same hardware resource as the training. `max_train_batch_size` is not defined.
+
+- transformers (4.12.0.dev0)
+- datasets (1.11.0)
+
+## Throughput Comparsion
+
+The table below shows the throughput (samples per second) comparsion. The corresponding train micro batch size per GPU (mbs or tmbspg) and ZeRO stage used to achieve the throughput value is also shown in the parentheses. Assume the strategy users would usein the handtuning process is to start from `mbs = 1` and increase mbs by 2 each time until running out of GPU memory.
+ - `baseline` is the vanila Hugging Face (HF) without DeepSpeed (DS) and mbs is hand-tuned.
+ - `HF + DS hand-tuned` is HF with DS, and mbs is hand-tuned while other DS configuration uses default values.
+ - `HF + DS autotuning` is HF with DS, and the DS configuration selected from autotuning.
+
+
+| Model name  | baseline (vanila HF)     | HF + DS hand-tuned                | HF + DS autotuning (fast-mode) |
+| ----------- | ------------------------ | --------------------------------- | ------------------------------ |
+| GPT2-medium | 71.61 (gas = 1, mbs = 2) | 142.211 (z = 1, gas = 1, mbs = 4) | 163.3 (z1_gas1_tmbspg6)        |
+
+## Detailed `HF + DS autotuning` Result Summary
+
+Note that the performance metric used in autotuning is calculated using the timings captured within DeepSpeed foward, backward and step functions. The sum of these timings is less than the actual training step latency, thus the throughput metric values used by autotuning would be higher than the end-to-end throughput in training.
+
+- Fast-mode Autotuning time: 25 mins
+- Number of experiments: 15
+- Throughput Improvement over baseline: 2.28x
+
+| tuning_space | num_experiments | best_metric_val | best_exp_name   |
+| :----------- | --------------: | --------------: | :-------------- |
+| z0           |               6 |         167.688 | z0_gas1_tmbspg5 |
+| z1           |               5 |          175.46 | z1_gas1_tmbspg6 |
+| z2           |               3 |         161.619 | z2_gas1_tmbspg6 |
+| z3           |               1 |               0 | z3_gas1_tmbspg6 |
+| global       |              15 |          175.46 | z1_gas1_tmbspg6 |
+
+Tuning completed in 0:25:18.653731. Total number of experiments: 15.
diff --git a/autotuning/hf/gpt2-medium/test_tune.sh b/autotuning/hf/gpt2-medium/test_tune.sh
new file mode 100755
index 000000000..4e2907f94
--- /dev/null
+++ b/autotuning/hf/gpt2-medium/test_tune.sh
@@ -0,0 +1,126 @@
+TASK_NAME=mnli
+MODEL_NAME=bert-base-cased
+HF_PATH=~/projects
+PER_DEVICE_TRAIN_BATCH_SIZE=64
+MAX_TRAIN_BATCH_SIZE=4096
+NEPOCHS=1
+NGPUS=16
+NNODES=1
+MAX_STEPS=200
+OUTPUT_DIR=./${TASK_NAME}/output_b${PER_DEVICE_TRAIN_BATCH_SIZE}_g${NGPUS}_$MAX_STEPS
+
+TEST=$1
+
+if [ ${TEST} == "0" ]
+then
+    python -m torch.distributed.launch --nproc_per_node=$NGPUS $HF_PATH/transformers/examples/pytorch/text-classification/run_glue.py \
+    --model_name_or_path $MODEL_NAME \
+    --task_name $TASK_NAME \
+    --do_train \
+    --do_eval \
+    --max_seq_length 128 \
+    --per_device_train_batch_size $PER_DEVICE_TRAIN_BATCH_SIZE \
+    # --learning_rate 2e-5 \
+    --num_train_epochs $NEPOCHS \
+    --output_dir ${OUTPUT_DIR}_b${PER_DEVICE_TRAIN_BATCH_SIZE}_0 \
+    --overwrite_output_dir \
+    --save_steps 0 \
+    --max_steps $MAX_STEPS \
+    --save_strategy "no"
+elif [ ${TEST} == "z0" ]
+then
+    deepspeed --num_nodes=$NNODES $HF_PATH/transformers/examples/pytorch/text-classification/run_glue.py --deepspeed ../dsconfigs/ds_config_z0.json \
+    --model_name_or_path $MODEL_NAME \
+    --task_name $TASK_NAME \
+    --do_train \
+    --do_eval \
+    --max_seq_length 128 \
+    --per_device_train_batch_size $PER_DEVICE_TRAIN_BATCH_SIZE \
+    --learning_rate 2e-5 \
+    --num_train_epochs $NEPOCHS \
+    --output_dir ${OUTPUT_DIR}_b${PER_DEVICE_TRAIN_BATCH_SIZE}_z0 \
+    --save_steps 0 \
+    --overwrite_output_dir \
+    --max_steps $MAX_STEPS
+elif [ ${TEST} == "z1" ]
+then
+    deepspeed --num_nodes=$NNODES $HF_PATH/transformers/examples/pytorch/text-classification/run_glue.py --deepspeed ../dsconfigs/ds_config_z1.json \
+    --model_name_or_path $MODEL_NAME \
+    --task_name $TASK_NAME \
+    --do_train \
+    --do_eval \
+    --max_seq_length 128 \
+    --per_device_train_batch_size $PER_DEVICE_TRAIN_BATCH_SIZE \
+    --learning_rate 2e-5 \
+    --num_train_epochs $NEPOCHS \
+    --output_dir ${OUTPUT_DIR}_b${PER_DEVICE_TRAIN_BATCH_SIZE}_z1 \
+    --save_steps 0 \
+    --overwrite_output_dir \
+    --max_steps $MAX_STEPS
+elif [ ${TEST} == "z2" ]
+then
+    deepspeed --num_nodes=$NNODES $HF_PATH/transformers/examples/pytorch/text-classification/run_glue.py --deepspeed ../dsconfigs/ds_config_z2.json \
+    --model_name_or_path $MODEL_NAME \
+    --task_name $TASK_NAME \
+    --do_train \
+    --do_eval \
+    --max_seq_length 128 \
+    --per_device_train_batch_size $PER_DEVICE_TRAIN_BATCH_SIZE \
+    --learning_rate 2e-5 \
+    --num_train_epochs $NEPOCHS \
+    --output_dir ${OUTPUT_DIR}_b${PER_DEVICE_TRAIN_BATCH_SIZE}_z2 \
+    --save_steps 0 \
+    --overwrite_output_dir \
+    --max_steps $MAX_STEPS
+elif [ ${TEST} == "z3" ]
+then
+    deepspeed --num_nodes=$NNODES $HF_PATH/transformers/examples/pytorch/text-classification/run_glue.py --deepspeed ../dsconfigs/ds_config_z3.json \
+    --model_name_or_path $MODEL_NAME \
+    --task_name $TASK_NAME \
+    --do_train \
+    --do_eval \
+    --max_seq_length 128 \
+    --per_device_train_batch_size $PER_DEVICE_TRAIN_BATCH_SIZE \
+    --learning_rate 2e-5 \
+    --num_train_epochs $NEPOCHS \
+    --output_dir ${OUTPUT_DIR}_b${PER_DEVICE_TRAIN_BATCH_SIZE}_z3 \
+    --save_steps 0 \
+    --overwrite_output_dir \
+    --max_steps $MAX_STEPS
+elif [ ${TEST} == "tune" ]
+then
+    deepspeed --autotuning run --num_nodes=$NNODES $HF_PATH/transformers/examples/pytorch/text-classification/run_glue.py --deepspeed ../dsconfigs/ds_config_tune.json \
+    --model_name_or_path $MODEL_NAME \
+    --task_name $TASK_NAME \
+    --do_train \
+    --do_eval \
+    --max_seq_length 128 \
+    --per_device_train_batch_size $PER_DEVICE_TRAIN_BATCH_SIZE \
+    --learning_rate 2e-5 \
+    --num_train_epochs $NEPOCHS \
+    --output_dir ${OUTPUT_DIR}_tune \
+    --save_steps 0 \
+    --overwrite_output_dir \
+    --max_steps $MAX_STEPS
+elif [ ${TEST} == "fs" ]
+then
+    python -m torch.distributed.launch --nproc_per_node=$NGPUS $HF_PATH/transformers/examples/pytorch/text-classification/run_glue.py \
+    --model_name_or_path $MODEL_NAME \
+    --task_name $TASK_NAME \
+    --do_train \
+    --do_eval \
+    --max_seq_length 128 \
+    --per_device_train_batch_size $PER_DEVICE_TRAIN_BATCH_SIZE \
+    --learning_rate 2e-5 \
+    --num_train_epochs $NEPOCHS \
+    --output_dir ${OUTPUT_DIR}_b${PER_DEVICE_TRAIN_BATCH_SIZE}_fs \
+    --overwrite_output_dir \
+    --save_steps 0 \
+    --max_steps $MAX_STEPS \
+    --sharded_ddp zero_dp_2
+fi
+    --save_steps 0 \
+    --max_steps $MAX_STEPS \
+    --save_strategy "no"
+    --sharded_ddp zero_dp_2
+fi
diff --git a/autotuning/hf/gpt2-xl/README.md b/autotuning/hf/gpt2-xl/README.md
new file mode 100644
index 000000000..390796767
--- /dev/null
+++ b/autotuning/hf/gpt2-xl/README.md
@@ -0,0 +1,45 @@
+# [gpt2-xl](https://huggingface.co/gpt2-xl)
+
+This model has the following configuration:
+- 48-layer
+- 1600 hidden dimension
+- 25 attention heads
+- 1.5B parameters.
+
+Refer to [GPT-2/GPT and causal language modeling](https://github.com/huggingface/transformers/tree/master/examples/pytorch/language-modeling#gpt-2gpt-and-causal-language-modeling)
+
+## Environment
+
+The training use fp16 and runs on 1 node with 16 Nvidia V100 GPUs. The autotuning uses the same hardware resource as the training. `max_train_batch_size` is not defined.
+
+- transformers (4.12.0.dev0)
+- datasets (1.11.0)
+## Throughput Comparsion
+
+The table below shows the throughput (samples per second) comparsion. The corresponding train micro batch size per GPU (mbs or tmbspg) and ZeRO stage used to achieve the throughput value is also shown in the parentheses. Assume the strategy users would usein the handtuning process is to start from `mbs = 1` and increase mbs by 2 each time until running out of GPU memory.
+ - `baseline` is the vanila Hugging Face (HF) without DeepSpeed (DS) and mbs is hand-tuned.
+ - `HF + DS hand-tuned` is HF with DS, and mbs is hand-tuned while other DS configuration uses default values.
+ - `HF + DS autotuning` is HF with DS, and the DS configuration selected from autotuning.
+
+| Model name | baseline (vanila HF) | HF + DS hand-tuned                | HF + DS autotuning (fast-mode)   |
+| ---------- | -------------------- | --------------------------------- | -------------------------------- |
+| GPT2-xl    | Not runnable         | Zero1 (27.462, gas = 1, mbs = 1), | Zero1 (27.497, gas = 1, mbs = 1) |
+
+Notation: Hugging Face (HF), DeepSpeed (DS), ZeRO stage (z), graident accumulation steps (gas), train micro batch size per GPU (mbs or tmbspg).
+
+## Detailed `HF + DS autotuning` Result Summary
+
+Note that the performance metric used in autotuning is calculated using the timings captured within DeepSpeed foward, backward and step functions. The sum of these timings is less than the actual training step latency, thus the throughput metric values used by autotuning would be higher than the end-to-end throughput in training.
+
+- Fast-mode Autotuning time: 21 mins
+- Number of experiments: 9
+- Throughput Improvement over baseline: Inf
+
+| tuning_space | num_experiments | best_metric_val | best_exp_name   |
+| :----------- | --------------: | --------------: | :-------------- |
+| z1           |               3 |         40.1749 | z1_gas1_tmbspg1 |
+| z2           |               3 |         33.0472 | z2_gas1_tmbspg1 |
+| z3           |               3 |         12.8604 | z3_gas1_tmbspg1 |
+| global       |               9 |         40.1749 | z1_gas1_tmbspg1 |
+
+Tuning completed in 0:20:55.156000. Total number of experiments: 9.
diff --git a/autotuning/hf/gpt2-xl/test_tune.sh b/autotuning/hf/gpt2-xl/test_tune.sh
new file mode 100755
index 000000000..4e2907f94
--- /dev/null
+++ b/autotuning/hf/gpt2-xl/test_tune.sh
@@ -0,0 +1,126 @@
+TASK_NAME=mnli
+MODEL_NAME=bert-base-cased
+HF_PATH=~/projects
+PER_DEVICE_TRAIN_BATCH_SIZE=64
+MAX_TRAIN_BATCH_SIZE=4096
+NEPOCHS=1
+NGPUS=16
+NNODES=1
+MAX_STEPS=200
+OUTPUT_DIR=./${TASK_NAME}/output_b${PER_DEVICE_TRAIN_BATCH_SIZE}_g${NGPUS}_$MAX_STEPS
+
+TEST=$1
+
+if [ ${TEST} == "0" ]
+then
+    python -m torch.distributed.launch --nproc_per_node=$NGPUS $HF_PATH/transformers/examples/pytorch/text-classification/run_glue.py \
+    --model_name_or_path $MODEL_NAME \
+    --task_name $TASK_NAME \
+    --do_train \
+    --do_eval \
+    --max_seq_length 128 \
+    --per_device_train_batch_size $PER_DEVICE_TRAIN_BATCH_SIZE \
+    # --learning_rate 2e-5 \
+    --num_train_epochs $NEPOCHS \
+    --output_dir ${OUTPUT_DIR}_b${PER_DEVICE_TRAIN_BATCH_SIZE}_0 \
+    --overwrite_output_dir \
+    --save_steps 0 \
+    --max_steps $MAX_STEPS \
+    --save_strategy "no"
+elif [ ${TEST} == "z0" ]
+then
+    deepspeed --num_nodes=$NNODES $HF_PATH/transformers/examples/pytorch/text-classification/run_glue.py --deepspeed ../dsconfigs/ds_config_z0.json \
+    --model_name_or_path $MODEL_NAME \
+    --task_name $TASK_NAME \
+    --do_train \
+    --do_eval \
+    --max_seq_length 128 \
+    --per_device_train_batch_size $PER_DEVICE_TRAIN_BATCH_SIZE \
+    --learning_rate 2e-5 \
+    --num_train_epochs $NEPOCHS \
+    --output_dir ${OUTPUT_DIR}_b${PER_DEVICE_TRAIN_BATCH_SIZE}_z0 \
+    --save_steps 0 \
+    --overwrite_output_dir \
+    --max_steps $MAX_STEPS
+elif [ ${TEST} == "z1" ]
+then
+    deepspeed --num_nodes=$NNODES $HF_PATH/transformers/examples/pytorch/text-classification/run_glue.py --deepspeed ../dsconfigs/ds_config_z1.json \
+    --model_name_or_path $MODEL_NAME \
+    --task_name $TASK_NAME \
+    --do_train \
+    --do_eval \
+    --max_seq_length 128 \
+    --per_device_train_batch_size $PER_DEVICE_TRAIN_BATCH_SIZE \
+    --learning_rate 2e-5 \
+    --num_train_epochs $NEPOCHS \
+    --output_dir ${OUTPUT_DIR}_b${PER_DEVICE_TRAIN_BATCH_SIZE}_z1 \
+    --save_steps 0 \
+    --overwrite_output_dir \
+    --max_steps $MAX_STEPS
+elif [ ${TEST} == "z2" ]
+then
+    deepspeed --num_nodes=$NNODES $HF_PATH/transformers/examples/pytorch/text-classification/run_glue.py --deepspeed ../dsconfigs/ds_config_z2.json \
+    --model_name_or_path $MODEL_NAME \
+    --task_name $TASK_NAME \
+    --do_train \
+    --do_eval \
+    --max_seq_length 128 \
+    --per_device_train_batch_size $PER_DEVICE_TRAIN_BATCH_SIZE \
+    --learning_rate 2e-5 \
+    --num_train_epochs $NEPOCHS \
+    --output_dir ${OUTPUT_DIR}_b${PER_DEVICE_TRAIN_BATCH_SIZE}_z2 \
+    --save_steps 0 \
+    --overwrite_output_dir \
+    --max_steps $MAX_STEPS
+elif [ ${TEST} == "z3" ]
+then
+    deepspeed --num_nodes=$NNODES $HF_PATH/transformers/examples/pytorch/text-classification/run_glue.py --deepspeed ../dsconfigs/ds_config_z3.json \
+    --model_name_or_path $MODEL_NAME \
+    --task_name $TASK_NAME \
+    --do_train \
+    --do_eval \
+    --max_seq_length 128 \
+    --per_device_train_batch_size $PER_DEVICE_TRAIN_BATCH_SIZE \
+    --learning_rate 2e-5 \
+    --num_train_epochs $NEPOCHS \
+    --output_dir ${OUTPUT_DIR}_b${PER_DEVICE_TRAIN_BATCH_SIZE}_z3 \
+    --save_steps 0 \
+    --overwrite_output_dir \
+    --max_steps $MAX_STEPS
+elif [ ${TEST} == "tune" ]
+then
+    deepspeed --autotuning run --num_nodes=$NNODES $HF_PATH/transformers/examples/pytorch/text-classification/run_glue.py --deepspeed ../dsconfigs/ds_config_tune.json \
+    --model_name_or_path $MODEL_NAME \
+    --task_name $TASK_NAME \
+    --do_train \
+    --do_eval \
+    --max_seq_length 128 \
+    --per_device_train_batch_size $PER_DEVICE_TRAIN_BATCH_SIZE \
+    --learning_rate 2e-5 \
+    --num_train_epochs $NEPOCHS \
+    --output_dir ${OUTPUT_DIR}_tune \
+    --save_steps 0 \
+    --overwrite_output_dir \
+    --max_steps $MAX_STEPS
+elif [ ${TEST} == "fs" ]
+then
+    python -m torch.distributed.launch --nproc_per_node=$NGPUS $HF_PATH/transformers/examples/pytorch/text-classification/run_glue.py \
+    --model_name_or_path $MODEL_NAME \
+    --task_name $TASK_NAME \
+    --do_train \
+    --do_eval \
+    --max_seq_length 128 \
+    --per_device_train_batch_size $PER_DEVICE_TRAIN_BATCH_SIZE \
+    --learning_rate 2e-5 \
+    --num_train_epochs $NEPOCHS \
+    --output_dir ${OUTPUT_DIR}_b${PER_DEVICE_TRAIN_BATCH_SIZE}_fs \
+    --overwrite_output_dir \
+    --save_steps 0 \
+    --max_steps $MAX_STEPS \
+    --sharded_ddp zero_dp_2
+fi
+    --save_steps 0 \
+    --max_steps $MAX_STEPS \
+    --save_strategy "no"
+    --sharded_ddp zero_dp_2
+fi
diff --git a/autotuning/hf/gpt2/README.md b/autotuning/hf/gpt2/README.md
new file mode 100644
index 000000000..fb7016825
--- /dev/null
+++ b/autotuning/hf/gpt2/README.md
@@ -0,0 +1,47 @@
+# [gpt2](https://huggingface.co/gpt2)
+
+This model has the following configuration:
+
+- 12-layer
+- 768 hidden dimension
+- 12 attention heads
+- 117M parameters.
+
+Refer to [GPT-2/GPT and causal language modeling](https://github.com/huggingface/transformers/tree/master/examples/pytorch/language-modeling#gpt-2gpt-and-causal-language-modeling)
+
+## Environment
+
+The training use fp16 and runs on 1 node with 16 Nvidia V100 GPUs. The autotuning uses the same hardware resource as the training. `max_train_batch_size` is not defined.
+
+- transformers (4.12.0.dev0)
+- datasets (1.11.0)
+## Throughput Comparsion
+
+The table below shows the throughput (samples per second) comparsion. The corresponding train micro batch size per GPU (mbs or tmbspg) and ZeRO stage used to achieve the throughput value is also shown in the parentheses. Assume the strategy users would usein the handtuning process is to start from `mbs = 1` and increase mbs by 2 each time until running out of GPU memory.
+ - `baseline` is the vanila Hugging Face (HF) without DeepSpeed (DS) and mbs is hand-tuned.
+ - `HF + DS hand-tuned` is HF with DS, and mbs is hand-tuned while other DS configuration uses default values.
+ - `HF + DS autotuning` is HF with DS, and the DS configuration selected from autotuning.
+
+| Model name | baseline (vanila HF) | HF + DS hand-tuned       | HF + DS autotuning (fast-mode) |
+| ---------- | -------------------- | ------------------------ | ------------------------------ |
+| GPT2       | 284.142 (mbs = 8)    | 397.827 (z = 1, mbs = 8) | 431.586 (z1_gas1_tmbspg14)     |
+
+Notation: Hugging Face (HF), DeepSpeed (DS), ZeRO stage (z), graident accumulation steps (gas), train micro batch size per GPU (mbs or tmbspg).
+
+## Detailed `HF + DS autotuning` Result Summary
+
+Note that the performance metric used in autotuning is calculated using the timings captured within DeepSpeed foward, backward and step functions. The sum of these timings is less than the actual training step latency, thus the throughput metric values used by autotuning would be higher than the end-to-end throughput in training.
+
+- Fast-mode Autotuning time: 25 mins
+- Number of experiments: 17
+- Throughput Improvement over baseline: 1.52x
+
+| tuning_space | num_experiments | best_metric_val | best_exp_name    |
+| :----------- | --------------: | --------------: | :--------------- |
+| z0           |               9 |         441.693 | z0_gas1_tmbspg11 |
+| z1           |               6 |         452.004 | z1_gas1_tmbspg15 |
+| z2           |               1 |               0 | z2_gas1_tmbspg15 |
+| z3           |               1 |               0 | z3_gas1_tmbspg15 |
+| global       |              17 |         452.004 | z1_gas1_tmbspg15 |
+
+Tuning completed in 0:24:19.976427. Total number of experiments: 17.
diff --git a/autotuning/hf/gpt2/test_tune.sh b/autotuning/hf/gpt2/test_tune.sh
new file mode 100755
index 000000000..88785da24
--- /dev/null
+++ b/autotuning/hf/gpt2/test_tune.sh
@@ -0,0 +1,135 @@
+MODEL_NAME=gpt2
+PER_DEVICE_TRAIN_BATCH_SIZE=1
+HF_PATH=~/projects
+MAX_TRAIN_BATCH_SIZE=512
+
+NEPOCHS=1
+NGPUS=16
+NNODES=1
+MAX_STEPS=200
+OUTPUT_DIR=./output_b${PER_DEVICE_TRAIN_BATCH_SIZE}_g${NGPUS}_$MAX_STEPS
+
+TEST=$1
+
+
+if [ ${TEST} == "0" ]
+then
+    python -m torch.distributed.launch --nproc_per_node=$NGPUS $HF_PATH/transformers/examples/pytorch/language-modeling/run_clm.py \
+    --model_name_or_path $MODEL_NAME \
+    --dataset_name wikitext \
+    --dataset_config_name wikitext-2-raw-v1 \
+    --do_train \
+    --do_eval \
+    --fp16 \
+    --per_device_train_batch_size $PER_DEVICE_TRAIN_BATCH_SIZE \
+    --learning_rate 2e-5 \
+    --num_train_epochs $NEPOCHS \
+    --output_dir ${OUTPUT_DIR}_0 \
+    --overwrite_output_dir \
+    --save_steps 0 \
+    --max_steps $MAX_STEPS \
+    --save_strategy "no"
+elif [ ${TEST} == "z0" ]
+then
+    deepspeed --num_nodes=$NNODES --num_gpus=$NGPUS $HF_PATH/transformers/examples/pytorch/language-modeling/run_clm.py --deepspeed ../dsconfigs/ds_config_fp16_z0.json\
+    --model_name_or_path $MODEL_NAME \
+    --dataset_name wikitext \
+    --dataset_config_name wikitext-2-raw-v1 \
+    --do_train \
+    --do_eval \
+    --fp16 \
+    --per_device_train_batch_size $PER_DEVICE_TRAIN_BATCH_SIZE \
+    --learning_rate 2e-5 \
+    --num_train_epochs $NEPOCHS \
+    --output_dir ${OUTPUT_DIR}_z0 \
+    --overwrite_output_dir \
+    --save_steps 0 \
+    --max_steps $MAX_STEPS \
+    --save_strategy "no"
+elif [ ${TEST} == "z1" ]
+then
+    deepspeed --num_nodes=$NNODES --num_gpus=$NGPUS $HF_PATH/transformers/examples/pytorch/language-modeling/run_clm.py --deepspeed ../dsconfigs/ds_config_fp16_z1.json\
+    --model_name_or_path $MODEL_NAME \
+    --dataset_name wikitext \
+    --dataset_config_name wikitext-2-raw-v1 \
+    --do_train \
+    --do_eval \
+    --fp16 \
+    --per_device_train_batch_size $PER_DEVICE_TRAIN_BATCH_SIZE \
+    --learning_rate 2e-5 \
+    --num_train_epochs $NEPOCHS \
+    --output_dir ${OUTPUT_DIR}_z1 \
+    --overwrite_output_dir \
+    --save_steps 0 \
+    --max_steps $MAX_STEPS \
+    --save_strategy "no"
+elif [ ${TEST} == "z2" ]
+then
+    deepspeed --num_nodes=$NNODES --num_gpus=$NGPUS $HF_PATH/transformers/examples/pytorch/language-modeling/run_clm.py --deepspeed ../dsconfigs/ds_config_fp16_z2.json\
+    --model_name_or_path $MODEL_NAME \
+    --dataset_name wikitext \
+    --dataset_config_name wikitext-2-raw-v1 \
+    --do_train \
+    --do_eval \
+    --fp16 \
+    --per_device_train_batch_size $PER_DEVICE_TRAIN_BATCH_SIZE \
+    --learning_rate 2e-5 \
+    --num_train_epochs $NEPOCHS \
+    --output_dir ${OUTPUT_DIR}_z2 \
+    --overwrite_output_dir \
+    --save_steps 0 \
+    --max_steps $MAX_STEPS \
+    --save_strategy "no"
+elif [ ${TEST} == "z3" ]
+then
+    deepspeed --num_nodes=$NNODES --num_gpus=$NGPUS $HF_PATH/transformers/examples/pytorch/language-modeling/run_clm.py --deepspeed ../dsconfigs/ds_config_fp16_z3.json\
+    --model_name_or_path $MODEL_NAME \
+    --dataset_name wikitext \
+    --dataset_config_name wikitext-2-raw-v1 \
+    --do_train \
+    --do_eval \
+    --fp16 \
+    --per_device_train_batch_size $PER_DEVICE_TRAIN_BATCH_SIZE \
+    --learning_rate 2e-5 \
+    --num_train_epochs $NEPOCHS \
+    --output_dir ${OUTPUT_DIR}_z3 \
+    --overwrite_output_dir \
+    --save_steps 0 \
+    --max_steps $MAX_STEPS \
+    --save_strategy "no"
+elif [ ${TEST} == "tune" ]
+then
+    deepspeed --autotuning run --num_nodes=$NNODES --num_gpus=$NGPUS $HF_PATH/transformers/examples/pytorch/language-modeling/run_clm.py --deepspeed ../dsconfigs/ds_config_fp16_tune.json\
+    --model_name_or_path $MODEL_NAME \
+    --dataset_name wikitext \
+    --dataset_config_name wikitext-2-raw-v1 \
+    --do_train \
+    --do_eval \
+    --fp16 \
+    --per_device_train_batch_size $PER_DEVICE_TRAIN_BATCH_SIZE \
+    --learning_rate 2e-5 \
+    --num_train_epochs $NEPOCHS \
+    --output_dir ${OUTPUT_DIR}_tune \
+    --overwrite_output_dir \
+    --save_steps 0 \
+    --max_steps $MAX_STEPS \
+    --save_strategy "no"
+elif [ ${TEST} == "fs" ]
+then
+    python -m torch.distributed.launch --nproc_per_node=$NGPUS $HF_PATH/transformers/examples/pytorch/language-modeling/run_clm.py \
+    --model_name_or_path $MODEL_NAME \
+    --dataset_name wikitext \
+    --dataset_config_name wikitext-2-raw-v1 \
+    --do_train \
+    --do_eval \
+    --fp16 \
+    --per_device_train_batch_size $PER_DEVICE_TRAIN_BATCH_SIZE \
+    --learning_rate 2e-5 \
+    --num_train_epochs $NEPOCHS \
+    --output_dir ${OUTPUT_DIR}_fs \
+    --overwrite_output_dir \
+    --save_steps 0 \
+    --max_steps $MAX_STEPS \
+    --save_strategy "no"
+    --sharded_ddp zero_dp_2
+fi

From 7c94fe61a80c1c74c35b45b5ef612852ad9493d1 Mon Sep 17 00:00:00 2001
From: Cheng Li <pistasable@gmail.com>
Date: Thu, 11 Nov 2021 00:06:59 +0000
Subject: [PATCH 2/7] add links

---
 autotuning/README.md    | 3 +++
 autotuning/hf/README.md | 2 +-
 2 files changed, 4 insertions(+), 1 deletion(-)
 create mode 100644 autotuning/README.md

diff --git a/autotuning/README.md b/autotuning/README.md
new file mode 100644
index 000000000..d028a945e
--- /dev/null
+++ b/autotuning/README.md
@@ -0,0 +1,3 @@
+# Autotuning Examples
+
+This showcases the [autotuning](https://github.com/microsoft/DeepSpeed/tree/master/deepspeed/autotuning) feature in DeepSpeed (DS).
diff --git a/autotuning/hf/README.md b/autotuning/hf/README.md
index c8095c594..e101842af 100644
--- a/autotuning/hf/README.md
+++ b/autotuning/hf/README.md
@@ -1,6 +1,6 @@
 # Autotuning Hugging Face Examples
 
-This showcases the Autotuning feature in DeepSpeed (DS) with Hugging Face (HF) models.
+This showcases the [autotuning](https://github.com/microsoft/DeepSpeed/tree/master/deepspeed/autotuning) feature in DeepSpeed (DS) with Hugging Face (HF) models.
 
 ## List of Models
 

From c5829565f4964bb28351a7a3ba1a56ad356d61fb Mon Sep 17 00:00:00 2001
From: Cheng Li <pistasable@gmail.com>
Date: Thu, 11 Nov 2021 19:08:20 +0000
Subject: [PATCH 3/7] fix typos

---
 .../tests/fixtures/tests_samples/GermEval/dev.txt           | 2 +-
 autotuning/hf/README.md                                     | 2 +-
 autotuning/hf/bert-base/README.md                           | 2 +-
 autotuning/hf/bert-large/README.md                          | 2 +-
 autotuning/hf/deberta/README.md                             | 6 +++---
 autotuning/hf/distilbert/README.md                          | 2 +-
 autotuning/hf/gpt2-large/README.md                          | 6 +++---
 autotuning/hf/gpt2-medium/README.md                         | 3 ++-
 autotuning/hf/gpt2-xl/README.md                             | 6 +++---
 autotuning/hf/gpt2/README.md                                | 5 +++--
 10 files changed, 19 insertions(+), 17 deletions(-)

diff --git a/MoQ/huggingface-transformers/tests/fixtures/tests_samples/GermEval/dev.txt b/MoQ/huggingface-transformers/tests/fixtures/tests_samples/GermEval/dev.txt
index de0015823..1aba64f7a 100644
--- a/MoQ/huggingface-transformers/tests/fixtures/tests_samples/GermEval/dev.txt
+++ b/MoQ/huggingface-transformers/tests/fixtures/tests_samples/GermEval/dev.txt
@@ -10,7 +10,7 @@ homo I-OTH
 " O
 in O
 enger O
-Auseinandersetzung O
+Ause inandersetzung O
 mit O
 diesem O
 Bild O
diff --git a/autotuning/hf/README.md b/autotuning/hf/README.md
index e101842af..9f1a5ab56 100644
--- a/autotuning/hf/README.md
+++ b/autotuning/hf/README.md
@@ -32,7 +32,7 @@ The training runs on 1 node with 16 Nvidia V100 GPUs. The autotuning uses the sa
 
 ## Throughput Comparsion
 
-The table below shows the throughput (samples per second) comparsion. The corresponding train micro batch size per GPU (mbs or tmbspg) and ZeRO stage used to achieve the throughput value is also shown in the parentheses. Assume the strategy users would usein the handtuning process is to start from `mbs = 1` and increase mbs by 2 each time until running out of GPU memory.
+The table below shows the throughput (samples per second) comparsion. The corresponding train micro batch size per GPU (mbs or tmbspg) and ZeRO stage used to achieve the throughput value is also shown in the parentheses. Assume the strategy users would use in the handtuning process is to start from `mbs = 1` and increase mbs by 2 each time until running out of GPU memory.
  - `baseline` is the vanila HF without DeepSpeed (DS) and mbs is hand-tuned.
  - `HF + DS hand-tuned` is HF with DS, and mbs is hand-tuned while other DS configuration uses default values.
  - `HF + DS autotuning` is HF with DS, and the DS configuration selected from autotuning.
diff --git a/autotuning/hf/bert-base/README.md b/autotuning/hf/bert-base/README.md
index 48e0a3310..ebb922078 100644
--- a/autotuning/hf/bert-base/README.md
+++ b/autotuning/hf/bert-base/README.md
@@ -16,7 +16,7 @@ The training use fp32 and runs on 1 node with 16 Nvidia V100 GPUs. The autotunin
 
 ## Throughput Comparsion
 
-The table below shows the throughput (samples per second) comparsion. The corresponding train micro batch size per GPU (mbs or tmbspg) and ZeRO stage used to achieve the throughput value is also shown in the parentheses. Assume the strategy users would usein the handtuning process is to start from `mbs = 1` and increase mbs by 2 each time until running out of GPU memory.
+The table below shows the throughput (samples per second) comparsion. The corresponding train micro batch size per GPU (mbs or tmbspg) and ZeRO stage used to achieve the throughput value is also shown in the parentheses. Assume the strategy users would use in the handtuning process is to start from `mbs = 1` and increase mbs by 2 each time until running out of GPU memory.
  - `baseline` is the vanila Hugging Face (HF) without DeepSpeed (DS) and mbs is hand-tuned.
  - `HF + DS hand-tuned` is HF with DS, and mbs is hand-tuned while other DS configuration uses default values.
  - `HF + DS autotuning` is HF with DS, and the DS configuration selected from autotuning.
diff --git a/autotuning/hf/bert-large/README.md b/autotuning/hf/bert-large/README.md
index 97d393d13..4ee41c3e2 100644
--- a/autotuning/hf/bert-large/README.md
+++ b/autotuning/hf/bert-large/README.md
@@ -14,7 +14,7 @@ The training use fp32 and runs on 1 node with 16 Nvidia V100 GPUs. The autotunin
 
 ## Throughput Comparsion
 
-The table below shows the throughput (samples per second) comparsion. The corresponding train micro batch size per GPU (mbs or tmbspg) and ZeRO stage used to achieve the throughput value is also shown in the parentheses. Assume the strategy users would usein the handtuning process is to start from `mbs = 1` and increase mbs by 2 each time until running out of GPU memory.
+The table below shows the throughput (samples per second) comparsion. The corresponding train micro batch size per GPU (mbs or tmbspg) and ZeRO stage used to achieve the throughput value is also shown in the parentheses. Assume the strategy users would use in the handtuning process is to start from `mbs = 1` and increase mbs by 2 each time until running out of GPU memory.
  - `baseline` is the vanila Hugging Face (HF) without DeepSpeed (DS) and mbs is hand-tuned.
  - `HF + DS hand-tuned` is HF with DS, and mbs is hand-tuned while other DS configuration uses default values.
  - `HF + DS autotuning` is HF with DS, and the DS configuration selected from autotuning.
diff --git a/autotuning/hf/deberta/README.md b/autotuning/hf/deberta/README.md
index 9f47712ff..51a8af4ac 100644
--- a/autotuning/hf/deberta/README.md
+++ b/autotuning/hf/deberta/README.md
@@ -16,17 +16,17 @@ The training use fp16 and runs on 1 node with 16 Nvidia V100 GPUs. The autotunin
 
 ## Throughput Comparsion
 
-The table below shows the throughput (samples per second) comparsion. The corresponding train micro batch size per GPU (mbs or tmbspg) and ZeRO stage used to achieve the throughput value is also shown in the parentheses. Assume the strategy users would usein the handtuning process is to start from `mbs = 1` and increase mbs by 2 each time until running out of GPU memory.
+The table below shows the throughput (samples per second) comparsion. The corresponding train micro batch size per GPU (mbs or tmbspg) and ZeRO stage used to achieve the throughput value is also shown in the parentheses. Assume the strategy users would use in the handtuning process is to start from `mbs = 1` and increase mbs by 2 each time until running out of GPU memory.
  - `baseline` is the vanila Hugging Face (HF) without DeepSpeed (DS) and mbs is hand-tuned.
  - `HF + DS hand-tuned` is HF with DS, and mbs is hand-tuned while other DS configuration uses default values.
  - `HF + DS autotuning` is HF with DS, and the DS configuration selected from autotuning.
 
+Notation: Hugging Face (HF), DeepSpeed (DS), ZeRO stage (z), graident accumulation steps (gas), train micro batch size per GPU (mbs or tmbspg), reduce_bucket_size (rbs), allgather_bucket_size (abs).
+
 | Model name | baseline (vanila HF) | HF + DS hand-tuned                | HF + DS autotuning (fast-mode) |
 | ---------- | -------------------- | --------------------------------- | ------------------------------ |
 | DeBERTa    | Not runnable         | 140.587 (z = 1, gas = 1 mbs = 8), | 162.395  (z1_gas1_tmbspg11)    |
 
-Notation: Hugging Face (HF), DeepSpeed (DS), ZeRO stage (z), graident accumulation steps (gas), train micro batch size per GPU (mbs or tmbspg), reduce_bucket_size (rbs), allgather_bucket_size (abs).
-
 ## Detailed `HF + DS autotuning` Result Summary
 
 Note that the performance metric used in autotuning is calculated using the timings captured within DeepSpeed foward, backward and step functions. The sum of these timings is less than the actual training step latency, thus the throughput metric values used by autotuning would be higher than the end-to-end throughput in training.
diff --git a/autotuning/hf/distilbert/README.md b/autotuning/hf/distilbert/README.md
index 5a86266f0..2520324c6 100644
--- a/autotuning/hf/distilbert/README.md
+++ b/autotuning/hf/distilbert/README.md
@@ -16,7 +16,7 @@ The training uses 1 node with 16 Nvidia V100 GPUs, fp32, max_train_batch_size =
 
 ## Throughput Comparsion
 
-The table below shows the throughput (samples per second) comparsion. The corresponding train micro batch size per GPU (mbs or tmbspg) and ZeRO stage used to achieve the throughput value is also shown in the parentheses. Assume the strategy users would usein the handtuning process is to start from `mbs = 1` and increase mbs by 2 each time until running out of GPU memory.
+The table below shows the throughput (samples per second) comparsion. The corresponding train micro batch size per GPU (mbs or tmbspg) and ZeRO stage used to achieve the throughput value is also shown in the parentheses. Assume the strategy users would use in the handtuning process is to start from `mbs = 1` and increase mbs by 2 each time until running out of GPU memory.
  - `baseline` is the vanila Hugging Face (HF) without DeepSpeed (DS) and mbs is hand-tuned.
  - `HF + DS hand-tuned` is HF with DS, and mbs is hand-tuned while other DS configuration uses default values.
  - `HF + DS autotuning` is HF with DS, and the DS configuration selected from autotuning.
diff --git a/autotuning/hf/gpt2-large/README.md b/autotuning/hf/gpt2-large/README.md
index b9b52dc0e..f259447e0 100644
--- a/autotuning/hf/gpt2-large/README.md
+++ b/autotuning/hf/gpt2-large/README.md
@@ -18,17 +18,17 @@ The training use fp16 and runs on 1 node with 16 Nvidia V100 GPUs. The autotunin
 
 ## Throughput Comparsion
 
-The table below shows the throughput (samples per second) comparsion. The corresponding train micro batch size per GPU (mbs or tmbspg) and ZeRO stage used to achieve the throughput value is also shown in the parentheses. Assume the strategy users would usein the handtuning process is to start from `mbs = 1` and increase mbs by 2 each time until running out of GPU memory.
+The table below shows the throughput (samples per second) comparsion. The corresponding train micro batch size per GPU (mbs or tmbspg) and ZeRO stage used to achieve the throughput value is also shown in the parentheses. Assume the strategy users would use in the handtuning process is to start from `mbs = 1` and increase mbs by 2 each time until running out of GPU memory.
  - `baseline` is the vanila Hugging Face (HF) without DeepSpeed (DS) and mbs is hand-tuned.
  - `HF + DS hand-tuned` is HF with DS, and mbs is hand-tuned while other DS configuration uses default values.
  - `HF + DS autotuning` is HF with DS, and the DS configuration selected from autotuning.
 
+Notation: Hugging Face (HF), DeepSpeed (DS), ZeRO stage (z), graident accumulation steps (gas), train micro batch size per GPU (mbs or tmbspg).
+
 | Model name | baseline (vanila HF) | HF + DS hand-tuned       | HF + DS autotuning (fast-mode) |
 | ---------- | -------------------- | ------------------------ | ------------------------------ |
 | GPT2-large | 27.874 (mbs = 1)     | 56.797 (z = 1, mbs = 2), | 69.061 (z = 1, mbs = 3)        |
 
-Notation: Hugging Face (HF), DeepSpeed (DS), ZeRO stage (z), graident accumulation steps (gas), train micro batch size per GPU (mbs or tmbspg).
-
 ## Detailed `HF + DS autotuning` Result Summary
 
 Note that the performance metric used in autotuning is calculated using the timings captured within DeepSpeed foward, backward and step functions. The sum of these timings is less than the actual training step latency, thus the throughput metric values used by autotuning would be higher than the end-to-end throughput in training.
diff --git a/autotuning/hf/gpt2-medium/README.md b/autotuning/hf/gpt2-medium/README.md
index ebec83b05..5507d5683 100644
--- a/autotuning/hf/gpt2-medium/README.md
+++ b/autotuning/hf/gpt2-medium/README.md
@@ -17,11 +17,12 @@ The training use fp16 and runs on 1 node with 16 Nvidia V100 GPUs. The autotunin
 
 ## Throughput Comparsion
 
-The table below shows the throughput (samples per second) comparsion. The corresponding train micro batch size per GPU (mbs or tmbspg) and ZeRO stage used to achieve the throughput value is also shown in the parentheses. Assume the strategy users would usein the handtuning process is to start from `mbs = 1` and increase mbs by 2 each time until running out of GPU memory.
+The table below shows the throughput (samples per second) comparsion. The corresponding train micro batch size per GPU (mbs or tmbspg) and ZeRO stage used to achieve the throughput value is also shown in the parentheses. Assume the strategy users would use in the handtuning process is to start from `mbs = 1` and increase mbs by 2 each time until running out of GPU memory.
  - `baseline` is the vanila Hugging Face (HF) without DeepSpeed (DS) and mbs is hand-tuned.
  - `HF + DS hand-tuned` is HF with DS, and mbs is hand-tuned while other DS configuration uses default values.
  - `HF + DS autotuning` is HF with DS, and the DS configuration selected from autotuning.
 
+Notation: Hugging Face (HF), DeepSpeed (DS), ZeRO stage (z), graident accumulation steps (gas), train micro batch size per GPU (mbs or tmbspg).
 
 | Model name  | baseline (vanila HF)     | HF + DS hand-tuned                | HF + DS autotuning (fast-mode) |
 | ----------- | ------------------------ | --------------------------------- | ------------------------------ |
diff --git a/autotuning/hf/gpt2-xl/README.md b/autotuning/hf/gpt2-xl/README.md
index 390796767..e3bda042c 100644
--- a/autotuning/hf/gpt2-xl/README.md
+++ b/autotuning/hf/gpt2-xl/README.md
@@ -16,17 +16,17 @@ The training use fp16 and runs on 1 node with 16 Nvidia V100 GPUs. The autotunin
 - datasets (1.11.0)
 ## Throughput Comparsion
 
-The table below shows the throughput (samples per second) comparsion. The corresponding train micro batch size per GPU (mbs or tmbspg) and ZeRO stage used to achieve the throughput value is also shown in the parentheses. Assume the strategy users would usein the handtuning process is to start from `mbs = 1` and increase mbs by 2 each time until running out of GPU memory.
+The table below shows the throughput (samples per second) comparsion. The corresponding train micro batch size per GPU (mbs or tmbspg) and ZeRO stage used to achieve the throughput value is also shown in the parentheses. Assume the strategy users would use in the handtuning process is to start from `mbs = 1` and increase mbs by 2 each time until running out of GPU memory.
  - `baseline` is the vanila Hugging Face (HF) without DeepSpeed (DS) and mbs is hand-tuned.
  - `HF + DS hand-tuned` is HF with DS, and mbs is hand-tuned while other DS configuration uses default values.
  - `HF + DS autotuning` is HF with DS, and the DS configuration selected from autotuning.
 
+Notation: Hugging Face (HF), DeepSpeed (DS), ZeRO stage (z), graident accumulation steps (gas), train micro batch size per GPU (mbs or tmbspg).
+
 | Model name | baseline (vanila HF) | HF + DS hand-tuned                | HF + DS autotuning (fast-mode)   |
 | ---------- | -------------------- | --------------------------------- | -------------------------------- |
 | GPT2-xl    | Not runnable         | Zero1 (27.462, gas = 1, mbs = 1), | Zero1 (27.497, gas = 1, mbs = 1) |
 
-Notation: Hugging Face (HF), DeepSpeed (DS), ZeRO stage (z), graident accumulation steps (gas), train micro batch size per GPU (mbs or tmbspg).
-
 ## Detailed `HF + DS autotuning` Result Summary
 
 Note that the performance metric used in autotuning is calculated using the timings captured within DeepSpeed foward, backward and step functions. The sum of these timings is less than the actual training step latency, thus the throughput metric values used by autotuning would be higher than the end-to-end throughput in training.
diff --git a/autotuning/hf/gpt2/README.md b/autotuning/hf/gpt2/README.md
index fb7016825..2b5ab65a3 100644
--- a/autotuning/hf/gpt2/README.md
+++ b/autotuning/hf/gpt2/README.md
@@ -17,16 +17,17 @@ The training use fp16 and runs on 1 node with 16 Nvidia V100 GPUs. The autotunin
 - datasets (1.11.0)
 ## Throughput Comparsion
 
-The table below shows the throughput (samples per second) comparsion. The corresponding train micro batch size per GPU (mbs or tmbspg) and ZeRO stage used to achieve the throughput value is also shown in the parentheses. Assume the strategy users would usein the handtuning process is to start from `mbs = 1` and increase mbs by 2 each time until running out of GPU memory.
+The table below shows the throughput (samples per second) comparsion. The corresponding train micro batch size per GPU (mbs or tmbspg) and ZeRO stage used to achieve the throughput value is also shown in the parentheses. Assume the strategy users would use in the handtuning process is to start from `mbs = 1` and increase mbs by 2 each time until running out of GPU memory.
  - `baseline` is the vanila Hugging Face (HF) without DeepSpeed (DS) and mbs is hand-tuned.
  - `HF + DS hand-tuned` is HF with DS, and mbs is hand-tuned while other DS configuration uses default values.
  - `HF + DS autotuning` is HF with DS, and the DS configuration selected from autotuning.
 
+Notation: Hugging Face (HF), DeepSpeed (DS), ZeRO stage (z), graident accumulation steps (gas), train micro batch size per GPU (mbs or tmbspg).
+
 | Model name | baseline (vanila HF) | HF + DS hand-tuned       | HF + DS autotuning (fast-mode) |
 | ---------- | -------------------- | ------------------------ | ------------------------------ |
 | GPT2       | 284.142 (mbs = 8)    | 397.827 (z = 1, mbs = 8) | 431.586 (z1_gas1_tmbspg14)     |
 
-Notation: Hugging Face (HF), DeepSpeed (DS), ZeRO stage (z), graident accumulation steps (gas), train micro batch size per GPU (mbs or tmbspg).
 
 ## Detailed `HF + DS autotuning` Result Summary
 

From 55694cde4661113554610cbe1a897692a924a46f Mon Sep 17 00:00:00 2001
From: Cheng Li <pistasable@gmail.com>
Date: Fri, 12 Nov 2021 20:53:57 +0000
Subject: [PATCH 4/7] add hf install detais

---
 autotuning/hf/README.md | 11 +++++++++++
 1 file changed, 11 insertions(+)

diff --git a/autotuning/hf/README.md b/autotuning/hf/README.md
index 9f1a5ab56..a2e0ae3b6 100644
--- a/autotuning/hf/README.md
+++ b/autotuning/hf/README.md
@@ -27,6 +27,17 @@ Each model folder has a `test_tune.sh` script:
 
 The training runs on 1 node with 16 Nvidia V100 GPUs. The autotuning uses the same hardware resource as the training. The HF packages below are used.
 
+HF examples require installing the transformers package from source.
+
+```bash
+    git clone https://github.com/huggingface/transformers.git
+    cd transformers
+    pip install .
+```
+datasets can be installed by `pip install datasets`
+
+Below are the versions used in this test.
+
 - transformers (4.12.0.dev0)
 - datasets (1.11.0)
 

From dff9b78feda0f5ec15240d29488d7dca961b1ed7 Mon Sep 17 00:00:00 2001
From: Cheng Li <pistasable@gmail.com>
Date: Sat, 13 Nov 2021 01:31:20 +0000
Subject: [PATCH 5/7] fix typos

---
 autotuning/hf/README.md              | 20 ++++++++++----------
 autotuning/hf/bert-base/README.md    | 25 ++++++++++++++++++-------
 autotuning/hf/bert-base/test_tune.sh |  2 +-
 autotuning/hf/bert-large/README.md   | 25 ++++++++++++++++++-------
 autotuning/hf/deberta/README.md      | 28 +++++++++++++++++++---------
 autotuning/hf/distilbert/README.md   | 26 ++++++++++++++++++--------
 autotuning/hf/gpt2-large/README.md   | 27 +++++++++++++++++++--------
 autotuning/hf/gpt2-medium/README.md  | 26 ++++++++++++++++++--------
 autotuning/hf/gpt2-xl/README.md      | 25 ++++++++++++++++++-------
 autotuning/hf/gpt2/README.md         | 25 ++++++++++++++++++-------
 10 files changed, 157 insertions(+), 72 deletions(-)

diff --git a/autotuning/hf/README.md b/autotuning/hf/README.md
index a2e0ae3b6..d51d5624a 100644
--- a/autotuning/hf/README.md
+++ b/autotuning/hf/README.md
@@ -15,7 +15,7 @@ This showcases the [autotuning](https://github.com/microsoft/DeepSpeed/tree/mast
 
 Each model folder has a `test_tune.sh` script:
 
-- `./test_tune.sh tune` tunes the model training and and then runs it using the selected tuned DeepSpeed configuration.
+- `./test_tune.sh tune` tunes the model training and then runs it using the selected tuned DeepSpeed configuration.
 - `./test_tune.sh 0` runs the model using HF without DeepSpeed.
 - `./test_tune.sh z0` runs the model using HF + DS with ZeRO optimization disabled.
 - `./test_tune.sh z1` runs the model using HF + DS with ZeRO optimization stage 1.
@@ -25,30 +25,30 @@ Each model folder has a `test_tune.sh` script:
 
 ## Testing Environment
 
-The training runs on 1 node with 16 Nvidia V100 GPUs. The autotuning uses the same hardware resource as the training. The HF packages below are used.
-
-HF examples require installing the transformers package from source.
+The training runs on 1 node with 16 Nvidia V100 GPUs. The autotuning uses the same hardware resource as the training.
+The HF packages below are used.
 
+HF examples require installing the `transformers` package from source:
 ```bash
     git clone https://github.com/huggingface/transformers.git
     cd transformers
     pip install .
 ```
-datasets can be installed by `pip install datasets`
+The `datasets` package can be installed by `pip install datasets`
 
 Below are the versions used in this test.
 
-- transformers (4.12.0.dev0)
+- transformers (4.12.0)
 - datasets (1.11.0)
 
-## Throughput Comparsion
+## Throughput Comparison
 
-The table below shows the throughput (samples per second) comparsion. The corresponding train micro batch size per GPU (mbs or tmbspg) and ZeRO stage used to achieve the throughput value is also shown in the parentheses. Assume the strategy users would use in the handtuning process is to start from `mbs = 1` and increase mbs by 2 each time until running out of GPU memory.
+The table below shows the throughput (samples per second) comparison. The corresponding train micro-batch size per GPU (mbs or tmbspg) and ZeRO stage used to achieve the throughput value is also shown in the parentheses. Assume the strategy users would use in the handtuning process is to start from `mbs = 1` and increase mbs by 2 each time until running out of GPU memory.
  - `baseline` is the vanila HF without DeepSpeed (DS) and mbs is hand-tuned.
  - `HF + DS hand-tuned` is HF with DS, and mbs is hand-tuned while other DS configuration uses default values.
- - `HF + DS autotuning` is HF with DS, and the DS configuration selected from autotuning.
+ - `HF + DS autotuning` is HF with DS, and the DS configuration is selected from autotuning.
 
-Notation: Hugging Face (HF), DeepSpeed (DS), ZeRO stage (z), graident accumulation steps (gas), train micro batch size per GPU (mbs or tmbspg).
+Notation: Hugging Face (HF), DeepSpeed (DS), ZeRO stage (z), gradient accumulation steps (gas), train micro-batch size per GPU (mbs or tmbspg).
 
 | Model   name | num_params |     baseline (vanila HF)      |          HF + DS hand-tuned          | HF + DS autotuning (fast-mode) | throughput improvement over baseline | autotuning time (mins) | number of experiments |
 | :----------: | :--------: | :---------------------------: | :----------------------------------: | :----------------------------: | :----------------------------------: | :--------------------: | :-------------------: |
diff --git a/autotuning/hf/bert-base/README.md b/autotuning/hf/bert-base/README.md
index ebb922078..21f508da8 100644
--- a/autotuning/hf/bert-base/README.md
+++ b/autotuning/hf/bert-base/README.md
@@ -10,18 +10,29 @@ This model has the following configuration:
 ## Environment
 
 The training use fp32 and runs on 1 node with 16 Nvidia V100 GPUs. The autotuning uses the same hardware resource as the training. `max_train_batch_size` is not defined.
+The HF packages below are used.
 
-- transformers (4.12.0.dev0)
+HF examples require installing the `transformers` package from source:
+```bash
+    git clone https://github.com/huggingface/transformers.git
+    cd transformers
+    pip install .
+```
+The `datasets` package can be installed by `pip install datasets`
+
+Below are the versions used in this test.
+
+- transformers (4.12.0)
 - datasets (1.11.0)
 
-## Throughput Comparsion
+## Throughput Comparison
 
-The table below shows the throughput (samples per second) comparsion. The corresponding train micro batch size per GPU (mbs or tmbspg) and ZeRO stage used to achieve the throughput value is also shown in the parentheses. Assume the strategy users would use in the handtuning process is to start from `mbs = 1` and increase mbs by 2 each time until running out of GPU memory.
- - `baseline` is the vanila Hugging Face (HF) without DeepSpeed (DS) and mbs is hand-tuned.
+The table below shows the throughput (samples per second) comparison. The corresponding train micro-batch size per GPU (mbs or tmbspg) and ZeRO stage used to achieve the throughput value is also shown in the parentheses. Assume the strategy users would use in the handtuning process is to start from `mbs = 1` and increase mbs by 2 each time until running out of GPU memory.
+ - `baseline` is the vanila HF without DeepSpeed (DS) and mbs is hand-tuned.
  - `HF + DS hand-tuned` is HF with DS, and mbs is hand-tuned while other DS configuration uses default values.
- - `HF + DS autotuning` is HF with DS, and the DS configuration selected from autotuning.
+ - `HF + DS autotuning` is HF with DS, and the DS configuration is selected from autotuning.
 
-Notation: Hugging Face (HF), DeepSpeed (DS), ZeRO stage (z), graident accumulation steps (gas), train micro batch size per GPU (mbs or tmbspg).
+Notation: Hugging Face (HF), DeepSpeed (DS), ZeRO stage (z), gradient accumulation steps (gas), train micro-batch size per GPU (mbs or tmbspg).
 
 | Model name | baseline (vanila HF)          | HF + DS handtuned                    | HF + DS autotuning           |
 | ---------- | ----------------------------- | ------------------------------------ | ---------------------------- |
@@ -29,7 +40,7 @@ Notation: Hugging Face (HF), DeepSpeed (DS), ZeRO stage (z), graident accumulati
 
 ## Detailed `HF + DS autotuning` Result Summary
 
-Note that the performance metric used in autotuning is calculated using the timings captured within DeepSpeed foward, backward and step functions. The sum of these timings is less than the actual training step latency, thus the throughput metric values used by autotuning would be higher than the end-to-end throughput in training.
+Note that the performance metric used in autotuning is calculated using the timings captured within DeepSpeed forward, backward, and step functions. The sum of these timings is less than the actual training step latency, thus the throughput metric values used by autotuning would be higher than the end-to-end throughput in training.
 
 - Fast-mode Autotuning time: 43 mins
 - Number of experiments: 35
diff --git a/autotuning/hf/bert-base/test_tune.sh b/autotuning/hf/bert-base/test_tune.sh
index 560ddb1e6..cb6ecb01c 100755
--- a/autotuning/hf/bert-base/test_tune.sh
+++ b/autotuning/hf/bert-base/test_tune.sh
@@ -20,7 +20,7 @@ then
     --do_eval \
     --max_seq_length 128 \
     --per_device_train_batch_size $PER_DEVICE_TRAIN_BATCH_SIZE \
-    # --learning_rate 2e-5 \
+    --learning_rate 2e-5 \
     --num_train_epochs $NEPOCHS \
     --output_dir ${OUTPUT_DIR}_b${PER_DEVICE_TRAIN_BATCH_SIZE}_0 \
     --overwrite_output_dir \
diff --git a/autotuning/hf/bert-large/README.md b/autotuning/hf/bert-large/README.md
index 4ee41c3e2..157dba0c1 100644
--- a/autotuning/hf/bert-large/README.md
+++ b/autotuning/hf/bert-large/README.md
@@ -8,18 +8,29 @@ This model has the following configuration:
 - 336M parameters
 
 The training use fp32 and runs on 1 node with 16 Nvidia V100 GPUs. The autotuning uses the same hardware resource as the training. `max_train_batch_size` is not defined.
+The HF packages below are used.
 
-- transformers (4.12.0.dev0)
+HF examples require installing the `transformers` package from source:
+```bash
+    git clone https://github.com/huggingface/transformers.git
+    cd transformers
+    pip install .
+```
+The `datasets` package can be installed by `pip install datasets`
+
+Below are the versions used in this test.
+
+- transformers (4.12.0)
 - datasets (1.11.0)
 
-## Throughput Comparsion
+## Throughput Comparison
 
-The table below shows the throughput (samples per second) comparsion. The corresponding train micro batch size per GPU (mbs or tmbspg) and ZeRO stage used to achieve the throughput value is also shown in the parentheses. Assume the strategy users would use in the handtuning process is to start from `mbs = 1` and increase mbs by 2 each time until running out of GPU memory.
- - `baseline` is the vanila Hugging Face (HF) without DeepSpeed (DS) and mbs is hand-tuned.
+The table below shows the throughput (samples per second) comparison. The corresponding train micro-batch size per GPU (mbs or tmbspg) and ZeRO stage used to achieve the throughput value is also shown in the parentheses. Assume the strategy users would use in the handtuning process is to start from `mbs = 1` and increase mbs by 2 each time until running out of GPU memory.
+ - `baseline` is the vanila HF without DeepSpeed (DS) and mbs is hand-tuned.
  - `HF + DS hand-tuned` is HF with DS, and mbs is hand-tuned while other DS configuration uses default values.
- - `HF + DS autotuning` is HF with DS, and the DS configuration selected from autotuning.
+ - `HF + DS autotuning` is HF with DS, and the DS configuration is selected from autotuning.
 
-Notation: Hugging Face (HF), DeepSpeed (DS), ZeRO stage (z), graident accumulation steps (gas), train micro batch size per GPU (mbs or tmbspg).
+Notation: Hugging Face (HF), DeepSpeed (DS), ZeRO stage (z), gradient accumulation steps (gas), train micro-batch size per GPU (mbs or tmbspg).
 
 | Model name | baseline (vanila HF)        | HF + DS handtuned                 | HF + DS autotuning         |
 | ---------- | --------------------------- | --------------------------------- | -------------------------- |
@@ -27,7 +38,7 @@ Notation: Hugging Face (HF), DeepSpeed (DS), ZeRO stage (z), graident accumulati
 
 ## Detailed `HF + DS autotuning` Result Summary
 
-Note that the performance metric used in autotuning is calculated using the timings captured within DeepSpeed foward, backward and step functions. The sum of these timings is less than the actual training step latency, thus the throughput metric values used by autotuning would be higher than the end-to-end throughput in training.
+Note that the performance metric used in autotuning is calculated using the timings captured within DeepSpeed forward, backward, and step functions. The sum of these timings is less than the actual training step latency, thus the throughput metric values used by autotuning would be higher than the end-to-end throughput in training.
 
 - Fast-mode Autotuning time: 36 mins
 - Number of experiments: 22
diff --git a/autotuning/hf/deberta/README.md b/autotuning/hf/deberta/README.md
index 51a8af4ac..552d845c5 100644
--- a/autotuning/hf/deberta/README.md
+++ b/autotuning/hf/deberta/README.md
@@ -10,18 +10,28 @@ Refer to [DeBERTa: Decoding-enhanced BERT with Disentangled Attention](https://g
 ## Environment
 
 The training use fp16 and runs on 1 node with 16 Nvidia V100 GPUs. The autotuning uses the same hardware resource as the training.
+The HF packages below are used.
 
-- transformers (4.12.0.dev0)
-- datasets (1.11.0)
+HF examples require installing the `transformers` package from source:
+```bash
+    git clone https://github.com/huggingface/transformers.git
+    cd transformers
+    pip install .
+```
+The `datasets` package can be installed by `pip install datasets`
+
+Below are the versions used in this test.
 
-## Throughput Comparsion
+- transformers (4.12.0)
+- datasets (1.11.0)
+## Throughput Comparison
 
-The table below shows the throughput (samples per second) comparsion. The corresponding train micro batch size per GPU (mbs or tmbspg) and ZeRO stage used to achieve the throughput value is also shown in the parentheses. Assume the strategy users would use in the handtuning process is to start from `mbs = 1` and increase mbs by 2 each time until running out of GPU memory.
- - `baseline` is the vanila Hugging Face (HF) without DeepSpeed (DS) and mbs is hand-tuned.
+The table below shows the throughput (samples per second) comparison. The corresponding train micro-batch size per GPU (mbs or tmbspg) and ZeRO stage used to achieve the throughput value is also shown in the parentheses. Assume the strategy users would use in the handtuning process is to start from `mbs = 1` and increase mbs by 2 each time until running out of GPU memory.
+ - `baseline` is the vanila HF without DeepSpeed (DS) and mbs is hand-tuned.
  - `HF + DS hand-tuned` is HF with DS, and mbs is hand-tuned while other DS configuration uses default values.
- - `HF + DS autotuning` is HF with DS, and the DS configuration selected from autotuning.
+ - `HF + DS autotuning` is HF with DS, and the DS configuration is selected from autotuning.
 
-Notation: Hugging Face (HF), DeepSpeed (DS), ZeRO stage (z), graident accumulation steps (gas), train micro batch size per GPU (mbs or tmbspg), reduce_bucket_size (rbs), allgather_bucket_size (abs).
+Notation: Hugging Face (HF), DeepSpeed (DS), ZeRO stage (z), gradient accumulation steps (gas), train micro-batch size per GPU (mbs or tmbspg), reduce_bucket_size (rbs), allgather_bucket_size (abs).
 
 | Model name | baseline (vanila HF) | HF + DS hand-tuned                | HF + DS autotuning (fast-mode) |
 | ---------- | -------------------- | --------------------------------- | ------------------------------ |
@@ -29,7 +39,7 @@ Notation: Hugging Face (HF), DeepSpeed (DS), ZeRO stage (z), graident accumulati
 
 ## Detailed `HF + DS autotuning` Result Summary
 
-Note that the performance metric used in autotuning is calculated using the timings captured within DeepSpeed foward, backward and step functions. The sum of these timings is less than the actual training step latency, thus the throughput metric values used by autotuning would be higher than the end-to-end throughput in training.
+Note that the performance metric used in autotuning is calculated using the timings captured within DeepSpeed forward, backward, and step functions. The sum of these timings is less than the actual training step latency, thus the throughput metric values used by autotuning would be higher than the end-to-end throughput in training.
 ### Fast-mode
 - Autotuning time: 40 mins
 - Number of experiments: 12
@@ -45,7 +55,7 @@ Note that the performance metric used in autotuning is calculated using the timi
 
 Tuning completed in 0:39:25.253998. Total number of experiments: 12.
 
-### Full-mode
+### Full-mode ("fast" set to false)
 - Autotuning time: 1 hr 2 mins
 - Number of experiments: 24
 - Throughput Improvement over baseline: Inf
diff --git a/autotuning/hf/distilbert/README.md b/autotuning/hf/distilbert/README.md
index 2520324c6..088cb4877 100644
--- a/autotuning/hf/distilbert/README.md
+++ b/autotuning/hf/distilbert/README.md
@@ -10,18 +10,28 @@ This model has the following configuration:
 ## Environment
 
 The training uses 1 node with 16 Nvidia V100 GPUs, fp32, max_train_batch_size = 4096. The autotuning uses the same hardware resource as the training.
+The HF packages below are used.
 
-- transformers (4.12.0.dev0)
-- datasets (1.11.0)
+HF examples require installing the `transformers` package from source:
+```bash
+    git clone https://github.com/huggingface/transformers.git
+    cd transformers
+    pip install .
+```
+The `datasets` package can be installed by `pip install datasets`
+
+Below are the versions used in this test.
 
-## Throughput Comparsion
+- transformers (4.12.0)
+- datasets (1.11.0)
+## Throughput Comparison
 
-The table below shows the throughput (samples per second) comparsion. The corresponding train micro batch size per GPU (mbs or tmbspg) and ZeRO stage used to achieve the throughput value is also shown in the parentheses. Assume the strategy users would use in the handtuning process is to start from `mbs = 1` and increase mbs by 2 each time until running out of GPU memory.
- - `baseline` is the vanila Hugging Face (HF) without DeepSpeed (DS) and mbs is hand-tuned.
+The table below shows the throughput (samples per second) comparison. The corresponding train micro-batch size per GPU (mbs or tmbspg) and ZeRO stage used to achieve the throughput value is also shown in the parentheses. Assume the strategy users would use in the handtuning process is to start from `mbs = 1` and increase mbs by 2 each time until running out of GPU memory.
+ - `baseline` is the vanila HF without DeepSpeed (DS) and mbs is hand-tuned.
  - `HF + DS hand-tuned` is HF with DS, and mbs is hand-tuned while other DS configuration uses default values.
- - `HF + DS autotuning` is HF with DS, and the DS configuration selected from autotuning.
+ - `HF + DS autotuning` is HF with DS, and the DS configuration is selected from autotuning.
 
-Notation: Hugging Face (HF), DeepSpeed (DS), ZeRO stage (z), graident accumulation steps (gas), train micro batch size per GPU (mbs or tmbspg).
+Notation: Hugging Face (HF), DeepSpeed (DS), ZeRO stage (z), gradient accumulation steps (gas), train micro-batch size per GPU (mbs or tmbspg).
 
 | Model name | baseline (vanila HF)          | HF + DS hand-tuned                   | HF + DS autotuning (fast-mode) |
 | ---------- | ----------------------------- | ------------------------------------ | ------------------------------ |
@@ -29,7 +39,7 @@ Notation: Hugging Face (HF), DeepSpeed (DS), ZeRO stage (z), graident accumulati
 
 ## Detailed `HF + DS autotuning` Result Summary
 
-Note that the performance metric used in autotuning is calculated using the timings captured within DeepSpeed foward, backward and step functions. The sum of these timings is less than the actual training step latency, thus the throughput metric values used by autotuning would be higher than the end-to-end throughput in training.
+Note that the performance metric used in autotuning is calculated using the timings captured within DeepSpeed forward, backward, and step functions. The sum of these timings is less than the actual training step latency, thus the throughput metric values used by autotuning would be higher than the end-to-end throughput in training.
 
 - Fast-mode Autotuning time: 11 mins
 - Number of experiments: 11
diff --git a/autotuning/hf/gpt2-large/README.md b/autotuning/hf/gpt2-large/README.md
index f259447e0..a736db485 100644
--- a/autotuning/hf/gpt2-large/README.md
+++ b/autotuning/hf/gpt2-large/README.md
@@ -12,18 +12,29 @@ Refer to [GPT-2/GPT and causal language modeling](https://github.com/huggingface
 ## Environment
 
 The training use fp16 and runs on 1 node with 16 Nvidia V100 GPUs. The autotuning uses the same hardware resource as the training. `max_train_batch_size` is not defined.
+The HF packages below are used.
 
-- transformers (4.12.0.dev0)
-- datasets (1.11.0)
+HF examples require installing the `transformers` package from source:
+```bash
+    git clone https://github.com/huggingface/transformers.git
+    cd transformers
+    pip install .
+```
+The `datasets` package can be installed by `pip install datasets`
 
-## Throughput Comparsion
+Below are the versions used in this test.
 
-The table below shows the throughput (samples per second) comparsion. The corresponding train micro batch size per GPU (mbs or tmbspg) and ZeRO stage used to achieve the throughput value is also shown in the parentheses. Assume the strategy users would use in the handtuning process is to start from `mbs = 1` and increase mbs by 2 each time until running out of GPU memory.
- - `baseline` is the vanila Hugging Face (HF) without DeepSpeed (DS) and mbs is hand-tuned.
+- transformers (4.12.0)
+- datasets (1.11.0)datasets (1.11.0)
+
+## Throughput Comparison
+
+The table below shows the throughput (samples per second) comparison. The corresponding train micro-batch size per GPU (mbs or tmbspg) and ZeRO stage used to achieve the throughput value is also shown in the parentheses. Assume the strategy users would use in the handtuning process is to start from `mbs = 1` and increase mbs by 2 each time until running out of GPU memory.
+ - `baseline` is the vanila HF without DeepSpeed (DS) and mbs is hand-tuned.
  - `HF + DS hand-tuned` is HF with DS, and mbs is hand-tuned while other DS configuration uses default values.
- - `HF + DS autotuning` is HF with DS, and the DS configuration selected from autotuning.
+ - `HF + DS autotuning` is HF with DS, and the DS configuration is selected from autotuning.
 
-Notation: Hugging Face (HF), DeepSpeed (DS), ZeRO stage (z), graident accumulation steps (gas), train micro batch size per GPU (mbs or tmbspg).
+Notation: Hugging Face (HF), DeepSpeed (DS), ZeRO stage (z), gradient accumulation steps (gas), train micro-batch size per GPU (mbs or tmbspg).
 
 | Model name | baseline (vanila HF) | HF + DS hand-tuned       | HF + DS autotuning (fast-mode) |
 | ---------- | -------------------- | ------------------------ | ------------------------------ |
@@ -31,7 +42,7 @@ Notation: Hugging Face (HF), DeepSpeed (DS), ZeRO stage (z), graident accumulati
 
 ## Detailed `HF + DS autotuning` Result Summary
 
-Note that the performance metric used in autotuning is calculated using the timings captured within DeepSpeed foward, backward and step functions. The sum of these timings is less than the actual training step latency, thus the throughput metric values used by autotuning would be higher than the end-to-end throughput in training.
+Note that the performance metric used in autotuning is calculated using the timings captured within DeepSpeed forward, backward, and step functions. The sum of these timings is less than the actual training step latency, thus the throughput metric values used by autotuning would be higher than the end-to-end throughput in training.
 
 - Fast-mode Autotuning time: 27 mins
 - Number of experiments: 13
diff --git a/autotuning/hf/gpt2-medium/README.md b/autotuning/hf/gpt2-medium/README.md
index 5507d5683..e97a1f9b3 100644
--- a/autotuning/hf/gpt2-medium/README.md
+++ b/autotuning/hf/gpt2-medium/README.md
@@ -11,18 +11,28 @@ Refer to [GPT-2/GPT and causal language modeling](https://github.com/huggingface
 ## Environment
 
 The training use fp16 and runs on 1 node with 16 Nvidia V100 GPUs. The autotuning uses the same hardware resource as the training. `max_train_batch_size` is not defined.
+The HF packages below are used.
 
-- transformers (4.12.0.dev0)
-- datasets (1.11.0)
+HF examples require installing the `transformers` package from source:
+```bash
+    git clone https://github.com/huggingface/transformers.git
+    cd transformers
+    pip install .
+```
+The `datasets` package can be installed by `pip install datasets`
+
+Below are the versions used in this test.
 
-## Throughput Comparsion
+- transformers (4.12.0)
+- datasets (1.11.0)
+## Throughput Comparison
 
-The table below shows the throughput (samples per second) comparsion. The corresponding train micro batch size per GPU (mbs or tmbspg) and ZeRO stage used to achieve the throughput value is also shown in the parentheses. Assume the strategy users would use in the handtuning process is to start from `mbs = 1` and increase mbs by 2 each time until running out of GPU memory.
- - `baseline` is the vanila Hugging Face (HF) without DeepSpeed (DS) and mbs is hand-tuned.
+The table below shows the throughput (samples per second) comparison. The corresponding train micro-batch size per GPU (mbs or tmbspg) and ZeRO stage used to achieve the throughput value is also shown in the parentheses. Assume the strategy users would use in the handtuning process is to start from `mbs = 1` and increase mbs by 2 each time until running out of GPU memory.
+ - `baseline` is the vanila HF without DeepSpeed (DS) and mbs is hand-tuned.
  - `HF + DS hand-tuned` is HF with DS, and mbs is hand-tuned while other DS configuration uses default values.
- - `HF + DS autotuning` is HF with DS, and the DS configuration selected from autotuning.
+ - `HF + DS autotuning` is HF with DS, and the DS configuration is selected from autotuning.
 
-Notation: Hugging Face (HF), DeepSpeed (DS), ZeRO stage (z), graident accumulation steps (gas), train micro batch size per GPU (mbs or tmbspg).
+Notation: Hugging Face (HF), DeepSpeed (DS), ZeRO stage (z), gradient accumulation steps (gas), train micro-batch size per GPU (mbs or tmbspg).
 
 | Model name  | baseline (vanila HF)     | HF + DS hand-tuned                | HF + DS autotuning (fast-mode) |
 | ----------- | ------------------------ | --------------------------------- | ------------------------------ |
@@ -30,7 +40,7 @@ Notation: Hugging Face (HF), DeepSpeed (DS), ZeRO stage (z), graident accumulati
 
 ## Detailed `HF + DS autotuning` Result Summary
 
-Note that the performance metric used in autotuning is calculated using the timings captured within DeepSpeed foward, backward and step functions. The sum of these timings is less than the actual training step latency, thus the throughput metric values used by autotuning would be higher than the end-to-end throughput in training.
+Note that the performance metric used in autotuning is calculated using the timings captured within DeepSpeed forward, backward, and step functions. The sum of these timings is less than the actual training step latency, thus the throughput metric values used by autotuning would be higher than the end-to-end throughput in training.
 
 - Fast-mode Autotuning time: 25 mins
 - Number of experiments: 15
diff --git a/autotuning/hf/gpt2-xl/README.md b/autotuning/hf/gpt2-xl/README.md
index e3bda042c..f6d81b264 100644
--- a/autotuning/hf/gpt2-xl/README.md
+++ b/autotuning/hf/gpt2-xl/README.md
@@ -11,17 +11,28 @@ Refer to [GPT-2/GPT and causal language modeling](https://github.com/huggingface
 ## Environment
 
 The training use fp16 and runs on 1 node with 16 Nvidia V100 GPUs. The autotuning uses the same hardware resource as the training. `max_train_batch_size` is not defined.
+The HF packages below are used.
 
-- transformers (4.12.0.dev0)
+HF examples require installing the `transformers` package from source:
+```bash
+    git clone https://github.com/huggingface/transformers.git
+    cd transformers
+    pip install .
+```
+The `datasets` package can be installed by `pip install datasets`
+
+Below are the versions used in this test.
+
+- transformers (4.12.0)
 - datasets (1.11.0)
-## Throughput Comparsion
+## Throughput Comparison
 
-The table below shows the throughput (samples per second) comparsion. The corresponding train micro batch size per GPU (mbs or tmbspg) and ZeRO stage used to achieve the throughput value is also shown in the parentheses. Assume the strategy users would use in the handtuning process is to start from `mbs = 1` and increase mbs by 2 each time until running out of GPU memory.
- - `baseline` is the vanila Hugging Face (HF) without DeepSpeed (DS) and mbs is hand-tuned.
+The table below shows the throughput (samples per second) comparison. The corresponding train micro-batch size per GPU (mbs or tmbspg) and ZeRO stage used to achieve the throughput value is also shown in the parentheses. Assume the strategy users would use in the handtuning process is to start from `mbs = 1` and increase mbs by 2 each time until running out of GPU memory.
+ - `baseline` is the vanila HF without DeepSpeed (DS) and mbs is hand-tuned.
  - `HF + DS hand-tuned` is HF with DS, and mbs is hand-tuned while other DS configuration uses default values.
- - `HF + DS autotuning` is HF with DS, and the DS configuration selected from autotuning.
+ - `HF + DS autotuning` is HF with DS, and the DS configuration is selected from autotuning.
 
-Notation: Hugging Face (HF), DeepSpeed (DS), ZeRO stage (z), graident accumulation steps (gas), train micro batch size per GPU (mbs or tmbspg).
+Notation: Hugging Face (HF), DeepSpeed (DS), ZeRO stage (z), gradient accumulation steps (gas), train micro-batch size per GPU (mbs or tmbspg).
 
 | Model name | baseline (vanila HF) | HF + DS hand-tuned                | HF + DS autotuning (fast-mode)   |
 | ---------- | -------------------- | --------------------------------- | -------------------------------- |
@@ -29,7 +40,7 @@ Notation: Hugging Face (HF), DeepSpeed (DS), ZeRO stage (z), graident accumulati
 
 ## Detailed `HF + DS autotuning` Result Summary
 
-Note that the performance metric used in autotuning is calculated using the timings captured within DeepSpeed foward, backward and step functions. The sum of these timings is less than the actual training step latency, thus the throughput metric values used by autotuning would be higher than the end-to-end throughput in training.
+Note that the performance metric used in autotuning is calculated using the timings captured within DeepSpeed forward, backward, and step functions. The sum of these timings is less than the actual training step latency, thus the throughput metric values used by autotuning would be higher than the end-to-end throughput in training.
 
 - Fast-mode Autotuning time: 21 mins
 - Number of experiments: 9
diff --git a/autotuning/hf/gpt2/README.md b/autotuning/hf/gpt2/README.md
index 2b5ab65a3..507f2b6fb 100644
--- a/autotuning/hf/gpt2/README.md
+++ b/autotuning/hf/gpt2/README.md
@@ -12,17 +12,28 @@ Refer to [GPT-2/GPT and causal language modeling](https://github.com/huggingface
 ## Environment
 
 The training use fp16 and runs on 1 node with 16 Nvidia V100 GPUs. The autotuning uses the same hardware resource as the training. `max_train_batch_size` is not defined.
+The HF packages below are used.
 
-- transformers (4.12.0.dev0)
+HF examples require installing the `transformers` package from source:
+```bash
+    git clone https://github.com/huggingface/transformers.git
+    cd transformers
+    pip install .
+```
+The `datasets` package can be installed by `pip install datasets`
+
+Below are the versions used in this test.
+
+- transformers (4.12.0)
 - datasets (1.11.0)
-## Throughput Comparsion
+## Throughput Comparison
 
-The table below shows the throughput (samples per second) comparsion. The corresponding train micro batch size per GPU (mbs or tmbspg) and ZeRO stage used to achieve the throughput value is also shown in the parentheses. Assume the strategy users would use in the handtuning process is to start from `mbs = 1` and increase mbs by 2 each time until running out of GPU memory.
- - `baseline` is the vanila Hugging Face (HF) without DeepSpeed (DS) and mbs is hand-tuned.
+The table below shows the throughput (samples per second) comparison. The corresponding train micro-batch size per GPU (mbs or tmbspg) and ZeRO stage used to achieve the throughput value is also shown in the parentheses. Assume the strategy users would use in the handtuning process is to start from `mbs = 1` and increase mbs by 2 each time until running out of GPU memory.
+ - `baseline` is the vanila HF without DeepSpeed (DS) and mbs is hand-tuned.
  - `HF + DS hand-tuned` is HF with DS, and mbs is hand-tuned while other DS configuration uses default values.
- - `HF + DS autotuning` is HF with DS, and the DS configuration selected from autotuning.
+ - `HF + DS autotuning` is HF with DS, and the DS configuration is selected from autotuning.
 
-Notation: Hugging Face (HF), DeepSpeed (DS), ZeRO stage (z), graident accumulation steps (gas), train micro batch size per GPU (mbs or tmbspg).
+Notation: Hugging Face (HF), DeepSpeed (DS), ZeRO stage (z), gradient accumulation steps (gas), train micro-batch size per GPU (mbs or tmbspg).
 
 | Model name | baseline (vanila HF) | HF + DS hand-tuned       | HF + DS autotuning (fast-mode) |
 | ---------- | -------------------- | ------------------------ | ------------------------------ |
@@ -31,7 +42,7 @@ Notation: Hugging Face (HF), DeepSpeed (DS), ZeRO stage (z), graident accumulati
 
 ## Detailed `HF + DS autotuning` Result Summary
 
-Note that the performance metric used in autotuning is calculated using the timings captured within DeepSpeed foward, backward and step functions. The sum of these timings is less than the actual training step latency, thus the throughput metric values used by autotuning would be higher than the end-to-end throughput in training.
+Note that the performance metric used in autotuning is calculated using the timings captured within DeepSpeed forward, backward, and step functions. The sum of these timings is less than the actual training step latency, thus the throughput metric values used by autotuning would be higher than the end-to-end throughput in training.
 
 - Fast-mode Autotuning time: 25 mins
 - Number of experiments: 17

From 94cd5e28850485df955e9409b8a3a71742929e75 Mon Sep 17 00:00:00 2001
From: Cheng Li <pistasable@gmail.com>
Date: Sat, 13 Nov 2021 04:02:44 +0000
Subject: [PATCH 6/7] fix format

---
 autotuning/hf/README.md                |   2 +-
 autotuning/hf/bert-base/README.md      |  23 ++---
 autotuning/hf/bert-large/README.md     |   2 +-
 autotuning/hf/deberta/README.md        |   2 +-
 autotuning/hf/deberta/test_tune.sh     |   1 -
 autotuning/hf/distilbert/test_tune.sh  |  56 ++++++-----
 autotuning/hf/gpt2-large/test_tune.sh  |   1 -
 autotuning/hf/gpt2-medium/test_tune.sh | 122 +++++++++++++-----------
 autotuning/hf/gpt2-xl/test_tune.sh     | 123 ++++++++++++++-----------
 autotuning/hf/gpt2/README.md           |   2 +-
 autotuning/hf/gpt2/test_tune.sh        |   1 -
 11 files changed, 182 insertions(+), 153 deletions(-)

diff --git a/autotuning/hf/README.md b/autotuning/hf/README.md
index d51d5624a..567deda04 100644
--- a/autotuning/hf/README.md
+++ b/autotuning/hf/README.md
@@ -53,7 +53,7 @@ Notation: Hugging Face (HF), DeepSpeed (DS), ZeRO stage (z), gradient accumulati
 | Model   name | num_params |     baseline (vanila HF)      |          HF + DS hand-tuned          | HF + DS autotuning (fast-mode) | throughput improvement over baseline | autotuning time (mins) | number of experiments |
 | :----------: | :--------: | :---------------------------: | :----------------------------------: | :----------------------------: | :----------------------------------: | :--------------------: | :-------------------: |
 |  DistilBERT  |    66M     | 5161.902 (gas = 1, mbs = 256) | 5305.067 (z = 0, gas = 1 mbs = 256)  |  5305.067 (z0_gas1_tmbspg256)  |                1.03x                 |           11           |          11           |
-|  BERT-base   |   0.11B    | 2502.236 (gas = 1,mbs = 128)  | 2523.684 (z = 0, gas = 1, mbs = 128) |  2682.849 (z0_gas1_tmbspg220)  |                1.07x                 |           43           |          35           |
+|  BERT-base   |   0.11B    | 2502.236 (gas = 1,mbs = 128)  | 2523.684 (z = 0, gas = 1, mbs = 128) |  2736.561 (z0_gas1_tmbspg235)  |                1.09x                 |           35           |          34           |
 |  BERT-large  |   0.34B    |  742.692 (gas = 1,mbs = 64)   |  766.929 (z = 1, gas = 1, mbs = 64)  |   808.168 (z1_gas1_tmbspg93)   |                1.09x                 |           36           |          22           |
 |     GPT2     |   0.12B    |   284.142 (gas = 1,mbs = 8)   |  397.827 (z = 1, gas = 1, mbs = 8)   |   431.586 (z1_gas1_tmbspg14)   |                1.52x                 |           25           |          17           |
 | GPT2-medium  |   0.35B    |   71.61 (gas = 1, mbs = 2)    |  142.211 (z = 1, gas = 1, mbs = 4)   |    163.3 (z1_gas1_tmbspg6)     |                 2.28                 |           15           |          25           |
diff --git a/autotuning/hf/bert-base/README.md b/autotuning/hf/bert-base/README.md
index 21f508da8..02450fdd3 100644
--- a/autotuning/hf/bert-base/README.md
+++ b/autotuning/hf/bert-base/README.md
@@ -9,7 +9,7 @@ This model has the following configuration:
 
 ## Environment
 
-The training use fp32 and runs on 1 node with 16 Nvidia V100 GPUs. The autotuning uses the same hardware resource as the training. `max_train_batch_size` is not defined.
+The training use fp32 and runs on 1 node with 16 Nvidia V100 GPUs. The autotuning uses the same hardware resource as the training. `max_train_batch_size` is set to `4096`.
 The HF packages below are used.
 
 HF examples require installing the `transformers` package from source:
@@ -36,22 +36,23 @@ Notation: Hugging Face (HF), DeepSpeed (DS), ZeRO stage (z), gradient accumulati
 
 | Model name | baseline (vanila HF)          | HF + DS handtuned                    | HF + DS autotuning           |
 | ---------- | ----------------------------- | ------------------------------------ | ---------------------------- |
-| BERT-base  | 2502.236 (gas = 1, mbs = 128) | 2523.684 (z = 0, gas = 1, mbs = 128) | 2682.849 (z0_gas1_tmbspg220) |
+| BERT-base  | 2502.236 (gas = 1, mbs = 128) | 2523.684 (z = 0, gas = 1, mbs = 128) | 2736.561 (z0_gas1_tmbspg235) |
 
 ## Detailed `HF + DS autotuning` Result Summary
 
 Note that the performance metric used in autotuning is calculated using the timings captured within DeepSpeed forward, backward, and step functions. The sum of these timings is less than the actual training step latency, thus the throughput metric values used by autotuning would be higher than the end-to-end throughput in training.
 
-- Fast-mode Autotuning time: 43 mins
-- Number of experiments: 35
-- Throughput Improvement over baseline: 1.07x
+- Fast-mode Autotuning time: 35 mins
+- Number of experiments: 34
+- Throughput Improvement over baseline: 1.09x
+
 
 | tuning_space | num_experiments | best_metric_val | best_exp_name     |
 | :----------- | --------------: | --------------: | :---------------- |
-| z0           |               9 |         2880.94 | z0_gas1_tmbspg220 |
-| z1           |               7 |         2861.43 | z1_gas1_tmbspg220 |
-| z2           |               8 |         2714.96 | z2_gas1_tmbspg240 |
-| z3           |              11 |         2420.78 | z3_gas1_tmbspg240 |
-| global       |              35 |         2880.94 | z0_gas1_tmbspg220 |
+| z0           |               9 |         2930.18 | z0_gas1_tmbspg235 |
+| z1           |               7 |         2930.17 | z1_gas1_tmbspg235 |
+| z2           |               8 |         2744.16 | z2_gas1_tmbspg235 |
+| z3           |              10 |         2479.47 | z3_gas1_tmbspg238 |
+| global       |              34 |         2930.18 | z0_gas1_tmbspg235 |
 
-Tuning completed in 0:43:33.853567. Total number of experiments: 35.
+Tuning completed in 0:34:41.842250. Total number of experiments: 34.
diff --git a/autotuning/hf/bert-large/README.md b/autotuning/hf/bert-large/README.md
index 157dba0c1..c89c171cc 100644
--- a/autotuning/hf/bert-large/README.md
+++ b/autotuning/hf/bert-large/README.md
@@ -7,7 +7,7 @@ This model has the following configuration:
 - 16 attention heads
 - 336M parameters
 
-The training use fp32 and runs on 1 node with 16 Nvidia V100 GPUs. The autotuning uses the same hardware resource as the training. `max_train_batch_size` is not defined.
+The training use fp32 and runs on 1 node with 16 Nvidia V100 GPUs. The autotuning uses the same hardware resource as the training. `max_train_batch_size` is set to `4096`.
 The HF packages below are used.
 
 HF examples require installing the `transformers` package from source:
diff --git a/autotuning/hf/deberta/README.md b/autotuning/hf/deberta/README.md
index 552d845c5..9144376cd 100644
--- a/autotuning/hf/deberta/README.md
+++ b/autotuning/hf/deberta/README.md
@@ -9,7 +9,7 @@ This model has the following configuration:
 Refer to [DeBERTa: Decoding-enhanced BERT with Disentangled Attention](https://github.com/microsoft/DeBERTa).
 ## Environment
 
-The training use fp16 and runs on 1 node with 16 Nvidia V100 GPUs. The autotuning uses the same hardware resource as the training.
+The training use fp16 and runs on 1 node with 16 Nvidia V100 GPUs. The autotuning uses the same hardware resource as the training. `max_train_batch_size` is not defined.
 The HF packages below are used.
 
 HF examples require installing the `transformers` package from source:
diff --git a/autotuning/hf/deberta/test_tune.sh b/autotuning/hf/deberta/test_tune.sh
index 45b60dae3..1b283ca57 100755
--- a/autotuning/hf/deberta/test_tune.sh
+++ b/autotuning/hf/deberta/test_tune.sh
@@ -2,7 +2,6 @@ MODEL_NAME=microsoft/deberta-v2-xxlarge
 TASK_NAME=mnli
 PER_DEVICE_TRAIN_BATCH_SIZE=1
 HF_PATH=~/projects
-
 NEPOCHS=1
 NGPUS=16
 NNODES=1
diff --git a/autotuning/hf/distilbert/test_tune.sh b/autotuning/hf/distilbert/test_tune.sh
index 560ddb1e6..fc1a0c978 100755
--- a/autotuning/hf/distilbert/test_tune.sh
+++ b/autotuning/hf/distilbert/test_tune.sh
@@ -1,7 +1,7 @@
 TASK_NAME=mnli
-MODEL_NAME=bert-base-cased
+MODEL_NAME=distilbert-base-uncased
 HF_PATH=~/projects
-PER_DEVICE_TRAIN_BATCH_SIZE=64
+PER_DEVICE_TRAIN_BATCH_SIZE=256
 MAX_TRAIN_BATCH_SIZE=4096
 NEPOCHS=1
 NGPUS=16
@@ -17,105 +17,103 @@ then
     --model_name_or_path $MODEL_NAME \
     --task_name $TASK_NAME \
     --do_train \
-    --do_eval \
     --max_seq_length 128 \
     --per_device_train_batch_size $PER_DEVICE_TRAIN_BATCH_SIZE \
-    # --learning_rate 2e-5 \
+    --learning_rate 2e-5 \
     --num_train_epochs $NEPOCHS \
-    --output_dir ${OUTPUT_DIR}_b${PER_DEVICE_TRAIN_BATCH_SIZE}_0 \
+    --output_dir ${OUTPUT_DIR}_0 \
     --overwrite_output_dir \
     --save_steps 0 \
     --max_steps $MAX_STEPS \
     --save_strategy "no"
 elif [ ${TEST} == "z0" ]
 then
-    deepspeed --num_nodes=$NNODES $HF_PATH/transformers/examples/pytorch/text-classification/run_glue.py --deepspeed ../dsconfigs/ds_config_z0.json \
+    deepspeed --num_nodes=$NNODES --num_gpus=$NGPUS $HF_PATH/transformers/examples/pytorch/text-classification/run_glue.py --deepspeed ../dsconfigs/ds_config_z0.json \
     --model_name_or_path $MODEL_NAME \
     --task_name $TASK_NAME \
     --do_train \
-    --do_eval \
     --max_seq_length 128 \
     --per_device_train_batch_size $PER_DEVICE_TRAIN_BATCH_SIZE \
     --learning_rate 2e-5 \
     --num_train_epochs $NEPOCHS \
-    --output_dir ${OUTPUT_DIR}_b${PER_DEVICE_TRAIN_BATCH_SIZE}_z0 \
-    --save_steps 0 \
+    --output_dir ${OUTPUT_DIR}_z0 \
     --overwrite_output_dir \
-    --max_steps $MAX_STEPS
+    --save_steps 0 \
+    --max_steps $MAX_STEPS \
+    --save_strategy "no"
 elif [ ${TEST} == "z1" ]
 then
     deepspeed --num_nodes=$NNODES $HF_PATH/transformers/examples/pytorch/text-classification/run_glue.py --deepspeed ../dsconfigs/ds_config_z1.json \
     --model_name_or_path $MODEL_NAME \
     --task_name $TASK_NAME \
     --do_train \
-    --do_eval \
     --max_seq_length 128 \
     --per_device_train_batch_size $PER_DEVICE_TRAIN_BATCH_SIZE \
     --learning_rate 2e-5 \
     --num_train_epochs $NEPOCHS \
-    --output_dir ${OUTPUT_DIR}_b${PER_DEVICE_TRAIN_BATCH_SIZE}_z1 \
-    --save_steps 0 \
+    --output_dir ${OUTPUT_DIR}_z1 \
     --overwrite_output_dir \
-    --max_steps $MAX_STEPS
+    --save_steps 0 \
+    --max_steps $MAX_STEPS \
+    --save_strategy "no"
 elif [ ${TEST} == "z2" ]
 then
     deepspeed --num_nodes=$NNODES $HF_PATH/transformers/examples/pytorch/text-classification/run_glue.py --deepspeed ../dsconfigs/ds_config_z2.json \
     --model_name_or_path $MODEL_NAME \
     --task_name $TASK_NAME \
     --do_train \
-    --do_eval \
     --max_seq_length 128 \
     --per_device_train_batch_size $PER_DEVICE_TRAIN_BATCH_SIZE \
     --learning_rate 2e-5 \
     --num_train_epochs $NEPOCHS \
-    --output_dir ${OUTPUT_DIR}_b${PER_DEVICE_TRAIN_BATCH_SIZE}_z2 \
-    --save_steps 0 \
+    --output_dir ${OUTPUT_DIR}_z2 \
     --overwrite_output_dir \
-    --max_steps $MAX_STEPS
+    --save_steps 0 \
+    --max_steps $MAX_STEPS \
+    --save_strategy "no"
 elif [ ${TEST} == "z3" ]
 then
     deepspeed --num_nodes=$NNODES $HF_PATH/transformers/examples/pytorch/text-classification/run_glue.py --deepspeed ../dsconfigs/ds_config_z3.json \
     --model_name_or_path $MODEL_NAME \
     --task_name $TASK_NAME \
     --do_train \
-    --do_eval \
     --max_seq_length 128 \
     --per_device_train_batch_size $PER_DEVICE_TRAIN_BATCH_SIZE \
     --learning_rate 2e-5 \
     --num_train_epochs $NEPOCHS \
-    --output_dir ${OUTPUT_DIR}_b${PER_DEVICE_TRAIN_BATCH_SIZE}_z3 \
-    --save_steps 0 \
+    --output_dir ${OUTPUT_DIR}_z3 \
     --overwrite_output_dir \
-    --max_steps $MAX_STEPS
+    --save_steps 0 \
+    --max_steps $MAX_STEPS \
+    --save_strategy "no"
 elif [ ${TEST} == "tune" ]
 then
-    deepspeed --autotuning run --num_nodes=$NNODES $HF_PATH/transformers/examples/pytorch/text-classification/run_glue.py --deepspeed ../dsconfigs/ds_config_tune.json \
+    deepspeed --autotuning tune --num_nodes=$NNODES --num_gpus=$NGPUS $HF_PATH/transformers/examples/pytorch/text-classification/run_glue.py --deepspeed ./ds_config_tune.json \
     --model_name_or_path $MODEL_NAME \
     --task_name $TASK_NAME \
     --do_train \
-    --do_eval \
     --max_seq_length 128 \
     --per_device_train_batch_size $PER_DEVICE_TRAIN_BATCH_SIZE \
-    --learning_rate 2e-5 \
     --num_train_epochs $NEPOCHS \
     --output_dir ${OUTPUT_DIR}_tune \
-    --save_steps 0 \
     --overwrite_output_dir \
-    --max_steps $MAX_STEPS
+    --save_steps 0 \
+    --max_steps $MAX_STEPS \
+    --save_strategy "no"
 elif [ ${TEST} == "fs" ]
 then
     python -m torch.distributed.launch --nproc_per_node=$NGPUS $HF_PATH/transformers/examples/pytorch/text-classification/run_glue.py \
     --model_name_or_path $MODEL_NAME \
     --task_name $TASK_NAME \
     --do_train \
-    --do_eval \
     --max_seq_length 128 \
     --per_device_train_batch_size $PER_DEVICE_TRAIN_BATCH_SIZE \
     --learning_rate 2e-5 \
     --num_train_epochs $NEPOCHS \
-    --output_dir ${OUTPUT_DIR}_b${PER_DEVICE_TRAIN_BATCH_SIZE}_fs \
+    --output_dir ${OUTPUT_DIR}_fs \
     --overwrite_output_dir \
     --save_steps 0 \
     --max_steps $MAX_STEPS \
+    --save_strategy "no"
     --sharded_ddp zero_dp_2
 fi
diff --git a/autotuning/hf/gpt2-large/test_tune.sh b/autotuning/hf/gpt2-large/test_tune.sh
index c477e60b8..c5fa9b608 100755
--- a/autotuning/hf/gpt2-large/test_tune.sh
+++ b/autotuning/hf/gpt2-large/test_tune.sh
@@ -1,7 +1,6 @@
 MODEL_NAME=gpt2-large
 PER_DEVICE_TRAIN_BATCH_SIZE=1
 HF_PATH=~/projects
-
 NEPOCHS=1
 NGPUS=16
 NNODES=1
diff --git a/autotuning/hf/gpt2-medium/test_tune.sh b/autotuning/hf/gpt2-medium/test_tune.sh
index 4e2907f94..567deb4ff 100755
--- a/autotuning/hf/gpt2-medium/test_tune.sh
+++ b/autotuning/hf/gpt2-medium/test_tune.sh
@@ -1,124 +1,140 @@
-TASK_NAME=mnli
-MODEL_NAME=bert-base-cased
+MODEL_NAME=gpt2-medium
+PER_DEVICE_TRAIN_BATCH_SIZE=1
 HF_PATH=~/projects
-PER_DEVICE_TRAIN_BATCH_SIZE=64
-MAX_TRAIN_BATCH_SIZE=4096
 NEPOCHS=1
 NGPUS=16
 NNODES=1
 MAX_STEPS=200
-OUTPUT_DIR=./${TASK_NAME}/output_b${PER_DEVICE_TRAIN_BATCH_SIZE}_g${NGPUS}_$MAX_STEPS
+OUTPUT_DIR=./output_b${PER_DEVICE_TRAIN_BATCH_SIZE}_g${NGPUS}_$MAX_STEPS
 
 TEST=$1
 
 if [ ${TEST} == "0" ]
 then
-    python -m torch.distributed.launch --nproc_per_node=$NGPUS $HF_PATH/transformers/examples/pytorch/text-classification/run_glue.py \
+    python -m torch.distributed.launch --nproc_per_node=$NGPUS $HF_PATH/transformers/examples/pytorch/language-modeling/run_clm.py \
     --model_name_or_path $MODEL_NAME \
-    --task_name $TASK_NAME \
+    --dataset_name wikitext \
+    --dataset_config_name wikitext-2-raw-v1 \
     --do_train \
-    --do_eval \
-    --max_seq_length 128 \
+    --fp16 \
     --per_device_train_batch_size $PER_DEVICE_TRAIN_BATCH_SIZE \
-    # --learning_rate 2e-5 \
+    --learning_rate 2e-5 \
     --num_train_epochs $NEPOCHS \
-    --output_dir ${OUTPUT_DIR}_b${PER_DEVICE_TRAIN_BATCH_SIZE}_0 \
+    --output_dir ${OUTPUT_DIR}_0 \
     --overwrite_output_dir \
     --save_steps 0 \
     --max_steps $MAX_STEPS \
     --save_strategy "no"
 elif [ ${TEST} == "z0" ]
 then
-    deepspeed --num_nodes=$NNODES $HF_PATH/transformers/examples/pytorch/text-classification/run_glue.py --deepspeed ../dsconfigs/ds_config_z0.json \
+    deepspeed --num_nodes=$NNODES --num_gpus=$NGPUS $HF_PATH/transformers/examples/pytorch/language-modeling/run_clm.py --deepspeed ../dsconfigs/ds_config_fp16_z0.json\
     --model_name_or_path $MODEL_NAME \
-    --task_name $TASK_NAME \
+    --dataset_name wikitext \
+    --dataset_config_name wikitext-2-raw-v1 \
     --do_train \
-    --do_eval \
-    --max_seq_length 128 \
+    --fp16 \
     --per_device_train_batch_size $PER_DEVICE_TRAIN_BATCH_SIZE \
     --learning_rate 2e-5 \
     --num_train_epochs $NEPOCHS \
-    --output_dir ${OUTPUT_DIR}_b${PER_DEVICE_TRAIN_BATCH_SIZE}_z0 \
-    --save_steps 0 \
+    --output_dir ${OUTPUT_DIR}_z0 \
     --overwrite_output_dir \
-    --max_steps $MAX_STEPS
+    --save_steps 0 \
+    --max_steps $MAX_STEPS \
+    --save_strategy "no"
 elif [ ${TEST} == "z1" ]
 then
-    deepspeed --num_nodes=$NNODES $HF_PATH/transformers/examples/pytorch/text-classification/run_glue.py --deepspeed ../dsconfigs/ds_config_z1.json \
+    deepspeed --num_nodes=$NNODES --num_gpus=$NGPUS $HF_PATH/transformers/examples/pytorch/language-modeling/run_clm.py --deepspeed ../dsconfigs/ds_config_fp16_z1.json\
     --model_name_or_path $MODEL_NAME \
-    --task_name $TASK_NAME \
+    --dataset_name wikitext \
+    --dataset_config_name wikitext-2-raw-v1 \
     --do_train \
-    --do_eval \
-    --max_seq_length 128 \
+    --fp16 \
     --per_device_train_batch_size $PER_DEVICE_TRAIN_BATCH_SIZE \
     --learning_rate 2e-5 \
     --num_train_epochs $NEPOCHS \
-    --output_dir ${OUTPUT_DIR}_b${PER_DEVICE_TRAIN_BATCH_SIZE}_z1 \
-    --save_steps 0 \
+    --output_dir ${OUTPUT_DIR}_z1 \
     --overwrite_output_dir \
-    --max_steps $MAX_STEPS
+    --save_steps 0 \
+    --max_steps $MAX_STEPS \
+    --save_strategy "no"
 elif [ ${TEST} == "z2" ]
 then
-    deepspeed --num_nodes=$NNODES $HF_PATH/transformers/examples/pytorch/text-classification/run_glue.py --deepspeed ../dsconfigs/ds_config_z2.json \
+    deepspeed --num_nodes=$NNODES --num_gpus=$NGPUS $HF_PATH/transformers/examples/pytorch/language-modeling/run_clm.py --deepspeed ../dsconfigs/ds_config_fp16_z2.json\
     --model_name_or_path $MODEL_NAME \
-    --task_name $TASK_NAME \
+    --dataset_name wikitext \
+    --dataset_config_name wikitext-2-raw-v1 \
     --do_train \
-    --do_eval \
-    --max_seq_length 128 \
+    --fp16 \
     --per_device_train_batch_size $PER_DEVICE_TRAIN_BATCH_SIZE \
     --learning_rate 2e-5 \
     --num_train_epochs $NEPOCHS \
-    --output_dir ${OUTPUT_DIR}_b${PER_DEVICE_TRAIN_BATCH_SIZE}_z2 \
-    --save_steps 0 \
+    --output_dir ${OUTPUT_DIR}_z2 \
     --overwrite_output_dir \
-    --max_steps $MAX_STEPS
+    --save_steps 0 \
+    --max_steps $MAX_STEPS \
+    --save_strategy "no"
 elif [ ${TEST} == "z3" ]
 then
-    deepspeed --num_nodes=$NNODES $HF_PATH/transformers/examples/pytorch/text-classification/run_glue.py --deepspeed ../dsconfigs/ds_config_z3.json \
+    deepspeed --num_nodes=$NNODES --num_gpus=$NGPUS $HF_PATH/transformers/examples/pytorch/language-modeling/run_clm.py --deepspeed ../dsconfigs/ds_config_fp16_z3.json\
     --model_name_or_path $MODEL_NAME \
-    --task_name $TASK_NAME \
+    --dataset_name wikitext \
+    --dataset_config_name wikitext-2-raw-v1 \
     --do_train \
-    --do_eval \
-    --max_seq_length 128 \
+    --fp16 \
     --per_device_train_batch_size $PER_DEVICE_TRAIN_BATCH_SIZE \
     --learning_rate 2e-5 \
     --num_train_epochs $NEPOCHS \
-    --output_dir ${OUTPUT_DIR}_b${PER_DEVICE_TRAIN_BATCH_SIZE}_z3 \
-    --save_steps 0 \
+    --output_dir ${OUTPUT_DIR}_z3 \
     --overwrite_output_dir \
-    --max_steps $MAX_STEPS
+    --save_steps 0 \
+    --max_steps $MAX_STEPS \
+    --save_strategy "no"
 elif [ ${TEST} == "tune" ]
 then
-    deepspeed --autotuning run --num_nodes=$NNODES $HF_PATH/transformers/examples/pytorch/text-classification/run_glue.py --deepspeed ../dsconfigs/ds_config_tune.json \
+    deepspeed --autotuning run --num_nodes=$NNODES --num_gpus=$NGPUS $HF_PATH/transformers/examples/pytorch/language-modeling/run_clm.py --deepspeed ../dsconfigs/ds_config_fp16_tune.json\
     --model_name_or_path $MODEL_NAME \
-    --task_name $TASK_NAME \
+    --dataset_name wikitext \
+    --dataset_config_name wikitext-2-raw-v1 \
     --do_train \
-    --do_eval \
-    --max_seq_length 128 \
+    --fp16 \
+    --block_size 512 \
     --per_device_train_batch_size $PER_DEVICE_TRAIN_BATCH_SIZE \
     --learning_rate 2e-5 \
     --num_train_epochs $NEPOCHS \
     --output_dir ${OUTPUT_DIR}_tune \
-    --save_steps 0 \
     --overwrite_output_dir \
-    --max_steps $MAX_STEPS
-elif [ ${TEST} == "fs" ]
+    --save_steps 0 \
+    --max_steps $MAX_STEPS \
+    --save_strategy "no"
+elif [ ${TEST} == "tune_test" ]
 then
-    python -m torch.distributed.launch --nproc_per_node=$NGPUS $HF_PATH/transformers/examples/pytorch/text-classification/run_glue.py \
+    deepspeed --num_nodes=$NNODES --num_gpus=$NGPUS $HF_PATH/transformers/examples/pytorch/language-modeling/run_clm.py --deepspeed ../dsconfigs/ds_config_fp16_tune_test.json \
     --model_name_or_path $MODEL_NAME \
-    --task_name $TASK_NAME \
+    --dataset_name wikitext \
+    --dataset_config_name wikitext-2-raw-v1 \
     --do_train \
-    --do_eval \
-    --max_seq_length 128 \
+    --fp16 \
     --per_device_train_batch_size $PER_DEVICE_TRAIN_BATCH_SIZE \
     --learning_rate 2e-5 \
     --num_train_epochs $NEPOCHS \
-    --output_dir ${OUTPUT_DIR}_b${PER_DEVICE_TRAIN_BATCH_SIZE}_fs \
+    --output_dir ${OUTPUT_DIR}_tune_test \
     --overwrite_output_dir \
     --save_steps 0 \
     --max_steps $MAX_STEPS \
-    --sharded_ddp zero_dp_2
-fi
+    --save_strategy "no"
+elif [ ${TEST} == "fs" ]
+then
+    python -m torch.distributed.launch --nproc_per_node=$NGPUS $HF_PATH/transformers/examples/pytorch/language-modeling/run_clm.py \
+    --model_name_or_path $MODEL_NAME \
+    --dataset_name wikitext \
+    --dataset_config_name wikitext-2-raw-v1 \
+    --do_train \
+    --fp16 \
+    --per_device_train_batch_size $PER_DEVICE_TRAIN_BATCH_SIZE \
+    --learning_rate 2e-5 \
+    --num_train_epochs $NEPOCHS \
+    --output_dir ${OUTPUT_DIR}_fs \
+    --overwrite_output_dir \
     --save_steps 0 \
     --max_steps $MAX_STEPS \
     --save_strategy "no"
diff --git a/autotuning/hf/gpt2-xl/test_tune.sh b/autotuning/hf/gpt2-xl/test_tune.sh
index 4e2907f94..d5ae1678c 100755
--- a/autotuning/hf/gpt2-xl/test_tune.sh
+++ b/autotuning/hf/gpt2-xl/test_tune.sh
@@ -1,124 +1,141 @@
-TASK_NAME=mnli
-MODEL_NAME=bert-base-cased
+MODEL_NAME=gpt2-xl
+PER_DEVICE_TRAIN_BATCH_SIZE=1
 HF_PATH=~/projects
-PER_DEVICE_TRAIN_BATCH_SIZE=64
-MAX_TRAIN_BATCH_SIZE=4096
 NEPOCHS=1
 NGPUS=16
 NNODES=1
 MAX_STEPS=200
-OUTPUT_DIR=./${TASK_NAME}/output_b${PER_DEVICE_TRAIN_BATCH_SIZE}_g${NGPUS}_$MAX_STEPS
+OUTPUT_DIR=./output_b${PER_DEVICE_TRAIN_BATCH_SIZE}_g${NGPUS}_$MAX_STEPS
 
+TEST=$1
 TEST=$1
 
 if [ ${TEST} == "0" ]
 then
-    python -m torch.distributed.launch --nproc_per_node=$NGPUS $HF_PATH/transformers/examples/pytorch/text-classification/run_glue.py \
+    python -m torch.distributed.launch --nproc_per_node=$NGPUS $HF_PATH/transformers/examples/pytorch/language-modeling/run_clm.py \
     --model_name_or_path $MODEL_NAME \
-    --task_name $TASK_NAME \
+    --dataset_name wikitext \
+    --dataset_config_name wikitext-2-raw-v1 \
     --do_train \
-    --do_eval \
-    --max_seq_length 128 \
+    --fp16 \
     --per_device_train_batch_size $PER_DEVICE_TRAIN_BATCH_SIZE \
-    # --learning_rate 2e-5 \
+    --learning_rate 2e-5 \
     --num_train_epochs $NEPOCHS \
-    --output_dir ${OUTPUT_DIR}_b${PER_DEVICE_TRAIN_BATCH_SIZE}_0 \
+    --output_dir ${OUTPUT_DIR}_0 \
     --overwrite_output_dir \
     --save_steps 0 \
     --max_steps $MAX_STEPS \
     --save_strategy "no"
 elif [ ${TEST} == "z0" ]
 then
-    deepspeed --num_nodes=$NNODES $HF_PATH/transformers/examples/pytorch/text-classification/run_glue.py --deepspeed ../dsconfigs/ds_config_z0.json \
+    deepspeed --num_nodes=$NNODES --num_gpus=$NGPUS $HF_PATH/transformers/examples/pytorch/language-modeling/run_clm.py --deepspeed ../dsconfigs/ds_config_fp16_z0.json\
     --model_name_or_path $MODEL_NAME \
-    --task_name $TASK_NAME \
+    --dataset_name wikitext \
+    --dataset_config_name wikitext-2-raw-v1 \
     --do_train \
-    --do_eval \
-    --max_seq_length 128 \
+    --fp16 \
     --per_device_train_batch_size $PER_DEVICE_TRAIN_BATCH_SIZE \
     --learning_rate 2e-5 \
     --num_train_epochs $NEPOCHS \
-    --output_dir ${OUTPUT_DIR}_b${PER_DEVICE_TRAIN_BATCH_SIZE}_z0 \
-    --save_steps 0 \
+    --output_dir ${OUTPUT_DIR}_z0 \
     --overwrite_output_dir \
-    --max_steps $MAX_STEPS
+    --save_steps 0 \
+    --max_steps $MAX_STEPS \
+    --save_strategy "no"
 elif [ ${TEST} == "z1" ]
 then
-    deepspeed --num_nodes=$NNODES $HF_PATH/transformers/examples/pytorch/text-classification/run_glue.py --deepspeed ../dsconfigs/ds_config_z1.json \
+    deepspeed --num_nodes=$NNODES --num_gpus=$NGPUS $HF_PATH/transformers/examples/pytorch/language-modeling/run_clm.py --deepspeed ../dsconfigs/ds_config_fp16_z1.json\
     --model_name_or_path $MODEL_NAME \
-    --task_name $TASK_NAME \
+    --dataset_name wikitext \
+    --dataset_config_name wikitext-2-raw-v1 \
     --do_train \
-    --do_eval \
-    --max_seq_length 128 \
+    --fp16 \
     --per_device_train_batch_size $PER_DEVICE_TRAIN_BATCH_SIZE \
     --learning_rate 2e-5 \
     --num_train_epochs $NEPOCHS \
-    --output_dir ${OUTPUT_DIR}_b${PER_DEVICE_TRAIN_BATCH_SIZE}_z1 \
-    --save_steps 0 \
+    --output_dir ${OUTPUT_DIR}_z1 \
     --overwrite_output_dir \
-    --max_steps $MAX_STEPS
+    --save_steps 0 \
+    --max_steps $MAX_STEPS \
+    --save_strategy "no"
 elif [ ${TEST} == "z2" ]
 then
-    deepspeed --num_nodes=$NNODES $HF_PATH/transformers/examples/pytorch/text-classification/run_glue.py --deepspeed ../dsconfigs/ds_config_z2.json \
+    deepspeed --num_nodes=$NNODES --num_gpus=$NGPUS $HF_PATH/transformers/examples/pytorch/language-modeling/run_clm.py --deepspeed ../dsconfigs/ds_config_fp16_z2.json\
     --model_name_or_path $MODEL_NAME \
-    --task_name $TASK_NAME \
+    --dataset_name wikitext \
+    --dataset_config_name wikitext-2-raw-v1 \
     --do_train \
-    --do_eval \
-    --max_seq_length 128 \
+    --fp16 \
     --per_device_train_batch_size $PER_DEVICE_TRAIN_BATCH_SIZE \
     --learning_rate 2e-5 \
     --num_train_epochs $NEPOCHS \
-    --output_dir ${OUTPUT_DIR}_b${PER_DEVICE_TRAIN_BATCH_SIZE}_z2 \
-    --save_steps 0 \
+    --output_dir ${OUTPUT_DIR}_z2 \
     --overwrite_output_dir \
-    --max_steps $MAX_STEPS
+    --save_steps 0 \
+    --max_steps $MAX_STEPS \
+    --save_strategy "no"
 elif [ ${TEST} == "z3" ]
 then
-    deepspeed --num_nodes=$NNODES $HF_PATH/transformers/examples/pytorch/text-classification/run_glue.py --deepspeed ../dsconfigs/ds_config_z3.json \
+    deepspeed --num_nodes=$NNODES --num_gpus=$NGPUS $HF_PATH/transformers/examples/pytorch/language-modeling/run_clm.py --deepspeed ../dsconfigs/ds_config_fp16_z3.json\
     --model_name_or_path $MODEL_NAME \
-    --task_name $TASK_NAME \
+    --dataset_name wikitext \
+    --dataset_config_name wikitext-2-raw-v1 \
     --do_train \
-    --do_eval \
-    --max_seq_length 128 \
+    --fp16 \
     --per_device_train_batch_size $PER_DEVICE_TRAIN_BATCH_SIZE \
     --learning_rate 2e-5 \
     --num_train_epochs $NEPOCHS \
-    --output_dir ${OUTPUT_DIR}_b${PER_DEVICE_TRAIN_BATCH_SIZE}_z3 \
-    --save_steps 0 \
+    --output_dir ${OUTPUT_DIR}_z3 \
     --overwrite_output_dir \
-    --max_steps $MAX_STEPS
+    --save_steps 0 \
+    --max_steps $MAX_STEPS \
+    --save_strategy "no"
 elif [ ${TEST} == "tune" ]
 then
-    deepspeed --autotuning run --num_nodes=$NNODES $HF_PATH/transformers/examples/pytorch/text-classification/run_glue.py --deepspeed ../dsconfigs/ds_config_tune.json \
+    deepspeed --autotuning run --num_nodes=$NNODES --num_gpus=$NGPUS $HF_PATH/transformers/examples/pytorch/language-modeling/run_clm.py --deepspeed ../dsconfigs/ds_config_fp16_tune.json\
     --model_name_or_path $MODEL_NAME \
-    --task_name $TASK_NAME \
+    --dataset_name wikitext \
+    --dataset_config_name wikitext-2-raw-v1 \
     --do_train \
-    --do_eval \
-    --max_seq_length 128 \
+    --fp16 \
+    --block_size 512 \
     --per_device_train_batch_size $PER_DEVICE_TRAIN_BATCH_SIZE \
     --learning_rate 2e-5 \
     --num_train_epochs $NEPOCHS \
     --output_dir ${OUTPUT_DIR}_tune \
-    --save_steps 0 \
     --overwrite_output_dir \
-    --max_steps $MAX_STEPS
-elif [ ${TEST} == "fs" ]
+    --save_steps 0 \
+    --max_steps $MAX_STEPS \
+    --save_strategy "no"
+elif [ ${TEST} == "tune_test" ]
 then
-    python -m torch.distributed.launch --nproc_per_node=$NGPUS $HF_PATH/transformers/examples/pytorch/text-classification/run_glue.py \
+    deepspeed --num_nodes=$NNODES --num_gpus=$NGPUS $HF_PATH/transformers/examples/pytorch/language-modeling/run_clm.py --deepspeed ../dsconfigs/ds_config_fp16_tune_test.json \
     --model_name_or_path $MODEL_NAME \
-    --task_name $TASK_NAME \
+    --dataset_name wikitext \
+    --dataset_config_name wikitext-2-raw-v1 \
     --do_train \
-    --do_eval \
-    --max_seq_length 128 \
+    --fp16 \
     --per_device_train_batch_size $PER_DEVICE_TRAIN_BATCH_SIZE \
     --learning_rate 2e-5 \
     --num_train_epochs $NEPOCHS \
-    --output_dir ${OUTPUT_DIR}_b${PER_DEVICE_TRAIN_BATCH_SIZE}_fs \
+    --output_dir ${OUTPUT_DIR}_tune_test \
     --overwrite_output_dir \
     --save_steps 0 \
     --max_steps $MAX_STEPS \
-    --sharded_ddp zero_dp_2
-fi
+    --save_strategy "no"
+elif [ ${TEST} == "fs" ]
+then
+    python -m torch.distributed.launch --nproc_per_node=$NGPUS $HF_PATH/transformers/examples/pytorch/language-modeling/run_clm.py \
+    --model_name_or_path $MODEL_NAME \
+    --dataset_name wikitext \
+    --dataset_config_name wikitext-2-raw-v1 \
+    --do_train \
+    --fp16 \
+    --per_device_train_batch_size $PER_DEVICE_TRAIN_BATCH_SIZE \
+    --learning_rate 2e-5 \
+    --num_train_epochs $NEPOCHS \
+    --output_dir ${OUTPUT_DIR}_fs \
+    --overwrite_output_dir \
     --save_steps 0 \
     --max_steps $MAX_STEPS \
     --save_strategy "no"
diff --git a/autotuning/hf/gpt2/README.md b/autotuning/hf/gpt2/README.md
index 507f2b6fb..bcd59f4c8 100644
--- a/autotuning/hf/gpt2/README.md
+++ b/autotuning/hf/gpt2/README.md
@@ -11,7 +11,7 @@ Refer to [GPT-2/GPT and causal language modeling](https://github.com/huggingface
 
 ## Environment
 
-The training use fp16 and runs on 1 node with 16 Nvidia V100 GPUs. The autotuning uses the same hardware resource as the training. `max_train_batch_size` is not defined.
+The training use fp16 and runs on 1 node with 16 Nvidia V100 GPUs. The autotuning uses the same hardware resource as the training. `max_train_batch_size` is set to 512.
 The HF packages below are used.
 
 HF examples require installing the `transformers` package from source:
diff --git a/autotuning/hf/gpt2/test_tune.sh b/autotuning/hf/gpt2/test_tune.sh
index 88785da24..c9338cdcd 100755
--- a/autotuning/hf/gpt2/test_tune.sh
+++ b/autotuning/hf/gpt2/test_tune.sh
@@ -2,7 +2,6 @@ MODEL_NAME=gpt2
 PER_DEVICE_TRAIN_BATCH_SIZE=1
 HF_PATH=~/projects
 MAX_TRAIN_BATCH_SIZE=512
-
 NEPOCHS=1
 NGPUS=16
 NNODES=1

From 0805868e5db3881c70baee140a48bc4190ed2a0c Mon Sep 17 00:00:00 2001
From: Cheng Li <pistasable@gmail.com>
Date: Sat, 13 Nov 2021 09:46:30 +0000
Subject: [PATCH 7/7] fix format

---
 autotuning/hf/bert-base/ds_config_tune.json  | 12 ++++++++++++
 autotuning/hf/bert-base/test_tune.sh         |  9 +--------
 autotuning/hf/bert-large/README.md           |  2 +-
 autotuning/hf/bert-large/ds_config_tune.json | 11 +++++++++++
 autotuning/hf/bert-large/test_tune.sh        |  9 +--------
 autotuning/hf/deberta/test_tune.sh           | 14 +++++++-------
 autotuning/hf/distilbert/README.md           | 15 ++++++++++++++-
 autotuning/hf/distilbert/test_tune.sh        |  4 ++--
 autotuning/hf/gpt2-xl/test_tune.sh           |  3 +--
 autotuning/hf/gpt2/README.md                 |  4 ++--
 autotuning/hf/gpt2/test_tune.sh              |  1 -
 11 files changed, 52 insertions(+), 32 deletions(-)
 create mode 100644 autotuning/hf/bert-base/ds_config_tune.json
 create mode 100644 autotuning/hf/bert-large/ds_config_tune.json

diff --git a/autotuning/hf/bert-base/ds_config_tune.json b/autotuning/hf/bert-base/ds_config_tune.json
new file mode 100644
index 000000000..23a48ddf9
--- /dev/null
+++ b/autotuning/hf/bert-base/ds_config_tune.json
@@ -0,0 +1,12 @@
+{
+  "train_micro_batch_size_per_gpu": "auto",
+  "autotuning": {
+    "enabled": true,
+    "overwrite": false,
+    "max_train_batch_size": 4096,
+    "arg_mappings": {
+      "train_micro_batch_size_per_gpu": "--per_device_train_batch_size",
+      "gradient_accumulation_steps ": "--gradient_accumulation_steps"
+    }
+  }
+}
diff --git a/autotuning/hf/bert-base/test_tune.sh b/autotuning/hf/bert-base/test_tune.sh
index cb6ecb01c..532efc902 100755
--- a/autotuning/hf/bert-base/test_tune.sh
+++ b/autotuning/hf/bert-base/test_tune.sh
@@ -17,7 +17,6 @@ then
     --model_name_or_path $MODEL_NAME \
     --task_name $TASK_NAME \
     --do_train \
-    --do_eval \
     --max_seq_length 128 \
     --per_device_train_batch_size $PER_DEVICE_TRAIN_BATCH_SIZE \
     --learning_rate 2e-5 \
@@ -33,7 +32,6 @@ then
     --model_name_or_path $MODEL_NAME \
     --task_name $TASK_NAME \
     --do_train \
-    --do_eval \
     --max_seq_length 128 \
     --per_device_train_batch_size $PER_DEVICE_TRAIN_BATCH_SIZE \
     --learning_rate 2e-5 \
@@ -48,7 +46,6 @@ then
     --model_name_or_path $MODEL_NAME \
     --task_name $TASK_NAME \
     --do_train \
-    --do_eval \
     --max_seq_length 128 \
     --per_device_train_batch_size $PER_DEVICE_TRAIN_BATCH_SIZE \
     --learning_rate 2e-5 \
@@ -63,7 +60,6 @@ then
     --model_name_or_path $MODEL_NAME \
     --task_name $TASK_NAME \
     --do_train \
-    --do_eval \
     --max_seq_length 128 \
     --per_device_train_batch_size $PER_DEVICE_TRAIN_BATCH_SIZE \
     --learning_rate 2e-5 \
@@ -78,7 +74,6 @@ then
     --model_name_or_path $MODEL_NAME \
     --task_name $TASK_NAME \
     --do_train \
-    --do_eval \
     --max_seq_length 128 \
     --per_device_train_batch_size $PER_DEVICE_TRAIN_BATCH_SIZE \
     --learning_rate 2e-5 \
@@ -89,11 +84,10 @@ then
     --max_steps $MAX_STEPS
 elif [ ${TEST} == "tune" ]
 then
-    deepspeed --autotuning run --num_nodes=$NNODES $HF_PATH/transformers/examples/pytorch/text-classification/run_glue.py --deepspeed ../dsconfigs/ds_config_tune.json \
+    deepspeed --autotuning run --num_nodes=$NNODES $HF_PATH/transformers/examples/pytorch/text-classification/run_glue.py --deepspeed ./ds_config_tune.json \
     --model_name_or_path $MODEL_NAME \
     --task_name $TASK_NAME \
     --do_train \
-    --do_eval \
     --max_seq_length 128 \
     --per_device_train_batch_size $PER_DEVICE_TRAIN_BATCH_SIZE \
     --learning_rate 2e-5 \
@@ -108,7 +102,6 @@ then
     --model_name_or_path $MODEL_NAME \
     --task_name $TASK_NAME \
     --do_train \
-    --do_eval \
     --max_seq_length 128 \
     --per_device_train_batch_size $PER_DEVICE_TRAIN_BATCH_SIZE \
     --learning_rate 2e-5 \
diff --git a/autotuning/hf/bert-large/README.md b/autotuning/hf/bert-large/README.md
index c89c171cc..157dba0c1 100644
--- a/autotuning/hf/bert-large/README.md
+++ b/autotuning/hf/bert-large/README.md
@@ -7,7 +7,7 @@ This model has the following configuration:
 - 16 attention heads
 - 336M parameters
 
-The training use fp32 and runs on 1 node with 16 Nvidia V100 GPUs. The autotuning uses the same hardware resource as the training. `max_train_batch_size` is set to `4096`.
+The training use fp32 and runs on 1 node with 16 Nvidia V100 GPUs. The autotuning uses the same hardware resource as the training. `max_train_batch_size` is not defined.
 The HF packages below are used.
 
 HF examples require installing the `transformers` package from source:
diff --git a/autotuning/hf/bert-large/ds_config_tune.json b/autotuning/hf/bert-large/ds_config_tune.json
new file mode 100644
index 000000000..e79f9c450
--- /dev/null
+++ b/autotuning/hf/bert-large/ds_config_tune.json
@@ -0,0 +1,11 @@
+{
+  "train_micro_batch_size_per_gpu": "auto",
+  "autotuning": {
+    "enabled": true,
+    "overwrite": false,
+    "arg_mappings": {
+      "train_micro_batch_size_per_gpu": "--per_device_train_batch_size",
+      "gradient_accumulation_steps ": "--gradient_accumulation_steps"
+    }
+  }
+}
diff --git a/autotuning/hf/bert-large/test_tune.sh b/autotuning/hf/bert-large/test_tune.sh
index b8254f1a0..e63f917b8 100755
--- a/autotuning/hf/bert-large/test_tune.sh
+++ b/autotuning/hf/bert-large/test_tune.sh
@@ -17,7 +17,6 @@ then
     --model_name_or_path $MODEL_NAME \
     --task_name $TASK_NAME \
     --do_train \
-    --do_eval \
     --max_seq_length 128 \
     --per_device_train_batch_size $PER_DEVICE_TRAIN_BATCH_SIZE \
     --learning_rate 2e-5 \
@@ -33,7 +32,6 @@ then
     --model_name_or_path $MODEL_NAME \
     --task_name $TASK_NAME \
     --do_train \
-    --do_eval \
     --max_seq_length 128 \
     --per_device_train_batch_size $PER_DEVICE_TRAIN_BATCH_SIZE \
     --learning_rate 2e-5 \
@@ -48,7 +46,6 @@ then
     --model_name_or_path $MODEL_NAME \
     --task_name $TASK_NAME \
     --do_train \
-    --do_eval \
     --max_seq_length 128 \
     --per_device_train_batch_size $PER_DEVICE_TRAIN_BATCH_SIZE \
     --learning_rate 2e-5 \
@@ -63,7 +60,6 @@ then
     --model_name_or_path $MODEL_NAME \
     --task_name $TASK_NAME \
     --do_train \
-    --do_eval \
     --max_seq_length 128 \
     --per_device_train_batch_size $PER_DEVICE_TRAIN_BATCH_SIZE \
     --learning_rate 2e-5 \
@@ -78,7 +74,6 @@ then
     --model_name_or_path $MODEL_NAME \
     --task_name $TASK_NAME \
     --do_train \
-    --do_eval \
     --max_seq_length 128 \
     --per_device_train_batch_size $PER_DEVICE_TRAIN_BATCH_SIZE \
     --learning_rate 2e-5 \
@@ -89,11 +84,10 @@ then
     --max_steps $MAX_STEPS
 elif [ ${TEST} == "tune" ]
 then
-    deepspeed --autotuning run --num_nodes=$NNODES --num_gpus=$NGPUS $HF_PATH/transformers/examples/pytorch/text-classification/run_glue.py --deepspeed ../dsconfigs/ds_config_tune.json \
+    deepspeed --autotuning run --num_nodes=$NNODES --num_gpus=$NGPUS $HF_PATH/transformers/examples/pytorch/text-classification/run_glue.py --deepspeed ./ds_config_tune.json \
     --model_name_or_path $MODEL_NAME \
     --task_name $TASK_NAME \
     --do_train \
-    --do_eval \
     --max_seq_length 128 \
     --per_device_train_batch_size $PER_DEVICE_TRAIN_BATCH_SIZE \
     --learning_rate 2e-5 \
@@ -108,7 +102,6 @@ then
     --model_name_or_path $MODEL_NAME \
     --task_name $TASK_NAME \
     --do_train \
-    --do_eval \
     --max_seq_length 128 \
     --per_device_train_batch_size $PER_DEVICE_TRAIN_BATCH_SIZE \
     --learning_rate 2e-5 \
diff --git a/autotuning/hf/deberta/test_tune.sh b/autotuning/hf/deberta/test_tune.sh
index 1b283ca57..d4de499ee 100755
--- a/autotuning/hf/deberta/test_tune.sh
+++ b/autotuning/hf/deberta/test_tune.sh
@@ -15,7 +15,7 @@ then
     python -m torch.distributed.launch --nproc_per_node=$NGPUS $HF_PATH/transformers/examples/pytorch/text-classification/run_glue.py \
     --model_name_or_path $MODEL_NAME \
     --task_name $TASK_NAME \
-    --do_train --do_eval \
+    --do_train \
     --fp16 \
     --max_seq_length 256 \
     --per_device_train_batch_size $PER_DEVICE_TRAIN_BATCH_SIZE \
@@ -31,7 +31,7 @@ then
     deepspeed --num_nodes=$NNODES --num_gpus=$NGPUS $HF_PATH/transformers/examples/pytorch/text-classification/run_glue.py --deepspeed ../dsconfigs/ds_config_fp16_z0.json\
     --model_name_or_path $MODEL_NAME \
     --task_name $TASK_NAME \
-    --do_train --do_eval \
+    --do_train \
     --fp16 \
     --max_seq_length 256 \
     --per_device_train_batch_size $PER_DEVICE_TRAIN_BATCH_SIZE \
@@ -48,7 +48,7 @@ then
     deepspeed --num_nodes=$NNODES --num_gpus=$NGPUS $HF_PATH/transformers/examples/pytorch/text-classification/run_glue.py --deepspeed ../dsconfigs/ds_config_fp16_z1.json\
     --model_name_or_path $MODEL_NAME \
     --task_name $TASK_NAME \
-    --do_train --do_eval \
+    --do_train \
     --fp16 \
     --max_seq_length 256 \
     --per_device_train_batch_size $PER_DEVICE_TRAIN_BATCH_SIZE \
@@ -64,7 +64,7 @@ then
     deepspeed --num_nodes=$NNODES --num_gpus=$NGPUS $HF_PATH/transformers/examples/pytorch/text-classification/run_glue.py --deepspeed ../dsconfigs/ds_config_fp16_z2.json\
     --model_name_or_path $MODEL_NAME \
     --task_name $TASK_NAME \
-    --do_train --do_eval \
+    --do_train \
     --fp16 \
     --max_seq_length 256 \
     --per_device_train_batch_size $PER_DEVICE_TRAIN_BATCH_SIZE \
@@ -80,7 +80,7 @@ then
     deepspeed --num_nodes=$NNODES --num_gpus=$NGPUS $HF_PATH/transformers/examples/pytorch/text-classification/run_glue.py --deepspeed ../dsconfigs/ds_config_fp16_z3.json\
     --model_name_or_path $MODEL_NAME \
     --task_name $TASK_NAME \
-    --do_train --do_eval \
+    --do_train \
     --fp16 \
     --max_seq_length 256 \
     --per_device_train_batch_size $PER_DEVICE_TRAIN_BATCH_SIZE \
@@ -96,7 +96,7 @@ then
     deepspeed --autotuning run --num_nodes=$NNODES --num_gpus=$NGPUS $HF_PATH/transformers/examples/pytorch/text-classification/run_glue.py --deepspeed ./ds_config_fp16_tune.json\
     --model_name_or_path $MODEL_NAME \
     --task_name $TASK_NAME \
-    --do_train --do_eval \
+    --do_train \
     --fp16 \
     --max_seq_length 256 \
     --per_device_train_batch_size $PER_DEVICE_TRAIN_BATCH_SIZE \
@@ -112,7 +112,7 @@ then
     python -m torch.distributed.launch --nproc_per_node=$NGPUS $HF_PATH/transformers/examples/pytorch/text-classification/run_glue.py \
     --model_name_or_path $MODEL_NAME \
     --task_name $TASK_NAME \
-    --do_train --do_eval \
+    --do_train \
     --fp16 \
     --max_seq_length 256 \
     --per_device_train_batch_size $PER_DEVICE_TRAIN_BATCH_SIZE \
diff --git a/autotuning/hf/distilbert/README.md b/autotuning/hf/distilbert/README.md
index 088cb4877..dce99207c 100644
--- a/autotuning/hf/distilbert/README.md
+++ b/autotuning/hf/distilbert/README.md
@@ -9,7 +9,7 @@ This model has the following configuration:
 
 ## Environment
 
-The training uses 1 node with 16 Nvidia V100 GPUs, fp32, max_train_batch_size = 4096. The autotuning uses the same hardware resource as the training.
+The training uses 1 node with 16 Nvidia V100 GPUs, fp32, max_train_batch_size = 4096. The autotuning uses the same hardware resource as the training. `"max_train_batch_size"` is set to `4096`.
 The HF packages below are used.
 
 HF examples require installing the `transformers` package from source:
@@ -37,6 +37,8 @@ Notation: Hugging Face (HF), DeepSpeed (DS), ZeRO stage (z), gradient accumulati
 | ---------- | ----------------------------- | ------------------------------------ | ------------------------------ |
 | DistilBERT | 5161.902 (gas = 1, mbs = 256) | 5305.067 (z = 0, gas = 1 mbs = 256), | 5305.067 (z0_gas1_tmbspg256)   |
 
+3700.296
+
 ## Detailed `HF + DS autotuning` Result Summary
 
 Note that the performance metric used in autotuning is calculated using the timings captured within DeepSpeed forward, backward, and step functions. The sum of these timings is less than the actual training step latency, thus the throughput metric values used by autotuning would be higher than the end-to-end throughput in training.
@@ -54,3 +56,14 @@ Note that the performance metric used in autotuning is calculated using the timi
 | global       |              11 |         5759.96 | z0_gas1_tmbspg256 |
 
 Tuning completed in 0:10:45.085016. Total number of experiments: 11.
+
+
+| tuning_space | num_experiments | best_metric_val | best_exp_name      |
+| :----------- | --------------: | --------------: | :----------------- |
+| z0           |               7 |         5759.98 | z0_gas22_tmbspg179 |
+| z1           |               2 |         5543.49 | z1_gas1_tmbspg269  |
+| z2           |               2 |         5044.88 | z2_gas15_tmbspg269 |
+| z3           |               2 |         4627.63 | z3_gas1_tmbspg269  |
+| global       |              13 |         5759.98 | z0_gas22_tmbspg179 |
+
+Tuning completed in 0:25:44.502148. Total number of experiments: 13.
diff --git a/autotuning/hf/distilbert/test_tune.sh b/autotuning/hf/distilbert/test_tune.sh
index fc1a0c978..08b92d56e 100755
--- a/autotuning/hf/distilbert/test_tune.sh
+++ b/autotuning/hf/distilbert/test_tune.sh
@@ -1,7 +1,7 @@
 TASK_NAME=mnli
 MODEL_NAME=distilbert-base-uncased
 HF_PATH=~/projects
-PER_DEVICE_TRAIN_BATCH_SIZE=256
+PER_DEVICE_TRAIN_BATCH_SIZE=64
 MAX_TRAIN_BATCH_SIZE=4096
 NEPOCHS=1
 NGPUS=16
@@ -88,7 +88,7 @@ then
     --save_strategy "no"
 elif [ ${TEST} == "tune" ]
 then
-    deepspeed --autotuning tune --num_nodes=$NNODES --num_gpus=$NGPUS $HF_PATH/transformers/examples/pytorch/text-classification/run_glue.py --deepspeed ./ds_config_tune.json \
+    deepspeed --autotuning run --num_nodes=$NNODES --num_gpus=$NGPUS $HF_PATH/transformers/examples/pytorch/text-classification/run_glue.py --deepspeed ./ds_config_tune.json \
     --model_name_or_path $MODEL_NAME \
     --task_name $TASK_NAME \
     --do_train \
diff --git a/autotuning/hf/gpt2-xl/test_tune.sh b/autotuning/hf/gpt2-xl/test_tune.sh
index d5ae1678c..3c144635e 100755
--- a/autotuning/hf/gpt2-xl/test_tune.sh
+++ b/autotuning/hf/gpt2-xl/test_tune.sh
@@ -4,10 +4,9 @@ HF_PATH=~/projects
 NEPOCHS=1
 NGPUS=16
 NNODES=1
-MAX_STEPS=200
+MAX_STEPS=50
 OUTPUT_DIR=./output_b${PER_DEVICE_TRAIN_BATCH_SIZE}_g${NGPUS}_$MAX_STEPS
 
-TEST=$1
 TEST=$1
 
 if [ ${TEST} == "0" ]
diff --git a/autotuning/hf/gpt2/README.md b/autotuning/hf/gpt2/README.md
index bcd59f4c8..bb426910c 100644
--- a/autotuning/hf/gpt2/README.md
+++ b/autotuning/hf/gpt2/README.md
@@ -11,7 +11,7 @@ Refer to [GPT-2/GPT and causal language modeling](https://github.com/huggingface
 
 ## Environment
 
-The training use fp16 and runs on 1 node with 16 Nvidia V100 GPUs. The autotuning uses the same hardware resource as the training. `max_train_batch_size` is set to 512.
+The training use fp16 and runs on 1 node with 16 Nvidia V100 GPUs. The autotuning uses the same hardware resource as the training. `max_train_batch_size` is not defined.
 The HF packages below are used.
 
 HF examples require installing the `transformers` package from source:
@@ -37,7 +37,7 @@ Notation: Hugging Face (HF), DeepSpeed (DS), ZeRO stage (z), gradient accumulati
 
 | Model name | baseline (vanila HF) | HF + DS hand-tuned       | HF + DS autotuning (fast-mode) |
 | ---------- | -------------------- | ------------------------ | ------------------------------ |
-| GPT2       | 284.142 (mbs = 8)    | 397.827 (z = 1, mbs = 8) | 431.586 (z1_gas1_tmbspg14)     |
+| GPT2       | 284.142 (mbs = 8)    | 397.827 (z = 1, mbs = 8) | 431.586 (z1_gas1_tmbspg15)     |
 
 
 ## Detailed `HF + DS autotuning` Result Summary
diff --git a/autotuning/hf/gpt2/test_tune.sh b/autotuning/hf/gpt2/test_tune.sh
index c9338cdcd..b570c455c 100755
--- a/autotuning/hf/gpt2/test_tune.sh
+++ b/autotuning/hf/gpt2/test_tune.sh
@@ -1,7 +1,6 @@
 MODEL_NAME=gpt2
 PER_DEVICE_TRAIN_BATCH_SIZE=1
 HF_PATH=~/projects
-MAX_TRAIN_BATCH_SIZE=512
 NEPOCHS=1
 NGPUS=16
 NNODES=1