diff --git a/MoQ/huggingface-transformers/tests/fixtures/tests_samples/GermEval/dev.txt b/MoQ/huggingface-transformers/tests/fixtures/tests_samples/GermEval/dev.txt
index de0015823..1aba64f7a 100644
--- a/MoQ/huggingface-transformers/tests/fixtures/tests_samples/GermEval/dev.txt
+++ b/MoQ/huggingface-transformers/tests/fixtures/tests_samples/GermEval/dev.txt
@@ -10,7 +10,7 @@ homo I-OTH
 " O
 in O
 enger O
-Auseinandersetzung O
+Ause inandersetzung O
 mit O
 diesem O
 Bild O
diff --git a/autotuning/.gitignore b/autotuning/.gitignore
new file mode 100644
index 000000000..82319e4a0
--- /dev/null
+++ b/autotuning/.gitignore
@@ -0,0 +1,4 @@
+autotuning_results*
+autotuning_exps*
+output*
+mnli
diff --git a/autotuning/README.md b/autotuning/README.md
new file mode 100644
index 000000000..d028a945e
--- /dev/null
+++ b/autotuning/README.md
@@ -0,0 +1,3 @@
+# Autotuning Examples
+
+This showcases the [autotuning](https://github.com/microsoft/DeepSpeed/tree/master/deepspeed/autotuning) feature in DeepSpeed (DS).
diff --git a/autotuning/hf/README.md b/autotuning/hf/README.md
new file mode 100644
index 000000000..567deda04
--- /dev/null
+++ b/autotuning/hf/README.md
@@ -0,0 +1,62 @@
+# Autotuning Hugging Face Examples
+
+This showcases the [autotuning](https://github.com/microsoft/DeepSpeed/tree/master/deepspeed/autotuning) feature in DeepSpeed (DS) with Hugging Face (HF) models.
+
+## List of Models
+
+- [DistilBERT](distilbert)
+- [BERT-base](bert-base)
+- [BERT-large](bert-large)
+- [GPT2](gpt2)
+- [GPT2-medium](gpt2-medium)
+- [GPT2-large](gpt2-large)
+- [GPT2-xl](gpt2-xl)
+- [DeBERTa](deberta)
+
+Each model folder has a `test_tune.sh` script:
+
+- `./test_tune.sh tune` tunes the model training and then runs it using the selected tuned DeepSpeed configuration.
+- `./test_tune.sh 0` runs the model using HF without DeepSpeed.
+- `./test_tune.sh z0` runs the model using HF + DS with ZeRO optimization disabled.
+- `./test_tune.sh z1` runs the model using HF + DS with ZeRO optimization stage 1.
+- `./test_tune.sh z2` runs the model using HF + DS with ZeRO optimization stage 2.
+- `./test_tune.sh z3` runs the model using HF + DS with ZeRO optimization stage 3.
+
+
+## Testing Environment
+
+The training runs on 1 node with 16 Nvidia V100 GPUs. The autotuning uses the same hardware resource as the training.
+The HF packages below are used.
+
+HF examples require installing the `transformers` package from source:
+```bash
+    git clone https://github.com/huggingface/transformers.git
+    cd transformers
+    pip install .
+```
+The `datasets` package can be installed by `pip install datasets`
+
+Below are the versions used in this test.
+
+- transformers (4.12.0)
+- datasets (1.11.0)
+
+## Throughput Comparison
+
+The table below shows the throughput (samples per second) comparison. The corresponding train micro-batch size per GPU (mbs or tmbspg) and ZeRO stage used to achieve the throughput value is also shown in the parentheses. Assume the strategy users would use in the handtuning process is to start from `mbs = 1` and increase mbs by 2 each time until running out of GPU memory.
+ - `baseline` is the vanila HF without DeepSpeed (DS) and mbs is hand-tuned.
+ - `HF + DS hand-tuned` is HF with DS, and mbs is hand-tuned while other DS configuration uses default values.
+ - `HF + DS autotuning` is HF with DS, and the DS configuration is selected from autotuning.
+
+Notation: Hugging Face (HF), DeepSpeed (DS), ZeRO stage (z), gradient accumulation steps (gas), train micro-batch size per GPU (mbs or tmbspg).
+
+| Model   name | num_params |     baseline (vanila HF)      |          HF + DS hand-tuned          | HF + DS autotuning (fast-mode) | throughput improvement over baseline | autotuning time (mins) | number of experiments |
+| :----------: | :--------: | :---------------------------: | :----------------------------------: | :----------------------------: | :----------------------------------: | :--------------------: | :-------------------: |
+|  DistilBERT  |    66M     | 5161.902 (gas = 1, mbs = 256) | 5305.067 (z = 0, gas = 1 mbs = 256)  |  5305.067 (z0_gas1_tmbspg256)  |                1.03x                 |           11           |          11           |
+|  BERT-base   |   0.11B    | 2502.236 (gas = 1,mbs = 128)  | 2523.684 (z = 0, gas = 1, mbs = 128) |  2736.561 (z0_gas1_tmbspg235)  |                1.09x                 |           35           |          34           |
+|  BERT-large  |   0.34B    |  742.692 (gas = 1,mbs = 64)   |  766.929 (z = 1, gas = 1, mbs = 64)  |   808.168 (z1_gas1_tmbspg93)   |                1.09x                 |           36           |          22           |
+|     GPT2     |   0.12B    |   284.142 (gas = 1,mbs = 8)   |  397.827 (z = 1, gas = 1, mbs = 8)   |   431.586 (z1_gas1_tmbspg14)   |                1.52x                 |           25           |          17           |
+| GPT2-medium  |   0.35B    |   71.61 (gas = 1, mbs = 2)    |  142.211 (z = 1, gas = 1, mbs = 4)   |    163.3 (z1_gas1_tmbspg6)     |                 2.28                 |           15           |          25           |
+|  GPT2-large  |   0.77B    |   27.874 (gas = 1, mbs = 1)   |   56.797 (z = 1, gas = 1, mbs = 2)   |    69.061 (z = 1, mbs = 3)     |                2.48x                 |           27           |          13           |
+|   GPT2-xl    |    1.5B    |         Not runnable          |      27.462 (gas = 1, mbs = 1)       |    27.497 (z1_gas1_tmbspg1)    |                 inf                  |           21           |           9           |
+|   DeBERTa    |    1.5B    |         Not runnable          |   140.587 (z = 1, gas = 1 mbs = 8)   |  162.395  (z1_gas1_tmbspg11)   |                 inf                  |           40           |          12           |
diff --git a/autotuning/hf/bert-base/README.md b/autotuning/hf/bert-base/README.md
new file mode 100644
index 000000000..02450fdd3
--- /dev/null
+++ b/autotuning/hf/bert-base/README.md
@@ -0,0 +1,58 @@
+# [bert-base-cased](https://huggingface.co/bert-base-cased)
+
+This model has the following configuration:
+
+- 12-layer
+- 768 hidden dimension
+- 12 attention heads
+- 110M parameters.
+
+## Environment
+
+The training use fp32 and runs on 1 node with 16 Nvidia V100 GPUs. The autotuning uses the same hardware resource as the training. `max_train_batch_size` is set to `4096`.
+The HF packages below are used.
+
+HF examples require installing the `transformers` package from source:
+```bash
+    git clone https://github.com/huggingface/transformers.git
+    cd transformers
+    pip install .
+```
+The `datasets` package can be installed by `pip install datasets`
+
+Below are the versions used in this test.
+
+- transformers (4.12.0)
+- datasets (1.11.0)
+
+## Throughput Comparison
+
+The table below shows the throughput (samples per second) comparison. The corresponding train micro-batch size per GPU (mbs or tmbspg) and ZeRO stage used to achieve the throughput value is also shown in the parentheses. Assume the strategy users would use in the handtuning process is to start from `mbs = 1` and increase mbs by 2 each time until running out of GPU memory.
+ - `baseline` is the vanila HF without DeepSpeed (DS) and mbs is hand-tuned.
+ - `HF + DS hand-tuned` is HF with DS, and mbs is hand-tuned while other DS configuration uses default values.
+ - `HF + DS autotuning` is HF with DS, and the DS configuration is selected from autotuning.
+
+Notation: Hugging Face (HF), DeepSpeed (DS), ZeRO stage (z), gradient accumulation steps (gas), train micro-batch size per GPU (mbs or tmbspg).
+
+| Model name | baseline (vanila HF)          | HF + DS handtuned                    | HF + DS autotuning           |
+| ---------- | ----------------------------- | ------------------------------------ | ---------------------------- |
+| BERT-base  | 2502.236 (gas = 1, mbs = 128) | 2523.684 (z = 0, gas = 1, mbs = 128) | 2736.561 (z0_gas1_tmbspg235) |
+
+## Detailed `HF + DS autotuning` Result Summary
+
+Note that the performance metric used in autotuning is calculated using the timings captured within DeepSpeed forward, backward, and step functions. The sum of these timings is less than the actual training step latency, thus the throughput metric values used by autotuning would be higher than the end-to-end throughput in training.
+
+- Fast-mode Autotuning time: 35 mins
+- Number of experiments: 34
+- Throughput Improvement over baseline: 1.09x
+
+
+| tuning_space | num_experiments | best_metric_val | best_exp_name     |
+| :----------- | --------------: | --------------: | :---------------- |
+| z0           |               9 |         2930.18 | z0_gas1_tmbspg235 |
+| z1           |               7 |         2930.17 | z1_gas1_tmbspg235 |
+| z2           |               8 |         2744.16 | z2_gas1_tmbspg235 |
+| z3           |              10 |         2479.47 | z3_gas1_tmbspg238 |
+| global       |              34 |         2930.18 | z0_gas1_tmbspg235 |
+
+Tuning completed in 0:34:41.842250. Total number of experiments: 34.
diff --git a/autotuning/hf/bert-base/ds_config_tune.json b/autotuning/hf/bert-base/ds_config_tune.json
new file mode 100644
index 000000000..23a48ddf9
--- /dev/null
+++ b/autotuning/hf/bert-base/ds_config_tune.json
@@ -0,0 +1,12 @@
+{
+  "train_micro_batch_size_per_gpu": "auto",
+  "autotuning": {
+    "enabled": true,
+    "overwrite": false,
+    "max_train_batch_size": 4096,
+    "arg_mappings": {
+      "train_micro_batch_size_per_gpu": "--per_device_train_batch_size",
+      "gradient_accumulation_steps ": "--gradient_accumulation_steps"
+    }
+  }
+}
diff --git a/autotuning/hf/bert-base/test_tune.sh b/autotuning/hf/bert-base/test_tune.sh
new file mode 100755
index 000000000..532efc902
--- /dev/null
+++ b/autotuning/hf/bert-base/test_tune.sh
@@ -0,0 +1,114 @@
+TASK_NAME=mnli
+MODEL_NAME=bert-base-cased
+HF_PATH=~/projects
+PER_DEVICE_TRAIN_BATCH_SIZE=64
+MAX_TRAIN_BATCH_SIZE=4096
+NEPOCHS=1
+NGPUS=16
+NNODES=1
+MAX_STEPS=200
+OUTPUT_DIR=./${TASK_NAME}/output_b${PER_DEVICE_TRAIN_BATCH_SIZE}_g${NGPUS}_$MAX_STEPS
+
+TEST=$1
+
+if [ ${TEST} == "0" ]
+then
+    python -m torch.distributed.launch --nproc_per_node=$NGPUS $HF_PATH/transformers/examples/pytorch/text-classification/run_glue.py \
+    --model_name_or_path $MODEL_NAME \
+    --task_name $TASK_NAME \
+    --do_train \
+    --max_seq_length 128 \
+    --per_device_train_batch_size $PER_DEVICE_TRAIN_BATCH_SIZE \
+    --learning_rate 2e-5 \
+    --num_train_epochs $NEPOCHS \
+    --output_dir ${OUTPUT_DIR}_b${PER_DEVICE_TRAIN_BATCH_SIZE}_0 \
+    --overwrite_output_dir \
+    --save_steps 0 \
+    --max_steps $MAX_STEPS \
+    --save_strategy "no"
+elif [ ${TEST} == "z0" ]
+then
+    deepspeed --num_nodes=$NNODES $HF_PATH/transformers/examples/pytorch/text-classification/run_glue.py --deepspeed ../dsconfigs/ds_config_z0.json \
+    --model_name_or_path $MODEL_NAME \
+    --task_name $TASK_NAME \
+    --do_train \
+    --max_seq_length 128 \
+    --per_device_train_batch_size $PER_DEVICE_TRAIN_BATCH_SIZE \
+    --learning_rate 2e-5 \
+    --num_train_epochs $NEPOCHS \
+    --output_dir ${OUTPUT_DIR}_b${PER_DEVICE_TRAIN_BATCH_SIZE}_z0 \
+    --save_steps 0 \
+    --overwrite_output_dir \
+    --max_steps $MAX_STEPS
+elif [ ${TEST} == "z1" ]
+then
+    deepspeed --num_nodes=$NNODES $HF_PATH/transformers/examples/pytorch/text-classification/run_glue.py --deepspeed ../dsconfigs/ds_config_z1.json \
+    --model_name_or_path $MODEL_NAME \
+    --task_name $TASK_NAME \
+    --do_train \
+    --max_seq_length 128 \
+    --per_device_train_batch_size $PER_DEVICE_TRAIN_BATCH_SIZE \
+    --learning_rate 2e-5 \
+    --num_train_epochs $NEPOCHS \
+    --output_dir ${OUTPUT_DIR}_b${PER_DEVICE_TRAIN_BATCH_SIZE}_z1 \
+    --save_steps 0 \
+    --overwrite_output_dir \
+    --max_steps $MAX_STEPS
+elif [ ${TEST} == "z2" ]
+then
+    deepspeed --num_nodes=$NNODES $HF_PATH/transformers/examples/pytorch/text-classification/run_glue.py --deepspeed ../dsconfigs/ds_config_z2.json \
+    --model_name_or_path $MODEL_NAME \
+    --task_name $TASK_NAME \
+    --do_train \
+    --max_seq_length 128 \
+    --per_device_train_batch_size $PER_DEVICE_TRAIN_BATCH_SIZE \
+    --learning_rate 2e-5 \
+    --num_train_epochs $NEPOCHS \
+    --output_dir ${OUTPUT_DIR}_b${PER_DEVICE_TRAIN_BATCH_SIZE}_z2 \
+    --save_steps 0 \
+    --overwrite_output_dir \
+    --max_steps $MAX_STEPS
+elif [ ${TEST} == "z3" ]
+then
+    deepspeed --num_nodes=$NNODES $HF_PATH/transformers/examples/pytorch/text-classification/run_glue.py --deepspeed ../dsconfigs/ds_config_z3.json \
+    --model_name_or_path $MODEL_NAME \
+    --task_name $TASK_NAME \
+    --do_train \
+    --max_seq_length 128 \
+    --per_device_train_batch_size $PER_DEVICE_TRAIN_BATCH_SIZE \
+    --learning_rate 2e-5 \
+    --num_train_epochs $NEPOCHS \
+    --output_dir ${OUTPUT_DIR}_b${PER_DEVICE_TRAIN_BATCH_SIZE}_z3 \
+    --save_steps 0 \
+    --overwrite_output_dir \
+    --max_steps $MAX_STEPS
+elif [ ${TEST} == "tune" ]
+then
+    deepspeed --autotuning run --num_nodes=$NNODES $HF_PATH/transformers/examples/pytorch/text-classification/run_glue.py --deepspeed ./ds_config_tune.json \
+    --model_name_or_path $MODEL_NAME \
+    --task_name $TASK_NAME \
+    --do_train \
+    --max_seq_length 128 \
+    --per_device_train_batch_size $PER_DEVICE_TRAIN_BATCH_SIZE \
+    --learning_rate 2e-5 \
+    --num_train_epochs $NEPOCHS \
+    --output_dir ${OUTPUT_DIR}_tune \
+    --save_steps 0 \
+    --overwrite_output_dir \
+    --max_steps $MAX_STEPS
+elif [ ${TEST} == "fs" ]
+then
+    python -m torch.distributed.launch --nproc_per_node=$NGPUS $HF_PATH/transformers/examples/pytorch/text-classification/run_glue.py \
+    --model_name_or_path $MODEL_NAME \
+    --task_name $TASK_NAME \
+    --do_train \
+    --max_seq_length 128 \
+    --per_device_train_batch_size $PER_DEVICE_TRAIN_BATCH_SIZE \
+    --learning_rate 2e-5 \
+    --num_train_epochs $NEPOCHS \
+    --output_dir ${OUTPUT_DIR}_b${PER_DEVICE_TRAIN_BATCH_SIZE}_fs \
+    --overwrite_output_dir \
+    --save_steps 0 \
+    --max_steps $MAX_STEPS \
+    --sharded_ddp zero_dp_2
+fi
diff --git a/autotuning/hf/bert-large/README.md b/autotuning/hf/bert-large/README.md
new file mode 100644
index 000000000..157dba0c1
--- /dev/null
+++ b/autotuning/hf/bert-large/README.md
@@ -0,0 +1,55 @@
+# [bert-large-uncased](https://huggingface.co/bert-large-uncased)
+
+This model has the following configuration:
+
+- 24-layer
+- 1024 hidden dimension
+- 16 attention heads
+- 336M parameters
+
+The training use fp32 and runs on 1 node with 16 Nvidia V100 GPUs. The autotuning uses the same hardware resource as the training. `max_train_batch_size` is not defined.
+The HF packages below are used.
+
+HF examples require installing the `transformers` package from source:
+```bash
+    git clone https://github.com/huggingface/transformers.git
+    cd transformers
+    pip install .
+```
+The `datasets` package can be installed by `pip install datasets`
+
+Below are the versions used in this test.
+
+- transformers (4.12.0)
+- datasets (1.11.0)
+
+## Throughput Comparison
+
+The table below shows the throughput (samples per second) comparison. The corresponding train micro-batch size per GPU (mbs or tmbspg) and ZeRO stage used to achieve the throughput value is also shown in the parentheses. Assume the strategy users would use in the handtuning process is to start from `mbs = 1` and increase mbs by 2 each time until running out of GPU memory.
+ - `baseline` is the vanila HF without DeepSpeed (DS) and mbs is hand-tuned.
+ - `HF + DS hand-tuned` is HF with DS, and mbs is hand-tuned while other DS configuration uses default values.
+ - `HF + DS autotuning` is HF with DS, and the DS configuration is selected from autotuning.
+
+Notation: Hugging Face (HF), DeepSpeed (DS), ZeRO stage (z), gradient accumulation steps (gas), train micro-batch size per GPU (mbs or tmbspg).
+
+| Model name | baseline (vanila HF)        | HF + DS handtuned                 | HF + DS autotuning         |
+| ---------- | --------------------------- | --------------------------------- | -------------------------- |
+| BERT-large | 742.692 (gas = 1, mbs = 64) | 766.929 (z = 1, gas =1, mbs = 64) | 808.168 (z1_gas1_tmbspg93) |
+
+## Detailed `HF + DS autotuning` Result Summary
+
+Note that the performance metric used in autotuning is calculated using the timings captured within DeepSpeed forward, backward, and step functions. The sum of these timings is less than the actual training step latency, thus the throughput metric values used by autotuning would be higher than the end-to-end throughput in training.
+
+- Fast-mode Autotuning time: 36 mins
+- Number of experiments: 22
+- Throughput Improvement over baseline: 1.09x
+
+| tuning_space | num_experiments | best_metric_val | best_exp_name    |
+| :----------- | --------------: | --------------: | :--------------- |
+| z0           |               6 |         835.244 | z0_gas1_tmbspg93 |
+| z1           |               6 |         842.243 | z1_gas1_tmbspg93 |
+| z2           |               9 |         764.524 | z2_gas1_tmbspg94 |
+| z3           |               1 |               0 | z3_gas1_tmbspg94 |
+| global       |              22 |         842.243 | z1_gas1_tmbspg93 |
+
+Tuning completed in 0:36:16.261417. Total number of experiments: 23.
diff --git a/autotuning/hf/bert-large/ds_config_tune.json b/autotuning/hf/bert-large/ds_config_tune.json
new file mode 100644
index 000000000..e79f9c450
--- /dev/null
+++ b/autotuning/hf/bert-large/ds_config_tune.json
@@ -0,0 +1,11 @@
+{
+  "train_micro_batch_size_per_gpu": "auto",
+  "autotuning": {
+    "enabled": true,
+    "overwrite": false,
+    "arg_mappings": {
+      "train_micro_batch_size_per_gpu": "--per_device_train_batch_size",
+      "gradient_accumulation_steps ": "--gradient_accumulation_steps"
+    }
+  }
+}
diff --git a/autotuning/hf/bert-large/test_tune.sh b/autotuning/hf/bert-large/test_tune.sh
new file mode 100755
index 000000000..e63f917b8
--- /dev/null
+++ b/autotuning/hf/bert-large/test_tune.sh
@@ -0,0 +1,114 @@
+TASK_NAME=mnli
+MODEL_NAME=bert-large-uncased
+HF_PATH=~/projects
+PER_DEVICE_TRAIN_BATCH_SIZE=64
+MAX_TRAIN_BATCH_SIZE=4096
+NEPOCHS=1
+NGPUS=16
+NNODES=1
+MAX_STEPS=200
+OUTPUT_DIR=./${TASK_NAME}/output_b${PER_DEVICE_TRAIN_BATCH_SIZE}_g${NGPUS}_$MAX_STEPS
+
+TEST=$1
+
+if [ ${TEST} == "0" ]
+then
+    python -m torch.distributed.launch --nproc_per_node=$NGPUS $HF_PATH/transformers/examples/pytorch/text-classification/run_glue.py \
+    --model_name_or_path $MODEL_NAME \
+    --task_name $TASK_NAME \
+    --do_train \
+    --max_seq_length 128 \
+    --per_device_train_batch_size $PER_DEVICE_TRAIN_BATCH_SIZE \
+    --learning_rate 2e-5 \
+    --num_train_epochs $NEPOCHS \
+    --output_dir ${OUTPUT_DIR}_b${PER_DEVICE_TRAIN_BATCH_SIZE}_0 \
+    --overwrite_output_dir \
+    --save_steps 0 \
+    --max_steps $MAX_STEPS \
+    --save_strategy "no"
+elif [ ${TEST} == "z0" ]
+then
+    deepspeed --num_nodes=$NNODES --num_gpus=$NGPUS $HF_PATH/transformers/examples/pytorch/text-classification/run_glue.py --deepspeed ../dsconfigs/ds_config_z0.json \
+    --model_name_or_path $MODEL_NAME \
+    --task_name $TASK_NAME \
+    --do_train \
+    --max_seq_length 128 \
+    --per_device_train_batch_size $PER_DEVICE_TRAIN_BATCH_SIZE \
+    --learning_rate 2e-5 \
+    --num_train_epochs $NEPOCHS \
+    --output_dir ${OUTPUT_DIR}_b${PER_DEVICE_TRAIN_BATCH_SIZE}_z0 \
+    --save_steps 0 \
+    --overwrite_output_dir \
+    --max_steps $MAX_STEPS
+elif [ ${TEST} == "z1" ]
+then
+    deepspeed --num_nodes=$NNODES --num_gpus=$NGPUS $HF_PATH/transformers/examples/pytorch/text-classification/run_glue.py --deepspeed ../dsconfigs/ds_config_z1.json \
+    --model_name_or_path $MODEL_NAME \
+    --task_name $TASK_NAME \
+    --do_train \
+    --max_seq_length 128 \
+    --per_device_train_batch_size $PER_DEVICE_TRAIN_BATCH_SIZE \
+    --learning_rate 2e-5 \
+    --num_train_epochs $NEPOCHS \
+    --output_dir ${OUTPUT_DIR}_b${PER_DEVICE_TRAIN_BATCH_SIZE}_z1 \
+    --save_steps 0 \
+    --overwrite_output_dir \
+    --max_steps $MAX_STEPS
+elif [ ${TEST} == "z2" ]
+then
+    deepspeed --num_nodes=$NNODES --num_gpus=$NGPUS $HF_PATH/transformers/examples/pytorch/text-classification/run_glue.py --deepspeed ../dsconfigs/ds_config_z2.json \
+    --model_name_or_path $MODEL_NAME \
+    --task_name $TASK_NAME \
+    --do_train \
+    --max_seq_length 128 \
+    --per_device_train_batch_size $PER_DEVICE_TRAIN_BATCH_SIZE \
+    --learning_rate 2e-5 \
+    --num_train_epochs $NEPOCHS \
+    --output_dir ${OUTPUT_DIR}_b${PER_DEVICE_TRAIN_BATCH_SIZE}_z2 \
+    --save_steps 0 \
+    --overwrite_output_dir \
+    --max_steps $MAX_STEPS
+elif [ ${TEST} == "z3" ]
+then
+    deepspeed --num_nodes=$NNODES --num_gpus=$NGPUS $HF_PATH/transformers/examples/pytorch/text-classification/run_glue.py --deepspeed ../dsconfigs/ds_config_z3.json \
+    --model_name_or_path $MODEL_NAME \
+    --task_name $TASK_NAME \
+    --do_train \
+    --max_seq_length 128 \
+    --per_device_train_batch_size $PER_DEVICE_TRAIN_BATCH_SIZE \
+    --learning_rate 2e-5 \
+    --num_train_epochs $NEPOCHS \
+    --output_dir ${OUTPUT_DIR}_b${PER_DEVICE_TRAIN_BATCH_SIZE}_z3 \
+    --save_steps 0 \
+    --overwrite_output_dir \
+    --max_steps $MAX_STEPS
+elif [ ${TEST} == "tune" ]
+then
+    deepspeed --autotuning run --num_nodes=$NNODES --num_gpus=$NGPUS $HF_PATH/transformers/examples/pytorch/text-classification/run_glue.py --deepspeed ./ds_config_tune.json \
+    --model_name_or_path $MODEL_NAME \
+    --task_name $TASK_NAME \
+    --do_train \
+    --max_seq_length 128 \
+    --per_device_train_batch_size $PER_DEVICE_TRAIN_BATCH_SIZE \
+    --learning_rate 2e-5 \
+    --num_train_epochs $NEPOCHS \
+    --output_dir ${OUTPUT_DIR}_tune \
+    --save_steps 0 \
+    --overwrite_output_dir \
+    --max_steps $MAX_STEPS
+elif [ ${TEST} == "fs" ]
+then
+    python -m torch.distributed.launch --nproc_per_node=$NGPUS $HF_PATH/transformers/examples/pytorch/text-classification/run_glue.py \
+    --model_name_or_path $MODEL_NAME \
+    --task_name $TASK_NAME \
+    --do_train \
+    --max_seq_length 128 \
+    --per_device_train_batch_size $PER_DEVICE_TRAIN_BATCH_SIZE \
+    --learning_rate 2e-5 \
+    --num_train_epochs $NEPOCHS \
+    --output_dir ${OUTPUT_DIR}_b${PER_DEVICE_TRAIN_BATCH_SIZE}_fs \
+    --overwrite_output_dir \
+    --save_steps 0 \
+    --max_steps $MAX_STEPS \
+    --sharded_ddp zero_dp_2
+fi
diff --git a/autotuning/hf/deberta/README.md b/autotuning/hf/deberta/README.md
new file mode 100644
index 000000000..9144376cd
--- /dev/null
+++ b/autotuning/hf/deberta/README.md
@@ -0,0 +1,72 @@
+# [deberta-v2-xxlarge-mnli](https://huggingface.co/microsoft/deberta-v2-xxlarge)
+
+This model has the following configuration:
+
+- 48-layer
+- 1536 hidden dimension
+- 1.5B parameters.
+
+Refer to [DeBERTa: Decoding-enhanced BERT with Disentangled Attention](https://github.com/microsoft/DeBERTa).
+## Environment
+
+The training use fp16 and runs on 1 node with 16 Nvidia V100 GPUs. The autotuning uses the same hardware resource as the training. `max_train_batch_size` is not defined.
+The HF packages below are used.
+
+HF examples require installing the `transformers` package from source:
+```bash
+    git clone https://github.com/huggingface/transformers.git
+    cd transformers
+    pip install .
+```
+The `datasets` package can be installed by `pip install datasets`
+
+Below are the versions used in this test.
+
+- transformers (4.12.0)
+- datasets (1.11.0)
+## Throughput Comparison
+
+The table below shows the throughput (samples per second) comparison. The corresponding train micro-batch size per GPU (mbs or tmbspg) and ZeRO stage used to achieve the throughput value is also shown in the parentheses. Assume the strategy users would use in the handtuning process is to start from `mbs = 1` and increase mbs by 2 each time until running out of GPU memory.
+ - `baseline` is the vanila HF without DeepSpeed (DS) and mbs is hand-tuned.
+ - `HF + DS hand-tuned` is HF with DS, and mbs is hand-tuned while other DS configuration uses default values.
+ - `HF + DS autotuning` is HF with DS, and the DS configuration is selected from autotuning.
+
+Notation: Hugging Face (HF), DeepSpeed (DS), ZeRO stage (z), gradient accumulation steps (gas), train micro-batch size per GPU (mbs or tmbspg), reduce_bucket_size (rbs), allgather_bucket_size (abs).
+
+| Model name | baseline (vanila HF) | HF + DS hand-tuned                | HF + DS autotuning (fast-mode) |
+| ---------- | -------------------- | --------------------------------- | ------------------------------ |
+| DeBERTa    | Not runnable         | 140.587 (z = 1, gas = 1 mbs = 8), | 162.395  (z1_gas1_tmbspg11)    |
+
+## Detailed `HF + DS autotuning` Result Summary
+
+Note that the performance metric used in autotuning is calculated using the timings captured within DeepSpeed forward, backward, and step functions. The sum of these timings is less than the actual training step latency, thus the throughput metric values used by autotuning would be higher than the end-to-end throughput in training.
+### Fast-mode
+- Autotuning time: 40 mins
+- Number of experiments: 12
+- Throughput Improvement over baseline: Inf
+
+| tuning_space | num_experiments | best_metric_val | best_exp_name    |
+| :----------- | --------------: | --------------: | :--------------- |
+| z0           |               1 |               0 | z0_gas1_tmbspg1  |
+| z1           |               6 |         177.843 | z1_gas1_tmbspg11 |
+| z2           |               4 |         154.002 | z2_gas1_tmbspg14 |
+| z3           |               1 |               0 | z3_gas1_tmbspg14 |
+| global       |              12 |         177.843 | z1_gas1_tmbspg11 |
+
+Tuning completed in 0:39:25.253998. Total number of experiments: 12.
+
+### Full-mode ("fast" set to false)
+- Autotuning time: 1 hr 2 mins
+- Number of experiments: 24
+- Throughput Improvement over baseline: Inf
+
+| tuning_space      | num_experiments | best_metric_val | best_exp_name                          |
+| :---------------- | --------------: | --------------: | :------------------------------------- |
+| z0                |               1 |               0 | z0_gas1_tmbspg1                        |
+| z1                |               6 |         177.843 | z1_gas1_tmbspg11                       |
+| z1_rbs_abs_tmbspg |              12 |         193.577 | z1_rbs5.0e+07_abs1.0e+09_gas1_tmbspg11 |
+| z2                |               4 |         154.002 | z2_gas1_tmbspg14                       |
+| z3                |               1 |               0 | z3_gas1_tmbspg14                       |
+| global            |              24 |         193.577 | z1_rbs5.0e+07_abs1.0e+09_gas1_tmbspg11 |
+
+Tuning completed in 1:02:32.759424. Total number of experiments: 24.
diff --git a/autotuning/hf/deberta/ds_config_fp16_tune.json b/autotuning/hf/deberta/ds_config_fp16_tune.json
new file mode 100644
index 000000000..b405929bb
--- /dev/null
+++ b/autotuning/hf/deberta/ds_config_fp16_tune.json
@@ -0,0 +1,16 @@
+{
+  "train_micro_batch_size_per_gpu": "auto",
+  "fp16": {
+    "enabled": true,
+    "initial_scale_power": 12
+  },
+  "autotuning": {
+    "enabled": true,
+    "overwrite": false,
+    "fast": true,
+    "arg_mappings": {
+      "train_micro_batch_size_per_gpu": "--per_device_train_batch_size",
+      "gradient_accumulation_steps ": "--gradient_accumulation_steps"
+    }
+  }
+}
\ No newline at end of file
diff --git a/autotuning/hf/deberta/test_tune.sh b/autotuning/hf/deberta/test_tune.sh
new file mode 100755
index 000000000..d4de499ee
--- /dev/null
+++ b/autotuning/hf/deberta/test_tune.sh
@@ -0,0 +1,127 @@
+MODEL_NAME=microsoft/deberta-v2-xxlarge
+TASK_NAME=mnli
+PER_DEVICE_TRAIN_BATCH_SIZE=1
+HF_PATH=~/projects
+NEPOCHS=1
+NGPUS=16
+NNODES=1
+MAX_STEPS=200
+OUTPUT_DIR=./output_b${PER_DEVICE_TRAIN_BATCH_SIZE}_g${NGPUS}_$MAX_STEPS
+
+TEST=$1
+
+if [ ${TEST} == "0" ]
+then
+    python -m torch.distributed.launch --nproc_per_node=$NGPUS $HF_PATH/transformers/examples/pytorch/text-classification/run_glue.py \
+    --model_name_or_path $MODEL_NAME \
+    --task_name $TASK_NAME \
+    --do_train \
+    --fp16 \
+    --max_seq_length 256 \
+    --per_device_train_batch_size $PER_DEVICE_TRAIN_BATCH_SIZE \
+    --learning_rate 3e-6 \
+    --num_train_epochs $NEPOCHS \
+    --output_dir ${OUTPUT_DIR}_0 \
+    --overwrite_output_dir \
+    --save_steps 0 \
+    --max_steps $MAX_STEPS \
+    --save_strategy "no"
+elif [ ${TEST} == "z0" ]
+then
+    deepspeed --num_nodes=$NNODES --num_gpus=$NGPUS $HF_PATH/transformers/examples/pytorch/text-classification/run_glue.py --deepspeed ../dsconfigs/ds_config_fp16_z0.json\
+    --model_name_or_path $MODEL_NAME \
+    --task_name $TASK_NAME \
+    --do_train \
+    --fp16 \
+    --max_seq_length 256 \
+    --per_device_train_batch_size $PER_DEVICE_TRAIN_BATCH_SIZE \
+    --learning_rate 3e-6 \
+    --num_train_epochs $NEPOCHS \
+    --output_dir ${OUTPUT_DIR}_z0 \
+    --overwrite_output_dir \
+    --save_steps 0 \
+    --save_steps 0 \
+    --max_steps $MAX_STEPS \
+    --save_strategy "no"
+elif [ ${TEST} == "z1" ]
+then
+    deepspeed --num_nodes=$NNODES --num_gpus=$NGPUS $HF_PATH/transformers/examples/pytorch/text-classification/run_glue.py --deepspeed ../dsconfigs/ds_config_fp16_z1.json\
+    --model_name_or_path $MODEL_NAME \
+    --task_name $TASK_NAME \
+    --do_train \
+    --fp16 \
+    --max_seq_length 256 \
+    --per_device_train_batch_size $PER_DEVICE_TRAIN_BATCH_SIZE \
+    --learning_rate 3e-6 \
+    --num_train_epochs $NEPOCHS \
+    --output_dir ${OUTPUT_DIR}_z1 \
+    --overwrite_output_dir \
+    --save_steps 0 \
+    --max_steps $MAX_STEPS \
+    --save_strategy "no"
+elif [ ${TEST} == "z2" ]
+then
+    deepspeed --num_nodes=$NNODES --num_gpus=$NGPUS $HF_PATH/transformers/examples/pytorch/text-classification/run_glue.py --deepspeed ../dsconfigs/ds_config_fp16_z2.json\
+    --model_name_or_path $MODEL_NAME \
+    --task_name $TASK_NAME \
+    --do_train \
+    --fp16 \
+    --max_seq_length 256 \
+    --per_device_train_batch_size $PER_DEVICE_TRAIN_BATCH_SIZE \
+    --learning_rate 3e-6 \
+    --num_train_epochs $NEPOCHS \
+    --output_dir ${OUTPUT_DIR}_z2 \
+    --overwrite_output_dir \
+    --save_steps 0 \
+    --max_steps $MAX_STEPS \
+    --save_strategy "no"
+elif [ ${TEST} == "z3" ]
+then
+    deepspeed --num_nodes=$NNODES --num_gpus=$NGPUS $HF_PATH/transformers/examples/pytorch/text-classification/run_glue.py --deepspeed ../dsconfigs/ds_config_fp16_z3.json\
+    --model_name_or_path $MODEL_NAME \
+    --task_name $TASK_NAME \
+    --do_train \
+    --fp16 \
+    --max_seq_length 256 \
+    --per_device_train_batch_size $PER_DEVICE_TRAIN_BATCH_SIZE \
+    --learning_rate 3e-6 \
+    --num_train_epochs $NEPOCHS \
+    --output_dir ${OUTPUT_DIR}_z3 \
+    --overwrite_output_dir \
+    --save_steps 0 \
+    --max_steps $MAX_STEPS \
+    --save_strategy "no"
+elif [ ${TEST} == "tune" ]
+then
+    deepspeed --autotuning run --num_nodes=$NNODES --num_gpus=$NGPUS $HF_PATH/transformers/examples/pytorch/text-classification/run_glue.py --deepspeed ./ds_config_fp16_tune.json\
+    --model_name_or_path $MODEL_NAME \
+    --task_name $TASK_NAME \
+    --do_train \
+    --fp16 \
+    --max_seq_length 256 \
+    --per_device_train_batch_size $PER_DEVICE_TRAIN_BATCH_SIZE \
+    --learning_rate 3e-6 \
+    --num_train_epochs $NEPOCHS \
+    --output_dir ${OUTPUT_DIR}_tune \
+    --overwrite_output_dir \
+    --save_steps 0 \
+    --max_steps $MAX_STEPS \
+    --save_strategy "no"
+elif [ ${TEST} == "fs" ]
+then
+    python -m torch.distributed.launch --nproc_per_node=$NGPUS $HF_PATH/transformers/examples/pytorch/text-classification/run_glue.py \
+    --model_name_or_path $MODEL_NAME \
+    --task_name $TASK_NAME \
+    --do_train \
+    --fp16 \
+    --max_seq_length 256 \
+    --per_device_train_batch_size $PER_DEVICE_TRAIN_BATCH_SIZE \
+    --learning_rate 3e-6 \
+    --num_train_epochs $NEPOCHS \
+    --output_dir ${OUTPUT_DIR}_fs \
+    --overwrite_output_dir \
+    --save_steps 0 \
+    --max_steps $MAX_STEPS \
+    --save_strategy "no"
+    --sharded_ddp zero_dp_2
+fi
diff --git a/autotuning/hf/distilbert/README.md b/autotuning/hf/distilbert/README.md
new file mode 100644
index 000000000..dce99207c
--- /dev/null
+++ b/autotuning/hf/distilbert/README.md
@@ -0,0 +1,69 @@
+# [distilbert-base-uncased](https://huggingface.co/distilbert-base-uncased)
+
+This model has the following configuration:
+
+- 12-layer
+- 768 hidden dimension
+- 12 attention heads
+- 66M parameters.
+
+## Environment
+
+The training uses 1 node with 16 Nvidia V100 GPUs, fp32, max_train_batch_size = 4096. The autotuning uses the same hardware resource as the training. `"max_train_batch_size"` is set to `4096`.
+The HF packages below are used.
+
+HF examples require installing the `transformers` package from source:
+```bash
+    git clone https://github.com/huggingface/transformers.git
+    cd transformers
+    pip install .
+```
+The `datasets` package can be installed by `pip install datasets`
+
+Below are the versions used in this test.
+
+- transformers (4.12.0)
+- datasets (1.11.0)
+## Throughput Comparison
+
+The table below shows the throughput (samples per second) comparison. The corresponding train micro-batch size per GPU (mbs or tmbspg) and ZeRO stage used to achieve the throughput value is also shown in the parentheses. Assume the strategy users would use in the handtuning process is to start from `mbs = 1` and increase mbs by 2 each time until running out of GPU memory.
+ - `baseline` is the vanila HF without DeepSpeed (DS) and mbs is hand-tuned.
+ - `HF + DS hand-tuned` is HF with DS, and mbs is hand-tuned while other DS configuration uses default values.
+ - `HF + DS autotuning` is HF with DS, and the DS configuration is selected from autotuning.
+
+Notation: Hugging Face (HF), DeepSpeed (DS), ZeRO stage (z), gradient accumulation steps (gas), train micro-batch size per GPU (mbs or tmbspg).
+
+| Model name | baseline (vanila HF)          | HF + DS hand-tuned                   | HF + DS autotuning (fast-mode) |
+| ---------- | ----------------------------- | ------------------------------------ | ------------------------------ |
+| DistilBERT | 5161.902 (gas = 1, mbs = 256) | 5305.067 (z = 0, gas = 1 mbs = 256), | 5305.067 (z0_gas1_tmbspg256)   |
+
+3700.296
+
+## Detailed `HF + DS autotuning` Result Summary
+
+Note that the performance metric used in autotuning is calculated using the timings captured within DeepSpeed forward, backward, and step functions. The sum of these timings is less than the actual training step latency, thus the throughput metric values used by autotuning would be higher than the end-to-end throughput in training.
+
+- Fast-mode Autotuning time: 11 mins
+- Number of experiments: 11
+- Throughput Improvement: 1.03x
+
+| tuning_space | num_experiments | best_metric_val | best_exp_name     |
+| :----------- | --------------: | --------------: | :---------------- |
+| z0           |               5 |         5759.96 | z0_gas1_tmbspg256 |
+| z1           |               2 |         5667.06 | z1_gas1_tmbspg256 |
+| z2           |               2 |         5366.97 | z2_gas1_tmbspg256 |
+| z3           |               2 |         4892.49 | z3_gas1_tmbspg256 |
+| global       |              11 |         5759.96 | z0_gas1_tmbspg256 |
+
+Tuning completed in 0:10:45.085016. Total number of experiments: 11.
+
+
+| tuning_space | num_experiments | best_metric_val | best_exp_name      |
+| :----------- | --------------: | --------------: | :----------------- |
+| z0           |               7 |         5759.98 | z0_gas22_tmbspg179 |
+| z1           |               2 |         5543.49 | z1_gas1_tmbspg269  |
+| z2           |               2 |         5044.88 | z2_gas15_tmbspg269 |
+| z3           |               2 |         4627.63 | z3_gas1_tmbspg269  |
+| global       |              13 |         5759.98 | z0_gas22_tmbspg179 |
+
+Tuning completed in 0:25:44.502148. Total number of experiments: 13.
diff --git a/autotuning/hf/distilbert/ds_config_tune.json b/autotuning/hf/distilbert/ds_config_tune.json
new file mode 100644
index 000000000..23a48ddf9
--- /dev/null
+++ b/autotuning/hf/distilbert/ds_config_tune.json
@@ -0,0 +1,12 @@
+{
+  "train_micro_batch_size_per_gpu": "auto",
+  "autotuning": {
+    "enabled": true,
+    "overwrite": false,
+    "max_train_batch_size": 4096,
+    "arg_mappings": {
+      "train_micro_batch_size_per_gpu": "--per_device_train_batch_size",
+      "gradient_accumulation_steps ": "--gradient_accumulation_steps"
+    }
+  }
+}
diff --git a/autotuning/hf/distilbert/test_tune.sh b/autotuning/hf/distilbert/test_tune.sh
new file mode 100755
index 000000000..08b92d56e
--- /dev/null
+++ b/autotuning/hf/distilbert/test_tune.sh
@@ -0,0 +1,119 @@
+TASK_NAME=mnli
+MODEL_NAME=distilbert-base-uncased
+HF_PATH=~/projects
+PER_DEVICE_TRAIN_BATCH_SIZE=64
+MAX_TRAIN_BATCH_SIZE=4096
+NEPOCHS=1
+NGPUS=16
+NNODES=1
+MAX_STEPS=200
+OUTPUT_DIR=./${TASK_NAME}/output_b${PER_DEVICE_TRAIN_BATCH_SIZE}_g${NGPUS}_$MAX_STEPS
+
+TEST=$1
+
+if [ ${TEST} == "0" ]
+then
+    python -m torch.distributed.launch --nproc_per_node=$NGPUS $HF_PATH/transformers/examples/pytorch/text-classification/run_glue.py \
+    --model_name_or_path $MODEL_NAME \
+    --task_name $TASK_NAME \
+    --do_train \
+    --max_seq_length 128 \
+    --per_device_train_batch_size $PER_DEVICE_TRAIN_BATCH_SIZE \
+    --learning_rate 2e-5 \
+    --num_train_epochs $NEPOCHS \
+    --output_dir ${OUTPUT_DIR}_0 \
+    --overwrite_output_dir \
+    --save_steps 0 \
+    --max_steps $MAX_STEPS \
+    --save_strategy "no"
+elif [ ${TEST} == "z0" ]
+then
+    deepspeed --num_nodes=$NNODES --num_gpus=$NGPUS $HF_PATH/transformers/examples/pytorch/text-classification/run_glue.py --deepspeed ../dsconfigs/ds_config_z0.json \
+    --model_name_or_path $MODEL_NAME \
+    --task_name $TASK_NAME \
+    --do_train \
+    --max_seq_length 128 \
+    --per_device_train_batch_size $PER_DEVICE_TRAIN_BATCH_SIZE \
+    --learning_rate 2e-5 \
+    --num_train_epochs $NEPOCHS \
+    --output_dir ${OUTPUT_DIR}_z0 \
+    --overwrite_output_dir \
+    --save_steps 0 \
+    --max_steps $MAX_STEPS \
+    --save_strategy "no"
+elif [ ${TEST} == "z1" ]
+then
+    deepspeed --num_nodes=$NNODES $HF_PATH/transformers/examples/pytorch/text-classification/run_glue.py --deepspeed ../dsconfigs/ds_config_z1.json \
+    --model_name_or_path $MODEL_NAME \
+    --task_name $TASK_NAME \
+    --do_train \
+    --max_seq_length 128 \
+    --per_device_train_batch_size $PER_DEVICE_TRAIN_BATCH_SIZE \
+    --learning_rate 2e-5 \
+    --num_train_epochs $NEPOCHS \
+    --output_dir ${OUTPUT_DIR}_z1 \
+    --overwrite_output_dir \
+    --save_steps 0 \
+    --max_steps $MAX_STEPS \
+    --save_strategy "no"
+elif [ ${TEST} == "z2" ]
+then
+    deepspeed --num_nodes=$NNODES $HF_PATH/transformers/examples/pytorch/text-classification/run_glue.py --deepspeed ../dsconfigs/ds_config_z2.json \
+    --model_name_or_path $MODEL_NAME \
+    --task_name $TASK_NAME \
+    --do_train \
+    --max_seq_length 128 \
+    --per_device_train_batch_size $PER_DEVICE_TRAIN_BATCH_SIZE \
+    --learning_rate 2e-5 \
+    --num_train_epochs $NEPOCHS \
+    --output_dir ${OUTPUT_DIR}_z2 \
+    --overwrite_output_dir \
+    --save_steps 0 \
+    --max_steps $MAX_STEPS \
+    --save_strategy "no"
+elif [ ${TEST} == "z3" ]
+then
+    deepspeed --num_nodes=$NNODES $HF_PATH/transformers/examples/pytorch/text-classification/run_glue.py --deepspeed ../dsconfigs/ds_config_z3.json \
+    --model_name_or_path $MODEL_NAME \
+    --task_name $TASK_NAME \
+    --do_train \
+    --max_seq_length 128 \
+    --per_device_train_batch_size $PER_DEVICE_TRAIN_BATCH_SIZE \
+    --learning_rate 2e-5 \
+    --num_train_epochs $NEPOCHS \
+    --output_dir ${OUTPUT_DIR}_z3 \
+    --overwrite_output_dir \
+    --save_steps 0 \
+    --max_steps $MAX_STEPS \
+    --save_strategy "no"
+elif [ ${TEST} == "tune" ]
+then
+    deepspeed --autotuning run --num_nodes=$NNODES --num_gpus=$NGPUS $HF_PATH/transformers/examples/pytorch/text-classification/run_glue.py --deepspeed ./ds_config_tune.json \
+    --model_name_or_path $MODEL_NAME \
+    --task_name $TASK_NAME \
+    --do_train \
+    --max_seq_length 128 \
+    --per_device_train_batch_size $PER_DEVICE_TRAIN_BATCH_SIZE \
+    --num_train_epochs $NEPOCHS \
+    --output_dir ${OUTPUT_DIR}_tune \
+    --overwrite_output_dir \
+    --save_steps 0 \
+    --max_steps $MAX_STEPS \
+    --save_strategy "no"
+elif [ ${TEST} == "fs" ]
+then
+    python -m torch.distributed.launch --nproc_per_node=$NGPUS $HF_PATH/transformers/examples/pytorch/text-classification/run_glue.py \
+    --model_name_or_path $MODEL_NAME \
+    --task_name $TASK_NAME \
+    --do_train \
+    --max_seq_length 128 \
+    --per_device_train_batch_size $PER_DEVICE_TRAIN_BATCH_SIZE \
+    --learning_rate 2e-5 \
+    --num_train_epochs $NEPOCHS \
+    --output_dir ${OUTPUT_DIR}_fs \
+    --overwrite_output_dir \
+    --save_steps 0 \
+    --max_steps $MAX_STEPS \
+    --save_strategy "no"
+    --sharded_ddp zero_dp_2
+fi
diff --git a/autotuning/hf/dsconfigs/ds_config_fp16_tune.json b/autotuning/hf/dsconfigs/ds_config_fp16_tune.json
new file mode 100644
index 000000000..7ae31168b
--- /dev/null
+++ b/autotuning/hf/dsconfigs/ds_config_fp16_tune.json
@@ -0,0 +1,15 @@
+{
+  "train_micro_batch_size_per_gpu": "auto",
+  "fp16": {
+    "enabled": true
+  },
+  "autotuning": {
+    "enabled": true,
+    "overwrite": false,
+    "fast": true,
+    "arg_mappings": {
+      "train_micro_batch_size_per_gpu": "--per_device_train_batch_size",
+      "gradient_accumulation_steps ": "--gradient_accumulation_steps"
+    }
+  }
+}
diff --git a/autotuning/hf/dsconfigs/ds_config_fp16_z0.json b/autotuning/hf/dsconfigs/ds_config_fp16_z0.json
new file mode 100644
index 000000000..ff375bb3e
--- /dev/null
+++ b/autotuning/hf/dsconfigs/ds_config_fp16_z0.json
@@ -0,0 +1,9 @@
+{
+  "train_micro_batch_size_per_gpu": "auto",
+  "zero_optimization": {
+    "stage": 0
+  },
+  "fp16": {
+    "enabled": true
+  }
+}
diff --git a/autotuning/hf/dsconfigs/ds_config_fp16_z1.json b/autotuning/hf/dsconfigs/ds_config_fp16_z1.json
new file mode 100644
index 000000000..209706d24
--- /dev/null
+++ b/autotuning/hf/dsconfigs/ds_config_fp16_z1.json
@@ -0,0 +1,9 @@
+{
+  "train_micro_batch_size_per_gpu": "auto",
+  "zero_optimization": {
+    "stage": 1
+  },
+  "fp16": {
+    "enabled": true
+  }
+}
diff --git a/autotuning/hf/dsconfigs/ds_config_fp16_z2.json b/autotuning/hf/dsconfigs/ds_config_fp16_z2.json
new file mode 100644
index 000000000..d3782ab14
--- /dev/null
+++ b/autotuning/hf/dsconfigs/ds_config_fp16_z2.json
@@ -0,0 +1,9 @@
+{
+  "train_micro_batch_size_per_gpu": "auto",
+  "zero_optimization": {
+    "stage": 2
+  },
+  "fp16": {
+    "enabled": true
+  }
+}
diff --git a/autotuning/hf/dsconfigs/ds_config_fp16_z3.json b/autotuning/hf/dsconfigs/ds_config_fp16_z3.json
new file mode 100644
index 000000000..d0affd293
--- /dev/null
+++ b/autotuning/hf/dsconfigs/ds_config_fp16_z3.json
@@ -0,0 +1,9 @@
+{
+  "train_micro_batch_size_per_gpu": "auto",
+  "zero_optimization": {
+    "stage": 3
+  },
+  "fp16": {
+    "enabled": true
+  }
+}
diff --git a/autotuning/hf/dsconfigs/ds_config_tune.json b/autotuning/hf/dsconfigs/ds_config_tune.json
new file mode 100644
index 000000000..413e19630
--- /dev/null
+++ b/autotuning/hf/dsconfigs/ds_config_tune.json
@@ -0,0 +1,12 @@
+{
+  "train_micro_batch_size_per_gpu": "auto",
+  "autotuning": {
+    "enabled": true,
+    "overwrite": false,
+    "fast": true,
+    "arg_mappings": {
+      "train_micro_batch_size_per_gpu": "--per_device_train_batch_size",
+      "gradient_accumulation_steps ": "--gradient_accumulation_steps"
+    }
+  }
+}
diff --git a/autotuning/hf/dsconfigs/ds_config_z0.json b/autotuning/hf/dsconfigs/ds_config_z0.json
new file mode 100644
index 000000000..6247e56c4
--- /dev/null
+++ b/autotuning/hf/dsconfigs/ds_config_z0.json
@@ -0,0 +1,6 @@
+{
+  "train_micro_batch_size_per_gpu": "auto",
+  "zero_optimization": {
+    "stage": 0
+  }
+}
diff --git a/autotuning/hf/dsconfigs/ds_config_z1.json b/autotuning/hf/dsconfigs/ds_config_z1.json
new file mode 100644
index 000000000..fd39970a4
--- /dev/null
+++ b/autotuning/hf/dsconfigs/ds_config_z1.json
@@ -0,0 +1,6 @@
+{
+  "train_micro_batch_size_per_gpu": "auto",
+  "zero_optimization": {
+    "stage": 1
+  }
+}
diff --git a/autotuning/hf/dsconfigs/ds_config_z2.json b/autotuning/hf/dsconfigs/ds_config_z2.json
new file mode 100644
index 000000000..b898aee82
--- /dev/null
+++ b/autotuning/hf/dsconfigs/ds_config_z2.json
@@ -0,0 +1,6 @@
+{
+  "train_micro_batch_size_per_gpu": "auto",
+  "zero_optimization": {
+    "stage": 2
+  }
+}
diff --git a/autotuning/hf/dsconfigs/ds_config_z3.json b/autotuning/hf/dsconfigs/ds_config_z3.json
new file mode 100644
index 000000000..5b118864e
--- /dev/null
+++ b/autotuning/hf/dsconfigs/ds_config_z3.json
@@ -0,0 +1,6 @@
+{
+  "train_micro_batch_size_per_gpu": "auto",
+  "zero_optimization": {
+    "stage": 3
+  }
+}
diff --git a/autotuning/hf/gpt2-large/README.md b/autotuning/hf/gpt2-large/README.md
new file mode 100644
index 000000000..a736db485
--- /dev/null
+++ b/autotuning/hf/gpt2-large/README.md
@@ -0,0 +1,59 @@
+# [gpt2-large](https://huggingface.co/gpt2-large)
+
+This model has the following configuration:
+
+- 36-layer
+- 1280 hidden dimension
+- 20 attention heads
+- 774M parameters.
+
+Refer to [GPT-2/GPT and causal language modeling](https://github.com/huggingface/transformers/tree/master/examples/pytorch/language-modeling#gpt-2gpt-and-causal-language-modeling)
+
+## Environment
+
+The training use fp16 and runs on 1 node with 16 Nvidia V100 GPUs. The autotuning uses the same hardware resource as the training. `max_train_batch_size` is not defined.
+The HF packages below are used.
+
+HF examples require installing the `transformers` package from source:
+```bash
+    git clone https://github.com/huggingface/transformers.git
+    cd transformers
+    pip install .
+```
+The `datasets` package can be installed by `pip install datasets`
+
+Below are the versions used in this test.
+
+- transformers (4.12.0)
+- datasets (1.11.0)datasets (1.11.0)
+
+## Throughput Comparison
+
+The table below shows the throughput (samples per second) comparison. The corresponding train micro-batch size per GPU (mbs or tmbspg) and ZeRO stage used to achieve the throughput value is also shown in the parentheses. Assume the strategy users would use in the handtuning process is to start from `mbs = 1` and increase mbs by 2 each time until running out of GPU memory.
+ - `baseline` is the vanila HF without DeepSpeed (DS) and mbs is hand-tuned.
+ - `HF + DS hand-tuned` is HF with DS, and mbs is hand-tuned while other DS configuration uses default values.
+ - `HF + DS autotuning` is HF with DS, and the DS configuration is selected from autotuning.
+
+Notation: Hugging Face (HF), DeepSpeed (DS), ZeRO stage (z), gradient accumulation steps (gas), train micro-batch size per GPU (mbs or tmbspg).
+
+| Model name | baseline (vanila HF) | HF + DS hand-tuned       | HF + DS autotuning (fast-mode) |
+| ---------- | -------------------- | ------------------------ | ------------------------------ |
+| GPT2-large | 27.874 (mbs = 1)     | 56.797 (z = 1, mbs = 2), | 69.061 (z = 1, mbs = 3)        |
+
+## Detailed `HF + DS autotuning` Result Summary
+
+Note that the performance metric used in autotuning is calculated using the timings captured within DeepSpeed forward, backward, and step functions. The sum of these timings is less than the actual training step latency, thus the throughput metric values used by autotuning would be higher than the end-to-end throughput in training.
+
+- Fast-mode Autotuning time: 27 mins
+- Number of experiments: 13
+- Throughput Improvement over baseline: 2.48x
+
+| tuning_space | num_experiments | best_metric_val | best_exp_name   |
+| :----------- | --------------: | --------------: | :-------------- |
+| z0           |               4 |         59.0229 | z0_gas1_tmbspg2 |
+| z1           |               5 |         87.3017 | z1_gas1_tmbspg3 |
+| z2           |               3 |         77.8338 | z2_gas1_tmbspg3 |
+| z3           |               1 |               0 | z3_gas1_tmbspg3 |
+| global       |              13 |         87.3017 | z1_gas1_tmbspg3 |
+
+Tuning completed in 0:27:33.988447. Total number of experiments: 13.
diff --git a/autotuning/hf/gpt2-large/test_tune.sh b/autotuning/hf/gpt2-large/test_tune.sh
new file mode 100755
index 000000000..c5fa9b608
--- /dev/null
+++ b/autotuning/hf/gpt2-large/test_tune.sh
@@ -0,0 +1,132 @@
+MODEL_NAME=gpt2-large
+PER_DEVICE_TRAIN_BATCH_SIZE=1
+HF_PATH=~/projects
+NEPOCHS=1
+NGPUS=16
+NNODES=1
+MAX_STEPS=200
+OUTPUT_DIR=./output_b${PER_DEVICE_TRAIN_BATCH_SIZE}_g${NGPUS}_$MAX_STEPS
+
+TEST=$1
+
+if [ ${TEST} == "0" ]
+then
+    python -m torch.distributed.launch --nproc_per_node=$NGPUS $HF_PATH/transformers/examples/pytorch/language-modeling/run_clm.py \
+    --model_name_or_path $MODEL_NAME \
+    --dataset_name wikitext \
+    --dataset_config_name wikitext-2-raw-v1 \
+    --do_train \
+    --do_eval \
+    --fp16 \
+    --per_device_train_batch_size $PER_DEVICE_TRAIN_BATCH_SIZE \
+    --learning_rate 2e-5 \
+    --num_train_epochs $NEPOCHS \
+    --output_dir ${OUTPUT_DIR}_0 \
+    --overwrite_output_dir \
+    --save_steps 0 \
+    --max_steps $MAX_STEPS \
+    --save_strategy "no"
+elif [ ${TEST} == "z0" ]
+then
+    deepspeed --num_nodes=$NNODES --num_gpus=$NGPUS $HF_PATH/transformers/examples/pytorch/language-modeling/run_clm.py --deepspeed ../dsconfigs/ds_config_fp16_z0.json\
+    --model_name_or_path $MODEL_NAME \
+    --dataset_name wikitext \
+    --dataset_config_name wikitext-2-raw-v1 \
+    --do_train \
+    --do_eval \
+    --fp16 \
+    --per_device_train_batch_size $PER_DEVICE_TRAIN_BATCH_SIZE \
+    --learning_rate 2e-5 \
+    --num_train_epochs $NEPOCHS \
+    --output_dir ${OUTPUT_DIR}_z0 \
+    --overwrite_output_dir \
+    --save_steps 0 \
+    --max_steps $MAX_STEPS \
+    --save_strategy "no"
+elif [ ${TEST} == "z1" ]
+then
+    deepspeed --num_nodes=$NNODES --num_gpus=$NGPUS $HF_PATH/transformers/examples/pytorch/language-modeling/run_clm.py --deepspeed ../dsconfigs/ds_config_fp16_z1.json\
+    --model_name_or_path $MODEL_NAME \
+    --dataset_name wikitext \
+    --dataset_config_name wikitext-2-raw-v1 \
+    --do_train \
+    --do_eval \
+    --fp16 \
+    --per_device_train_batch_size $PER_DEVICE_TRAIN_BATCH_SIZE \
+    --learning_rate 2e-5 \
+    --num_train_epochs $NEPOCHS \
+    --output_dir ${OUTPUT_DIR}_z1 \
+    --overwrite_output_dir \
+    --save_steps 0 \
+    --max_steps $MAX_STEPS \
+    --save_strategy "no"
+elif [ ${TEST} == "z2" ]
+then
+    deepspeed --num_nodes=$NNODES --num_gpus=$NGPUS $HF_PATH/transformers/examples/pytorch/language-modeling/run_clm.py --deepspeed ../dsconfigs/ds_config_fp16_z2.json\
+    --model_name_or_path $MODEL_NAME \
+    --dataset_name wikitext \
+    --dataset_config_name wikitext-2-raw-v1 \
+    --do_train \
+    --do_eval \
+    --fp16 \
+    --per_device_train_batch_size $PER_DEVICE_TRAIN_BATCH_SIZE \
+    --learning_rate 2e-5 \
+    --num_train_epochs $NEPOCHS \
+    --output_dir ${OUTPUT_DIR}_z2 \
+    --overwrite_output_dir \
+    --save_steps 0 \
+    --max_steps $MAX_STEPS \
+    --save_strategy "no"
+elif [ ${TEST} == "z3" ]
+then
+    deepspeed --num_nodes=$NNODES --num_gpus=$NGPUS $HF_PATH/transformers/examples/pytorch/language-modeling/run_clm.py --deepspeed ../dsconfigs/ds_config_fp16_z3.json\
+    --model_name_or_path $MODEL_NAME \
+    --dataset_name wikitext \
+    --dataset_config_name wikitext-2-raw-v1 \
+    --do_train \
+    --do_eval \
+    --fp16 \
+    --per_device_train_batch_size $PER_DEVICE_TRAIN_BATCH_SIZE \
+    --learning_rate 2e-5 \
+    --num_train_epochs $NEPOCHS \
+    --output_dir ${OUTPUT_DIR}_z3 \
+    --overwrite_output_dir \
+    --save_steps 0 \
+    --max_steps $MAX_STEPS \
+    --save_strategy "no"
+elif [ ${TEST} == "tune" ]
+then
+    deepspeed --autotuning run --num_nodes=$NNODES --num_gpus=$NGPUS $HF_PATH/transformers/examples/pytorch/language-modeling/run_clm.py --deepspeed ../dsconfigs/ds_config_fp16_tune.json\
+    --model_name_or_path $MODEL_NAME \
+    --dataset_name wikitext \
+    --dataset_config_name wikitext-2-raw-v1 \
+    --do_train \
+    --do_eval \
+    --fp16 \
+    --per_device_train_batch_size $PER_DEVICE_TRAIN_BATCH_SIZE \
+    --learning_rate 2e-5 \
+    --num_train_epochs $NEPOCHS \
+    --output_dir ${OUTPUT_DIR}_tune \
+    --overwrite_output_dir \
+    --save_steps 0 \
+    --max_steps $MAX_STEPS \
+    --save_strategy "no"
+elif [ ${TEST} == "fs" ]
+then
+    python -m torch.distributed.launch --nproc_per_node=$NGPUS $HF_PATH/transformers/examples/pytorch/language-modeling/run_clm.py \
+    --model_name_or_path $MODEL_NAME \
+    --dataset_name wikitext \
+    --dataset_config_name wikitext-2-raw-v1 \
+    --do_train \
+    --do_eval \
+    --fp16 \
+    --per_device_train_batch_size $PER_DEVICE_TRAIN_BATCH_SIZE \
+    --learning_rate 2e-5 \
+    --num_train_epochs $NEPOCHS \
+    --output_dir ${OUTPUT_DIR}_fs \
+    --overwrite_output_dir \
+    --save_steps 0 \
+    --max_steps $MAX_STEPS \
+    --save_strategy "no"
+    --sharded_ddp zero_dp_2
+fi
diff --git a/autotuning/hf/gpt2-medium/README.md b/autotuning/hf/gpt2-medium/README.md
new file mode 100644
index 000000000..e97a1f9b3
--- /dev/null
+++ b/autotuning/hf/gpt2-medium/README.md
@@ -0,0 +1,57 @@
+# [gpt2-medium](https://huggingface.co/gpt2-medium)
+
+This model has the following configuration:
+- 24-layer
+- 1024 hidden dimension
+- 16 attention heads
+- 345M parameters.
+
+Refer to [GPT-2/GPT and causal language modeling](https://github.com/huggingface/transformers/tree/master/examples/pytorch/language-modeling#gpt-2gpt-and-causal-language-modeling)
+
+## Environment
+
+The training use fp16 and runs on 1 node with 16 Nvidia V100 GPUs. The autotuning uses the same hardware resource as the training. `max_train_batch_size` is not defined.
+The HF packages below are used.
+
+HF examples require installing the `transformers` package from source:
+```bash
+    git clone https://github.com/huggingface/transformers.git
+    cd transformers
+    pip install .
+```
+The `datasets` package can be installed by `pip install datasets`
+
+Below are the versions used in this test.
+
+- transformers (4.12.0)
+- datasets (1.11.0)
+## Throughput Comparison
+
+The table below shows the throughput (samples per second) comparison. The corresponding train micro-batch size per GPU (mbs or tmbspg) and ZeRO stage used to achieve the throughput value is also shown in the parentheses. Assume the strategy users would use in the handtuning process is to start from `mbs = 1` and increase mbs by 2 each time until running out of GPU memory.
+ - `baseline` is the vanila HF without DeepSpeed (DS) and mbs is hand-tuned.
+ - `HF + DS hand-tuned` is HF with DS, and mbs is hand-tuned while other DS configuration uses default values.
+ - `HF + DS autotuning` is HF with DS, and the DS configuration is selected from autotuning.
+
+Notation: Hugging Face (HF), DeepSpeed (DS), ZeRO stage (z), gradient accumulation steps (gas), train micro-batch size per GPU (mbs or tmbspg).
+
+| Model name  | baseline (vanila HF)     | HF + DS hand-tuned                | HF + DS autotuning (fast-mode) |
+| ----------- | ------------------------ | --------------------------------- | ------------------------------ |
+| GPT2-medium | 71.61 (gas = 1, mbs = 2) | 142.211 (z = 1, gas = 1, mbs = 4) | 163.3 (z1_gas1_tmbspg6)        |
+
+## Detailed `HF + DS autotuning` Result Summary
+
+Note that the performance metric used in autotuning is calculated using the timings captured within DeepSpeed forward, backward, and step functions. The sum of these timings is less than the actual training step latency, thus the throughput metric values used by autotuning would be higher than the end-to-end throughput in training.
+
+- Fast-mode Autotuning time: 25 mins
+- Number of experiments: 15
+- Throughput Improvement over baseline: 2.28x
+
+| tuning_space | num_experiments | best_metric_val | best_exp_name   |
+| :----------- | --------------: | --------------: | :-------------- |
+| z0           |               6 |         167.688 | z0_gas1_tmbspg5 |
+| z1           |               5 |          175.46 | z1_gas1_tmbspg6 |
+| z2           |               3 |         161.619 | z2_gas1_tmbspg6 |
+| z3           |               1 |               0 | z3_gas1_tmbspg6 |
+| global       |              15 |          175.46 | z1_gas1_tmbspg6 |
+
+Tuning completed in 0:25:18.653731. Total number of experiments: 15.
diff --git a/autotuning/hf/gpt2-medium/test_tune.sh b/autotuning/hf/gpt2-medium/test_tune.sh
new file mode 100755
index 000000000..567deb4ff
--- /dev/null
+++ b/autotuning/hf/gpt2-medium/test_tune.sh
@@ -0,0 +1,142 @@
+MODEL_NAME=gpt2-medium
+PER_DEVICE_TRAIN_BATCH_SIZE=1
+HF_PATH=~/projects
+NEPOCHS=1
+NGPUS=16
+NNODES=1
+MAX_STEPS=200
+OUTPUT_DIR=./output_b${PER_DEVICE_TRAIN_BATCH_SIZE}_g${NGPUS}_$MAX_STEPS
+
+TEST=$1
+
+if [ ${TEST} == "0" ]
+then
+    python -m torch.distributed.launch --nproc_per_node=$NGPUS $HF_PATH/transformers/examples/pytorch/language-modeling/run_clm.py \
+    --model_name_or_path $MODEL_NAME \
+    --dataset_name wikitext \
+    --dataset_config_name wikitext-2-raw-v1 \
+    --do_train \
+    --fp16 \
+    --per_device_train_batch_size $PER_DEVICE_TRAIN_BATCH_SIZE \
+    --learning_rate 2e-5 \
+    --num_train_epochs $NEPOCHS \
+    --output_dir ${OUTPUT_DIR}_0 \
+    --overwrite_output_dir \
+    --save_steps 0 \
+    --max_steps $MAX_STEPS \
+    --save_strategy "no"
+elif [ ${TEST} == "z0" ]
+then
+    deepspeed --num_nodes=$NNODES --num_gpus=$NGPUS $HF_PATH/transformers/examples/pytorch/language-modeling/run_clm.py --deepspeed ../dsconfigs/ds_config_fp16_z0.json\
+    --model_name_or_path $MODEL_NAME \
+    --dataset_name wikitext \
+    --dataset_config_name wikitext-2-raw-v1 \
+    --do_train \
+    --fp16 \
+    --per_device_train_batch_size $PER_DEVICE_TRAIN_BATCH_SIZE \
+    --learning_rate 2e-5 \
+    --num_train_epochs $NEPOCHS \
+    --output_dir ${OUTPUT_DIR}_z0 \
+    --overwrite_output_dir \
+    --save_steps 0 \
+    --max_steps $MAX_STEPS \
+    --save_strategy "no"
+elif [ ${TEST} == "z1" ]
+then
+    deepspeed --num_nodes=$NNODES --num_gpus=$NGPUS $HF_PATH/transformers/examples/pytorch/language-modeling/run_clm.py --deepspeed ../dsconfigs/ds_config_fp16_z1.json\
+    --model_name_or_path $MODEL_NAME \
+    --dataset_name wikitext \
+    --dataset_config_name wikitext-2-raw-v1 \
+    --do_train \
+    --fp16 \
+    --per_device_train_batch_size $PER_DEVICE_TRAIN_BATCH_SIZE \
+    --learning_rate 2e-5 \
+    --num_train_epochs $NEPOCHS \
+    --output_dir ${OUTPUT_DIR}_z1 \
+    --overwrite_output_dir \
+    --save_steps 0 \
+    --max_steps $MAX_STEPS \
+    --save_strategy "no"
+elif [ ${TEST} == "z2" ]
+then
+    deepspeed --num_nodes=$NNODES --num_gpus=$NGPUS $HF_PATH/transformers/examples/pytorch/language-modeling/run_clm.py --deepspeed ../dsconfigs/ds_config_fp16_z2.json\
+    --model_name_or_path $MODEL_NAME \
+    --dataset_name wikitext \
+    --dataset_config_name wikitext-2-raw-v1 \
+    --do_train \
+    --fp16 \
+    --per_device_train_batch_size $PER_DEVICE_TRAIN_BATCH_SIZE \
+    --learning_rate 2e-5 \
+    --num_train_epochs $NEPOCHS \
+    --output_dir ${OUTPUT_DIR}_z2 \
+    --overwrite_output_dir \
+    --save_steps 0 \
+    --max_steps $MAX_STEPS \
+    --save_strategy "no"
+elif [ ${TEST} == "z3" ]
+then
+    deepspeed --num_nodes=$NNODES --num_gpus=$NGPUS $HF_PATH/transformers/examples/pytorch/language-modeling/run_clm.py --deepspeed ../dsconfigs/ds_config_fp16_z3.json\
+    --model_name_or_path $MODEL_NAME \
+    --dataset_name wikitext \
+    --dataset_config_name wikitext-2-raw-v1 \
+    --do_train \
+    --fp16 \
+    --per_device_train_batch_size $PER_DEVICE_TRAIN_BATCH_SIZE \
+    --learning_rate 2e-5 \
+    --num_train_epochs $NEPOCHS \
+    --output_dir ${OUTPUT_DIR}_z3 \
+    --overwrite_output_dir \
+    --save_steps 0 \
+    --max_steps $MAX_STEPS \
+    --save_strategy "no"
+elif [ ${TEST} == "tune" ]
+then
+    deepspeed --autotuning run --num_nodes=$NNODES --num_gpus=$NGPUS $HF_PATH/transformers/examples/pytorch/language-modeling/run_clm.py --deepspeed ../dsconfigs/ds_config_fp16_tune.json\
+    --model_name_or_path $MODEL_NAME \
+    --dataset_name wikitext \
+    --dataset_config_name wikitext-2-raw-v1 \
+    --do_train \
+    --fp16 \
+    --block_size 512 \
+    --per_device_train_batch_size $PER_DEVICE_TRAIN_BATCH_SIZE \
+    --learning_rate 2e-5 \
+    --num_train_epochs $NEPOCHS \
+    --output_dir ${OUTPUT_DIR}_tune \
+    --overwrite_output_dir \
+    --save_steps 0 \
+    --max_steps $MAX_STEPS \
+    --save_strategy "no"
+elif [ ${TEST} == "tune_test" ]
+then
+    deepspeed --num_nodes=$NNODES --num_gpus=$NGPUS $HF_PATH/transformers/examples/pytorch/language-modeling/run_clm.py --deepspeed ../dsconfigs/ds_config_fp16_tune_test.json \
+    --model_name_or_path $MODEL_NAME \
+    --dataset_name wikitext \
+    --dataset_config_name wikitext-2-raw-v1 \
+    --do_train \
+    --fp16 \
+    --per_device_train_batch_size $PER_DEVICE_TRAIN_BATCH_SIZE \
+    --learning_rate 2e-5 \
+    --num_train_epochs $NEPOCHS \
+    --output_dir ${OUTPUT_DIR}_tune_test \
+    --overwrite_output_dir \
+    --save_steps 0 \
+    --max_steps $MAX_STEPS \
+    --save_strategy "no"
+elif [ ${TEST} == "fs" ]
+then
+    python -m torch.distributed.launch --nproc_per_node=$NGPUS $HF_PATH/transformers/examples/pytorch/language-modeling/run_clm.py \
+    --model_name_or_path $MODEL_NAME \
+    --dataset_name wikitext \
+    --dataset_config_name wikitext-2-raw-v1 \
+    --do_train \
+    --fp16 \
+    --per_device_train_batch_size $PER_DEVICE_TRAIN_BATCH_SIZE \
+    --learning_rate 2e-5 \
+    --num_train_epochs $NEPOCHS \
+    --output_dir ${OUTPUT_DIR}_fs \
+    --overwrite_output_dir \
+    --save_steps 0 \
+    --max_steps $MAX_STEPS \
+    --save_strategy "no"
+    --sharded_ddp zero_dp_2
+fi
diff --git a/autotuning/hf/gpt2-xl/README.md b/autotuning/hf/gpt2-xl/README.md
new file mode 100644
index 000000000..f6d81b264
--- /dev/null
+++ b/autotuning/hf/gpt2-xl/README.md
@@ -0,0 +1,56 @@
+# [gpt2-xl](https://huggingface.co/gpt2-xl)
+
+This model has the following configuration:
+- 48-layer
+- 1600 hidden dimension
+- 25 attention heads
+- 1.5B parameters.
+
+Refer to [GPT-2/GPT and causal language modeling](https://github.com/huggingface/transformers/tree/master/examples/pytorch/language-modeling#gpt-2gpt-and-causal-language-modeling)
+
+## Environment
+
+The training use fp16 and runs on 1 node with 16 Nvidia V100 GPUs. The autotuning uses the same hardware resource as the training. `max_train_batch_size` is not defined.
+The HF packages below are used.
+
+HF examples require installing the `transformers` package from source:
+```bash
+    git clone https://github.com/huggingface/transformers.git
+    cd transformers
+    pip install .
+```
+The `datasets` package can be installed by `pip install datasets`
+
+Below are the versions used in this test.
+
+- transformers (4.12.0)
+- datasets (1.11.0)
+## Throughput Comparison
+
+The table below shows the throughput (samples per second) comparison. The corresponding train micro-batch size per GPU (mbs or tmbspg) and ZeRO stage used to achieve the throughput value is also shown in the parentheses. Assume the strategy users would use in the handtuning process is to start from `mbs = 1` and increase mbs by 2 each time until running out of GPU memory.
+ - `baseline` is the vanila HF without DeepSpeed (DS) and mbs is hand-tuned.
+ - `HF + DS hand-tuned` is HF with DS, and mbs is hand-tuned while other DS configuration uses default values.
+ - `HF + DS autotuning` is HF with DS, and the DS configuration is selected from autotuning.
+
+Notation: Hugging Face (HF), DeepSpeed (DS), ZeRO stage (z), gradient accumulation steps (gas), train micro-batch size per GPU (mbs or tmbspg).
+
+| Model name | baseline (vanila HF) | HF + DS hand-tuned                | HF + DS autotuning (fast-mode)   |
+| ---------- | -------------------- | --------------------------------- | -------------------------------- |
+| GPT2-xl    | Not runnable         | Zero1 (27.462, gas = 1, mbs = 1), | Zero1 (27.497, gas = 1, mbs = 1) |
+
+## Detailed `HF + DS autotuning` Result Summary
+
+Note that the performance metric used in autotuning is calculated using the timings captured within DeepSpeed forward, backward, and step functions. The sum of these timings is less than the actual training step latency, thus the throughput metric values used by autotuning would be higher than the end-to-end throughput in training.
+
+- Fast-mode Autotuning time: 21 mins
+- Number of experiments: 9
+- Throughput Improvement over baseline: Inf
+
+| tuning_space | num_experiments | best_metric_val | best_exp_name   |
+| :----------- | --------------: | --------------: | :-------------- |
+| z1           |               3 |         40.1749 | z1_gas1_tmbspg1 |
+| z2           |               3 |         33.0472 | z2_gas1_tmbspg1 |
+| z3           |               3 |         12.8604 | z3_gas1_tmbspg1 |
+| global       |               9 |         40.1749 | z1_gas1_tmbspg1 |
+
+Tuning completed in 0:20:55.156000. Total number of experiments: 9.
diff --git a/autotuning/hf/gpt2-xl/test_tune.sh b/autotuning/hf/gpt2-xl/test_tune.sh
new file mode 100755
index 000000000..3c144635e
--- /dev/null
+++ b/autotuning/hf/gpt2-xl/test_tune.sh
@@ -0,0 +1,142 @@
+MODEL_NAME=gpt2-xl
+PER_DEVICE_TRAIN_BATCH_SIZE=1
+HF_PATH=~/projects
+NEPOCHS=1
+NGPUS=16
+NNODES=1
+MAX_STEPS=50
+OUTPUT_DIR=./output_b${PER_DEVICE_TRAIN_BATCH_SIZE}_g${NGPUS}_$MAX_STEPS
+
+TEST=$1
+
+if [ ${TEST} == "0" ]
+then
+    python -m torch.distributed.launch --nproc_per_node=$NGPUS $HF_PATH/transformers/examples/pytorch/language-modeling/run_clm.py \
+    --model_name_or_path $MODEL_NAME \
+    --dataset_name wikitext \
+    --dataset_config_name wikitext-2-raw-v1 \
+    --do_train \
+    --fp16 \
+    --per_device_train_batch_size $PER_DEVICE_TRAIN_BATCH_SIZE \
+    --learning_rate 2e-5 \
+    --num_train_epochs $NEPOCHS \
+    --output_dir ${OUTPUT_DIR}_0 \
+    --overwrite_output_dir \
+    --save_steps 0 \
+    --max_steps $MAX_STEPS \
+    --save_strategy "no"
+elif [ ${TEST} == "z0" ]
+then
+    deepspeed --num_nodes=$NNODES --num_gpus=$NGPUS $HF_PATH/transformers/examples/pytorch/language-modeling/run_clm.py --deepspeed ../dsconfigs/ds_config_fp16_z0.json\
+    --model_name_or_path $MODEL_NAME \
+    --dataset_name wikitext \
+    --dataset_config_name wikitext-2-raw-v1 \
+    --do_train \
+    --fp16 \
+    --per_device_train_batch_size $PER_DEVICE_TRAIN_BATCH_SIZE \
+    --learning_rate 2e-5 \
+    --num_train_epochs $NEPOCHS \
+    --output_dir ${OUTPUT_DIR}_z0 \
+    --overwrite_output_dir \
+    --save_steps 0 \
+    --max_steps $MAX_STEPS \
+    --save_strategy "no"
+elif [ ${TEST} == "z1" ]
+then
+    deepspeed --num_nodes=$NNODES --num_gpus=$NGPUS $HF_PATH/transformers/examples/pytorch/language-modeling/run_clm.py --deepspeed ../dsconfigs/ds_config_fp16_z1.json\
+    --model_name_or_path $MODEL_NAME \
+    --dataset_name wikitext \
+    --dataset_config_name wikitext-2-raw-v1 \
+    --do_train \
+    --fp16 \
+    --per_device_train_batch_size $PER_DEVICE_TRAIN_BATCH_SIZE \
+    --learning_rate 2e-5 \
+    --num_train_epochs $NEPOCHS \
+    --output_dir ${OUTPUT_DIR}_z1 \
+    --overwrite_output_dir \
+    --save_steps 0 \
+    --max_steps $MAX_STEPS \
+    --save_strategy "no"
+elif [ ${TEST} == "z2" ]
+then
+    deepspeed --num_nodes=$NNODES --num_gpus=$NGPUS $HF_PATH/transformers/examples/pytorch/language-modeling/run_clm.py --deepspeed ../dsconfigs/ds_config_fp16_z2.json\
+    --model_name_or_path $MODEL_NAME \
+    --dataset_name wikitext \
+    --dataset_config_name wikitext-2-raw-v1 \
+    --do_train \
+    --fp16 \
+    --per_device_train_batch_size $PER_DEVICE_TRAIN_BATCH_SIZE \
+    --learning_rate 2e-5 \
+    --num_train_epochs $NEPOCHS \
+    --output_dir ${OUTPUT_DIR}_z2 \
+    --overwrite_output_dir \
+    --save_steps 0 \
+    --max_steps $MAX_STEPS \
+    --save_strategy "no"
+elif [ ${TEST} == "z3" ]
+then
+    deepspeed --num_nodes=$NNODES --num_gpus=$NGPUS $HF_PATH/transformers/examples/pytorch/language-modeling/run_clm.py --deepspeed ../dsconfigs/ds_config_fp16_z3.json\
+    --model_name_or_path $MODEL_NAME \
+    --dataset_name wikitext \
+    --dataset_config_name wikitext-2-raw-v1 \
+    --do_train \
+    --fp16 \
+    --per_device_train_batch_size $PER_DEVICE_TRAIN_BATCH_SIZE \
+    --learning_rate 2e-5 \
+    --num_train_epochs $NEPOCHS \
+    --output_dir ${OUTPUT_DIR}_z3 \
+    --overwrite_output_dir \
+    --save_steps 0 \
+    --max_steps $MAX_STEPS \
+    --save_strategy "no"
+elif [ ${TEST} == "tune" ]
+then
+    deepspeed --autotuning run --num_nodes=$NNODES --num_gpus=$NGPUS $HF_PATH/transformers/examples/pytorch/language-modeling/run_clm.py --deepspeed ../dsconfigs/ds_config_fp16_tune.json\
+    --model_name_or_path $MODEL_NAME \
+    --dataset_name wikitext \
+    --dataset_config_name wikitext-2-raw-v1 \
+    --do_train \
+    --fp16 \
+    --block_size 512 \
+    --per_device_train_batch_size $PER_DEVICE_TRAIN_BATCH_SIZE \
+    --learning_rate 2e-5 \
+    --num_train_epochs $NEPOCHS \
+    --output_dir ${OUTPUT_DIR}_tune \
+    --overwrite_output_dir \
+    --save_steps 0 \
+    --max_steps $MAX_STEPS \
+    --save_strategy "no"
+elif [ ${TEST} == "tune_test" ]
+then
+    deepspeed --num_nodes=$NNODES --num_gpus=$NGPUS $HF_PATH/transformers/examples/pytorch/language-modeling/run_clm.py --deepspeed ../dsconfigs/ds_config_fp16_tune_test.json \
+    --model_name_or_path $MODEL_NAME \
+    --dataset_name wikitext \
+    --dataset_config_name wikitext-2-raw-v1 \
+    --do_train \
+    --fp16 \
+    --per_device_train_batch_size $PER_DEVICE_TRAIN_BATCH_SIZE \
+    --learning_rate 2e-5 \
+    --num_train_epochs $NEPOCHS \
+    --output_dir ${OUTPUT_DIR}_tune_test \
+    --overwrite_output_dir \
+    --save_steps 0 \
+    --max_steps $MAX_STEPS \
+    --save_strategy "no"
+elif [ ${TEST} == "fs" ]
+then
+    python -m torch.distributed.launch --nproc_per_node=$NGPUS $HF_PATH/transformers/examples/pytorch/language-modeling/run_clm.py \
+    --model_name_or_path $MODEL_NAME \
+    --dataset_name wikitext \
+    --dataset_config_name wikitext-2-raw-v1 \
+    --do_train \
+    --fp16 \
+    --per_device_train_batch_size $PER_DEVICE_TRAIN_BATCH_SIZE \
+    --learning_rate 2e-5 \
+    --num_train_epochs $NEPOCHS \
+    --output_dir ${OUTPUT_DIR}_fs \
+    --overwrite_output_dir \
+    --save_steps 0 \
+    --max_steps $MAX_STEPS \
+    --save_strategy "no"
+    --sharded_ddp zero_dp_2
+fi
diff --git a/autotuning/hf/gpt2/README.md b/autotuning/hf/gpt2/README.md
new file mode 100644
index 000000000..bb426910c
--- /dev/null
+++ b/autotuning/hf/gpt2/README.md
@@ -0,0 +1,59 @@
+# [gpt2](https://huggingface.co/gpt2)
+
+This model has the following configuration:
+
+- 12-layer
+- 768 hidden dimension
+- 12 attention heads
+- 117M parameters.
+
+Refer to [GPT-2/GPT and causal language modeling](https://github.com/huggingface/transformers/tree/master/examples/pytorch/language-modeling#gpt-2gpt-and-causal-language-modeling)
+
+## Environment
+
+The training use fp16 and runs on 1 node with 16 Nvidia V100 GPUs. The autotuning uses the same hardware resource as the training. `max_train_batch_size` is not defined.
+The HF packages below are used.
+
+HF examples require installing the `transformers` package from source:
+```bash
+    git clone https://github.com/huggingface/transformers.git
+    cd transformers
+    pip install .
+```
+The `datasets` package can be installed by `pip install datasets`
+
+Below are the versions used in this test.
+
+- transformers (4.12.0)
+- datasets (1.11.0)
+## Throughput Comparison
+
+The table below shows the throughput (samples per second) comparison. The corresponding train micro-batch size per GPU (mbs or tmbspg) and ZeRO stage used to achieve the throughput value is also shown in the parentheses. Assume the strategy users would use in the handtuning process is to start from `mbs = 1` and increase mbs by 2 each time until running out of GPU memory.
+ - `baseline` is the vanila HF without DeepSpeed (DS) and mbs is hand-tuned.
+ - `HF + DS hand-tuned` is HF with DS, and mbs is hand-tuned while other DS configuration uses default values.
+ - `HF + DS autotuning` is HF with DS, and the DS configuration is selected from autotuning.
+
+Notation: Hugging Face (HF), DeepSpeed (DS), ZeRO stage (z), gradient accumulation steps (gas), train micro-batch size per GPU (mbs or tmbspg).
+
+| Model name | baseline (vanila HF) | HF + DS hand-tuned       | HF + DS autotuning (fast-mode) |
+| ---------- | -------------------- | ------------------------ | ------------------------------ |
+| GPT2       | 284.142 (mbs = 8)    | 397.827 (z = 1, mbs = 8) | 431.586 (z1_gas1_tmbspg15)     |
+
+
+## Detailed `HF + DS autotuning` Result Summary
+
+Note that the performance metric used in autotuning is calculated using the timings captured within DeepSpeed forward, backward, and step functions. The sum of these timings is less than the actual training step latency, thus the throughput metric values used by autotuning would be higher than the end-to-end throughput in training.
+
+- Fast-mode Autotuning time: 25 mins
+- Number of experiments: 17
+- Throughput Improvement over baseline: 1.52x
+
+| tuning_space | num_experiments | best_metric_val | best_exp_name    |
+| :----------- | --------------: | --------------: | :--------------- |
+| z0           |               9 |         441.693 | z0_gas1_tmbspg11 |
+| z1           |               6 |         452.004 | z1_gas1_tmbspg15 |
+| z2           |               1 |               0 | z2_gas1_tmbspg15 |
+| z3           |               1 |               0 | z3_gas1_tmbspg15 |
+| global       |              17 |         452.004 | z1_gas1_tmbspg15 |
+
+Tuning completed in 0:24:19.976427. Total number of experiments: 17.
diff --git a/autotuning/hf/gpt2/test_tune.sh b/autotuning/hf/gpt2/test_tune.sh
new file mode 100755
index 000000000..b570c455c
--- /dev/null
+++ b/autotuning/hf/gpt2/test_tune.sh
@@ -0,0 +1,133 @@
+MODEL_NAME=gpt2
+PER_DEVICE_TRAIN_BATCH_SIZE=1
+HF_PATH=~/projects
+NEPOCHS=1
+NGPUS=16
+NNODES=1
+MAX_STEPS=200
+OUTPUT_DIR=./output_b${PER_DEVICE_TRAIN_BATCH_SIZE}_g${NGPUS}_$MAX_STEPS
+
+TEST=$1
+
+
+if [ ${TEST} == "0" ]
+then
+    python -m torch.distributed.launch --nproc_per_node=$NGPUS $HF_PATH/transformers/examples/pytorch/language-modeling/run_clm.py \
+    --model_name_or_path $MODEL_NAME \
+    --dataset_name wikitext \
+    --dataset_config_name wikitext-2-raw-v1 \
+    --do_train \
+    --do_eval \
+    --fp16 \
+    --per_device_train_batch_size $PER_DEVICE_TRAIN_BATCH_SIZE \
+    --learning_rate 2e-5 \
+    --num_train_epochs $NEPOCHS \
+    --output_dir ${OUTPUT_DIR}_0 \
+    --overwrite_output_dir \
+    --save_steps 0 \
+    --max_steps $MAX_STEPS \
+    --save_strategy "no"
+elif [ ${TEST} == "z0" ]
+then
+    deepspeed --num_nodes=$NNODES --num_gpus=$NGPUS $HF_PATH/transformers/examples/pytorch/language-modeling/run_clm.py --deepspeed ../dsconfigs/ds_config_fp16_z0.json\
+    --model_name_or_path $MODEL_NAME \
+    --dataset_name wikitext \
+    --dataset_config_name wikitext-2-raw-v1 \
+    --do_train \
+    --do_eval \
+    --fp16 \
+    --per_device_train_batch_size $PER_DEVICE_TRAIN_BATCH_SIZE \
+    --learning_rate 2e-5 \
+    --num_train_epochs $NEPOCHS \
+    --output_dir ${OUTPUT_DIR}_z0 \
+    --overwrite_output_dir \
+    --save_steps 0 \
+    --max_steps $MAX_STEPS \
+    --save_strategy "no"
+elif [ ${TEST} == "z1" ]
+then
+    deepspeed --num_nodes=$NNODES --num_gpus=$NGPUS $HF_PATH/transformers/examples/pytorch/language-modeling/run_clm.py --deepspeed ../dsconfigs/ds_config_fp16_z1.json\
+    --model_name_or_path $MODEL_NAME \
+    --dataset_name wikitext \
+    --dataset_config_name wikitext-2-raw-v1 \
+    --do_train \
+    --do_eval \
+    --fp16 \
+    --per_device_train_batch_size $PER_DEVICE_TRAIN_BATCH_SIZE \
+    --learning_rate 2e-5 \
+    --num_train_epochs $NEPOCHS \
+    --output_dir ${OUTPUT_DIR}_z1 \
+    --overwrite_output_dir \
+    --save_steps 0 \
+    --max_steps $MAX_STEPS \
+    --save_strategy "no"
+elif [ ${TEST} == "z2" ]
+then
+    deepspeed --num_nodes=$NNODES --num_gpus=$NGPUS $HF_PATH/transformers/examples/pytorch/language-modeling/run_clm.py --deepspeed ../dsconfigs/ds_config_fp16_z2.json\
+    --model_name_or_path $MODEL_NAME \
+    --dataset_name wikitext \
+    --dataset_config_name wikitext-2-raw-v1 \
+    --do_train \
+    --do_eval \
+    --fp16 \
+    --per_device_train_batch_size $PER_DEVICE_TRAIN_BATCH_SIZE \
+    --learning_rate 2e-5 \
+    --num_train_epochs $NEPOCHS \
+    --output_dir ${OUTPUT_DIR}_z2 \
+    --overwrite_output_dir \
+    --save_steps 0 \
+    --max_steps $MAX_STEPS \
+    --save_strategy "no"
+elif [ ${TEST} == "z3" ]
+then
+    deepspeed --num_nodes=$NNODES --num_gpus=$NGPUS $HF_PATH/transformers/examples/pytorch/language-modeling/run_clm.py --deepspeed ../dsconfigs/ds_config_fp16_z3.json\
+    --model_name_or_path $MODEL_NAME \
+    --dataset_name wikitext \
+    --dataset_config_name wikitext-2-raw-v1 \
+    --do_train \
+    --do_eval \
+    --fp16 \
+    --per_device_train_batch_size $PER_DEVICE_TRAIN_BATCH_SIZE \
+    --learning_rate 2e-5 \
+    --num_train_epochs $NEPOCHS \
+    --output_dir ${OUTPUT_DIR}_z3 \
+    --overwrite_output_dir \
+    --save_steps 0 \
+    --max_steps $MAX_STEPS \
+    --save_strategy "no"
+elif [ ${TEST} == "tune" ]
+then
+    deepspeed --autotuning run --num_nodes=$NNODES --num_gpus=$NGPUS $HF_PATH/transformers/examples/pytorch/language-modeling/run_clm.py --deepspeed ../dsconfigs/ds_config_fp16_tune.json\
+    --model_name_or_path $MODEL_NAME \
+    --dataset_name wikitext \
+    --dataset_config_name wikitext-2-raw-v1 \
+    --do_train \
+    --do_eval \
+    --fp16 \
+    --per_device_train_batch_size $PER_DEVICE_TRAIN_BATCH_SIZE \
+    --learning_rate 2e-5 \
+    --num_train_epochs $NEPOCHS \
+    --output_dir ${OUTPUT_DIR}_tune \
+    --overwrite_output_dir \
+    --save_steps 0 \
+    --max_steps $MAX_STEPS \
+    --save_strategy "no"
+elif [ ${TEST} == "fs" ]
+then
+    python -m torch.distributed.launch --nproc_per_node=$NGPUS $HF_PATH/transformers/examples/pytorch/language-modeling/run_clm.py \
+    --model_name_or_path $MODEL_NAME \
+    --dataset_name wikitext \
+    --dataset_config_name wikitext-2-raw-v1 \
+    --do_train \
+    --do_eval \
+    --fp16 \
+    --per_device_train_batch_size $PER_DEVICE_TRAIN_BATCH_SIZE \
+    --learning_rate 2e-5 \
+    --num_train_epochs $NEPOCHS \
+    --output_dir ${OUTPUT_DIR}_fs \
+    --overwrite_output_dir \
+    --save_steps 0 \
+    --max_steps $MAX_STEPS \
+    --save_strategy "no"
+    --sharded_ddp zero_dp_2
+fi