From c3ec4e4d9e2b0ec52633c0cb668504790a1b778c Mon Sep 17 00:00:00 2001 From: genghaozhe <939857490@qq.com> Date: Mon, 27 May 2024 06:53:28 +0000 Subject: [PATCH 1/4] [ci/tests] simplify some test case to reduce testing time --- .../test_model/test_shard_bert.py | 21 ------------ .../test_model/test_shard_blip2.py | 8 ++--- .../test_model/test_shard_bloom.py | 22 +----------- .../test_model/test_shard_falcon.py | 23 ++----------- .../test_model/test_shard_gpt2.py | 16 --------- .../test_model/test_shard_llama.py | 34 ++----------------- 6 files changed, 9 insertions(+), 115 deletions(-) diff --git a/tests/test_shardformer/test_model/test_shard_bert.py b/tests/test_shardformer/test_model/test_shard_bert.py index 3ec394768669..1869ad575c3f 100644 --- a/tests/test_shardformer/test_model/test_shard_bert.py +++ b/tests/test_shardformer/test_model/test_shard_bert.py @@ -122,20 +122,6 @@ def check_forward_backward(model_fn, data_gen_fn, output_transform_fn, loss_fn, "precision": "fp16", "initial_scale": 1, }, - { - "tp_size": 2, - "pp_size": 1, - "enable_all_optimization": True, - "use_lazy_init": True, - "precision": "fp32", - }, - { - "tp_size": 1, - "pp_size": 2, - "num_microbatches": 4, - "use_lazy_init": True, - "precision": "fp32", - }, { "tp_size": 2, "pp_size": 2, @@ -145,13 +131,6 @@ def check_forward_backward(model_fn, data_gen_fn, output_transform_fn, loss_fn, "precision": "fp16", "initial_scale": 1, }, - { - "tp_size": 4, - "pp_size": 1, - "enable_all_optimization": True, - "use_lazy_init": False, - "precision": "fp32", - }, {"tp_size": 2, "pp_size": 1, "enable_all_optimization": True, "use_lazy_init": False, "precision": "fp32"}, { "tp_size": 2, diff --git a/tests/test_shardformer/test_model/test_shard_blip2.py b/tests/test_shardformer/test_model/test_shard_blip2.py index 712c5c1e19fd..aab75ff97115 100644 --- a/tests/test_shardformer/test_model/test_shard_blip2.py +++ b/tests/test_shardformer/test_model/test_shard_blip2.py @@ -65,10 +65,10 @@ def check_forward_backward(org_model, sharded_model, data_gen_fn, output_transfo ) -@parameterize("enable_fused_normalization", [True, False]) -@parameterize("enable_tensor_parallelism", [True, False]) -@parameterize("enable_flash_attention", [True, False]) -@parameterize("enable_jit_fused", [True, False]) +@parameterize("enable_fused_normalization", [True]) +@parameterize("enable_tensor_parallelism", [True]) +@parameterize("enable_flash_attention", [True]) +@parameterize("enable_jit_fused", [True]) def run_blip2_test( enable_fused_normalization, enable_tensor_parallelism, diff --git a/tests/test_shardformer/test_model/test_shard_bloom.py b/tests/test_shardformer/test_model/test_shard_bloom.py index 6ab0369e0b91..31fa917416c5 100644 --- a/tests/test_shardformer/test_model/test_shard_bloom.py +++ b/tests/test_shardformer/test_model/test_shard_bloom.py @@ -110,17 +110,6 @@ def check_forward_backward(model_fn, data_gen_fn, output_transform_fn, loss_fn, "precision": "fp32", "initial_scale": 1, }, - { - "tp_size": 4, - "pp_size": 1, - "num_microbatches": 1, - "enable_sequence_parallelism": True, - "sequence_parallelism_mode": "split_gather", - "enable_flash_attention": False, - "use_lazy_init": True, - "precision": "fp16", - "initial_scale": 1, - }, { "tp_size": 2, "pp_size": 2, @@ -128,6 +117,7 @@ def check_forward_backward(model_fn, data_gen_fn, output_transform_fn, loss_fn, "enable_all_optimization": True, "use_lazy_init": True, "precision": "fp16", + "zero_stage": 1, "initial_scale": 1, }, { @@ -139,16 +129,6 @@ def check_forward_backward(model_fn, data_gen_fn, output_transform_fn, loss_fn, "precision": "fp32", }, {"tp_size": 4, "pp_size": 1, "enable_all_optimization": True, "use_lazy_init": False, "precision": "fp32"}, - {"tp_size": 2, "pp_size": 1, "enable_all_optimization": True, "use_lazy_init": False, "precision": "fp32"}, - { - "tp_size": 2, - "pp_size": 1, - "enable_all_optimization": True, - "use_lazy_init": True, - "zero_stage": 2, - "precision": "fp16", - "initial_scale": 1, - }, { "tp_size": 1, "pp_size": 2, diff --git a/tests/test_shardformer/test_model/test_shard_falcon.py b/tests/test_shardformer/test_model/test_shard_falcon.py index 8074f9d61140..3eb82864a63f 100644 --- a/tests/test_shardformer/test_model/test_shard_falcon.py +++ b/tests/test_shardformer/test_model/test_shard_falcon.py @@ -92,21 +92,12 @@ def check_forward_backward(model_fn, data_gen_fn, output_transform_fn, loss_fn, "tp_size": 2, "pp_size": 2, "num_microbatches": 4, - "enable_all_optimization": True, - "use_lazy_init": True, - "precision": "fp16", - "initial_scale": 1, - }, - { - "tp_size": 1, - "pp_size": 2, - "num_microbatches": 4, "enable_all_optimization": False, "use_lazy_init": False, - "precision": "fp32", + "precision": "fp16", + "initial_scale": 1, }, {"tp_size": 4, "pp_size": 1, "enable_all_optimization": True, "use_lazy_init": False, "precision": "fp32"}, - {"tp_size": 2, "pp_size": 1, "enable_all_optimization": True, "use_lazy_init": False, "precision": "fp32"}, { "tp_size": 2, "pp_size": 1, @@ -116,16 +107,6 @@ def check_forward_backward(model_fn, data_gen_fn, output_transform_fn, loss_fn, "precision": "fp16", "initial_scale": 1, }, - { - "tp_size": 1, - "pp_size": 2, - "num_microbatches": 2, - "enable_all_optimization": True, - "use_lazy_init": True, - "zero_stage": 1, - "precision": "fp16", - "initial_scale": 1, - }, ], ) def run_falcon_test(test_config): diff --git a/tests/test_shardformer/test_model/test_shard_gpt2.py b/tests/test_shardformer/test_model/test_shard_gpt2.py index 72ea2b0895e9..b86dab6dd22d 100644 --- a/tests/test_shardformer/test_model/test_shard_gpt2.py +++ b/tests/test_shardformer/test_model/test_shard_gpt2.py @@ -162,15 +162,6 @@ def check_forward_backward(model_fn, data_gen_fn, output_transform_fn, loss_fn, "precision": "fp16", "initial_scale": 1, }, - { - "tp_size": 1, - "pp_size": 2, - "num_microbatches": 4, - "enable_all_optimization": True, - "use_lazy_init": True, - "precision": "fp16", - "initial_scale": 1, - }, { "tp_size": 4, "pp_size": 1, @@ -178,13 +169,6 @@ def check_forward_backward(model_fn, data_gen_fn, output_transform_fn, loss_fn, "use_lazy_init": False, "precision": "fp32", }, - { - "tp_size": 2, - "pp_size": 1, - "enable_all_optimization": False, - "use_lazy_init": False, - "precision": "fp32", - }, { "tp_size": 2, "pp_size": 2, diff --git a/tests/test_shardformer/test_model/test_shard_llama.py b/tests/test_shardformer/test_model/test_shard_llama.py index c38570f8599c..11faf8a7bb22 100644 --- a/tests/test_shardformer/test_model/test_shard_llama.py +++ b/tests/test_shardformer/test_model/test_shard_llama.py @@ -145,17 +145,6 @@ def check_forward_backward(model_fn, data_gen_fn, output_transform_fn, loss_fn, "precision": "fp16", "initial_scale": 1, }, - { - "tp_size": 4, - "pp_size": 1, - "num_microbatches": 1, - "enable_sequence_parallelism": True, - "sequence_parallelism_mode": "ring", - "enable_flash_attention": False, - "use_lazy_init": True, - "precision": "fp32", - "initial_scale": 1, - }, { "tp_size": 4, "pp_size": 1, @@ -167,17 +156,6 @@ def check_forward_backward(model_fn, data_gen_fn, output_transform_fn, loss_fn, "precision": "fp16", "initial_scale": 1, }, - { - "tp_size": 1, - "pp_size": 1, - "sp_size": 2, - "num_microbatches": 1, - "enable_sequence_parallelism": True, - "sequence_parallelism_mode": "all_to_all", - "use_lazy_init": True, - "precision": "fp16", - "initial_scale": 1, - }, { "tp_size": 1, "pp_size": 1, @@ -222,21 +200,13 @@ def check_forward_backward(model_fn, data_gen_fn, output_transform_fn, loss_fn, "gradient_checkpoint_config": PipelineGradientCheckpointConfig(num_ckpt_layers_per_stage=[4, 0]), }, { - "tp_size": 4, - "pp_size": 1, - "enable_all_optimization": False, - "use_lazy_init": False, - "precision": "fp32", - }, - { - "tp_size": 1, - "pp_size": 4, + "tp_size": 2, + "pp_size": 2, "num_microbatches": 4, "enable_all_optimization": False, "use_lazy_init": False, "precision": "fp32", }, - {"tp_size": 2, "pp_size": 1, "enable_all_optimization": False, "use_lazy_init": False, "precision": "fp32"}, { "tp_size": 2, "pp_size": 1, From 43b912c03ea7e68079830d015d86f1942f4af2ca Mon Sep 17 00:00:00 2001 From: genghaozhe <939857490@qq.com> Date: Tue, 28 May 2024 06:37:40 +0000 Subject: [PATCH 2/4] [ci/tests] continue to remove test case to reduce ci time cost --- .../test_model/test_shard_bert.py | 1 - .../test_model/test_shard_bloom.py | 1 - .../test_model/test_shard_gpt2.py | 16 ---------------- 3 files changed, 18 deletions(-) diff --git a/tests/test_shardformer/test_model/test_shard_bert.py b/tests/test_shardformer/test_model/test_shard_bert.py index 1869ad575c3f..0808e3490b04 100644 --- a/tests/test_shardformer/test_model/test_shard_bert.py +++ b/tests/test_shardformer/test_model/test_shard_bert.py @@ -131,7 +131,6 @@ def check_forward_backward(model_fn, data_gen_fn, output_transform_fn, loss_fn, "precision": "fp16", "initial_scale": 1, }, - {"tp_size": 2, "pp_size": 1, "enable_all_optimization": True, "use_lazy_init": False, "precision": "fp32"}, { "tp_size": 2, "pp_size": 1, diff --git a/tests/test_shardformer/test_model/test_shard_bloom.py b/tests/test_shardformer/test_model/test_shard_bloom.py index 31fa917416c5..feabc908394c 100644 --- a/tests/test_shardformer/test_model/test_shard_bloom.py +++ b/tests/test_shardformer/test_model/test_shard_bloom.py @@ -128,7 +128,6 @@ def check_forward_backward(model_fn, data_gen_fn, output_transform_fn, loss_fn, "use_lazy_init": False, "precision": "fp32", }, - {"tp_size": 4, "pp_size": 1, "enable_all_optimization": True, "use_lazy_init": False, "precision": "fp32"}, { "tp_size": 1, "pp_size": 2, diff --git a/tests/test_shardformer/test_model/test_shard_gpt2.py b/tests/test_shardformer/test_model/test_shard_gpt2.py index b86dab6dd22d..ab70da3ded13 100644 --- a/tests/test_shardformer/test_model/test_shard_gpt2.py +++ b/tests/test_shardformer/test_model/test_shard_gpt2.py @@ -162,13 +162,6 @@ def check_forward_backward(model_fn, data_gen_fn, output_transform_fn, loss_fn, "precision": "fp16", "initial_scale": 1, }, - { - "tp_size": 4, - "pp_size": 1, - "enable_all_optimization": False, - "use_lazy_init": False, - "precision": "fp32", - }, { "tp_size": 2, "pp_size": 2, @@ -177,15 +170,6 @@ def check_forward_backward(model_fn, data_gen_fn, output_transform_fn, loss_fn, "use_lazy_init": True, "precision": "fp32", }, - { - "tp_size": 2, - "pp_size": 1, - "enable_all_optimization": True, - "use_lazy_init": True, - "zero_stage": 2, - "precision": "fp16", - "initial_scale": 1, - }, { "tp_size": 1, "pp_size": 2, From 713f7bef609afc097c3f597d03dd76cd211bd645 Mon Sep 17 00:00:00 2001 From: genghaozhe <939857490@qq.com> Date: Tue, 28 May 2024 06:42:24 +0000 Subject: [PATCH 3/4] restore some test config --- tests/test_shardformer/test_model/test_shard_blip2.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/test_shardformer/test_model/test_shard_blip2.py b/tests/test_shardformer/test_model/test_shard_blip2.py index aab75ff97115..04a8f57e9df6 100644 --- a/tests/test_shardformer/test_model/test_shard_blip2.py +++ b/tests/test_shardformer/test_model/test_shard_blip2.py @@ -65,8 +65,8 @@ def check_forward_backward(org_model, sharded_model, data_gen_fn, output_transfo ) -@parameterize("enable_fused_normalization", [True]) -@parameterize("enable_tensor_parallelism", [True]) +@parameterize("enable_fused_normalization", [True, False]) +@parameterize("enable_tensor_parallelism", [True, False]) @parameterize("enable_flash_attention", [True]) @parameterize("enable_jit_fused", [True]) def run_blip2_test( From f9e2e084e522bdbe418e51abb6b4c5a98c9eeade Mon Sep 17 00:00:00 2001 From: genghaozhe <939857490@qq.com> Date: Tue, 28 May 2024 10:50:16 +0000 Subject: [PATCH 4/4] [ci/tests] continue to reduce ci time cost --- .../test_model/test_shard_bert.py | 10 ---------- .../test_model/test_shard_gpt2.py | 8 -------- .../test_model/test_shard_llama.py | 19 ------------------- .../test_model/test_shard_vit.py | 1 - 4 files changed, 38 deletions(-) diff --git a/tests/test_shardformer/test_model/test_shard_bert.py b/tests/test_shardformer/test_model/test_shard_bert.py index 0808e3490b04..b97de0ef86cf 100644 --- a/tests/test_shardformer/test_model/test_shard_bert.py +++ b/tests/test_shardformer/test_model/test_shard_bert.py @@ -140,16 +140,6 @@ def check_forward_backward(model_fn, data_gen_fn, output_transform_fn, loss_fn, "precision": "fp16", "initial_scale": 1, }, - { - "tp_size": 1, - "pp_size": 2, - "num_microbatches": 2, - "enable_all_optimization": True, - "use_lazy_init": True, - "zero_stage": 1, - "precision": "fp16", - "initial_scale": 1, - }, ], ) def run_bert_test(test_config): diff --git a/tests/test_shardformer/test_model/test_shard_gpt2.py b/tests/test_shardformer/test_model/test_shard_gpt2.py index ab70da3ded13..f9e368c0ebf3 100644 --- a/tests/test_shardformer/test_model/test_shard_gpt2.py +++ b/tests/test_shardformer/test_model/test_shard_gpt2.py @@ -162,14 +162,6 @@ def check_forward_backward(model_fn, data_gen_fn, output_transform_fn, loss_fn, "precision": "fp16", "initial_scale": 1, }, - { - "tp_size": 2, - "pp_size": 2, - "num_microbatches": 4, - "enable_all_optimization": False, - "use_lazy_init": True, - "precision": "fp32", - }, { "tp_size": 1, "pp_size": 2, diff --git a/tests/test_shardformer/test_model/test_shard_llama.py b/tests/test_shardformer/test_model/test_shard_llama.py index 11faf8a7bb22..1628bf2f398a 100644 --- a/tests/test_shardformer/test_model/test_shard_llama.py +++ b/tests/test_shardformer/test_model/test_shard_llama.py @@ -168,17 +168,6 @@ def check_forward_backward(model_fn, data_gen_fn, output_transform_fn, loss_fn, "precision": "fp16", "initial_scale": 1, }, - { - "tp_size": 1, - "pp_size": 1, - "num_microbatches": 1, - "enable_sequence_parallelism": True, - "sequence_parallelism_mode": "all_to_all", - "enable_flash_attention": False, - "use_lazy_init": True, - "precision": "fp16", - "initial_scale": 1, - }, { "tp_size": 2, "pp_size": 2, @@ -199,14 +188,6 @@ def check_forward_backward(model_fn, data_gen_fn, output_transform_fn, loss_fn, "enable_gradient_checkpointing": True, "gradient_checkpoint_config": PipelineGradientCheckpointConfig(num_ckpt_layers_per_stage=[4, 0]), }, - { - "tp_size": 2, - "pp_size": 2, - "num_microbatches": 4, - "enable_all_optimization": False, - "use_lazy_init": False, - "precision": "fp32", - }, { "tp_size": 2, "pp_size": 1, diff --git a/tests/test_shardformer/test_model/test_shard_vit.py b/tests/test_shardformer/test_model/test_shard_vit.py index d33b52b422dc..99d6f7d2ccc1 100644 --- a/tests/test_shardformer/test_model/test_shard_vit.py +++ b/tests/test_shardformer/test_model/test_shard_vit.py @@ -108,7 +108,6 @@ def check_forward_backward(model_fn, data_gen_fn, output_transform_fn, loss_fn, "precision": "fp32", }, {"tp_size": 4, "pp_size": 1, "enable_all_optimization": True, "use_lazy_init": False, "precision": "fp32"}, - {"tp_size": 2, "pp_size": 1, "enable_all_optimization": True, "use_lazy_init": False, "precision": "fp32"}, { "tp_size": 2, "pp_size": 1,