From c794669a8c3d7ba83c4a43b2af4fda44f69c7791 Mon Sep 17 00:00:00 2001
From: Baizhou Zhang <eddiezhang@pku.edu.cn>
Date: Wed, 30 Aug 2023 23:06:18 +0800
Subject: [PATCH] [shardformer] fix submodule replacement bug when enabling pp

---
 colossalai/shardformer/shard/sharder.py       | 25 ++++++++++---------
 ...st_hybrid_parallel_plugin_checkpoint_io.py |  2 ++
 .../test_model/test_shard_chatglm2.py         |  2 ++
 .../test_model/test_shard_gpt2.py             |  2 ++
 .../test_model/test_shard_opt.py              |  2 ++
 5 files changed, 21 insertions(+), 12 deletions(-)

diff --git a/colossalai/shardformer/shard/sharder.py b/colossalai/shardformer/shard/sharder.py
index 0ed745a1fc4a..9ed384266a80 100644
--- a/colossalai/shardformer/shard/sharder.py
+++ b/colossalai/shardformer/shard/sharder.py
@@ -92,22 +92,21 @@ def _recursive_replace_layer(
             param_replacement (List[Callable]): The function list to get parameter shard information in policy
             method_replacement (Dict[str, Callable]):  Key is the method name, value is the method for replacement
             sub_module_replacement ((List[SubModuleReplacementDescription]): The function list to get sub module shard information in policy
+            include (Set[nn.Module], optional): The set of modules to keep on current device when pipeline parallel is enabled. Defaults to None
         """
-        # released layers are not shardable
-        can_replace_param_or_layer = include is None or module in include
         if (isinstance(origin_cls, str) and origin_cls == module.__class__.__name__) or \
            (module.__class__ == origin_cls):
             if attr_replacement is not None:
                 self._replace_attr(module, attr_replacement)
 
-            if param_replacement is not None and can_replace_param_or_layer:
+            if param_replacement is not None and (include is None or module in include):
                 self._replace_param(module, param_replacement)
 
             if method_replacement is not None:
                 self._replace_method(module, method_replacement)
 
-            if sub_module_replacement is not None and can_replace_param_or_layer:
-                self._replace_sub_module(module, sub_module_replacement)
+            if sub_module_replacement is not None:
+                self._replace_sub_module(module, sub_module_replacement, include)
 
         for name, child in module.named_children():
             self._recursive_replace_layer(child,
@@ -154,18 +153,17 @@ def _replace_method(self, module: nn.Module, method_replacement: Dict[str, Calla
             bound_method = MethodType(new_method, module)
             setattr(module, method_name, bound_method)
 
-    def _replace_sub_module(
-        self,
-        org_layer: nn.Module,
-        sub_module_replacement: List[SubModuleReplacementDescription],
-    ) -> None:
+    def _replace_sub_module(self,
+                            org_layer: nn.Module,
+                            sub_module_replacement: List[SubModuleReplacementDescription],
+                            include: Optional[Set[nn.Module]] = None) -> None:
         r"""
         Shard one layer according to the policy, the layer should be the same class as the key in policy's argument_policy return dict
 
         Args:
             org_layer (torch.nn.Module): The origin layer object to shard
             sub_module_replacement (List[SubModuleReplacementDescription]): The sub module replacement description list
-
+            include (Set[nn.Module], optional): The set of modules to keep on current device when pipeline parallel is enabled. Defaults to None
         """
         for description in sub_module_replacement:
             suffix = description.suffix
@@ -174,9 +172,12 @@ def _replace_sub_module(
 
             assert target_module is not None, 'target_module should not be None'
 
-            # TODO: support different parallel mode
             native_sub_module = getattr_(org_layer, suffix, ignore=True)
 
+            # Skip replacement if submodule is not kept by current device when pipeline parallel is enabled.
+            if (include is not None) and (native_sub_module is not None) and (native_sub_module not in include):
+                continue
+
             assert not isinstance(native_sub_module, target_module), \
                 f"The module with suffix {suffix} has been replaced, please check the policy"
 
diff --git a/tests/test_checkpoint_io/test_hybrid_parallel_plugin_checkpoint_io.py b/tests/test_checkpoint_io/test_hybrid_parallel_plugin_checkpoint_io.py
index ea0922ef5dec..67d73c31f6e0 100644
--- a/tests/test_checkpoint_io/test_hybrid_parallel_plugin_checkpoint_io.py
+++ b/tests/test_checkpoint_io/test_hybrid_parallel_plugin_checkpoint_io.py
@@ -7,6 +7,7 @@
 import colossalai
 from colossalai.booster import Booster
 from colossalai.booster.plugin import HybridParallelPlugin
+from colossalai.shardformer.layer.utils import Randomizer
 from colossalai.tensor.d_tensor.api import clear_layout_converter
 from colossalai.testing import (
     check_state_dict_equal,
@@ -100,6 +101,7 @@ def _criterion(outputs, inputs):
         booster.load_model(new_model, model_ckpt_path)
         check_state_dict_equal(model.unwrap().state_dict(), new_model.unwrap().state_dict(), False)
 
+    Randomizer.reset_index()
     clear_layout_converter()
 
 
diff --git a/tests/test_shardformer/test_model/test_shard_chatglm2.py b/tests/test_shardformer/test_model/test_shard_chatglm2.py
index 05ca05dea4d6..48f651c727f4 100644
--- a/tests/test_shardformer/test_model/test_shard_chatglm2.py
+++ b/tests/test_shardformer/test_model/test_shard_chatglm2.py
@@ -4,6 +4,7 @@
 
 import colossalai
 from colossalai.logging import disable_existing_loggers
+from colossalai.shardformer.layer.utils import Randomizer
 from colossalai.tensor.d_tensor.api import clear_layout_converter
 from colossalai.testing import clear_cache_before_run, parameterize, rerun_if_address_is_in_use, spawn
 from tests.kit.model_zoo import model_zoo
@@ -105,6 +106,7 @@ def check_forward_backward(model_fn, data_gen_fn, output_transform_fn, loss_fn,
     # check grads
     check_all_grad_tensors(grads_to_check)
 
+    Randomizer.reset_index()
     torch.cuda.empty_cache()
 
 
diff --git a/tests/test_shardformer/test_model/test_shard_gpt2.py b/tests/test_shardformer/test_model/test_shard_gpt2.py
index 563084ed0f09..115a1bd79d41 100644
--- a/tests/test_shardformer/test_model/test_shard_gpt2.py
+++ b/tests/test_shardformer/test_model/test_shard_gpt2.py
@@ -4,6 +4,7 @@
 
 import colossalai
 from colossalai.logging import disable_existing_loggers
+from colossalai.shardformer.layer.utils import Randomizer
 from colossalai.tensor.d_tensor.api import clear_layout_converter
 from colossalai.testing import clear_cache_before_run, parameterize, rerun_if_address_is_in_use, spawn
 from tests.kit.model_zoo import model_zoo
@@ -97,6 +98,7 @@ def check_forward_backward(model_fn, data_gen_fn, output_transform_fn, loss_fn,
     # check grads
     check_all_grad_tensors(grads_to_check)
 
+    Randomizer.reset_index()
     torch.cuda.empty_cache()
 
 
diff --git a/tests/test_shardformer/test_model/test_shard_opt.py b/tests/test_shardformer/test_model/test_shard_opt.py
index 25b1eefc6016..3e74859ad1a8 100644
--- a/tests/test_shardformer/test_model/test_shard_opt.py
+++ b/tests/test_shardformer/test_model/test_shard_opt.py
@@ -6,6 +6,7 @@
 
 import colossalai
 from colossalai.logging import disable_existing_loggers
+from colossalai.shardformer.layer.utils import Randomizer
 from colossalai.tensor.d_tensor.api import clear_layout_converter
 from colossalai.testing import clear_cache_before_run, parameterize, rerun_if_address_is_in_use, spawn
 from tests.kit.model_zoo import model_zoo
@@ -107,6 +108,7 @@ def check_forward_backward(model_fn, data_gen_fn, output_transform_fn, loss_fn,
     # check grads
     check_all_grad_tensors(grads_to_check)
 
+    Randomizer.reset_index()
     torch.cuda.empty_cache()