hpcaitech · CjhHa1 · Jul 7, 2023 · Jul 6, 2023 · Jul 6, 2023 · Jul 6, 2023
diff --git a/colossalai/pipeline/policy/base.py b/colossalai/pipeline/policy/base.py
@@ -109,33 +109,3 @@ def parallelize_model(self,
         self.replace_forward(module)
         shared_params = self.get_shared_params(module)
         return hold_params, hold_buffers, shared_params
-
-    @staticmethod
-    def distribute_layers(num_layers: int, num_stages: int) -> List[int]:
-        """
-        divide layers into stages
-        """
-        quotient = num_layers // num_stages
-        remainder = num_layers % num_stages
-
-        # calculate the num_layers per stage
-        layers_per_stage = [quotient] * num_stages
-
-        # deal with the rest layers
-        if remainder > 0:
-            start_position = num_layers // 2 - remainder // 2
-            for i in range(start_position, start_position + remainder):
-                layers_per_stage[i] += 1
-        return layers_per_stage
-
-    @staticmethod
-    def get_stage_index(layers_per_stage: List[int], stage: int) -> List[int]:
-        """
-        get the start index and end index of layers for each stage.
-        """
-        num_layers_per_stage_accumulated = np.insert(np.cumsum(layers_per_stage), 0, 0)
-
-        start_idx = num_layers_per_stage_accumulated[stage]
-        end_idx = num_layers_per_stage_accumulated[stage + 1]
-
-        return [start_idx, end_idx]
diff --git a/colossalai/shardformer/policies/auto_policy.py b/colossalai/shardformer/policies/auto_policy.py
@@ -29,7 +29,7 @@ class PolicyLocation:
     "transformers.models.bert.modeling_bert.BertModel":
         PolicyLocation(file_name="bert", class_name="BertModelPolicy"),
     "transformers.models.bert.modeling_bert.BertForPreTraining":
-        PolicyLocation(file_name="bert", class_name="BertForPretrainingPolicy"),
+        PolicyLocation(file_name="bert", class_name="BertForPreTrainingPolicy"),
     "transformers.models.bert.modeling_bert.BertLMHeadModel":
         PolicyLocation(file_name="bert", class_name="BertLMHeadModelPolicy"),
     "transformers.models.bert.modeling_bert.BertForMaskedLM":

diff --git a/colossalai/shardformer/policies/base_policy.py b/colossalai/shardformer/policies/base_policy.py
@@ -4,6 +4,7 @@
 from dataclasses import dataclass
 from typing import Any, Callable, Dict, List, Optional, Union
 
+import numpy as np
 import torch.nn as nn
 from torch import Tensor
 from torch.nn import Module
@@ -176,3 +177,33 @@ def get_shared_params(self) -> List[Dict[int, Tensor]]:
             List[Dict[int, Tensor]]: List of parameters that should be shared across stages. E.g. [{0: module.model.embed_tokens.weight, 3: module.lm_head.weight}]
         """
         return []
+
+    @staticmethod
+    def distribute_layers(num_layers: int, num_stages: int) -> List[int]:
+        """Divide layers into stages
+
+        """
+        quotient = num_layers // num_stages
+        remainder = num_layers % num_stages
+
+        # calculate the num_layers per stage
+        layers_per_stage = [quotient] * num_stages
+
+        # deal with the rest layers
+        if remainder > 0:
+            start_position = num_layers // 2 - remainder // 2
+            for i in range(start_position, start_position + remainder):
+                layers_per_stage[i] += 1
+        return layers_per_stage
+
+    @staticmethod
+    def get_stage_index(layers_per_stage: List[int], stage: int) -> List[int]:
+        """
+        get the start index and end index of layers for each stage.
+        """
+        num_layers_per_stage_accumulated = np.insert(np.cumsum(layers_per_stage), 0, 0)
+
+        start_idx = num_layers_per_stage_accumulated[stage]
+        end_idx = num_layers_per_stage_accumulated[stage + 1]
+
+        return [start_idx, end_idx]