hpcaitech · CjhHa1 · Jul 17, 2023 · Jul 10, 2023 · Jul 10, 2023 · Jul 10, 2023
diff --git a/colossalai/pipeline/p2p.py b/colossalai/pipeline/p2p.py
@@ -64,7 +64,10 @@ def _broadcast_object_list(object_list: List[Any],
     my_rank = dist.get_rank()
     # Serialize object_list elements to tensors on src rank.
     if my_rank == src:
-        tensor_list, size_list = zip(*[c10d._object_to_tensor(obj) for obj in object_list])
+        if torch.__version__ >= "1.13.0":
+            tensor_list, size_list = zip(*[c10d._object_to_tensor(obj, device=device) for obj in object_list])
+        else:
+            tensor_list, size_list = zip(*[c10d._object_to_tensor(obj) for obj in object_list])
         object_sizes_tensor = torch.cat(size_list)
     else:
         object_sizes_tensor = torch.empty(len(object_list), dtype=torch.long)

diff --git a/colossalai/pipeline/schedule/one_f_one_b.py b/colossalai/pipeline/schedule/one_f_one_b.py
@@ -205,7 +205,6 @@ def forward_backward_step(self,
                 # the backward pass.
                 input_obj = input_objs.pop(0)
                 output_obj = output_objs.pop(0)
-
                 input_obj_grad = self.backward_step(optimizer, input_obj, output_obj, output_obj_grad)
 
                 if last_iteration:

diff --git a/colossalai/shardformer/policies/auto_policy.py b/colossalai/shardformer/policies/auto_policy.py
@@ -42,6 +42,8 @@ class PolicyLocation:
         PolicyLocation(file_name="bert", class_name="BertForNextSentencePredictionPolicy"),
     "transformers.models.bert.modeling_bert.BertForMultipleChoice":
         PolicyLocation(file_name="bert", class_name="BertForMultipleChoicePolicy"),
+    "transformers.models.bert.modeling_bert.BertForQuestionAnswering":
+        PolicyLocation(file_name="bert", class_name="BertForQuestionAnsweringPolicy"),
 
     # LLaMA
     "transformers.models.llama.modeling_llama.LlamaModel":

diff --git a/colossalai/shardformer/policies/bert.py b/colossalai/shardformer/policies/bert.py
diff --git a/colossalai/shardformer/policies/llama.py b/colossalai/shardformer/policies/llama.py
@@ -212,11 +212,13 @@ def get_held_layers(self) -> List[Module]:
         return held_layers
 
     def get_shared_params(self) -> List[Dict[int, Tensor]]:
-        """No shared params in llama model"""
         llama_model = self.model.model
         if id(llama_model.embed_tokens.weight) == id(self.model.lm_head.weight):
             # tie weights
-            return [{0: llama_model.embed_tokens.weight, self.stage_manager.num_stages - 1: self.model.lm_head.weight}]
+            return [{
+                0: llama_model.embed_tokens.weight,
+                self.pipeline_stage_manager.num_stages - 1: self.model.lm_head.weight
+            }]
         return []
 
 

diff --git a/tests/kit/model_zoo/torchrec/__init__.py b/tests/kit/model_zoo/torchrec/__init__.py
@@ -1 +1 @@
-from .torchrec import *
+#from .torchrec import *
diff --git a/tests/kit/model_zoo/transformers/bert.py b/tests/kit/model_zoo/transformers/bert.py
@@ -87,6 +87,17 @@ def data_gen_for_mcq():
     return dict(input_ids=input_ids, token_type_ids=token_type_ids, attention_mask=attention_mask, labels=labels)
 
 
+def data_gen_for_qa():
+    # generating data for question answering
+    # no need for labels and use start and end position instead
+    data = data_gen()
+    start_positions = torch.tensor([0], dtype=torch.int64)
+    data['start_positions'] = start_positions
+    end_positions = torch.tensor([1], dtype=torch.int64)
+    data['end_positions'] = end_positions
+    return data
+
+
 # define output transform function
 output_transform_fn = lambda x: x
 
@@ -150,3 +161,9 @@ def data_gen_for_mcq():
                    output_transform_fn=output_transform_fn,
                    loss_fn=loss_fn,
                    model_attribute=ModelAttribute(has_control_flow=True))
+model_zoo.register(name='transformers_bert_for_question_answering',
+                   model_fn=lambda: transformers.BertForQuestionAnswering(config),
+                   data_gen_fn=data_gen_for_qa,
+                   output_transform_fn=output_transform_fn,
+                   loss_fn=loss_fn,
+                   model_attribute=ModelAttribute(has_control_flow=True))
diff --git a/tests/test_pipeline/test_policy/test_bert_for_pretraining_model.py b/tests/test_pipeline/test_policy/test_bert_for_pretraining_model.py
@@ -7,6 +7,7 @@
 import colossalai
 from colossalai.cluster import ProcessGroupMesh
 from colossalai.pipeline.stage_manager import PipelineStageManager
+from colossalai.shardformer.policies.base_policy import Policy
 from colossalai.shardformer.policies.bert import BertForPreTrainingPolicy, bert_for_pretraining_forward
 from colossalai.shardformer.shard import ShardConfig
 from colossalai.testing import rerun_if_address_is_in_use, spawn
@@ -35,25 +36,29 @@ def check_bert_for_pretraining_forward():
     stage_manager = PipelineStageManager(pg_mesh, PP_DIM)
     rank = dist.get_rank()
     # print(rank)
+    layers_per_stage = Policy.distribute_layers(len(model.bert.encoder.layer), 2)
+    stage_index = Policy.get_stage_index(layers_per_stage, stage_manager.stage)
 
     x = torch.randint(0, 1000, (2, 3))
     hidden_states = torch.randint(0, 1000, (2, 3, 768)).to(torch.float32)
     if stage_manager.stage == 0:
         attention_mask = torch.ones_like(x)
-        output = bert_for_pretraining_forward(self=model,
-                                              input_ids=x,
-                                              attention_mask=attention_mask,
-                                              stage_manager=stage_manager)
-        print(output['hidden_states'].shape)
+        output = bert_for_pretraining_forward(
+            self=model,
+            input_ids=x,
+            attention_mask=attention_mask,
+            stage_manager=stage_manager,
+            stage_index=stage_index,
+        )
         assert output['hidden_states'].shape == (2, 3, 768)
 
     else:
         attention_mask = torch.ones((2, 3))
         output = bert_for_pretraining_forward(self=model,
                                               hidden_states=hidden_states,
                                               attention_mask=attention_mask,
-                                              stage_manager=stage_manager)
-        print(output[0].shape)
+                                              stage_manager=stage_manager,
+                                              stage_index=stage_index)
         assert output[0].shape == (2, 3, 30522)
     # assert output[1].shape == (2, 768)
 

diff --git a/...ine/test_policy/test_bert_lmhead_model.py → ...ne/test_policy/test_bert_lm_head_model.py b/...ine/test_policy/test_bert_lmhead_model.py → ...ne/test_policy/test_bert_lm_head_model.py
@@ -7,12 +7,13 @@
 import colossalai
 from colossalai.cluster import ProcessGroupMesh
 from colossalai.pipeline.stage_manager import PipelineStageManager
-from colossalai.shardformer.policies.bert import BertLMHeadModelPolicy, bert_lmhead_forward
+from colossalai.shardformer.policies.base_policy import Policy
+from colossalai.shardformer.policies.bert import BertLMHeadModelPolicy, bert_lm_head_model_forward
 from colossalai.shardformer.shard import ShardConfig
 from colossalai.testing import rerun_if_address_is_in_use, spawn
 
 
-def check_bert_lmhead_forward():
+def check_bert_lm_head_model_forward():
     configuration = BertConfig()
     model = BertLMHeadModel(configuration)
     DP_DIM, PP_DIM = 0, 1
@@ -35,24 +36,28 @@ def check_bert_lmhead_forward():
     stage_manager = PipelineStageManager(pg_mesh, PP_DIM)
     rank = dist.get_rank()
     # print(rank)
-
+    layers_per_stage = Policy.distribute_layers(len(model.bert.encoder.layer), 2)
+    stage_index = Policy.get_stage_index(layers_per_stage, stage_manager.stage)
     x = torch.randint(0, 1000, (2, 3))
     hidden_states = torch.randint(0, 1000, (2, 3, 768)).to(torch.float32)
     if stage_manager.stage == 0:
         attention_mask = torch.ones_like(x)
-        output = bert_lmhead_forward(self=model,
-                                     input_ids=x,
-                                     attention_mask=attention_mask,
-                                     stage_manager=stage_manager)
+
+        output = bert_lm_head_model_forward(self=model,
+                                            input_ids=x,
+                                            attention_mask=attention_mask,
+                                            stage_manager=stage_manager,
+                                            stage_index=stage_index)
         print(output['hidden_states'].shape)
         assert output['hidden_states'].shape == (2, 3, 768)
 
     else:
         attention_mask = torch.ones((2, 3))
-        output = bert_lmhead_forward(self=model,
-                                     hidden_states=hidden_states,
-                                     attention_mask=attention_mask,
-                                     stage_manager=stage_manager)
+        output = bert_lm_head_model_forward(self=model,
+                                            hidden_states=hidden_states,
+                                            attention_mask=attention_mask,
+                                            stage_manager=stage_manager,
+                                            stage_index=stage_index)
         print(output[0].shape)
         assert output[0].shape == (2, 3, 30522)
 
@@ -93,7 +98,7 @@ def check_bert_lmhead_policy():
 
 def run_dist_model(rank, world_size, port):
     colossalai.launch(config={}, rank=rank, world_size=world_size, port=port, host='localhost')
-    check_bert_lmhead_forward()
+    check_bert_lm_head_model_forward()
 
 
 def run_dist_policy(rank, world_size, port):
@@ -103,7 +108,7 @@ def run_dist_policy(rank, world_size, port):
 
 @pytest.mark.dist
 @rerun_if_address_is_in_use()
-def test_bert_lmhead_forward():
+def test_bert_lm_head_model_forward():
     spawn(run_dist_model, 4)
 
 
@@ -115,5 +120,5 @@ def test_bert_lmhead_policy():
 
 if __name__ == "__main__":
     """test the bert for pretraining model forward and bert for pretraining model policy"""
-    test_bert_lmhead_forward()
+    test_bert_lm_head_model_forward()
     test_bert_lmhead_policy()
diff --git a/tests/test_pipeline/test_policy/test_bert_model.py b/tests/test_pipeline/test_policy/test_bert_model.py
@@ -6,12 +6,14 @@
 import colossalai
 from colossalai.cluster import ProcessGroupMesh
 from colossalai.pipeline.stage_manager import PipelineStageManager
+from colossalai.shardformer.policies.base_policy import Policy
 from colossalai.shardformer.policies.bert import BertModelPolicy, bert_model_forward
 from colossalai.shardformer.shard import ShardConfig
 from colossalai.testing import rerun_if_address_is_in_use, spawn
 
 
 def check_bert_model_forward():
+    # this test may crash for internet reasons
     model = BertModel.from_pretrained('bert-base-uncased')
     DP_DIM, PP_DIM = 0, 1
     DP_SIZE, PP_SIZE = 2, 2
@@ -34,20 +36,25 @@ def check_bert_model_forward():
     stage_manager = PipelineStageManager(pg_mesh, PP_DIM)
     rank = dist.get_rank()
     # print(rank)
-
+    layers_per_stage = Policy.distribute_layers(len(model.encoder.layer), 2)
+    stage_index = Policy.get_stage_index(layers_per_stage, stage_manager.stage)
     x = torch.randint(0, 1000, (2, 3))
     hidden_states = torch.randint(0, 1000, (2, 3, 768)).to(torch.float32)
     if stage_manager.stage == 0:
         attention_mask = torch.ones_like(x)
-        output = bert_model_forward(self=model, input_ids=x, attention_mask=attention_mask, stage_manager=stage_manager)
-        print(output['hidden_states'].shape)
+        output = bert_model_forward(self=model,
+                                    input_ids=x,
+                                    attention_mask=attention_mask,
+                                    stage_manager=stage_manager,
+                                    stage_index=stage_index)
         assert output['hidden_states'].shape == (2, 3, 768)
     else:
         attention_mask = torch.ones((2, 3))
         output = bert_model_forward(self=model,
                                     hidden_states=hidden_states,
                                     attention_mask=attention_mask,
-                                    stage_manager=stage_manager)
+                                    stage_manager=stage_manager,
+                                    stage_index=stage_index)
         print(output[0].shape)
         assert output[0].shape == (2, 3, 768)
 
@@ -112,4 +119,3 @@ def test_bert_model_policy():
     """test the bert model forward and bert model policy"""
     #test_bert_model_forward()
     test_bert_model_policy()
-    # this test need config to run
diff --git a/tests/test_shardformer/test_model/_utils.py b/tests/test_shardformer/test_model/_utils.py
@@ -49,7 +49,6 @@ def run_forward(original_model, sharded_model, data_gen_fn, output_transform_fn,
     # prepare input
     data = data_gen_fn()
     data = {k: v.cuda() for k, v in data.items()}
-
     # switch to train mode
     original_model.train()
     sharded_model.train()