From 5aec80010b2a423822c156fa6510d2cf98f6aa88 Mon Sep 17 00:00:00 2001
From: FoolPlayer <498107402@qq.com>
Date: Tue, 22 Aug 2023 16:43:42 +0800
Subject: [PATCH] not support opt of seq-parallel, add warning and fix a bug in
 gpt2 pp

---
 colossalai/shardformer/modeling/gpt2.py | 2 +-
 colossalai/shardformer/policies/opt.py  | 4 ++++
 2 files changed, 5 insertions(+), 1 deletion(-)

diff --git a/colossalai/shardformer/modeling/gpt2.py b/colossalai/shardformer/modeling/gpt2.py
index 722f0f52334b..8ed367b25349 100644
--- a/colossalai/shardformer/modeling/gpt2.py
+++ b/colossalai/shardformer/modeling/gpt2.py
@@ -148,7 +148,7 @@ def gpt2_model_forward(
             if token_type_ids is not None:
                 token_type_embeds = self.wte(token_type_ids)
                 hidden_states = hidden_states + token_type_embeds
-                hidden_states = self.drop(hidden_states)
+            hidden_states = self.drop(hidden_states)
 
         output_shape = input_shape + (hidden_states.size(-1),)
 
diff --git a/colossalai/shardformer/policies/opt.py b/colossalai/shardformer/policies/opt.py
index ba6036bd0658..58663553b922 100644
--- a/colossalai/shardformer/policies/opt.py
+++ b/colossalai/shardformer/policies/opt.py
@@ -1,3 +1,4 @@
+import warnings
 from functools import partial
 from typing import Callable, Dict, List
 
@@ -39,6 +40,9 @@ def module_policy(self):
         from transformers.models.opt.modeling_opt import OPTAttention, OPTDecoder, OPTDecoderLayer
 
         policy = {}
+        if self.shard_config.enable_sequence_parallelism:
+            self.shard_config.enable_sequence_parallelism = False
+            warnings.warn("OPT dosen't support sequence parallelism now, will ignore the sequence parallelism flag.")
 
         if self.shard_config.enable_tensor_parallelism:
             policy[OPTDecoder] = ModulePolicyDescription(sub_module_replacement=[