From ec217de814a6b76a3aa6f0bb5f0d9662d82b5961 Mon Sep 17 00:00:00 2001
From: Kun Lin <81014421+klhhhhh@users.noreply.github.com>
Date: Fri, 7 Jul 2023 14:06:46 +0800
Subject: [PATCH 01/21] Feature/vit support (#4182)

* [shardformer] added tests

* [shardformer] vit test finish and support

* fix attention dropout
---
 colossalai/shardformer/policies/autopolicy.py |   8 +
 colossalai/shardformer/policies/vit.py        | 170 +++++++++---------
 tests/kit/model_zoo/transformers/__init__.py  |   1 +
 tests/kit/model_zoo/transformers/vit.py       |  60 +++++++
 .../test_model/test_shard_vit.py              |  62 +++++--
 5 files changed, 201 insertions(+), 100 deletions(-)
 create mode 100644 tests/kit/model_zoo/transformers/vit.py

diff --git a/colossalai/shardformer/policies/autopolicy.py b/colossalai/shardformer/policies/autopolicy.py
index 8051433e8d71..f49a552c82b3 100644
--- a/colossalai/shardformer/policies/autopolicy.py
+++ b/colossalai/shardformer/policies/autopolicy.py
@@ -71,6 +71,14 @@ class PolicyLocation:
     "transformers.models.gpt2.modeling_gpt2.GPT2ForSequenceClassification":
         PolicyLocation(file_name="gpt2", class_name="GPT2ForSequenceClassificationPolicy"),
 
+    # ViT
+    "transformers.models.vit.modeling_vit.ViTModel":
+        PolicyLocation(file_name="vit", class_name="ViTPolicy"),
+    "transformers.models.vit.modeling_vit.ViTForImageClassification":
+        PolicyLocation(file_name="vit", class_name="ViTForImageClassificationPolicy"),
+    "transformers.models.vit.modeling_vit.ViTForMaskedImageModeling":
+        PolicyLocation(file_name="vit", class_name="ViTForMaskedImageModelingPolicy"),
+
     # OPT
     "transformers.models.opt.modeling_opt.OPTModel":
         PolicyLocation(file_name="opt", class_name="OPTModelPolicy"),
diff --git a/colossalai/shardformer/policies/vit.py b/colossalai/shardformer/policies/vit.py
index eaebe2eee0ba..7b035afae22c 100644
--- a/colossalai/shardformer/policies/vit.py
+++ b/colossalai/shardformer/policies/vit.py
@@ -2,11 +2,11 @@
 
 import torch.nn as nn
 
-from colossalai.shardformer.layer import DropoutForReplicatedInput, FusedLayerNorm, Linear1D_Col, Linear1D_Row
+from colossalai.shardformer.layer import DropoutForReplicatedInput, DropoutForParallelInput, FusedLayerNorm, Linear1D_Col, Linear1D_Row
 
 from .basepolicy import ModulePolicyDescription, Policy, SubModuleReplacementDescription
 
-__all__ = ['ViTPolicy']
+__all__ = ['ViTPolicy', 'ViTForImageClassificationPolicy', 'ViTForMaskedImageModelingPolicy']
 
 
 class ViTPolicy(Policy):
@@ -15,96 +15,104 @@ def config_sanity_check(self):
         pass
 
     def preprocess(self):
-        # Resize embedding
-        vocab_size = self.model.config.vocab_size
-        world_size = self.shard_config.tensor_parallel_size
-
-        if vocab_size % world_size != 0:
-            new_vocab_size = vocab_size + world_size - vocab_size % world_size
-            self.model.resize_token_embeddings(new_vocab_size)
-
         return self.model
 
     def module_policy(self) -> Dict[Union[str, nn.Module], ModulePolicyDescription]:
         from transformers.models.vit.modeling_vit import ViTEmbeddings, ViTLayer
 
-        base_policy = {
-            ViTEmbeddings:
-                ModulePolicyDescription(sub_module_replacement=[
-                    SubModuleReplacementDescription(
-                        suffix="dropout",
-                        target_module=DropoutForReplicatedInput,
-                    )
-                ]),
-            ViTLayer:
-                ModulePolicyDescription(attribute_replacement={
-                    "attention.attention.num_attention_heads":
-                        self.model.config.num_attention_heads // self.shard_config.tensor_parallel_size,
-                    "attention.attention.all_head_size":
-                        self.model.config.hidden_size // self.shard_config.tensor_parallel_size,
-                },
+        policy = {}
+
+        if self.shard_config.enable_tensor_parallelism:
+            policy[ViTEmbeddings] = ModulePolicyDescription(attribute_replacement={},
+                                        param_replacement=[],
                                         sub_module_replacement=[
                                             SubModuleReplacementDescription(
-                                                suffix="attention.attention.query",
-                                                target_module=Linear1D_Col,
-                                            ),
-                                            SubModuleReplacementDescription(
-                                                suffix="attention.attention.key",
-                                                target_module=Linear1D_Col,
-                                            ),
-                                            SubModuleReplacementDescription(
-                                                suffix="attention.attention.value",
-                                                target_module=Linear1D_Col,
-                                            ),
-                                            SubModuleReplacementDescription(
-                                                suffix="attention.attention.dropout",
-                                                target_module=DropoutForParallelInput,
-                                            ),
-                                            SubModuleReplacementDescription(
-                                                suffix="attention.output.dense",
-                                                target_module=Linear1D_Row,
-                                            ),
-                                            SubModuleReplacementDescription(
-                                                suffix="attention.output.dropout",
-                                                target_module=DropoutForParallelInput,
-                                            ),
-                                            SubModuleReplacementDescription(
-                                                suffix="intermediate.dense",
-                                                target_module=Linear1D_Col,
-                                            ),
-                                            SubModuleReplacementDescription(
-                                                suffix="output.dense",
-                                                target_module=Linear1D_Row,
-                                            ),
-                                            SubModuleReplacementDescription(
-                                                suffix="output.dropout",
-                                                target_module=DropoutForParallelInput,
-                                            ),
-                                        ]),
-        }
-
-        # optimization configuration
-        if self.shard_config.enable_fused_normalization:
-            base_policy[ViTAttention].sub_module_replacement.extend([
-                SubModuleReplacementDescription(
-                    suffix="layernorm_before",
-                    target_module=FusedLayerNorm,
-                ),
-                SubModuleReplacementDescription(
-                    suffix="layernorm_after",
-                    target_module=FusedLayerNorm,
+                                                suffix="dropout",
+                                                target_module=DropoutForReplicatedInput,
+                                            )
+                                        ])
+            
+            policy[ViTLayer] = ModulePolicyDescription(
+                    attribute_replacement={
+                        "attention.attention.num_attention_heads":
+                            self.model.config.num_attention_heads//self.shard_config.tensor_parallel_size,
+                        "attention.attention.all_head_size":
+                            self.model.config.hidden_size//self.shard_config.tensor_parallel_size,
+                    },
+                    param_replacement=[],
+                    sub_module_replacement=[
+                        SubModuleReplacementDescription(
+                            suffix="attention.attention.query",
+                            target_module=Linear1D_Col,
+                        ),
+                        SubModuleReplacementDescription(
+                            suffix="attention.attention.key",
+                            target_module=Linear1D_Col,
+                        ),
+                        SubModuleReplacementDescription(
+                            suffix="attention.attention.value",
+                            target_module=Linear1D_Col,
+                        ),
+                        SubModuleReplacementDescription(
+                            suffix="attention.attention.dropout",
+                            target_module=DropoutForParallelInput,
+                        ),
+                        SubModuleReplacementDescription(
+                            suffix="attention.output.dense",
+                            target_module=Linear1D_Row,
+                        ),
+                        SubModuleReplacementDescription(
+                            suffix="attention.output.dropout",
+                            target_module=DropoutForReplicatedInput,
+                        ),
+                        SubModuleReplacementDescription(
+                            suffix="intermediate.dense",
+                            target_module=Linear1D_Col,
+                        ),
+                        SubModuleReplacementDescription(
+                            suffix="output.dense",
+                            target_module=Linear1D_Row,
+                        ),
+                        SubModuleReplacementDescription(
+                            suffix="output.dropout",
+                            target_module=DropoutForReplicatedInput,
+                        ),
+                    ]
                 )
-            ])
-            base_policy[ViTModel].sub_module_replacement.append(
-                SubModuleReplacementDescription(
-                    suffix="layernorm",
-                    target_module=FusedLayerNorm,
-                ))
-
-        return base_policy
 
+        return policy
+  
+    
     def new_model_class(self):
         return None
 
     def postprocess(self):
         return self.model
+
+class ViTForImageClassificationPolicy(ViTPolicy):
+
+     def module_policy(self):
+        from transformers.models.vit.modeling_vit import ViTForImageClassification
+
+        policy = super().module_policy()
+        if self.shard_config.enable_tensor_parallelism:
+            new_item = {
+                ViTForImageClassification:
+                ModulePolicyDescription(sub_module_replacement=[
+                                        SubModuleReplacementDescription(suffix="classifier",
+                                                                            target_module=Linear1D_Col,
+                                                                            kwargs=dict(gather_output=True))
+                                        ])
+            }
+            policy.update(new_item)
+        return policy
+
+class ViTForMaskedImageModelingPolicy(ViTPolicy):
+    
+    def module_policy(self):
+        policy = super().module_policy()
+        return policy
+    
+
+        
+
diff --git a/tests/kit/model_zoo/transformers/__init__.py b/tests/kit/model_zoo/transformers/__init__.py
index 4aa01abe13ee..a298767d12e7 100644
--- a/tests/kit/model_zoo/transformers/__init__.py
+++ b/tests/kit/model_zoo/transformers/__init__.py
@@ -5,3 +5,4 @@
 from .llama import *
 from .opt import *
 from .t5 import *
+from .vit import *
diff --git a/tests/kit/model_zoo/transformers/vit.py b/tests/kit/model_zoo/transformers/vit.py
new file mode 100644
index 000000000000..1c86c7ebc742
--- /dev/null
+++ b/tests/kit/model_zoo/transformers/vit.py
@@ -0,0 +1,60 @@
+import torch
+import transformers
+
+from ..registry import ModelAttribute, model_zoo
+
+# ===============================
+# Register single-sentence VIT
+# ===============================
+
+config = transformers.ViTConfig(num_hidden_layers=4,
+                                hidden_size=128,
+                                intermediate_size=256,
+                                num_attention_heads=4)
+
+# define data gen function
+def data_gen():
+    pixel_values = torch.randn(1, 3, 224, 224)
+    return dict(pixel_values = pixel_values)
+
+def data_gen_for_masked_image_modeling():
+    data = data_gen()
+    num_patches = (config.image_size // config.patch_size) ** 2
+    bool_masked_pos = torch.randint(low=0, high=2, size=(1, num_patches)).bool()
+    data['bool_masked_pos'] = bool_masked_pos
+    return data
+
+# define output transform function
+output_transform_fn = lambda x: x
+
+# function to get the loss
+loss_fn_for_vit_model = lambda x : x.pooler_output.mean()
+loss_fn_for_image_classification = lambda x : x.logits.mean()
+loss_fn_for_masked_image_modeling = lambda x : x.loss
+
+# register the following models
+# transformers.ViTModel,
+# transformers.ViTForMaskedImageModeling,
+# transformers.ViTForImageClassification,
+model_zoo.register(name = 'transformers_vit',
+                    model_fn = lambda : transformers.ViTModel(config),
+                    data_gen_fn = data_gen,
+                    output_transform_fn = output_transform_fn,
+                    loss_fn = loss_fn_for_vit_model,
+                    model_attribute = ModelAttribute(has_control_flow=True))
+
+model_zoo.register(name = 'transformers_vit_for_masked_image_modeling',
+                    model_fn = lambda : transformers.ViTForMaskedImageModeling(config),
+                    data_gen_fn = data_gen_for_masked_image_modeling,
+                    output_transform_fn = output_transform_fn,
+                    loss_fn = loss_fn_for_masked_image_modeling,
+                    model_attribute = ModelAttribute(has_control_flow=True))
+
+model_zoo.register(name = 'transformers_vit_for_image_classification',
+                    model_fn = lambda : transformers.ViTForImageClassification(config),
+                    data_gen_fn = data_gen,
+                    output_transform_fn = output_transform_fn,
+                    loss_fn = loss_fn_for_image_classification,
+                    model_attribute = ModelAttribute(has_control_flow=True))
+
+
diff --git a/tests/test_shardformer/test_model/test_shard_vit.py b/tests/test_shardformer/test_model/test_shard_vit.py
index af1605b6b659..a96fd02ae746 100644
--- a/tests/test_shardformer/test_model/test_shard_vit.py
+++ b/tests/test_shardformer/test_model/test_shard_vit.py
@@ -1,9 +1,18 @@
+import os
+
 import pytest
 import torch
 
 import colossalai
 from colossalai.logging import disable_existing_loggers
-from colossalai.testing import assert_hf_output_close, clear_cache_before_run, rerun_if_address_is_in_use, spawn
+from colossalai.tensor.d_tensor.api import is_customized_distributed_tensor, is_distributed_tensor
+from colossalai.testing import (
+    assert_hf_output_close,
+    clear_cache_before_run,
+    parameterize,
+    rerun_if_address_is_in_use,
+    spawn,
+)
 from tests.kit.model_zoo import model_zoo
 from tests.test_shardformer.test_model._utils import build_model, run_forward
 
@@ -12,44 +21,59 @@ def check_forward_backward(org_model, sharded_model, data_gen_fn, output_transfo
     # check forward
     org_output, org_loss, shard_output, shard_loss = run_forward(org_model, sharded_model, data_gen_fn,
                                                                  output_transform_fn, loss_fn)
-    assert_hf_output_close(org_output, shard_output)
-
+    assert_hf_output_close(org_output, shard_output, atol=1e-4, rtol=1e-4)
     # do backward
     org_loss.backward()
     shard_loss.backward()
 
-    # check grad
-    org_grad = org_model.encoder.layer[0].attention.attention.query.weight.grad
-    shard_grad = sharded_model.encoder.layer[0].attention.attention.query.weight.grad
-
-    shard_grad_list = [torch.zeros([*shard_grad.shape]).to('cuda') for _ in range(2)]
-    shard_grad = torch.distributed.all_gather(shard_grad_list, shard_grad)
-    all_shard_grad = torch.cat(shard_grad_list, dim=0)
-
     assert torch.allclose(org_loss, shard_loss,
                           atol=1e-5), f"shard model loss is not equal to orgin model loss\n{org_loss}\n{shard_loss}"
-    assert torch.allclose(org_grad, all_shard_grad,
-                          atol=1e-5), f"shard model grad is not equal to orgin model grad\n{org_grad}\n{all_shard_grad}"
 
+    # unwrap model
+    if org_model.__class__.__name__ == 'ViTModel':
+        vit_model = org_model
+        shard_vit_model = sharded_model
+    else:
+        vit_model = org_model.vit
+        shard_vit_model = sharded_model.vit
 
-def check_vit(rank, world_size, port):
-    disable_existing_loggers()
-    colossalai.launch(config={}, rank=rank, world_size=world_size, host='localhost', port=port, backend='nccl')
+    # check attention grad
+    org_grad = vit_model.encoder.layer[0].attention.attention.query.weight.grad
+    shard_grad = shard_vit_model.encoder.layer[0].attention.attention.query.weight.grad
+    shard_weight = shard_vit_model.encoder.layer[0].attention.attention.query.weight
 
+    if is_distributed_tensor(shard_weight) or is_customized_distributed_tensor(shard_weight):
+        shard_grad_list = [torch.zeros([*shard_grad.shape]).to('cuda') for _ in range(2)]
+        shard_grad = torch.distributed.all_gather(shard_grad_list, shard_grad)
+        all_shard_grad = torch.cat(shard_grad_list, dim=0)
+    else:
+        all_shard_grad = shard_grad
+    assert torch.allclose(org_grad, all_shard_grad,
+                          atol=1e-5), f"shard model grad is not equal to orgin model grad\n{org_grad}\n{shard_grad}"
+    
+
+@parameterize('enable_fused_normalization', [True, False])
+@parameterize('enable_tensor_parallelism', [True, False])
+def run_vit_test(enable_fused_normalization, enable_tensor_parallelism):
     sub_model_zoo = model_zoo.get_sub_registry('transformers_vit')
     for name, (model_fn, data_gen_fn, output_transform_fn, loss_fn, _) in sub_model_zoo.items():
-        org_model, sharded_model = build_model(world_size, model_fn)
+        org_model, sharded_model = build_model(model_fn, enable_fused_normalization, enable_tensor_parallelism)
         check_forward_backward(org_model, sharded_model, data_gen_fn, output_transform_fn, loss_fn)
-
     torch.cuda.empty_cache()
 
 
+def check_vit(rank, world_size, port):
+    disable_existing_loggers()
+    colossalai.launch(config={}, rank=rank, world_size=world_size, host='localhost', port=port, backend='nccl')
+    run_vit_test()
+
+
 @pytest.mark.dist
 @pytest.mark.skip
 @rerun_if_address_is_in_use()
 @clear_cache_before_run()
 def test_vit():
-    spawn(check_vit, 4)
+    spawn(check_vit, 2)
 
 
 if __name__ == "__main__":

From ddecf73962201fd75c1207154f3758f519b2e1df Mon Sep 17 00:00:00 2001
From: FoolPlayer <45593998+FoolPlayer@users.noreply.github.com>
Date: Fri, 14 Jul 2023 15:56:59 +0800
Subject: [PATCH 02/21] [shardformer] support SAM (#4231)

* 1.support sam 2.add fused qkv for nn.Linear

* update utils support set element in list

* overtwrite SamVisionAttention foward to use DropoutForParallelInput

* remove unused code
---
 colossalai/shardformer/_utils.py              |  44 +++-
 colossalai/shardformer/layer/__init__.py      |   4 +-
 .../shardformer/layer/qkv_fused_linear.py     | 175 ++++++++++++++-
 colossalai/shardformer/modeling/sam.py        |  41 ++++
 colossalai/shardformer/policies/autopolicy.py |   6 +-
 colossalai/shardformer/policies/sam.py        | 209 ++++++++++++++++++
 tests/kit/model_zoo/transformers/__init__.py  |   1 +
 tests/kit/model_zoo/transformers/sam.py       |  52 +++++
 .../test_gpt2_qkv_fused_linear_1d.py          | 120 ++++++++++
 .../test_layer/test_qkv_fused_linear_1d.py    |  65 +-----
 .../test_model/test_shard_sam.py              |  92 ++++++++
 11 files changed, 740 insertions(+), 69 deletions(-)
 create mode 100644 colossalai/shardformer/modeling/sam.py
 create mode 100644 colossalai/shardformer/policies/sam.py
 create mode 100644 tests/kit/model_zoo/transformers/sam.py
 create mode 100644 tests/test_shardformer/test_layer/test_gpt2_qkv_fused_linear_1d.py
 create mode 100644 tests/test_shardformer/test_model/test_shard_sam.py

diff --git a/colossalai/shardformer/_utils.py b/colossalai/shardformer/_utils.py
index 4ad877e72357..c553080de0a0 100644
--- a/colossalai/shardformer/_utils.py
+++ b/colossalai/shardformer/_utils.py
@@ -1,25 +1,57 @@
 import re
 
 
-def get_obj_list_element(obj, a):
+def get_obj_list_element(obj, attr: str):
     r"""
     Get the element of the list in the object
+
+    If the attr is a normal attribute, return the attribute of the object.
+    If the attr is a index type, return the element of the index in the list, like `layers[0]`.
+
+    Args:
+        obj (Object): The object to get
+        attr (str): The suffix of the attribute to get
+
     """
     re_pattern = r'\[\d+\]'
     prog = re.compile(re_pattern)
-    result = prog.search(a)
+    result = prog.search(attr)
     if result:
         matched_brackets = result.group()
         matched_index = matched_brackets.replace('[', '')
         matched_index = matched_index.replace(']', '')
-        a_ = a.replace(matched_brackets, '')
-        container_obj = getattr(obj, a_)
+        attr_ = attr.replace(matched_brackets, '')
+        container_obj = getattr(obj, attr_)
         obj = container_obj[int(matched_index)]
     else:
-        obj = getattr(obj, a)
+        obj = getattr(obj, attr)
     return obj
 
 
+def set_obj_list_element(obj, attr: str, value):
+    r"""
+    Set the element to value of a list object
+
+    It used like set_obj_list_element(obj, 'lyaers[0]', new_layer), it will set obj.layers[0] to value
+
+    Args:
+        obj (object): The object to set
+        attr (str): the string including a list index like `layers[0]`
+    """
+    re_pattern = r'\[\d+\]'
+    prog = re.compile(re_pattern)
+    result = prog.search(attr)
+    if result:
+        matched_brackets = result.group()
+        matched_index = matched_brackets.replace('[', '')
+        matched_index = matched_index.replace(']', '')
+        attr_ = attr.replace(matched_brackets, '')
+        container_obj = getattr(obj, attr_)
+        container_obj[int(matched_index)] = value
+    else:
+        setattr(obj, attr, value)
+
+
 def hasattr_(obj, attr: str):
     r"""
     Check whether the object has the multi sublevel attr
@@ -56,7 +88,7 @@ def setattr_(obj, attr: str, value, ignore: bool = False):
             if ignore:
                 return
             raise AttributeError(f"Object {obj.__class__.__name__} has no attribute {attr}")
-    setattr(obj, attrs[-1], value)
+    set_obj_list_element(obj, attrs[-1], value)
 
 
 def getattr_(obj, attr: str, ignore: bool = False):
diff --git a/colossalai/shardformer/layer/__init__.py b/colossalai/shardformer/layer/__init__.py
index 7fad4948dfd0..0c44e6621711 100644
--- a/colossalai/shardformer/layer/__init__.py
+++ b/colossalai/shardformer/layer/__init__.py
@@ -3,10 +3,10 @@
 from .linear import Linear1D_Col, Linear1D_Row
 from .loss import cross_entropy_1d
 from .normalization import FusedLayerNorm, FusedRMSNorm
-from .qkv_fused_linear import GPT2FusedLinearConv1D_Col, GPT2FusedLinearConv1D_Row
+from .qkv_fused_linear import FusedLinear1D_Col, GPT2FusedLinearConv1D_Col, GPT2FusedLinearConv1D_Row
 
 __all__ = [
     "Embedding1D", "VocabParallelEmbedding1D", "Linear1D_Col", "Linear1D_Row", 'GPT2FusedLinearConv1D_Col',
     'GPT2FusedLinearConv1D_Row', 'DropoutForParallelInput', 'DropoutForReplicatedInput', "cross_entropy_1d",
-    'FusedLayerNorm', 'FusedRMSNorm'
+    'FusedLayerNorm', 'FusedRMSNorm', 'FusedLinear1D_Col'
 ]
diff --git a/colossalai/shardformer/layer/qkv_fused_linear.py b/colossalai/shardformer/layer/qkv_fused_linear.py
index 9d51670c65dd..cce51bae9e30 100644
--- a/colossalai/shardformer/layer/qkv_fused_linear.py
+++ b/colossalai/shardformer/layer/qkv_fused_linear.py
@@ -23,6 +23,7 @@
 
 from ._operation import (
     gather_forward_split_backward,
+    linear_with_async_comm,
     matmul_with_async_comm,
     reduce_backward,
     reduce_forward,
@@ -31,7 +32,7 @@
 from .parallel_module import ParallelModule
 from .utils import create_randomizer_with_offset
 
-__all__ = ['FusedLinear1D_Col', 'FusedLinear1D_Row']
+__all__ = ['FusedLinear1D_Col', 'FusedLinear1D_Row', 'GPT2FusedLinearConv1D_Col', 'GPT2FusedLinearConv1D_Row']
 
 # ====================================
 # For GPT Only
@@ -471,3 +472,175 @@ def forward(self, input_: Tensor) -> Tensor:
             return output
         else:
             return output, self.bias
+
+
+# ====================================
+# For Fused torch.nn.Linear
+# ====================================
+
+
+class FusedLinear1D_Col(ParallelModule):
+    r"""Fused Linear layer with column parallelism.
+
+    The linear layer is defined as :math:`Y = XA + b`. A is parallelized along
+    its second dimension as :math:`A = [A_1, ..., A_p]`. This layer is used to fit `torch.nn.Linear` layer (Fused QKV) in normal torch layer of huggingface, like SAM.
+
+    Args:
+        in_features (int): size of each input sample.
+        out_features (int): size of each output sample.
+        bias (bool, optional): If set to ``False``, the layer will not learn an additive bias, defaults to ``True``.
+        dtype (`torch.dtype`): The dtype of parameters, defaults to None.
+        device (`torch.device`): The device of parameters, defaults to None.
+        n_fused (int): The number items fused, defaults to 3 (QKV).
+        process_group (`torch.distributed.ProcessGroup`): The process group to be used for weight sharding and communication, defaults to None.
+        gather_output (bool, optional): If true, call all-gather on output and make Y available
+                    to all GPUs, otherwise, every GPU will have its output
+                    which is :math:`Y_i = XA_i`, defaults to False
+        skip_bias_add (bool): If set to ``True``, it will skip bias add for linear layer,
+            which is preserved for kernel fusion, defaults to False
+        weight_initializer (`typing.Callable`):
+            The initializer of weight, defaults to kaiming uniform initializer.
+        bias_initializer (`typing.Callable`):
+            The initializer of bias, defaults to xavier uniform initializer.
+
+    More details about ``initializer`` please refer to
+    `init <https://github.com/hpcaitech/ColossalAI/blob/main/colossalai/nn/init.py>`_.
+    """
+
+    def __init__(self,
+                 in_features: int,
+                 out_features: int,
+                 bias: bool = True,
+                 dtype: torch.dtype = None,
+                 device: torch.device = None,
+                 process_group: ProcessGroup = None,
+                 async_communication: bool = False,
+                 gather_output: bool = False,
+                 skip_bias_add: bool = False,
+                 n_fused: int = 3,
+                 weight_initializer: Callable = init.kaiming_uniform_(a=math.sqrt(5)),
+                 bias_initializer: Callable = init.xavier_uniform_(a=1, scale=1)):
+        super().__init__()
+
+        # Keep input parameters
+        self.in_features = in_features
+        self.out_features = out_features
+        self.gather_output = gather_output
+        self.skip_bias_add = skip_bias_add
+        self.device = device
+        self.n_fused = n_fused
+        self.process_group = process_group
+        self.async_communication = async_communication
+
+        if skip_bias_add and not bias:
+            raise ValueError('cannot skip bias addition if bias is None')
+
+        # Parameters.
+        # Initialize weight.
+        factory_kwargs = {'device': device, 'dtype': dtype}
+        weight = torch.empty(self.out_features, self.in_features, **factory_kwargs)
+
+        def shard_fn(tensor):
+            return split_fused_qkv_in_gpt2_style(tensor, self.n_fused, self.process_group, False)
+
+        def gather_fn(tensor):
+            return gather_fused_qkv_in_gpt2_style(tensor, 3, self.process_group, False)
+
+        with torch.no_grad():
+            sharded_weight = distribute_tensor_with_customization(weight, shard_fn, gather_fn)
+        self.weight = customized_distributed_tensor_to_param(sharded_weight)
+
+        if bias:
+            bias = torch.empty(self.out_features, **factory_kwargs)
+
+            with torch.no_grad():
+                sharded_bias = distribute_tensor_with_customization(bias, shard_fn, gather_fn)
+            self.bias = customized_distributed_tensor_to_param(sharded_bias)
+        else:
+            self.bias = None
+
+        # offset the seed with randomizer index and rank
+        seed = torch.random.initial_seed()
+        self.randomizer = create_randomizer_with_offset(seed, process_group=self.process_group)
+
+        # init weights
+        self.reset_parameters(weight_initializer, bias_initializer)
+
+    @staticmethod
+    def from_native_module(module: nn.Module, process_group: Union[ProcessGroup, List[ProcessGroup]], n_fused: int,
+                           *args, **kwargs) -> ParallelModule:
+        r"""
+        Convert a fused `torch.nn.linear` layer to a parallelized linear layer.
+
+        Args:
+            module (`nn.Linear`): The module to be converted.
+            process_group (`Union[ProcessGroup, List[ProcessGroup]]`): The process group to be used for weight sharding and communication.
+            n_fused (int): The number of layers to be fused. In common, Q,K,V are fused in one weight.
+        """
+        # get the attributes
+        in_features = module.in_features
+        out_features = module.out_features
+        bias = module.bias is not None
+        device = module.weight.device
+
+        # ensure only one process group is passed
+        if isinstance(process_group, (list, tuple)):
+            assert len(process_group) == 1, \
+                f'Expected only one process group, got {len(process_group)}.'
+            process_group = process_group[0]
+
+        linear_1d = FusedLinear1D_Col(in_features=in_features,
+                                      out_features=out_features,
+                                      bias=bias,
+                                      device=device,
+                                      process_group=process_group,
+                                      *args,
+                                      **kwargs)
+
+        # TODO: copy the sharded weights
+        with torch.no_grad():
+            sharded_weight = split_fused_qkv_in_gpt2_style(module.weight.data,
+                                                           n_fused=n_fused,
+                                                           process_group=process_group,
+                                                           is_transposed=False)
+            linear_1d.weight.data.copy_(sharded_weight.data)
+
+            if bias:
+                sharded_bias = split_fused_qkv_in_gpt2_style(module.bias.data,
+                                                             n_fused=n_fused,
+                                                             process_group=process_group,
+                                                             is_transposed=False)
+                linear_1d.bias.data.copy_(sharded_bias.data)
+
+        return linear_1d
+
+    def reset_parameters(self, weight_initializer, bias_initializer) -> None:
+        with self.randomizer.fork_rng(enable_cpu=True):
+            fan_in, fan_out = self.in_features, self.out_features
+            weight_initializer(self.weight, fan_in=fan_in, fan_out=fan_out)
+            if self.bias is not None:
+                bias_initializer(self.bias, fan_in=fan_in)
+
+    def forward(self, input_: Tensor) -> Tuple[Tensor, Tensor]:
+        assert input_.shape[-1] == self.weight.shape[-1], \
+            'Invalid shapes in Linear1D_Col forward: input={}, weight={}. Expected last dim of input {}.'.format(
+                input_.shape, self.weight.shape, self.weight.shape[-1])
+        # Set up backprop all-reduce.
+        # input_parallel = reduce_backward(input_, self.process_group)
+        input_parallel = input_
+
+        # Matrix multiply.
+        bias = self.bias if not self.skip_bias_add else None
+
+        output_parallel = linear_with_async_comm(input_parallel, self.weight, bias, self.process_group, True)
+
+        if self.gather_output:
+            # All-gather across the partitions.
+            output = gather_forward_split_backward(output_parallel, dim=-1, process_group=self.process_group)
+        else:
+            output = output_parallel
+
+        if self.skip_bias_add:
+            return output, self.bias
+        else:
+            return output
diff --git a/colossalai/shardformer/modeling/sam.py b/colossalai/shardformer/modeling/sam.py
new file mode 100644
index 000000000000..00e2d744e219
--- /dev/null
+++ b/colossalai/shardformer/modeling/sam.py
@@ -0,0 +1,41 @@
+import torch
+import torch.distributed as dist
+from torch.distributed import ProcessGroup
+
+
+def forward_fn():
+
+    def forward(self, hidden_states: torch.Tensor, output_attentions=False) -> torch.Tensor:
+        batch_size, height, width, _ = hidden_states.shape
+        # qkv with shape (3, batch_size, nHead, height * width, channel)
+        qkv = (self.qkv(hidden_states).reshape(batch_size, height * width, 3, self.num_attention_heads,
+                                               -1).permute(2, 0, 3, 1, 4))
+        # q, k, v with shape (batch_size * nHead, height * width, channel)
+        query, key, value = qkv.reshape(3, batch_size * self.num_attention_heads, height * width, -1).unbind(0)
+
+        attn_weights = (query * self.scale) @ key.transpose(-2, -1)
+
+        if self.use_rel_pos:
+            attn_weights = self.add_decomposed_rel_pos(attn_weights, query, self.rel_pos_h, self.rel_pos_w,
+                                                       (height, width), (height, width))
+
+        attn_weights = torch.nn.functional.softmax(attn_weights, dtype=torch.float32, dim=-1).to(query.dtype)
+
+        # replace dropout process with added DropoutForParallelInput layer
+        # origin code:
+        # attn_probs = nn.functional.dropout(attn_weights, p=self.dropout, training=self.training)
+        attn_probs = self.dropout_layer(attn_weights)
+
+        attn_output = (attn_probs @ value).reshape(batch_size, self.num_attention_heads, height, width, -1)
+        attn_output = attn_output.permute(0, 2, 3, 1, 4).reshape(batch_size, height, width, -1)
+
+        attn_output = self.proj(attn_output)
+
+        if output_attentions:
+            outputs = (attn_output, attn_weights)
+        else:
+            outputs = (attn_output, None)
+
+        return outputs
+
+    return forward
diff --git a/colossalai/shardformer/policies/autopolicy.py b/colossalai/shardformer/policies/autopolicy.py
index f49a552c82b3..ccf2199bd056 100644
--- a/colossalai/shardformer/policies/autopolicy.py
+++ b/colossalai/shardformer/policies/autopolicy.py
@@ -88,7 +88,7 @@ class PolicyLocation:
         PolicyLocation(file_name="opt", class_name="OPTForSequenceClassificationPolicy"),
     "transformers.models.opt.modeling_opt.OPTForQuestionAnswering":
         PolicyLocation(file_name="opt", class_name="OPTForQuestionAnsweringPolicy"),
-        
+
     # Bloom
     "transformers.models.bloom.modeling_bloom.BloomModel":
         PolicyLocation(file_name="bloom", class_name="BloomModelPolicy"),
@@ -100,6 +100,10 @@ class PolicyLocation:
         PolicyLocation(file_name="bloom", class_name="BloomForTokenClassificationPolicy"),
     "transformers.models.bloom.modeling_bloom.BloomForQuestionAnswering":
         PolicyLocation(file_name="bloom", class_name="BloomForQuestionAnsweringPolicy"),
+
+    # Sam
+    "transformers.models.sam.modeling_sam.SamModel":
+        PolicyLocation(file_name="sam", class_name="SamModelPolicy"),
 }
 
 
diff --git a/colossalai/shardformer/policies/sam.py b/colossalai/shardformer/policies/sam.py
new file mode 100644
index 000000000000..e75d63946260
--- /dev/null
+++ b/colossalai/shardformer/policies/sam.py
@@ -0,0 +1,209 @@
+import torch.nn as nn
+
+import colossalai.shardformer.layer as col_nn
+
+from .._utils import getattr_, setattr_
+from ..modeling.sam import forward_fn
+from .basepolicy import ModulePolicyDescription, Policy, SubModuleReplacementDescription
+
+__all__ = ['SamPolicy', 'SamModelPolicy']
+
+
+class SamPolicy(Policy):
+
+    def config_sanity_check(self):
+        pass
+
+    def preprocess(self):
+        return self.model
+
+    def module_policy(self):
+        from transformers.models.sam.modeling_sam import (
+            SamFeedForward,
+            SamTwoWayAttentionBlock,
+            SamTwoWayTransformer,
+            SamVisionAttention,
+            SamVisionLayer,
+        )
+
+        policy = {}
+
+        if self.shard_config.enable_tensor_parallelism:
+            policy[SamVisionLayer] = ModulePolicyDescription(attribute_replacement={
+                "attn.num_attention_heads":
+                    self.model.config.vision_config.num_attention_heads // self.shard_config.tensor_parallel_size,
+            },
+                                                             sub_module_replacement=[
+                                                                 SubModuleReplacementDescription(
+                                                                     suffix="attn.qkv",
+                                                                     target_module=col_nn.FusedLinear1D_Col,
+                                                                     kwargs={
+                                                                         "n_fused": 3,
+                                                                     },
+                                                                 ),
+                                                                 SubModuleReplacementDescription(
+                                                                     suffix="attn.proj",
+                                                                     target_module=col_nn.Linear1D_Row,
+                                                                 ),
+                                                                 SubModuleReplacementDescription(
+                                                                     suffix="mlp.lin1",
+                                                                     target_module=col_nn.Linear1D_Col,
+                                                                 ),
+                                                                 SubModuleReplacementDescription(
+                                                                     suffix="mlp.lin2",
+                                                                     target_module=col_nn.Linear1D_Row,
+                                                                 )
+                                                             ])
+            policy[SamTwoWayAttentionBlock] = ModulePolicyDescription(
+                attribute_replacement={
+                    "self_attn.num_attention_heads":
+                        self.model.config.mask_decoder_config.num_attention_heads //
+                        self.shard_config.tensor_parallel_size,
+                },
+                sub_module_replacement=[
+                    SubModuleReplacementDescription(
+                        suffix="self_attn.q_proj",
+                        target_module=col_nn.Linear1D_Col,
+                    ),
+                    SubModuleReplacementDescription(
+                        suffix="self_attn.k_proj",
+                        target_module=col_nn.Linear1D_Col,
+                    ),
+                    SubModuleReplacementDescription(
+                        suffix="self_attn.v_proj",
+                        target_module=col_nn.Linear1D_Col,
+                    ),
+                    SubModuleReplacementDescription(
+                        suffix="self_attn.out_proj",
+                        target_module=col_nn.Linear1D_Row,
+                    ),
+                    SubModuleReplacementDescription(
+                        suffix="cross_attn_token_to_image.q_proj",
+                        target_module=col_nn.Linear1D_Col,
+                    ),
+                    SubModuleReplacementDescription(
+                        suffix="cross_attn_token_to_image.k_proj",
+                        target_module=col_nn.Linear1D_Col,
+                    ),
+                    SubModuleReplacementDescription(
+                        suffix="cross_attn_token_to_image.v_proj",
+                        target_module=col_nn.Linear1D_Col,
+                    ),
+                    SubModuleReplacementDescription(
+                        suffix="cross_attn_token_to_image.out_proj",
+                        target_module=col_nn.Linear1D_Row,
+                    ),
+                    SubModuleReplacementDescription(
+                        suffix="mlp.lin1",
+                        target_module=col_nn.Linear1D_Col,
+                    ),
+                    SubModuleReplacementDescription(
+                        suffix="mlp.lin2",
+                        target_module=col_nn.Linear1D_Row,
+                    ),
+                    SubModuleReplacementDescription(
+                        suffix="cross_attn_image_to_token.q_proj",
+                        target_module=col_nn.Linear1D_Col,
+                    ),
+                    SubModuleReplacementDescription(
+                        suffix="cross_attn_image_to_token.k_proj",
+                        target_module=col_nn.Linear1D_Col,
+                    ),
+                    SubModuleReplacementDescription(
+                        suffix="cross_attn_image_to_token.v_proj",
+                        target_module=col_nn.Linear1D_Col,
+                    ),
+                    SubModuleReplacementDescription(
+                        suffix="cross_attn_image_to_token.out_proj",
+                        target_module=col_nn.Linear1D_Row,
+                    ),
+                ])
+            policy[SamTwoWayTransformer] = ModulePolicyDescription(attribute_replacement={
+                "final_attn_token_to_image.num_attention_heads":
+                    self.model.config.mask_decoder_config.num_attention_heads // self.shard_config.tensor_parallel_size,
+            },
+                                                                   sub_module_replacement=[
+                                                                       SubModuleReplacementDescription(
+                                                                           suffix="final_attn_token_to_image.q_proj",
+                                                                           target_module=col_nn.Linear1D_Col,
+                                                                       ),
+                                                                       SubModuleReplacementDescription(
+                                                                           suffix="final_attn_token_to_image.k_proj",
+                                                                           target_module=col_nn.Linear1D_Col,
+                                                                       ),
+                                                                       SubModuleReplacementDescription(
+                                                                           suffix="final_attn_token_to_image.v_proj",
+                                                                           target_module=col_nn.Linear1D_Col,
+                                                                       ),
+                                                                       SubModuleReplacementDescription(
+                                                                           suffix="final_attn_token_to_image.out_proj",
+                                                                           target_module=col_nn.Linear1D_Row,
+                                                                       )
+                                                                   ])
+
+            # add `DropoutForParallelInput` layer to replace the useage of `nn.functional.dropout`
+            policy[SamVisionAttention] = ModulePolicyDescription(attribute_replacement={
+                "dropout_layer": col_nn.DropoutForParallelInput(self.model.config.vision_config.attention_dropout)
+            },
+                                                                 method_replacement={"forward": forward_fn()},
+                                                                 sub_module_replacement=[])
+
+        # optimization configuration
+        if self.shard_config.enable_fused_normalization:
+            # Handle SamVisionLayer
+            self.append_or_create_submodule_replacement(description=[
+                SubModuleReplacementDescription(
+                    suffix="layer_norm1",
+                    target_module=col_nn.FusedLayerNorm,
+                ),
+                SubModuleReplacementDescription(
+                    suffix="layer_norm2",
+                    target_module=col_nn.FusedLayerNorm,
+                )
+            ],
+                                                        policy=policy,
+                                                        target_key=SamVisionLayer)
+
+            # Handle SamTwoWayAttentionBlock
+            self.append_or_create_submodule_replacement(description=[
+                SubModuleReplacementDescription(
+                    suffix="layer_norm1",
+                    target_module=col_nn.FusedLayerNorm,
+                ),
+                SubModuleReplacementDescription(
+                    suffix="layer_norm2",
+                    target_module=col_nn.FusedLayerNorm,
+                ),
+                SubModuleReplacementDescription(
+                    suffix="layer_norm3",
+                    target_module=col_nn.FusedLayerNorm,
+                ),
+                SubModuleReplacementDescription(
+                    suffix="layer_norm4",
+                    target_module=col_nn.FusedLayerNorm,
+                )
+            ],
+                                                        policy=policy,
+                                                        target_key=SamTwoWayAttentionBlock)
+
+            # Handle SamTwoWayTransformer
+            self.append_or_create_submodule_replacement(description=[
+                SubModuleReplacementDescription(
+                    suffix="layer_norm_final_attn",
+                    target_module=col_nn.FusedLayerNorm,
+                )
+            ],
+                                                        policy=policy,
+                                                        target_key=SamTwoWayTransformer)
+
+        return policy
+
+    def postprocess(self):
+        return self.model
+
+
+# SamModel
+class SamModelPolicy(SamPolicy):
+
+    def __init__(self) -> None:
+        super().__init__()
diff --git a/tests/kit/model_zoo/transformers/__init__.py b/tests/kit/model_zoo/transformers/__init__.py
index a298767d12e7..a1bcb78ddf6b 100644
--- a/tests/kit/model_zoo/transformers/__init__.py
+++ b/tests/kit/model_zoo/transformers/__init__.py
@@ -4,5 +4,6 @@
 from .gpt import *
 from .llama import *
 from .opt import *
+from .sam import *
 from .t5 import *
 from .vit import *
diff --git a/tests/kit/model_zoo/transformers/sam.py b/tests/kit/model_zoo/transformers/sam.py
new file mode 100644
index 000000000000..d850623f368f
--- /dev/null
+++ b/tests/kit/model_zoo/transformers/sam.py
@@ -0,0 +1,52 @@
+import torch
+import transformers
+
+from ..registry import ModelAttribute, model_zoo
+
+# ===============================
+# Register single-image SAM
+# ===============================
+
+
+# define data gen function
+def data_gen():
+    # Generated from following code snippet
+    #
+    # from PIL import Image
+    # import requests
+    # from transformers import SamModel, SamProcessor
+    #
+    # model = SamModel.from_pretrained("facebook/sam-vit-base")
+    # processor = SamProcessor.from_pretrained("facebook/sam-vit-base")
+    #
+    # img_url = "https://huggingface.co/ybelkada/segment-anything/resolve/main/assets/car.png"
+    # raw_image = Image.open(requests.get(img_url, stream=True).raw).convert("RGB")
+    # input_points = [[[450, 600]]] # 2D localization of a window
+    # inputs = processor(raw_image, input_points=input_points, return_tensors="pt")
+
+    pixel_values = torch.rand(1, 3, 1024, 1024, dtype=torch.float32)
+    original_sizes = torch.tensor([[1764, 2646]], dtype=torch.int64)
+    reshaped_input_sizes = torch.tensor([[683, 1024]], dtype=torch.int64)
+    input_points = torch.tensor([[[[174.1497, 232.3129]]]], dtype=torch.float64)
+    return dict(pixel_values=pixel_values,
+                original_sizes=original_sizes,
+                reshaped_input_sizes=reshaped_input_sizes,
+                input_points=input_points)
+
+
+# define output transform function
+output_transform_fn = lambda x: x
+
+# define loss funciton
+loss_fn = lambda x: x.iou_scores.mean()
+
+config = transformers.SamConfig()
+config.vision_config.num_hidden_layers = 2
+
+# register the BERT variants
+model_zoo.register(name='transformers_sam',
+                   model_fn=lambda: transformers.SamModel(config),
+                   data_gen_fn=data_gen,
+                   output_transform_fn=output_transform_fn,
+                   loss_fn=loss_fn,
+                   model_attribute=ModelAttribute(has_control_flow=True))
diff --git a/tests/test_shardformer/test_layer/test_gpt2_qkv_fused_linear_1d.py b/tests/test_shardformer/test_layer/test_gpt2_qkv_fused_linear_1d.py
new file mode 100644
index 000000000000..9eeda93afe35
--- /dev/null
+++ b/tests/test_shardformer/test_layer/test_gpt2_qkv_fused_linear_1d.py
@@ -0,0 +1,120 @@
+import torch
+import torch.distributed as dist
+import torch.nn as nn
+from torch.testing import assert_close
+
+import colossalai
+from colossalai.shardformer.layer import GPT2FusedLinearConv1D_Col, GPT2FusedLinearConv1D_Row
+from colossalai.shardformer.layer.qkv_fused_linear import split_fused_qkv_in_gpt2_style
+from colossalai.testing import rerun_if_address_is_in_use, spawn
+
+
+# This code is copied from https://github.com/huggingface/transformers
+class Conv1D(nn.Module):
+    """
+    1D-convolutional layer as defined by Radford et al. for OpenAI GPT (and also used in GPT-2).
+
+    Basically works like a linear layer but the weights are transposed.
+
+    Args:
+        nf (`int`): The number of output features.
+        nx (`int`): The number of input features.
+    """
+
+    def __init__(self, nf, nx):
+        super().__init__()
+        self.nf = nf
+        self.weight = nn.Parameter(torch.empty(nx, nf))
+        self.bias = nn.Parameter(torch.zeros(nf))
+        nn.init.normal_(self.weight, std=0.02)
+
+    def forward(self, x):
+        size_out = x.size()[:-1] + (self.nf,)
+        x = torch.addmm(self.bias, x.view(-1, x.size(-1)), self.weight)
+        x = x.view(size_out)
+        return x
+
+
+def rearrange(tensor: torch.Tensor, dim: int):
+    tensor = tensor.clone()
+    world_size = 2
+    order = torch.arange(world_size * 3)
+    new_order = []
+    for i in range(world_size):
+        new_order.append(order[i::world_size])
+    new_order = torch.cat(new_order)
+
+    tensor_chunks = torch.chunk(tensor, world_size * 3, dim=dim)
+    rearanged_tensor_chunks = [tensor_chunks[i] for i in new_order]
+    rearanged_tensor = torch.cat(rearanged_tensor_chunks, dim=dim)
+    return rearanged_tensor
+
+
+def check_gpt2_linear_conv_1d_col():
+    linear = Conv1D(192, 48).cuda()
+    linear_conv_col = GPT2FusedLinearConv1D_Col.from_native_module(linear,
+                                                                   process_group=None,
+                                                                   gather_output=True,
+                                                                   n_fused=3)
+
+    assert linear.weight.shape == torch.Size([48, 192])
+    assert linear.bias.shape == torch.Size([192])
+    assert linear_conv_col.weight.shape == torch.Size([48, 96])
+    assert linear_conv_col.bias.shape == torch.Size([96])
+
+    # ensure weights are reversibly loadable
+    linear_conv_col.load_state_dict(linear.state_dict())
+    linear.load_state_dict(linear_conv_col.state_dict())
+
+    # check computation correctness
+    x = torch.rand(4, 48).cuda()
+    out = linear(x)
+    gather_out = linear_conv_col(x)
+    assert_close(rearrange(out, 1), gather_out)
+
+    # check backward correctness
+    out.sum().backward()
+    gather_out.sum().backward()
+
+    target_grad = split_fused_qkv_in_gpt2_style(linear.weight.grad, 3, None, True)
+    assert_close(target_grad, linear_conv_col.weight.grad)
+
+
+def check_gpt2_linear_conv_1d_row():
+    linear = Conv1D(192, 48).cuda()
+    linear_row = GPT2FusedLinearConv1D_Row.from_native_module(linear, process_group=None, parallel_input=False)
+
+    assert linear.weight.shape == torch.Size([48, 192])
+    assert linear_row.weight.shape == torch.Size([24, 192])
+    assert linear_row.bias.shape == torch.Size([192])
+
+    # check computation correctness
+    x = torch.rand(4, 48).cuda()
+    out = linear(x)
+    gather_out = linear_row(x)
+    assert_close(out, gather_out)
+
+    # check backward correctness
+    out.sum().backward()
+    gather_out.sum().backward()
+
+    rank = dist.get_rank()
+    target_grad = torch.chunk(linear.weight.grad, 2, dim=0)[rank]
+    assert_close(target_grad, linear_row.weight.grad)
+
+
+def run_dist(rank, world_size, port):
+    colossalai.launch(config={}, rank=rank, world_size=world_size, host='localhost', port=port, backend='nccl')
+
+    # test for linear conv
+    check_gpt2_linear_conv_1d_col()
+    check_gpt2_linear_conv_1d_row()
+
+
+@rerun_if_address_is_in_use()
+def test_gpt2_linearconv():
+    spawn(run_dist, nprocs=2)
+
+
+if __name__ == '__main__':
+    test_gpt2_linearconv()
diff --git a/tests/test_shardformer/test_layer/test_qkv_fused_linear_1d.py b/tests/test_shardformer/test_layer/test_qkv_fused_linear_1d.py
index 681c4f6dd9f1..805ef68309e0 100644
--- a/tests/test_shardformer/test_layer/test_qkv_fused_linear_1d.py
+++ b/tests/test_shardformer/test_layer/test_qkv_fused_linear_1d.py
@@ -4,37 +4,11 @@
 from torch.testing import assert_close
 
 import colossalai
-from colossalai.shardformer.layer import GPT2FusedLinearConv1D_Col, GPT2FusedLinearConv1D_Row
+from colossalai.shardformer.layer import FusedLinear1D_Col
 from colossalai.shardformer.layer.qkv_fused_linear import split_fused_qkv_in_gpt2_style
 from colossalai.testing import rerun_if_address_is_in_use, spawn
 
 
-# This code is copied from https://github.com/huggingface/transformers
-class Conv1D(nn.Module):
-    """
-    1D-convolutional layer as defined by Radford et al. for OpenAI GPT (and also used in GPT-2).
-
-    Basically works like a linear layer but the weights are transposed.
-
-    Args:
-        nf (`int`): The number of output features.
-        nx (`int`): The number of input features.
-    """
-
-    def __init__(self, nf, nx):
-        super().__init__()
-        self.nf = nf
-        self.weight = nn.Parameter(torch.empty(nx, nf))
-        self.bias = nn.Parameter(torch.zeros(nf))
-        nn.init.normal_(self.weight, std=0.02)
-
-    def forward(self, x):
-        size_out = x.size()[:-1] + (self.nf,)
-        x = torch.addmm(self.bias, x.view(-1, x.size(-1)), self.weight)
-        x = x.view(size_out)
-        return x
-
-
 def rearrange(tensor: torch.Tensor, dim: int):
     tensor = tensor.clone()
     world_size = 2
@@ -51,15 +25,12 @@ def rearrange(tensor: torch.Tensor, dim: int):
 
 
 def check_linear_conv_1d_col():
-    linear = Conv1D(192, 48).cuda()
-    linear_conv_col = GPT2FusedLinearConv1D_Col.from_native_module(linear,
-                                                                   process_group=None,
-                                                                   gather_output=True,
-                                                                   n_fused=3)
+    linear = nn.Linear(48, 192).cuda()
+    linear_conv_col = FusedLinear1D_Col.from_native_module(linear, process_group=None, gather_output=True, n_fused=3)
 
-    assert linear.weight.shape == torch.Size([48, 192])
+    assert linear.weight.shape == torch.Size([192, 48])
     assert linear.bias.shape == torch.Size([192])
-    assert linear_conv_col.weight.shape == torch.Size([48, 96])
+    assert linear_conv_col.weight.shape == torch.Size([96, 48])
     assert linear_conv_col.bias.shape == torch.Size([96])
 
     # ensure weights are reversibly loadable
@@ -76,39 +47,15 @@ def check_linear_conv_1d_col():
     out.sum().backward()
     gather_out.sum().backward()
 
-    target_grad = split_fused_qkv_in_gpt2_style(linear.weight.grad, 3, None, True)
+    target_grad = split_fused_qkv_in_gpt2_style(linear.weight.grad, 3, None, False)
     assert_close(target_grad, linear_conv_col.weight.grad)
 
 
-def check_linear_conv_1d_row():
-    linear = Conv1D(192, 48).cuda()
-    linear_row = GPT2FusedLinearConv1D_Row.from_native_module(linear, process_group=None, parallel_input=False)
-
-    assert linear.weight.shape == torch.Size([48, 192])
-    assert linear_row.weight.shape == torch.Size([24, 192])
-    assert linear_row.bias.shape == torch.Size([192])
-
-    # check computation correctness
-    x = torch.rand(4, 48).cuda()
-    out = linear(x)
-    gather_out = linear_row(x)
-    assert_close(out, gather_out)
-
-    # check backward correctness
-    out.sum().backward()
-    gather_out.sum().backward()
-
-    rank = dist.get_rank()
-    target_grad = torch.chunk(linear.weight.grad, 2, dim=0)[rank]
-    assert_close(target_grad, linear_row.weight.grad)
-
-
 def run_dist(rank, world_size, port):
     colossalai.launch(config={}, rank=rank, world_size=world_size, host='localhost', port=port, backend='nccl')
 
     # test for linear conv
     check_linear_conv_1d_col()
-    check_linear_conv_1d_row()
 
 
 @rerun_if_address_is_in_use()
diff --git a/tests/test_shardformer/test_model/test_shard_sam.py b/tests/test_shardformer/test_model/test_shard_sam.py
new file mode 100644
index 000000000000..1d047d8e0c42
--- /dev/null
+++ b/tests/test_shardformer/test_model/test_shard_sam.py
@@ -0,0 +1,92 @@
+import pytest
+import torch
+
+import colossalai
+from colossalai.logging import disable_existing_loggers
+from colossalai.tensor.d_tensor.api import is_customized_distributed_tensor, is_distributed_tensor
+from colossalai.testing import (
+    assert_hf_output_close,
+    clear_cache_before_run,
+    parameterize,
+    rerun_if_address_is_in_use,
+    spawn,
+)
+from tests.kit.model_zoo import model_zoo
+from tests.test_shardformer.test_model._utils import build_model, run_forward
+
+
+def check_forward_backward(org_model, sharded_model, data_gen_fn, output_transform_fn, loss_fn):
+    # check forward
+    org_output, org_loss, shard_output, shard_loss = run_forward(org_model, sharded_model, data_gen_fn,
+                                                                 output_transform_fn, loss_fn)
+    assert_hf_output_close(org_output, shard_output, ignore_keys=['pred_masks'])
+
+    # do backward
+    org_loss.backward()
+    shard_loss.backward()
+
+    assert torch.allclose(org_loss, shard_loss,
+                          atol=1e-5), f"shard model loss is not equal to orgin model loss\n{org_loss}\n{shard_loss}"
+
+    # check grad
+
+    sam = org_model
+    sharded_sam = sharded_model
+
+    # compare mask decoder grad
+
+    org_grad = sam.mask_decoder.transformer.layers[0].self_attn.q_proj.weight.grad
+    shard_grad = sharded_sam.mask_decoder.transformer.layers[0].self_attn.q_proj.weight.grad
+    shard_weight = sharded_sam.mask_decoder.transformer.layers[0].self_attn.q_proj.weight
+
+    if is_distributed_tensor(shard_weight) or is_customized_distributed_tensor(shard_weight):
+        shard_grad_list = [torch.zeros([*shard_grad.shape]).to('cuda') for _ in range(2)]
+        shard_grad = torch.distributed.all_gather(shard_grad_list, shard_grad)
+        all_shard_grad = torch.cat(shard_grad_list, dim=0)
+    else:
+        all_shard_grad = shard_grad
+    assert torch.allclose(org_grad, all_shard_grad,
+                          atol=1e-5), f"shard model grad is not equal to orgin model grad\n{org_grad}\n{all_shard_grad}"
+
+    # compare vision_encoder grad
+    org_grad = sam.vision_encoder.layers[0].mlp.lin1.weight.grad
+    shard_grad = sharded_sam.vision_encoder.layers[0].mlp.lin1.weight.grad
+    shard_weight = sharded_sam.vision_encoder.layers[0].mlp.lin1.weight
+
+    if is_distributed_tensor(shard_weight) or is_customized_distributed_tensor(shard_weight):
+        shard_grad_list = [torch.zeros([*shard_grad.shape]).to('cuda') for _ in range(2)]
+        shard_grad = torch.distributed.all_gather(shard_grad_list, shard_grad)
+        all_shard_grad = torch.cat(shard_grad_list, dim=0)
+    else:
+        all_shard_grad = shard_grad
+
+    assert torch.allclose(org_grad, all_shard_grad,
+                          atol=1e-5), f"shard model grad is not equal to orgin model grad\n{org_grad}\n{all_shard_grad}"
+
+
+@parameterize('enable_fused_normalization', [True, False])
+@parameterize('enable_tensor_parallelism', [True, False])
+def run_sam_test(enable_fused_normalization, enable_tensor_parallelism):
+    sub_model_zoo = model_zoo.get_sub_registry('transformers_sam')
+    for name, (model_fn, data_gen_fn, output_transform_fn, loss_fn, _) in sub_model_zoo.items():
+        org_model, sharded_model = build_model(model_fn, enable_fused_normalization, enable_tensor_parallelism)
+        check_forward_backward(org_model, sharded_model, data_gen_fn, output_transform_fn, loss_fn)
+
+    torch.cuda.empty_cache()
+
+
+def check_sam(rank, world_size, port):
+    disable_existing_loggers()
+    colossalai.launch(config={}, rank=rank, world_size=world_size, host='localhost', port=port, backend='nccl')
+    run_sam_test()
+
+
+@pytest.mark.dist
+@rerun_if_address_is_in_use()
+@clear_cache_before_run()
+def test_sam():
+    spawn(check_sam, 2)
+
+
+if __name__ == "__main__":
+    test_sam()

From afcf4a054b925e8dcb68f0515b186338c9f8471d Mon Sep 17 00:00:00 2001
From: FoolPlayer <45593998+FoolPlayer@users.noreply.github.com>
Date: Mon, 17 Jul 2023 14:25:32 +0800
Subject: [PATCH 03/21] [shardformer] support whisper (#4212)

* support whisper

* fix bug in vocabembedding

* support downstream model of whisper

* update readme
---
 colossalai/shardformer/README.md              |   2 +-
 colossalai/shardformer/layer/embedding.py     |  13 +-
 colossalai/shardformer/policies/autopolicy.py |   8 +
 colossalai/shardformer/policies/whisper.py    | 232 ++++++++++++++++++
 tests/kit/model_zoo/transformers/__init__.py  |   1 +
 tests/kit/model_zoo/transformers/whisper.py   |  91 +++++++
 .../test_model/test_shard_whisper.py          | 101 ++++++++
 7 files changed, 446 insertions(+), 2 deletions(-)
 create mode 100644 colossalai/shardformer/policies/whisper.py
 create mode 100644 tests/kit/model_zoo/transformers/whisper.py
 create mode 100644 tests/test_shardformer/test_model/test_shard_whisper.py

diff --git a/colossalai/shardformer/README.md b/colossalai/shardformer/README.md
index fca401562be6..21b7bf05f923 100644
--- a/colossalai/shardformer/README.md
+++ b/colossalai/shardformer/README.md
@@ -102,7 +102,7 @@ We will follow this roadmap to develop Shardformer:
       - [ ] SwinTransformer
       - [ ] SwinTransformer V2
     - [ ] Audio
-      - [ ] Whisper
+      - [x] Whisper
     - [ ] Multi-modal
       - [ ] To be added
 
diff --git a/colossalai/shardformer/layer/embedding.py b/colossalai/shardformer/layer/embedding.py
index db39a457b7fd..49ecb63e6c9e 100644
--- a/colossalai/shardformer/layer/embedding.py
+++ b/colossalai/shardformer/layer/embedding.py
@@ -193,7 +193,6 @@ def __init__(self,
         super().__init__()
         self.num_embeddings = num_embeddings
         self.embedding_dim = embedding_dim
-        self.padding_idx = padding_idx
         self.embed_args = args
         self.embed_kwargs = kwargs
         self.process_group = process_group
@@ -206,6 +205,9 @@ def __init__(self,
         self.vocab_start_index = tensor_parallel_rank * self.num_embeddings_per_partition
         self.vocab_end_index = self.vocab_start_index + self.num_embeddings_per_partition
 
+        # padding index
+        self.padding_idx = self._select_padding_idx(padding_idx)
+
         # parameter
         factory_kwargs = {'device': device, 'dtype': dtype}
         weight = torch.empty((num_embeddings, self.embedding_dim), **factory_kwargs)
@@ -263,6 +265,15 @@ def _fill_padding_idx_with_zero(self) -> None:
             with torch.no_grad():
                 self.weight[self.padding_idx - self.vocab_start_index].fill_(0)
 
+    def _select_padding_idx(self, padding_idx: int):
+        # select padding index according to the rank
+        if padding_idx is None:
+            return None
+        elif padding_idx < self.vocab_end_index and padding_idx >= self.vocab_start_index:
+            return padding_idx - self.vocab_start_index
+        else:
+            return None
+
     def forward(self, input_: Tensor) -> Tensor:
         # Build the mask.
         input_mask = (input_ < self.vocab_start_index) | (input_ >= self.vocab_end_index)
diff --git a/colossalai/shardformer/policies/autopolicy.py b/colossalai/shardformer/policies/autopolicy.py
index ccf2199bd056..77583dd77cf0 100644
--- a/colossalai/shardformer/policies/autopolicy.py
+++ b/colossalai/shardformer/policies/autopolicy.py
@@ -101,6 +101,14 @@ class PolicyLocation:
     "transformers.models.bloom.modeling_bloom.BloomForQuestionAnswering":
         PolicyLocation(file_name="bloom", class_name="BloomForQuestionAnsweringPolicy"),
 
+    # Whisper
+    "transformers.models.whisper.modeling_whisper.WhisperModel":
+        PolicyLocation(file_name="whisper", class_name="WhisperModelPolicy"),
+    "transformers.models.whisper.modeling_whisper.WhisperForConditionalGeneration":
+        PolicyLocation(file_name="whisper", class_name="WhisperForConditionalGenerationPolicy"),
+    "transformers.models.whisper.modeling_whisper.WhisperForAudioClassification":
+        PolicyLocation(file_name="whisper", class_name="WhisperForAudioClassificationPolicy"),
+
     # Sam
     "transformers.models.sam.modeling_sam.SamModel":
         PolicyLocation(file_name="sam", class_name="SamModelPolicy"),
diff --git a/colossalai/shardformer/policies/whisper.py b/colossalai/shardformer/policies/whisper.py
new file mode 100644
index 000000000000..7751bbb5de99
--- /dev/null
+++ b/colossalai/shardformer/policies/whisper.py
@@ -0,0 +1,232 @@
+import torch.nn as nn
+
+import colossalai.shardformer.layer as col_nn
+
+from .._utils import getattr_, setattr_
+from .basepolicy import ModulePolicyDescription, Policy, SubModuleReplacementDescription
+
+__all__ = [
+    'WhisperPolicy', 'WhisperModelPolicy', 'WhisperForConditionalGenerationPolicy', 'WhisperForAudioClassification'
+]
+
+
+class WhisperPolicy(Policy):
+
+    def config_sanity_check(self):
+        pass
+
+    def preprocess(self):
+        # reshape the embedding layer
+        r"""
+        Reshape the Embedding layer to make the embedding dimension divisible by world_size
+        """
+        # TODO:
+        vocab_size = self.model.config.vocab_size
+        world_size = self.shard_config.tensor_parallel_size
+        if vocab_size % world_size != 0:
+            new_vocab_size = vocab_size + world_size - vocab_size % world_size
+            self.model.resize_token_embeddings(new_vocab_size)
+        return self.model
+
+    def module_policy(self):
+        from transformers.models.whisper.modeling_whisper import (
+            WhisperDecoder,
+            WhisperDecoderLayer,
+            WhisperEncoder,
+            WhisperEncoderLayer,
+        )
+
+        policy = {}
+
+        if self.shard_config.enable_tensor_parallelism:
+            policy[WhisperEncoderLayer] = ModulePolicyDescription(attribute_replacement={
+                "self_attn.embed_dim":
+                    self.model.config.d_model // self.shard_config.tensor_parallel_size,
+                "self_attn.num_heads":
+                    self.model.config.encoder_attention_heads // self.shard_config.tensor_parallel_size,
+            },
+                                                                  sub_module_replacement=[
+                                                                      SubModuleReplacementDescription(
+                                                                          suffix="self_attn.q_proj",
+                                                                          target_module=col_nn.Linear1D_Col,
+                                                                      ),
+                                                                      SubModuleReplacementDescription(
+                                                                          suffix="self_attn.k_proj",
+                                                                          target_module=col_nn.Linear1D_Col,
+                                                                      ),
+                                                                      SubModuleReplacementDescription(
+                                                                          suffix="self_attn.v_proj",
+                                                                          target_module=col_nn.Linear1D_Col,
+                                                                      ),
+                                                                      SubModuleReplacementDescription(
+                                                                          suffix="self_attn.out_proj",
+                                                                          target_module=col_nn.Linear1D_Row,
+                                                                      ),
+                                                                      SubModuleReplacementDescription(
+                                                                          suffix="fc1",
+                                                                          target_module=col_nn.Linear1D_Col,
+                                                                      ),
+                                                                      SubModuleReplacementDescription(
+                                                                          suffix="fc2",
+                                                                          target_module=col_nn.Linear1D_Row,
+                                                                      ),
+                                                                  ])
+
+            policy[WhisperDecoderLayer] = ModulePolicyDescription(attribute_replacement={
+                "self_attn.embed_dim":
+                    self.model.config.d_model // self.shard_config.tensor_parallel_size,
+                "self_attn.num_heads":
+                    self.model.config.decoder_attention_heads // self.shard_config.tensor_parallel_size,
+                "encoder_attn.embed_dim":
+                    self.model.config.d_model // self.shard_config.tensor_parallel_size,
+                "encoder_attn.num_heads":
+                    self.model.config.encoder_attention_heads // self.shard_config.tensor_parallel_size,
+            },
+                                                                  sub_module_replacement=[
+                                                                      SubModuleReplacementDescription(
+                                                                          suffix="self_attn.q_proj",
+                                                                          target_module=col_nn.Linear1D_Col,
+                                                                      ),
+                                                                      SubModuleReplacementDescription(
+                                                                          suffix="self_attn.k_proj",
+                                                                          target_module=col_nn.Linear1D_Col,
+                                                                      ),
+                                                                      SubModuleReplacementDescription(
+                                                                          suffix="self_attn.v_proj",
+                                                                          target_module=col_nn.Linear1D_Col,
+                                                                      ),
+                                                                      SubModuleReplacementDescription(
+                                                                          suffix="self_attn.out_proj",
+                                                                          target_module=col_nn.Linear1D_Row,
+                                                                      ),
+                                                                      SubModuleReplacementDescription(
+                                                                          suffix="encoder_attn.q_proj",
+                                                                          target_module=col_nn.Linear1D_Col,
+                                                                      ),
+                                                                      SubModuleReplacementDescription(
+                                                                          suffix="encoder_attn.k_proj",
+                                                                          target_module=col_nn.Linear1D_Col,
+                                                                      ),
+                                                                      SubModuleReplacementDescription(
+                                                                          suffix="encoder_attn.v_proj",
+                                                                          target_module=col_nn.Linear1D_Col,
+                                                                      ),
+                                                                      SubModuleReplacementDescription(
+                                                                          suffix="encoder_attn.out_proj",
+                                                                          target_module=col_nn.Linear1D_Row,
+                                                                      ),
+                                                                      SubModuleReplacementDescription(
+                                                                          suffix="fc1",
+                                                                          target_module=col_nn.Linear1D_Col,
+                                                                      ),
+                                                                      SubModuleReplacementDescription(
+                                                                          suffix="fc2",
+                                                                          target_module=col_nn.Linear1D_Row,
+                                                                      ),
+                                                                  ])
+
+            policy[WhisperDecoder] = ModulePolicyDescription(sub_module_replacement=[
+                SubModuleReplacementDescription(
+                    suffix="embed_tokens",
+                    target_module=col_nn.VocabParallelEmbedding1D,
+                ),
+            ])
+
+        # optimization configuration
+        if self.shard_config.enable_fused_normalization:
+            # Handle encoder layer
+            self.append_or_create_submodule_replacement(description=[
+                SubModuleReplacementDescription(
+                    suffix="self_attn_layer_norm",
+                    target_module=col_nn.FusedLayerNorm,
+                ),
+                SubModuleReplacementDescription(
+                    suffix="final_layer_norm",
+                    target_module=col_nn.FusedLayerNorm,
+                )
+            ],
+                                                        policy=policy,
+                                                        target_key=WhisperEncoderLayer)
+
+            # Handle decoder layer
+            self.append_or_create_submodule_replacement(description=[
+                SubModuleReplacementDescription(
+                    suffix="self_attn_layer_norm",
+                    target_module=col_nn.FusedLayerNorm,
+                ),
+                SubModuleReplacementDescription(
+                    suffix="final_layer_norm",
+                    target_module=col_nn.FusedLayerNorm,
+                )
+            ],
+                                                        policy=policy,
+                                                        target_key=WhisperDecoderLayer)
+
+            # handle encoder layer
+            self.append_or_create_submodule_replacement(description=[
+                SubModuleReplacementDescription(
+                    suffix="layer_norm",
+                    target_module=col_nn.FusedLayerNorm,
+                )
+            ],
+                                                        policy=policy,
+                                                        target_key=WhisperEncoder)
+
+            # handle decoder layer
+            self.append_or_create_submodule_replacement(description=[
+                SubModuleReplacementDescription(
+                    suffix="layer_norm",
+                    target_module=col_nn.FusedLayerNorm,
+                )
+            ],
+                                                        policy=policy,
+                                                        target_key=WhisperDecoder)
+        return policy
+
+    def add_lm_head_policy(self, base_policy):
+        from transformers.models.whisper.modeling_whisper import WhisperForConditionalGeneration
+
+        # optimize for tensor parallelism
+        if self.shard_config.enable_tensor_parallelism:
+            self.append_or_create_submodule_replacement(description=SubModuleReplacementDescription(
+                suffix="proj_out", target_module=col_nn.Linear1D_Col, kwargs={"gather_output": True}),
+                                                        policy=base_policy,
+                                                        target_key=WhisperForConditionalGeneration)
+
+        return base_policy
+
+    def postprocess(self):
+        return self.model
+
+
+# WhisperModel
+class WhisperModelPolicy(WhisperPolicy):
+
+    def __init__(self) -> None:
+        super().__init__()
+
+
+# WhisperForConditionalGeneration
+class WhisperForConditionalGenerationPolicy(WhisperPolicy):
+
+    def __init__(self) -> None:
+        super().__init__()
+
+    def module_policy(self):
+        module_policy = super().module_policy()
+        module_policy = self.add_lm_head_policy(module_policy)
+        return module_policy
+
+    def postprocess(self):
+        binding_map = {"model.decoder.embed_tokens.weight": "proj_out.weight"}
+        for k, v in binding_map.items():
+            param = getattr_(self.model, k)
+            setattr_(self.model, v, param)
+        return self.model
+
+
+# WhisperForAudioClassification
+class WhisperForAudioClassificationPolicy(WhisperPolicy):
+
+    def __init__(self) -> None:
+        super().__init__()
diff --git a/tests/kit/model_zoo/transformers/__init__.py b/tests/kit/model_zoo/transformers/__init__.py
index a1bcb78ddf6b..39e5ef411f32 100644
--- a/tests/kit/model_zoo/transformers/__init__.py
+++ b/tests/kit/model_zoo/transformers/__init__.py
@@ -7,3 +7,4 @@
 from .sam import *
 from .t5 import *
 from .vit import *
+from .whisper import *
diff --git a/tests/kit/model_zoo/transformers/whisper.py b/tests/kit/model_zoo/transformers/whisper.py
new file mode 100644
index 000000000000..b58716217cb5
--- /dev/null
+++ b/tests/kit/model_zoo/transformers/whisper.py
@@ -0,0 +1,91 @@
+import torch
+import transformers
+
+from ..registry import ModelAttribute, model_zoo
+
+# ===============================
+# Register single-sentence Whisper
+# ===============================
+
+
+# define data gen function
+def data_gen():
+    # Generated from following code snippet
+    #
+    # from transformers import AutoFeatureExtractor, WhisperModel
+    # from datasets import load_dataset
+
+    # model = WhisperModel.from_pretrained("openai/whisper-base")
+    # feature_extractor = AutoFeatureExtractor.from_pretrained("openai/whisper-base")
+    # ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation")
+    # inputs = feature_extractor(ds[0]["audio"]["array"], return_tensors="pt")
+    # input_features = inputs.input_features
+    # decoder_input_ids = torch.tensor([[1, 1]]) * model.config.decoder_start_token_id
+
+    input_features = torch.randn(1, 80, 3000)
+    decoder_input_ids = torch.tensor([[1, 1]]) * 50258
+    return dict(input_features=input_features, decoder_input_ids=decoder_input_ids)
+
+
+def data_gen_for_conditional_generation():
+    # labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
+    #         Labels for computing the language modeling loss. Indices should either be in `[0, ..., config.vocab_size]`
+    #         or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored (masked), the loss is
+    #         only computed for the tokens with labels in `[0, ..., config.vocab_size]`.
+    data = data_gen()
+    data['labels'] = torch.tensor([[0, 1]], dtype=torch.int64)
+    return data
+
+
+def data_gen_for_audio_classification():
+    # labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
+    #         Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
+    #         config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
+    #         `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
+    # `WhisperForAudioClassification` does not need `decoder_input_ids`
+    data = data_gen()
+    data.pop('decoder_input_ids')
+    data['labels'] = torch.tensor([1], dtype=torch.int64)
+    return data
+
+
+# define output transform function
+output_transform_fn = lambda x: x
+
+# define loss funciton
+loss_fn = lambda x: x.last_hidden_state.mean()
+loss_fn_attr = lambda x: x.loss
+
+config = transformers.WhisperConfig(
+    classifier_proj_size=256,
+    d_model=256,
+    decoder_attention_heads=4,
+    decoder_ffn_dim=1536,
+    decoder_layers=2,
+    encoder_attention_heads=4,
+    encoder_ffn_dim=1536,
+    encoder_layers=2,
+    vocab_size=51866,
+)
+
+# register the Whisper variants
+model_zoo.register(name='transformers_whisper',
+                   model_fn=lambda: transformers.WhisperModel(config),
+                   data_gen_fn=data_gen,
+                   output_transform_fn=output_transform_fn,
+                   loss_fn=loss_fn,
+                   model_attribute=ModelAttribute(has_control_flow=True))
+
+model_zoo.register(name='transformers_whisperForConditionalGeneration',
+                   model_fn=lambda: transformers.WhisperForConditionalGeneration(config),
+                   data_gen_fn=data_gen_for_conditional_generation,
+                   output_transform_fn=output_transform_fn,
+                   loss_fn=loss_fn_attr,
+                   model_attribute=ModelAttribute(has_control_flow=True))
+
+model_zoo.register(name='transformers_whisperWhisperForAudioClassification',
+                   model_fn=lambda: transformers.WhisperForAudioClassification(config),
+                   data_gen_fn=data_gen_for_audio_classification,
+                   output_transform_fn=output_transform_fn,
+                   loss_fn=loss_fn_attr,
+                   model_attribute=ModelAttribute(has_control_flow=True))
diff --git a/tests/test_shardformer/test_model/test_shard_whisper.py b/tests/test_shardformer/test_model/test_shard_whisper.py
new file mode 100644
index 000000000000..8932a4ab902c
--- /dev/null
+++ b/tests/test_shardformer/test_model/test_shard_whisper.py
@@ -0,0 +1,101 @@
+import pytest
+import torch
+
+import colossalai
+from colossalai.logging import disable_existing_loggers
+from colossalai.tensor.d_tensor.api import is_customized_distributed_tensor, is_distributed_tensor
+from colossalai.testing import (
+    assert_hf_output_close,
+    clear_cache_before_run,
+    parameterize,
+    rerun_if_address_is_in_use,
+    spawn,
+)
+from tests.kit.model_zoo import model_zoo
+from tests.test_shardformer.test_model._utils import build_model, run_forward
+
+
+def check_forward_backward(org_model, sharded_model, data_gen_fn, output_transform_fn, loss_fn):
+    # check forward
+    org_output, org_loss, shard_output, shard_loss = run_forward(org_model, sharded_model, data_gen_fn,
+                                                                 output_transform_fn, loss_fn)
+    assert_hf_output_close(org_output, shard_output, ignore_keys='past_key_values')
+
+    # do backward
+    org_loss.backward()
+    shard_loss.backward()
+
+    assert torch.allclose(org_loss, shard_loss,
+                          atol=1e-5), f"shard model loss is not equal to orgin model loss\n{org_loss}\n{shard_loss}"
+
+    # check grad
+
+    if org_model.__class__.__name__ == 'WhisperForConditionalGeneration':
+        whisper = org_model.model
+        sharded_whisper = sharded_model.model
+    else:
+        whisper = org_model
+        sharded_whisper = sharded_model
+
+    # compare self attention grad
+    org_grad = whisper.encoder.layers[0].self_attn.q_proj.weight.grad
+    shard_grad = sharded_whisper.encoder.layers[0].self_attn.q_proj.weight.grad
+    shard_weight = sharded_whisper.encoder.layers[0].self_attn.q_proj.weight
+
+    if is_distributed_tensor(shard_weight) or is_customized_distributed_tensor(shard_weight):
+        shard_grad_list = [torch.zeros([*shard_grad.shape]).to('cuda') for _ in range(2)]
+        shard_grad = torch.distributed.all_gather(shard_grad_list, shard_grad)
+        all_shard_grad = torch.cat(shard_grad_list, dim=0)
+    else:
+        all_shard_grad = shard_grad
+    assert torch.allclose(org_grad, all_shard_grad,
+                          atol=1e-5), f"shard model grad is not equal to orgin model grad\n{org_grad}\n{all_shard_grad}"
+
+    # WhisperForAudioClassification does not have decoder and embedding layer
+    if org_model.__class__.__name__ == 'WhisperForAudioClassification':
+        return
+
+    # compare embedding grad
+    org_grad = whisper.decoder.embed_tokens.weight.grad
+    shard_grad = sharded_whisper.decoder.embed_tokens.weight.grad
+    shard_weight = sharded_whisper.decoder.embed_tokens.weight
+
+    if is_distributed_tensor(shard_weight) or is_customized_distributed_tensor(shard_weight):
+        shard_grad_list = [torch.zeros([*shard_grad.shape]).to('cuda') for _ in range(2)]
+        shard_grad = torch.distributed.all_gather(shard_grad_list, shard_grad)
+        all_shard_grad = torch.cat(shard_grad_list, dim=0)
+    else:
+        all_shard_grad = shard_grad
+
+    assert torch.allclose(org_grad, all_shard_grad,
+                          atol=1e-5), f"shard model grad is not equal to orgin model grad\n{org_grad}\n{all_shard_grad}"
+
+
+@parameterize('enable_fused_normalization', [True, False])
+@parameterize('enable_tensor_parallelism', [True, False])
+def run_whisper_test(enable_fused_normalization, enable_tensor_parallelism):
+    sub_model_zoo = model_zoo.get_sub_registry('transformers_whisper')
+    for name, (model_fn, data_gen_fn, output_transform_fn, loss_fn, _) in sub_model_zoo.items():
+        org_model, sharded_model = build_model(model_fn,
+                                               enable_fused_normalization=enable_fused_normalization,
+                                               enable_tensor_parallelism=enable_tensor_parallelism)
+        check_forward_backward(org_model, sharded_model, data_gen_fn, output_transform_fn, loss_fn)
+
+    torch.cuda.empty_cache()
+
+
+def check_whisper(rank, world_size, port):
+    disable_existing_loggers()
+    colossalai.launch(config={}, rank=rank, world_size=world_size, host='localhost', port=port, backend='nccl')
+    run_whisper_test()
+
+
+@pytest.mark.dist
+@rerun_if_address_is_in_use()
+@clear_cache_before_run()
+def test_whisper():
+    spawn(check_whisper, 2)
+
+
+if __name__ == "__main__":
+    test_whisper()

From 77cc087a81c2ffb7dca71f4f3242a4f2c129f219 Mon Sep 17 00:00:00 2001
From: Kun Lin <81014421+klhhhhh@users.noreply.github.com>
Date: Thu, 20 Jul 2023 17:28:00 +0800
Subject: [PATCH 04/21] Feature/chatglm (#4240)

* [shardformer] added tests

* [shardformer] vit test finish and support

* [shardformer] chatglm ready

* import chatglm

* [shardformer] add test kit in model zoo for chatglm

* [sharformer] add first version of policy of chatglm

* [shardformer] polish chatglm code

* [shardformer] polish code

* [shardformer] support chatglm without layernorm

* [shardformer] chatglm shard without mlp sharding

* [shardformer] delete some file

* [shardformer] ChatGLM support layernorm sharding

* [shardformer] register without auto policy

* [shardformer] pre-commit check files

* [shardformer] fix chatglm configuration with pre-commit
---
 colossalai/shardformer/policies/chatglm.py    |   96 ++
 tests/kit/model_zoo/transformers/__init__.py  |    1 +
 tests/kit/model_zoo/transformers/chatglm.py   |   38 +
 .../chatglm2_6b/configuration_chatglm.py      |   58 +
 .../chatglm2_6b/modeling_chatglm.py           | 1372 +++++++++++++++++
 .../test_model/test_shard_chatglm.py          |  107 ++
 .../test_model/test_shard_vit.py              |    1 -
 7 files changed, 1672 insertions(+), 1 deletion(-)
 create mode 100644 colossalai/shardformer/policies/chatglm.py
 create mode 100644 tests/kit/model_zoo/transformers/chatglm.py
 create mode 100644 tests/kit/model_zoo/transformers/chatglm2_6b/configuration_chatglm.py
 create mode 100644 tests/kit/model_zoo/transformers/chatglm2_6b/modeling_chatglm.py
 create mode 100644 tests/test_shardformer/test_model/test_shard_chatglm.py

diff --git a/colossalai/shardformer/policies/chatglm.py b/colossalai/shardformer/policies/chatglm.py
new file mode 100644
index 000000000000..934b99b83ea1
--- /dev/null
+++ b/colossalai/shardformer/policies/chatglm.py
@@ -0,0 +1,96 @@
+from typing import Dict, Union
+
+import torch.nn as nn
+
+import colossalai.shardformer.layer as col_nn
+
+from .basepolicy import ModulePolicyDescription, Policy, SubModuleReplacementDescription
+
+__all__ = ['ChatGLMModelPolicy', 'ChatGLMForConditionalGenerationPolicy']
+
+
+class ChatGLMModelPolicy(Policy):
+
+    def config_sanity_check(self):
+        pass
+
+    def preprocess(self):
+        # Resize embedding
+        vocab_size = self.model.config.padded_vocab_size
+        world_size = self.shard_config.tensor_parallel_size
+
+        if vocab_size % world_size != 0:
+            new_vocab_size = vocab_size + world_size - vocab_size % world_size
+            self.model.resize_token_embeddings(new_vocab_size)
+
+        return self.model
+
+    def module_policy(self) -> Dict[Union[str, nn.Module], ModulePolicyDescription]:
+        from tests.kit.model_zoo.transformers.chatglm2_6b.modeling_chatglm import ChatGLMModel, GLMBlock
+
+        policy = {}
+
+        if self.shard_config.enable_tensor_parallelism:
+
+            policy[ChatGLMModel] = ModulePolicyDescription(attribute_replacement={},
+                                                           sub_module_replacement=[
+                                                               SubModuleReplacementDescription(
+                                                                   suffix="embedding.word_embeddings",
+                                                                   target_module=col_nn.VocabParallelEmbedding1D,
+                                                               )
+                                                           ])
+
+            policy[GLMBlock] = ModulePolicyDescription(attribute_replacement={
+                "self_attention.num_attention_heads_per_partition":
+                    self.model.config.num_attention_heads // self.shard_config.tensor_parallel_size,
+                "self_attention.projection_size":
+                    (self.model.config.kv_channels * self.model.config.num_attention_heads) //
+                    self.shard_config.tensor_parallel_size,
+                "self_attention.qkv_hidden_size":
+                    (self.model.config.kv_channels * self.model.config.num_attention_heads * 3) //
+                    self.shard_config.tensor_parallel_size,
+                "self_attention.core_attention.num_attention_heads_per_partition":
+                    self.model.config.num_attention_heads // self.shard_config.tensor_parallel_size,
+                "self_attention.core_attention.hidden_size_per_partition":
+                    self.model.config.kv_channels * self.model.config.num_attention_heads //
+                    self.shard_config.tensor_parallel_size,
+            },
+                                                       param_replacement=[],
+                                                       sub_module_replacement=[
+                                                           SubModuleReplacementDescription(
+                                                               suffix="self_attention.query_key_value",
+                                                               target_module=col_nn.Linear1D_Col,
+                                                           ),
+                                                           SubModuleReplacementDescription(
+                                                               suffix="self_attention.dense",
+                                                               target_module=col_nn.Linear1D_Row,
+                                                           ),
+                                                           SubModuleReplacementDescription(
+                                                               suffix="self_attention.core_attention.attention_dropout",
+                                                               target_module=col_nn.DropoutForParallelInput,
+                                                           ),
+                                                       ])
+        # optimization configuration
+        if self.shard_config.enable_fused_normalization:
+            if not self.model.config.rmsnorm:
+
+                self.append_or_create_submodule_replacement(description=[
+                    SubModuleReplacementDescription(suffix="input_layernorm", target_module=col_nn.FusedLayerNorm),
+                    SubModuleReplacementDescription(suffix="post_attention_layernorm",
+                                                    target_module=col_nn.FusedLayerNorm)
+                ],
+                                                            policy=policy,
+                                                            target_key=GLMBlock)
+
+                if self.model.config.post_layer_norm:
+                    self.append_or_create_submodule_replacement(description=[
+                        SubModuleReplacementDescription(suffix="encoder.final_layernorm",
+                                                        target_module=col_nn.FusedLayerNorm)
+                    ],
+                                                                policy=policy,
+                                                                target_key=ChatGLMModel)
+
+        return policy
+
+    def postprocess(self):
+        return self.model
diff --git a/tests/kit/model_zoo/transformers/__init__.py b/tests/kit/model_zoo/transformers/__init__.py
index 39e5ef411f32..08a118e5783d 100644
--- a/tests/kit/model_zoo/transformers/__init__.py
+++ b/tests/kit/model_zoo/transformers/__init__.py
@@ -1,6 +1,7 @@
 from .albert import *
 from .bert import *
 from .bloom import *
+from .chatglm import *
 from .gpt import *
 from .llama import *
 from .opt import *
diff --git a/tests/kit/model_zoo/transformers/chatglm.py b/tests/kit/model_zoo/transformers/chatglm.py
new file mode 100644
index 000000000000..1408babede64
--- /dev/null
+++ b/tests/kit/model_zoo/transformers/chatglm.py
@@ -0,0 +1,38 @@
+import torch
+import transformers
+
+from ..registry import ModelAttribute, model_zoo
+from .chatglm2_6b.configuration_chatglm import ChatGLMConfig
+from .chatglm2_6b.modeling_chatglm import ChatGLMModel
+
+# ================================
+# Register single-sentence ChatGLM
+# ================================
+
+
+def data_gen():
+    input_ids = torch.tensor([[5941, 15, 2670, 3543, 632, 2075]], dtype=torch.int64)
+    attention_mask = torch.tensor([[1, 1, 1, 1, 1, 1]])
+    return dict(input_ids=input_ids, attention_mask=attention_mask)
+
+
+# define output transform function
+output_transform_fn = lambda x: x
+
+# define loss function
+loss_fn_for_chatglm_model = lambda x: x.last_hidden_state.mean()
+loss_fn = lambda x: x.loss
+config = ChatGLMConfig(num_layers=1,
+                       padded_vocab_size=65024,
+                       hidden_size=64,
+                       num_attention_heads=8,
+                       rmsnorm=False,
+                       original_rope=True,
+                       use_cache=True)
+
+model_zoo.register(name='transformers_chatglm',
+                   model_fn=lambda: ChatGLMModel(config, empty_init=False),
+                   data_gen_fn=data_gen,
+                   output_transform_fn=output_transform_fn,
+                   loss_fn=loss_fn_for_chatglm_model,
+                   model_attribute=ModelAttribute(has_control_flow=True))
diff --git a/tests/kit/model_zoo/transformers/chatglm2_6b/configuration_chatglm.py b/tests/kit/model_zoo/transformers/chatglm2_6b/configuration_chatglm.py
new file mode 100644
index 000000000000..3e78732be2da
--- /dev/null
+++ b/tests/kit/model_zoo/transformers/chatglm2_6b/configuration_chatglm.py
@@ -0,0 +1,58 @@
+from transformers import PretrainedConfig
+
+
+class ChatGLMConfig(PretrainedConfig):
+    model_type = "chatglm"
+
+    def __init__(self,
+                 num_layers=28,
+                 padded_vocab_size=65024,
+                 hidden_size=4096,
+                 ffn_hidden_size=13696,
+                 kv_channels=128,
+                 num_attention_heads=32,
+                 seq_length=2048,
+                 hidden_dropout=0.0,
+                 attention_dropout=0.0,
+                 layernorm_epsilon=1e-5,
+                 rmsnorm=True,
+                 apply_residual_connection_post_layernorm=False,
+                 post_layer_norm=True,
+                 add_bias_linear=False,
+                 add_qkv_bias=False,
+                 bias_dropout_fusion=True,
+                 multi_query_attention=False,
+                 multi_query_group_num=1,
+                 apply_query_key_layer_scaling=True,
+                 attention_softmax_in_fp32=True,
+                 fp32_residual_connection=False,
+                 quantization_bit=0,
+                 pre_seq_len=None,
+                 prefix_projection=False,
+                 **kwargs):
+        self.num_layers = num_layers
+        self.vocab_size = padded_vocab_size
+        self.padded_vocab_size = padded_vocab_size
+        self.hidden_size = hidden_size
+        self.ffn_hidden_size = ffn_hidden_size
+        self.kv_channels = kv_channels
+        self.num_attention_heads = num_attention_heads
+        self.seq_length = seq_length
+        self.hidden_dropout = hidden_dropout
+        self.attention_dropout = attention_dropout
+        self.layernorm_epsilon = layernorm_epsilon
+        self.rmsnorm = rmsnorm
+        self.apply_residual_connection_post_layernorm = apply_residual_connection_post_layernorm
+        self.post_layer_norm = post_layer_norm
+        self.add_bias_linear = add_bias_linear
+        self.add_qkv_bias = add_qkv_bias
+        self.bias_dropout_fusion = bias_dropout_fusion
+        self.multi_query_attention = multi_query_attention
+        self.multi_query_group_num = multi_query_group_num
+        self.apply_query_key_layer_scaling = apply_query_key_layer_scaling
+        self.attention_softmax_in_fp32 = attention_softmax_in_fp32
+        self.fp32_residual_connection = fp32_residual_connection
+        self.quantization_bit = quantization_bit
+        self.pre_seq_len = pre_seq_len
+        self.prefix_projection = prefix_projection
+        super().__init__(**kwargs)
diff --git a/tests/kit/model_zoo/transformers/chatglm2_6b/modeling_chatglm.py b/tests/kit/model_zoo/transformers/chatglm2_6b/modeling_chatglm.py
new file mode 100644
index 000000000000..bae6d425878d
--- /dev/null
+++ b/tests/kit/model_zoo/transformers/chatglm2_6b/modeling_chatglm.py
@@ -0,0 +1,1372 @@
+"""
+The ChatGLM2-6B License
+
+1. Definitions
+
+“Licensor” means the ChatGLM2-6B Model Team that distributes its Software.
+
+“Software” means the ChatGLM2-6B model parameters made available under this license.
+
+2. License Grant
+
+Subject to the terms and conditions of this License, the Licensor hereby grants to you a non-exclusive, worldwide, non-transferable, non-sublicensable, revocable, royalty-free copyright license to use the Software solely for your non-commercial research purposes.
+
+The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+
+3. Restriction
+
+You will not use, copy, modify, merge, publish, distribute, reproduce, or create derivative works of the Software, in whole or in part, for any commercial, military, or illegal purposes.
+
+You will not use the Software for any act that may undermine China's national security and national unity, harm the public interest of society, or infringe upon the rights and interests of human beings.
+
+4. Disclaimer
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+5. Limitation of Liability
+
+EXCEPT TO THE EXTENT PROHIBITED BY APPLICABLE LAW, IN NO EVENT AND UNDER NO LEGAL THEORY, WHETHER BASED IN TORT, NEGLIGENCE, CONTRACT, LIABILITY, OR OTHERWISE WILL ANY LICENSOR BE LIABLE TO YOU FOR ANY DIRECT, INDIRECT, SPECIAL, INCIDENTAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES, OR ANY OTHER COMMERCIAL LOSSES, EVEN IF THE LICENSOR HAS BEEN ADVISED OF THE POSSIBILITY OF SUCH DAMAGES.
+
+6. Dispute Resolution
+
+This license shall be governed and construed in accordance with the laws of People’s Republic of China. Any dispute arising from or in connection with this License shall be submitted to Haidian District People's Court in Beijing.
+
+Note that the license is subject to update to a more comprehensive version.  For any questions related to the license and copyright, please contact us at glm-130b@googlegroups.com.
+"""
+""" PyTorch ChatGLM model. """
+
+import copy
+import math
+import re
+import sys
+import warnings
+from typing import Any, Callable, Dict, List, Optional, Tuple, Union
+
+import torch
+import torch.nn.functional as F
+import torch.utils.checkpoint
+from torch import nn
+from torch.nn import CrossEntropyLoss, LayerNorm
+from torch.nn.utils import skip_init
+from transformers.generation.logits_process import LogitsProcessor
+from transformers.generation.utils import GenerationConfig, LogitsProcessorList, ModelOutput, StoppingCriteriaList
+from transformers.modeling_outputs import BaseModelOutputWithPast, CausalLMOutputWithPast
+from transformers.modeling_utils import PreTrainedModel
+from transformers.utils import logging
+
+from .configuration_chatglm import ChatGLMConfig
+
+# flags required to enable jit fusion kernels
+
+if sys.platform != "darwin":
+    torch._C._jit_set_profiling_mode(False)
+    torch._C._jit_set_profiling_executor(False)
+    torch._C._jit_override_can_fuse_on_cpu(True)
+    torch._C._jit_override_can_fuse_on_gpu(True)
+
+logger = logging.get_logger(__name__)
+
+_CHECKPOINT_FOR_DOC = "THUDM/ChatGLM2-6B"
+_CONFIG_FOR_DOC = "ChatGLM6BConfig"
+
+CHATGLM_6B_PRETRAINED_MODEL_ARCHIVE_LIST = [
+    "THUDM/chatglm2-6b",
+    # See all ChatGLM models at https://huggingface.co/models?filter=chatglm
+]
+
+
+def default_init(cls, *args, **kwargs):
+    return cls(*args, **kwargs)
+
+
+class InvalidScoreLogitsProcessor(LogitsProcessor):
+
+    def __call__(self, input_ids: torch.LongTensor, scores: torch.FloatTensor) -> torch.FloatTensor:
+        if torch.isnan(scores).any() or torch.isinf(scores).any():
+            scores.zero_()
+            scores[..., 5] = 5e4
+        return scores
+
+
+class PrefixEncoder(torch.nn.Module):
+    """
+    The torch.nn model to encode the prefix
+    Input shape: (batch-size, prefix-length)
+    Output shape: (batch-size, prefix-length, 2*layers*hidden)
+    """
+
+    def __init__(self, config: ChatGLMConfig):
+        super().__init__()
+        self.prefix_projection = config.prefix_projection
+        if self.prefix_projection:
+            # Use a two-layer MLP to encode the prefix
+            kv_size = (config.num_layers * config.kv_channels * config.multi_query_group_num * 2)
+            self.embedding = torch.nn.Embedding(config.pre_seq_len, kv_size)
+            self.trans = torch.nn.Sequential(
+                torch.nn.Linear(kv_size, config.hidden_size),
+                torch.nn.Tanh(),
+                torch.nn.Linear(config.hidden_size, kv_size),
+            )
+        else:
+            self.embedding = torch.nn.Embedding(
+                config.pre_seq_len,
+                config.num_layers * config.kv_channels * config.multi_query_group_num * 2,
+            )
+
+    def forward(self, prefix: torch.Tensor):
+        if self.prefix_projection:
+            prefix_tokens = self.embedding(prefix)
+            past_key_values = self.trans(prefix_tokens)
+        else:
+            past_key_values = self.embedding(prefix)
+        return past_key_values
+
+
+def split_tensor_along_last_dim(
+    tensor: torch.Tensor,
+    num_partitions: int,
+    contiguous_split_chunks: bool = False,
+) -> List[torch.Tensor]:
+    """Split a tensor along its last dimension.
+
+    Arguments:
+        tensor: input tensor.
+        num_partitions: number of partitions to split the tensor
+        contiguous_split_chunks: If True, make each chunk contiguous
+                                 in memory.
+
+    Returns:
+        A list of Tensors
+    """
+    # Get the size and dimension.
+    last_dim = tensor.dim() - 1
+    last_dim_size = tensor.size()[last_dim] // num_partitions
+    # Split.
+    tensor_list = torch.split(tensor, last_dim_size, dim=last_dim)
+    # Note: torch.split does not create contiguous tensors by default.
+    if contiguous_split_chunks:
+        return tuple(chunk.contiguous() for chunk in tensor_list)
+
+    return tensor_list
+
+
+class RotaryEmbedding(nn.Module):
+
+    def __init__(self, dim, original_impl=False, device=None, dtype=None):
+        super().__init__()
+        inv_freq = 1.0 / (10000**(torch.arange(0, dim, 2, device=device).to(dtype=dtype) / dim))
+        self.register_buffer("inv_freq", inv_freq)
+        self.dim = dim
+        self.original_impl = original_impl
+
+    def forward_impl(
+        self,
+        seq_len: int,
+        n_elem: int,
+        dtype: torch.dtype,
+        device: torch.device,
+        base: int = 10000,
+    ):
+        """Enhanced Transformer with Rotary Position Embedding.
+
+        Derived from: https://github.com/labmlai/annotated_deep_learning_paper_implementations/blob/master/labml_nn/
+        transformers/rope/__init__.py. MIT License:
+        https://github.com/labmlai/annotated_deep_learning_paper_implementations/blob/master/license.
+        """
+        # $\Theta = {\theta_i = 10000^{\frac{2(i-1)}{d}}, i \in [1, 2, ..., \frac{d}{2}]}$
+        theta = 1.0 / (base**(torch.arange(0, n_elem, 2, dtype=dtype, device=device) / n_elem))
+
+        # Create position indexes `[0, 1, ..., seq_len - 1]`
+        seq_idx = torch.arange(seq_len, dtype=dtype, device=device)
+
+        # Calculate the product of position index and $\theta_i$
+        idx_theta = torch.outer(seq_idx, theta).float()
+
+        cache = torch.stack([torch.cos(idx_theta), torch.sin(idx_theta)], dim=-1)
+
+        # this is to mimic the behaviour of complex32, else we will get different results
+        if dtype in (torch.float16, torch.bfloat16, torch.int8):
+            cache = cache.bfloat16() if dtype == torch.bfloat16 else cache.half()
+        return cache
+
+    def forward(self, max_seq_len, offset=0):
+        return self.forward_impl(
+            max_seq_len,
+            self.dim,
+            dtype=self.inv_freq.dtype,
+            device=self.inv_freq.device,
+        )
+
+
+@torch.jit.script
+def apply_rotary_pos_emb(x: torch.Tensor, rope_cache: torch.Tensor) -> torch.Tensor:
+    # x: [sq, b, np, hn]
+    sq, b, np, hn = x.size(0), x.size(1), x.size(2), x.size(3)
+    rot_dim = rope_cache.shape[-2] * 2
+    x, x_pass = x[..., :rot_dim], x[..., rot_dim:]
+    # truncate to support variable sizes
+    rope_cache = rope_cache[:sq]
+    xshaped = x.reshape(sq, -1, np, rot_dim // 2, 2)
+    rope_cache = rope_cache.view(sq, -1, 1, xshaped.size(3), 2)
+    x_out2 = torch.stack(
+        [
+            xshaped[..., 0] * rope_cache[..., 0] - xshaped[..., 1] * rope_cache[..., 1],
+            xshaped[..., 1] * rope_cache[..., 0] + xshaped[..., 0] * rope_cache[..., 1],
+        ],
+        -1,
+    )
+    x_out2 = x_out2.flatten(3)
+    return torch.cat((x_out2, x_pass), dim=-1)
+
+
+class RMSNorm(torch.nn.Module):
+
+    def __init__(self, normalized_shape, eps=1e-5, device=None, dtype=None, **kwargs):
+        super().__init__()
+        self.weight = torch.nn.Parameter(torch.empty(normalized_shape, device=device, dtype=dtype))
+        self.eps = eps
+
+    def forward(self, hidden_states: torch.Tensor):
+        input_dtype = hidden_states.dtype
+        variance = hidden_states.to(torch.float32).pow(2).mean(-1, keepdim=True)
+        hidden_states = hidden_states * torch.rsqrt(variance + self.eps)
+
+        return (self.weight * hidden_states).to(input_dtype)
+
+
+class CoreAttention(torch.nn.Module):
+
+    def __init__(self, config: ChatGLMConfig, layer_number):
+        super(CoreAttention, self).__init__()
+
+        self.apply_query_key_layer_scaling = config.apply_query_key_layer_scaling
+        self.attention_softmax_in_fp32 = config.attention_softmax_in_fp32
+        if self.apply_query_key_layer_scaling:
+            self.attention_softmax_in_fp32 = True
+        self.layer_number = max(1, layer_number)
+
+        projection_size = config.kv_channels * config.num_attention_heads
+
+        # Per attention head and per partition values.
+        self.hidden_size_per_partition = projection_size
+        self.hidden_size_per_attention_head = (projection_size // config.num_attention_heads)
+        self.num_attention_heads_per_partition = config.num_attention_heads
+
+        coeff = None
+        self.norm_factor = math.sqrt(self.hidden_size_per_attention_head)
+        if self.apply_query_key_layer_scaling:
+            coeff = self.layer_number
+            self.norm_factor *= coeff
+        self.coeff = coeff
+
+        self.attention_dropout = torch.nn.Dropout(config.attention_dropout)
+
+    def forward(self, query_layer, key_layer, value_layer, attention_mask):
+        pytorch_major_version = int(torch.__version__.split(".")[0])
+        if pytorch_major_version >= 2:
+            query_layer, key_layer, value_layer = [k.permute(1, 2, 0, 3) for k in [query_layer, key_layer, value_layer]]
+            if attention_mask is None and query_layer.shape[2] == key_layer.shape[2]:
+                context_layer = torch.nn.functional.scaled_dot_product_attention(query_layer,
+                                                                                 key_layer,
+                                                                                 value_layer,
+                                                                                 is_causal=True)
+            else:
+                if attention_mask is not None:
+                    attention_mask = ~attention_mask
+                context_layer = torch.nn.functional.scaled_dot_product_attention(query_layer, key_layer, value_layer,
+                                                                                 attention_mask)
+            context_layer = context_layer.permute(2, 0, 1, 3)
+            new_context_layer_shape = context_layer.size()[:-2] + (self.hidden_size_per_partition,)
+            context_layer = context_layer.reshape(*new_context_layer_shape)
+        else:
+            # Raw attention scores
+
+            # [b, np, sq, sk]
+            output_size = (
+                query_layer.size(1),
+                query_layer.size(2),
+                query_layer.size(0),
+                key_layer.size(0),
+            )
+
+            # [sq, b, np, hn] -> [sq, b * np, hn]
+            query_layer = query_layer.view(output_size[2], output_size[0] * output_size[1], -1)
+            # [sk, b, np, hn] -> [sk, b * np, hn]
+            key_layer = key_layer.view(output_size[3], output_size[0] * output_size[1], -1)
+
+            # preallocting input tensor: [b * np, sq, sk]
+            matmul_input_buffer = torch.empty(
+                output_size[0] * output_size[1],
+                output_size[2],
+                output_size[3],
+                dtype=query_layer.dtype,
+                device=query_layer.device,
+            )
+
+            # Raw attention scores. [b * np, sq, sk]
+            matmul_result = torch.baddbmm(
+                matmul_input_buffer,
+                query_layer.transpose(0, 1),    # [b * np, sq, hn]
+                key_layer.transpose(0, 1).transpose(1, 2),    # [b * np, hn, sk]
+                beta=0.0,
+                alpha=(1.0 / self.norm_factor),
+            )
+
+            # change view to [b, np, sq, sk]
+            attention_scores = matmul_result.view(*output_size)
+
+            # ===========================
+            # Attention probs and dropout
+            # ===========================
+
+            # attention scores and attention mask [b, np, sq, sk]
+            if self.attention_softmax_in_fp32:
+                attention_scores = attention_scores.float()
+            if self.coeff is not None:
+                attention_scores = attention_scores * self.coeff
+            if (attention_mask is None and attention_scores.shape[2] == attention_scores.shape[3]):
+                attention_mask = torch.ones(
+                    output_size[0],
+                    1,
+                    output_size[2],
+                    output_size[3],
+                    device=attention_scores.device,
+                    dtype=torch.bool,
+                )
+                attention_mask.tril_()
+                attention_mask = ~attention_mask
+            if attention_mask is not None:
+                attention_scores = attention_scores.masked_fill(attention_mask, float("-inf"))
+            attention_probs = F.softmax(attention_scores, dim=-1)
+            attention_probs = attention_probs.type_as(value_layer)
+
+            # This is actually dropping out entire tokens to attend to, which might
+            # seem a bit unusual, but is taken from the original Transformer paper.
+            attention_probs = self.attention_dropout(attention_probs)
+            # =========================
+            # Context layer. [sq, b, hp]
+            # =========================
+
+            # value_layer -> context layer.
+            # [sk, b, np, hn] --> [b, np, sq, hn]
+
+            # context layer shape: [b, np, sq, hn]
+            output_size = (
+                value_layer.size(1),
+                value_layer.size(2),
+                query_layer.size(0),
+                value_layer.size(3),
+            )
+            # change view [sk, b * np, hn]
+            value_layer = value_layer.view(value_layer.size(0), output_size[0] * output_size[1], -1)
+            # change view [b * np, sq, sk]
+            attention_probs = attention_probs.view(output_size[0] * output_size[1], output_size[2], -1)
+            # matmul: [b * np, sq, hn]
+            context_layer = torch.bmm(attention_probs, value_layer.transpose(0, 1))
+            # change view [b, np, sq, hn]
+            context_layer = context_layer.view(*output_size)
+            # [b, np, sq, hn] --> [sq, b, np, hn]
+            context_layer = context_layer.permute(2, 0, 1, 3).contiguous()
+            # [sq, b, np, hn] --> [sq, b, hp]
+            new_context_layer_shape = context_layer.size()[:-2] + (self.hidden_size_per_partition,)
+            context_layer = context_layer.view(*new_context_layer_shape)
+
+        return context_layer
+
+
+class SelfAttention(torch.nn.Module):
+    """Parallel self-attention layer abstract class.
+
+    Self-attention layer takes input with size [s, b, h]
+    and returns output of the same size.
+    """
+
+    def __init__(self, config: ChatGLMConfig, layer_number, device=None):
+        super(SelfAttention, self).__init__()
+        self.layer_number = max(1, layer_number)
+
+        self.projection_size = config.kv_channels * config.num_attention_heads
+        # Per attention head and per partition values.
+        self.hidden_size_per_attention_head = (self.projection_size // config.num_attention_heads)
+        self.num_attention_heads_per_partition = config.num_attention_heads
+
+        self.multi_query_attention = config.multi_query_attention
+        self.qkv_hidden_size = 3 * self.projection_size
+        if self.multi_query_attention:
+            self.num_multi_query_groups_per_partition = config.multi_query_group_num
+            self.qkv_hidden_size = (self.projection_size +
+                                    2 * self.hidden_size_per_attention_head * config.multi_query_group_num)
+        self.query_key_value = nn.Linear(
+            config.hidden_size,
+            self.qkv_hidden_size,
+            bias=config.add_bias_linear or config.add_qkv_bias,
+            device=device,
+            **_config_to_kwargs(config),
+        )
+
+        self.core_attention = CoreAttention(config, self.layer_number)
+
+        # Output.
+        self.dense = nn.Linear(
+            self.projection_size,
+            config.hidden_size,
+            bias=config.add_bias_linear,
+            device=device,
+            **_config_to_kwargs(config),
+        )
+
+    def _allocate_memory(self, inference_max_sequence_len, batch_size, device=None, dtype=None):
+        if self.multi_query_attention:
+            num_attention_heads = self.num_multi_query_groups_per_partition
+        else:
+            num_attention_heads = self.num_attention_heads_per_partition
+        return torch.empty(
+            inference_max_sequence_len,
+            batch_size,
+            num_attention_heads,
+            self.hidden_size_per_attention_head,
+            dtype=dtype,
+            device=device,
+        )
+
+    def forward(
+        self,
+        hidden_states,
+        attention_mask,
+        rotary_pos_emb,
+        kv_cache=None,
+        use_cache=True,
+    ):
+        # hidden_states: [sq, b, h]
+
+        # =================================================
+        # Pre-allocate memory for key-values for inference.
+        # =================================================
+        # =====================
+        # Query, Key, and Value
+        # =====================
+
+        # Attention heads [sq, b, h] --> [sq, b, (np * 3 * hn)]
+        mixed_x_layer = self.query_key_value(hidden_states)
+
+        if self.multi_query_attention:
+            (query_layer, key_layer, value_layer) = mixed_x_layer.split(
+                [
+                    self.num_attention_heads_per_partition * self.hidden_size_per_attention_head,
+                    self.num_multi_query_groups_per_partition * self.hidden_size_per_attention_head,
+                    self.num_multi_query_groups_per_partition * self.hidden_size_per_attention_head,
+                ],
+                dim=-1,
+            )
+            query_layer = query_layer.view(query_layer.size()[:-1] + (
+                self.num_attention_heads_per_partition,
+                self.hidden_size_per_attention_head,
+            ))
+            key_layer = key_layer.view(key_layer.size()[:-1] + (
+                self.num_multi_query_groups_per_partition,
+                self.hidden_size_per_attention_head,
+            ))
+            value_layer = value_layer.view(value_layer.size()[:-1] + (
+                self.num_multi_query_groups_per_partition,
+                self.hidden_size_per_attention_head,
+            ))
+        else:
+            new_tensor_shape = mixed_x_layer.size()[:-1] + (
+                self.num_attention_heads_per_partition,
+                3 * self.hidden_size_per_attention_head,
+            )
+            mixed_x_layer = mixed_x_layer.view(*new_tensor_shape)
+            # [sq, b, np, 3 * hn] --> 3 [sq, b, np, hn]
+            (query_layer, key_layer, value_layer) = split_tensor_along_last_dim(mixed_x_layer, 3)
+
+        # apply relative positional encoding (rotary embedding)
+        if rotary_pos_emb is not None:
+            query_layer = apply_rotary_pos_emb(query_layer, rotary_pos_emb)
+            key_layer = apply_rotary_pos_emb(key_layer, rotary_pos_emb)
+
+        # adjust key and value for inference
+        if kv_cache is not None:
+            cache_k, cache_v = kv_cache
+            key_layer = torch.cat((cache_k, key_layer), dim=0)
+            value_layer = torch.cat((cache_v, value_layer), dim=0)
+        if use_cache:
+            kv_cache = (key_layer, value_layer)
+        else:
+            kv_cache = None
+
+        if self.multi_query_attention:
+            key_layer = key_layer.unsqueeze(-2)
+            key_layer = key_layer.expand(
+                -1,
+                -1,
+                -1,
+                self.num_attention_heads_per_partition // self.num_multi_query_groups_per_partition,
+                -1,
+            )
+            key_layer = key_layer.contiguous().view(key_layer.size()[:2] + (
+                self.num_attention_heads_per_partition,
+                self.hidden_size_per_attention_head,
+            ))
+            value_layer = value_layer.unsqueeze(-2)
+            value_layer = value_layer.expand(
+                -1,
+                -1,
+                -1,
+                self.num_attention_heads_per_partition // self.num_multi_query_groups_per_partition,
+                -1,
+            )
+            value_layer = value_layer.contiguous().view(value_layer.size()[:2] + (
+                self.num_attention_heads_per_partition,
+                self.hidden_size_per_attention_head,
+            ))
+
+        # ==================================
+        # core attention computation
+        # ==================================
+
+        context_layer = self.core_attention(query_layer, key_layer, value_layer, attention_mask)
+
+        # =================
+        # Output. [sq, b, h]
+        # =================
+
+        output = self.dense(context_layer)
+
+        return output, kv_cache
+
+
+def _config_to_kwargs(args):
+    common_kwargs = {
+        "dtype": args.torch_dtype,
+    }
+    return common_kwargs
+
+
+class MLP(torch.nn.Module):
+    """MLP.
+
+    MLP will take the input with h hidden state, project it to 4*h
+    hidden dimension, perform nonlinear transformation, and project the
+    state back into h hidden dimension.
+    """
+
+    def __init__(self, config: ChatGLMConfig, device=None):
+        super(MLP, self).__init__()
+
+        self.add_bias = config.add_bias_linear
+
+        # Project to 4h. If using swiglu double the output width, see https://arxiv.org/pdf/2002.05202.pdf
+        self.dense_h_to_4h = nn.Linear(
+            config.hidden_size,
+            config.ffn_hidden_size * 2,
+            bias=self.add_bias,
+            device=device,
+            **_config_to_kwargs(config),
+        )
+
+        def swiglu(x):
+            x = torch.chunk(x, 2, dim=-1)
+            return F.silu(x[0]) * x[1]
+
+        self.activation_func = swiglu
+
+        # Project back to h.
+        self.dense_4h_to_h = nn.Linear(
+            config.ffn_hidden_size,
+            config.hidden_size,
+            bias=self.add_bias,
+            device=device,
+            **_config_to_kwargs(config),
+        )
+
+    def forward(self, hidden_states):
+        # [s, b, 4hp]
+        intermediate_parallel = self.dense_h_to_4h(hidden_states)
+        intermediate_parallel = self.activation_func(intermediate_parallel)
+        # [s, b, h]
+        output = self.dense_4h_to_h(intermediate_parallel)
+        return output
+
+
+class GLMBlock(torch.nn.Module):
+    """A single transformer layer.
+
+    Transformer layer takes input with size [s, b, h] and returns an
+    output of the same size.
+    """
+
+    def __init__(self, config: ChatGLMConfig, layer_number, device=None):
+        super(GLMBlock, self).__init__()
+        self.layer_number = layer_number
+
+        self.apply_residual_connection_post_layernorm = (config.apply_residual_connection_post_layernorm)
+
+        self.fp32_residual_connection = config.fp32_residual_connection
+
+        LayerNormFunc = RMSNorm if config.rmsnorm else LayerNorm
+        # Layernorm on the input data.
+        self.input_layernorm = LayerNormFunc(
+            config.hidden_size,
+            eps=config.layernorm_epsilon,
+            device=device,
+            dtype=config.torch_dtype,
+        )
+
+        # Self attention.
+        self.self_attention = SelfAttention(config, layer_number, device=device)
+        self.hidden_dropout = config.hidden_dropout
+
+        # Layernorm on the attention output
+        self.post_attention_layernorm = LayerNormFunc(
+            config.hidden_size,
+            eps=config.layernorm_epsilon,
+            device=device,
+            dtype=config.torch_dtype,
+        )
+
+        # MLP
+        self.mlp = MLP(config, device=device)
+
+    def forward(
+        self,
+        hidden_states,
+        attention_mask,
+        rotary_pos_emb,
+        kv_cache=None,
+        use_cache=True,
+    ):
+        # hidden_states: [s, b, h]
+
+        # Layer norm at the beginning of the transformer layer.
+        layernorm_output = self.input_layernorm(hidden_states)
+        # Self attention.
+        attention_output, kv_cache = self.self_attention(
+            layernorm_output,
+            attention_mask,
+            rotary_pos_emb,
+            kv_cache=kv_cache,
+            use_cache=use_cache,
+        )
+
+        # Residual connection.
+        if self.apply_residual_connection_post_layernorm:
+            residual = layernorm_output
+        else:
+            residual = hidden_states
+
+        layernorm_input = torch.nn.functional.dropout(attention_output, p=self.hidden_dropout, training=self.training)
+        layernorm_input = residual + layernorm_input
+
+        # Layer norm post the self attention.
+        layernorm_output = self.post_attention_layernorm(layernorm_input)
+
+        # MLP.
+        mlp_output = self.mlp(layernorm_output)
+
+        # Second residual connection.
+        if self.apply_residual_connection_post_layernorm:
+            residual = layernorm_output
+        else:
+            residual = layernorm_input
+
+        output = torch.nn.functional.dropout(mlp_output, p=self.hidden_dropout, training=self.training)
+        output = residual + output
+
+        return output, kv_cache
+
+
+class GLMTransformer(torch.nn.Module):
+    """Transformer class."""
+
+    def __init__(self, config: ChatGLMConfig, device=None):
+        super(GLMTransformer, self).__init__()
+
+        self.fp32_residual_connection = config.fp32_residual_connection
+        self.post_layer_norm = config.post_layer_norm
+
+        # Number of layers.
+        self.num_layers = config.num_layers
+
+        # Transformer layers.
+        def build_layer(layer_number):
+            return GLMBlock(config, layer_number, device=device)
+
+        self.layers = torch.nn.ModuleList([build_layer(i + 1) for i in range(self.num_layers)])
+
+        if self.post_layer_norm:
+            LayerNormFunc = RMSNorm if config.rmsnorm else LayerNorm
+            # Final layer norm before output.
+            self.final_layernorm = LayerNormFunc(
+                config.hidden_size,
+                eps=config.layernorm_epsilon,
+                device=device,
+                dtype=config.torch_dtype,
+            )
+
+        self.gradient_checkpointing = False
+
+    def _get_layer(self, layer_number):
+        return self.layers[layer_number]
+
+    def forward(
+        self,
+        hidden_states,
+        attention_mask,
+        rotary_pos_emb,
+        kv_caches=None,
+        use_cache: Optional[bool] = True,
+        output_hidden_states: Optional[bool] = False,
+    ):
+        if not kv_caches:
+            kv_caches = [None for _ in range(self.num_layers)]
+        presents = () if use_cache else None
+        if self.gradient_checkpointing and self.training:
+            if use_cache:
+                logger.warning_once(
+                    "`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...")
+                use_cache = False
+
+        all_self_attentions = None
+        all_hidden_states = () if output_hidden_states else None
+        for index in range(self.num_layers):
+            if output_hidden_states:
+                all_hidden_states = all_hidden_states + (hidden_states,)
+
+            layer = self._get_layer(index)
+            if self.gradient_checkpointing and self.training:
+                layer_ret = torch.utils.checkpoint.checkpoint(
+                    layer,
+                    hidden_states,
+                    attention_mask,
+                    rotary_pos_emb,
+                    kv_caches[index],
+                    use_cache,
+                )
+            else:
+                layer_ret = layer(
+                    hidden_states,
+                    attention_mask,
+                    rotary_pos_emb,
+                    kv_cache=kv_caches[index],
+                    use_cache=use_cache,
+                )
+            hidden_states, kv_cache = layer_ret
+            if use_cache:
+                presents = presents + (kv_cache,)
+
+        if output_hidden_states:
+            all_hidden_states = all_hidden_states + (hidden_states,)
+
+        # Final layer norm.
+        if self.post_layer_norm:
+            hidden_states = self.final_layernorm(hidden_states)
+
+        return hidden_states, presents, all_hidden_states, all_self_attentions
+
+
+class ChatGLMPreTrainedModel(PreTrainedModel):
+    """
+    An abstract class to handle weights initialization and
+    a simple interface for downloading and loading pretrained models.
+    """
+
+    is_parallelizable = False
+    supports_gradient_checkpointing = True
+    config_class = ChatGLMConfig
+    base_model_prefix = "transformer"
+    _no_split_modules = ["GLMBlock"]
+
+    def _init_weights(self, module: nn.Module):
+        """Initialize the weights."""
+        return
+
+    def get_masks(self, input_ids, past_key_values, padding_mask=None):
+        batch_size, seq_length = input_ids.shape
+        full_attention_mask = torch.ones(batch_size, seq_length, seq_length, device=input_ids.device)
+        full_attention_mask.tril_()
+        past_length = 0
+        if past_key_values:
+            past_length = past_key_values[0][0].shape[0]
+        if past_length:
+            full_attention_mask = torch.cat(
+                (
+                    torch.ones(batch_size, seq_length, past_length, device=input_ids.device),
+                    full_attention_mask,
+                ),
+                dim=-1,
+            )
+        if padding_mask is not None:
+            full_attention_mask = full_attention_mask * padding_mask.unsqueeze(1)
+        if not past_length and padding_mask is not None:
+            full_attention_mask -= padding_mask.unsqueeze(-1) - 1
+        full_attention_mask = (full_attention_mask < 0.5).bool()
+        full_attention_mask.unsqueeze_(1)
+        return full_attention_mask
+
+    def get_position_ids(self, input_ids, device):
+        batch_size, seq_length = input_ids.shape
+        position_ids = (torch.arange(seq_length, dtype=torch.long, device=device).unsqueeze(0).repeat(batch_size, 1))
+        return position_ids
+
+    def _set_gradient_checkpointing(self, module, value=False):
+        if isinstance(module, GLMTransformer):
+            module.gradient_checkpointing = value
+
+
+class Embedding(torch.nn.Module):
+    """Language model embeddings."""
+
+    def __init__(self, config: ChatGLMConfig, device=None):
+        super(Embedding, self).__init__()
+
+        self.hidden_size = config.hidden_size
+        # Word embeddings (parallel).
+        self.word_embeddings = nn.Embedding(
+            config.padded_vocab_size,
+            self.hidden_size,
+            dtype=config.torch_dtype,
+            device=device,
+        )
+        self.fp32_residual_connection = config.fp32_residual_connection
+
+    def forward(self, input_ids):
+        # Embeddings.
+        words_embeddings = self.word_embeddings(input_ids)
+        embeddings = words_embeddings
+        # Data format change to avoid explicit tranposes : [b s h] --> [s b h].
+        embeddings = embeddings.transpose(0, 1).contiguous()
+        # If the input flag for fp32 residual connection is set, convert for float.
+        if self.fp32_residual_connection:
+            embeddings = embeddings.float()
+        return embeddings
+
+
+class ChatGLMModel(ChatGLMPreTrainedModel):
+
+    def __init__(self, config: ChatGLMConfig, device=None, empty_init=True):
+        super().__init__(config)
+        if empty_init:
+            init_method = skip_init
+        else:
+            init_method = default_init
+        init_kwargs = {}
+        if device is not None:
+            init_kwargs["device"] = device
+        self.embedding = init_method(Embedding, config, **init_kwargs)
+        self.num_layers = config.num_layers
+        self.multi_query_group_num = config.multi_query_group_num
+        self.kv_channels = config.kv_channels
+
+        # Rotary positional embeddings
+        self.seq_length = config.seq_length
+        rotary_dim = (config.hidden_size //
+                      config.num_attention_heads if config.kv_channels is None else config.kv_channels)
+
+        self.rotary_pos_emb = RotaryEmbedding(
+            rotary_dim // 2,
+            original_impl=config.original_rope,
+            device=device,
+            dtype=config.torch_dtype,
+        )
+        self.encoder = init_method(GLMTransformer, config, **init_kwargs)
+        self.output_layer = init_method(
+            nn.Linear,
+            config.hidden_size,
+            config.padded_vocab_size,
+            bias=False,
+            dtype=config.torch_dtype,
+            **init_kwargs,
+        )
+        self.pre_seq_len = config.pre_seq_len
+        self.prefix_projection = config.prefix_projection
+        if self.pre_seq_len is not None:
+            for param in self.parameters():
+                param.requires_grad = False
+            self.prefix_tokens = torch.arange(self.pre_seq_len).long()
+            self.prefix_encoder = PrefixEncoder(config)
+            self.dropout = torch.nn.Dropout(0.1)
+
+    def get_input_embeddings(self):
+        return self.embedding.word_embeddings
+
+    def get_prompt(self, batch_size, device, dtype=torch.half):
+        prefix_tokens = (self.prefix_tokens.unsqueeze(0).expand(batch_size, -1).to(device))
+        past_key_values = self.prefix_encoder(prefix_tokens).type(dtype)
+        past_key_values = past_key_values.view(
+            batch_size,
+            self.pre_seq_len,
+            self.num_layers * 2,
+            self.multi_query_group_num,
+            self.kv_channels,
+        )
+        # seq_len, b, nh, hidden_size
+        past_key_values = self.dropout(past_key_values)
+        past_key_values = past_key_values.permute([2, 1, 0, 3, 4]).split(2)
+        return past_key_values
+
+    def forward(
+        self,
+        input_ids,
+        position_ids: Optional[torch.Tensor] = None,
+        attention_mask: Optional[torch.BoolTensor] = None,
+        full_attention_mask: Optional[torch.BoolTensor] = None,
+        past_key_values: Optional[Tuple[Tuple[torch.Tensor, torch.Tensor], ...]] = None,
+        inputs_embeds: Optional[torch.Tensor] = None,
+        use_cache: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ):
+        output_hidden_states = (output_hidden_states
+                                if output_hidden_states is not None else self.config.output_hidden_states)
+        use_cache = use_cache if use_cache is not None else self.config.use_cache
+        return_dict = (return_dict if return_dict is not None else self.config.use_return_dict)
+
+        batch_size, seq_length = input_ids.shape
+
+        if inputs_embeds is None:
+            inputs_embeds = self.embedding(input_ids)
+
+        if self.pre_seq_len is not None:
+            if past_key_values is None:
+                past_key_values = self.get_prompt(
+                    batch_size=batch_size,
+                    device=input_ids.device,
+                    dtype=inputs_embeds.dtype,
+                )
+            if attention_mask is not None:
+                attention_mask = torch.cat(
+                    [
+                        attention_mask.new_ones((batch_size, self.pre_seq_len)),
+                        attention_mask,
+                    ],
+                    dim=-1,
+                )
+
+        if full_attention_mask is None:
+            if (attention_mask is not None and not attention_mask.all()) or (past_key_values and seq_length != 1):
+                full_attention_mask = self.get_masks(input_ids, past_key_values, padding_mask=attention_mask)
+
+        # Rotary positional embeddings
+        rotary_pos_emb = self.rotary_pos_emb(self.seq_length)
+        if position_ids is not None:
+            rotary_pos_emb = rotary_pos_emb[position_ids]
+        else:
+            rotary_pos_emb = rotary_pos_emb[None, :seq_length]
+        rotary_pos_emb = rotary_pos_emb.transpose(0, 1).contiguous()
+
+        # Run encoder.
+        hidden_states, presents, all_hidden_states, all_self_attentions = self.encoder(
+            inputs_embeds,
+            full_attention_mask,
+            rotary_pos_emb=rotary_pos_emb,
+            kv_caches=past_key_values,
+            use_cache=use_cache,
+            output_hidden_states=output_hidden_states,
+        )
+
+        if not return_dict:
+            return tuple(v for v in [
+                hidden_states,
+                presents,
+                all_hidden_states,
+                all_self_attentions,
+            ] if v is not None)
+
+        return BaseModelOutputWithPast(
+            last_hidden_state=hidden_states,
+            past_key_values=presents,
+            hidden_states=all_hidden_states,
+            attentions=all_self_attentions,
+        )
+
+    def quantize(self, weight_bit_width: int):
+        from .quantization import quantize
+
+        quantize(self.encoder, weight_bit_width)
+        return self
+
+
+class ChatGLMForConditionalGeneration(ChatGLMPreTrainedModel):
+
+    def __init__(self, config: ChatGLMConfig, empty_init=True, device=None):
+        super().__init__(config)
+
+        self.max_sequence_length = config.max_length
+        self.transformer = ChatGLMModel(config, empty_init=empty_init, device=device)
+        self.config = config
+        self.quantized = False
+
+        if self.config.quantization_bit:
+            self.quantize(self.config.quantization_bit, empty_init=True)
+
+    def _update_model_kwargs_for_generation(
+        self,
+        outputs: ModelOutput,
+        model_kwargs: Dict[str, Any],
+        is_encoder_decoder: bool = False,
+        standardize_cache_format: bool = False,
+    ) -> Dict[str, Any]:
+        # update past_key_values
+        model_kwargs["past_key_values"] = self._extract_past_from_model_output(
+            outputs, standardize_cache_format=standardize_cache_format)
+
+        # update attention mask
+        if "attention_mask" in model_kwargs:
+            attention_mask = model_kwargs["attention_mask"]
+            model_kwargs["attention_mask"] = torch.cat(
+                [attention_mask, attention_mask.new_ones((attention_mask.shape[0], 1))],
+                dim=-1,
+            )
+
+        # update position ids
+        if "position_ids" in model_kwargs:
+            position_ids = model_kwargs["position_ids"]
+            new_position_id = position_ids[..., -1:].clone()
+            new_position_id += 1
+            model_kwargs["position_ids"] = torch.cat([position_ids, new_position_id], dim=-1)
+
+        model_kwargs["is_first_forward"] = False
+        return model_kwargs
+
+    def prepare_inputs_for_generation(
+        self,
+        input_ids: torch.LongTensor,
+        past_key_values: Optional[torch.Tensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.Tensor] = None,
+        is_first_forward: bool = True,
+        **kwargs,
+    ) -> dict:
+        # only last token for input_ids if past is not None
+        if position_ids is None:
+            position_ids = self.get_position_ids(input_ids, device=input_ids.device)
+        if not is_first_forward:
+            position_ids = position_ids[..., -1:]
+            input_ids = input_ids[:, -1:]
+        return {
+            "input_ids": input_ids,
+            "past_key_values": past_key_values,
+            "position_ids": position_ids,
+            "attention_mask": attention_mask,
+            "return_last_logit": True,
+        }
+
+    def forward(
+        self,
+        input_ids: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.Tensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        past_key_values: Optional[Tuple[torch.FloatTensor]] = None,
+        inputs_embeds: Optional[torch.Tensor] = None,
+        labels: Optional[torch.Tensor] = None,
+        use_cache: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+        return_last_logit: Optional[bool] = False,
+    ):
+        use_cache = use_cache if use_cache is not None else self.config.use_cache
+        return_dict = (return_dict if return_dict is not None else self.config.use_return_dict)
+
+        transformer_outputs = self.transformer(
+            input_ids=input_ids,
+            position_ids=position_ids,
+            attention_mask=attention_mask,
+            past_key_values=past_key_values,
+            inputs_embeds=inputs_embeds,
+            use_cache=use_cache,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+
+        hidden_states = transformer_outputs[0]
+        if return_last_logit:
+            hidden_states = hidden_states[-1:]
+        lm_logits = self.transformer.output_layer(hidden_states)
+        lm_logits = lm_logits.transpose(0, 1).contiguous()
+
+        loss = None
+        if labels is not None:
+            lm_logits = lm_logits.to(torch.float32)
+
+            # Shift so that tokens < n predict n
+            shift_logits = lm_logits[..., :-1, :].contiguous()
+            shift_labels = labels[..., 1:].contiguous()
+            # Flatten the tokens
+            loss_fct = CrossEntropyLoss(ignore_index=-100)
+            loss = loss_fct(shift_logits.view(-1, shift_logits.size(-1)), shift_labels.view(-1))
+
+            lm_logits = lm_logits.to(hidden_states.dtype)
+            loss = loss.to(hidden_states.dtype)
+
+        if not return_dict:
+            output = (lm_logits,) + transformer_outputs[1:]
+            return ((loss,) + output) if loss is not None else output
+
+        return CausalLMOutputWithPast(
+            loss=loss,
+            logits=lm_logits,
+            past_key_values=transformer_outputs.past_key_values,
+            hidden_states=transformer_outputs.hidden_states,
+            attentions=transformer_outputs.attentions,
+        )
+
+    @staticmethod
+    def _reorder_cache(past: Tuple[Tuple[torch.Tensor, torch.Tensor], ...],
+                       beam_idx: torch.LongTensor) -> Tuple[Tuple[torch.Tensor, torch.Tensor], ...]:
+        """
+        This function is used to re-order the `past_key_values` cache if [`~PreTrainedModel.beam_search`] or
+        [`~PreTrainedModel.beam_sample`] is called. This is required to match `past_key_values` with the correct
+        beam_idx at every generation step.
+
+        Output shares the same memory storage as `past`.
+        """
+        return tuple((
+            layer_past[0].index_select(1, beam_idx.to(layer_past[0].device)),
+            layer_past[1].index_select(1, beam_idx.to(layer_past[1].device)),
+        ) for layer_past in past)
+
+    def process_response(self, response):
+        response = response.strip()
+        response = response.replace("[[训练时间]]", "2023年")
+        return response
+
+    def build_inputs(self, tokenizer, query: str, history: List[Tuple[str, str]] = None):
+        prompt = tokenizer.build_prompt(query, history=history)
+        inputs = tokenizer([prompt], return_tensors="pt")
+        inputs = inputs.to(self.device)
+        return inputs
+
+    def build_stream_inputs(self, tokenizer, query: str, history: List[Tuple[str, str]] = None):
+        if history:
+            prompt = "\n\n[Round {}]\n\n问：{}\n\n答：".format(len(history) + 1, query)
+            input_ids = tokenizer.encode(prompt, add_special_tokens=False)
+            input_ids = input_ids[1:]
+            inputs = tokenizer.batch_encode_plus([(input_ids, None)], return_tensors="pt", add_special_tokens=False)
+        else:
+            prompt = "[Round {}]\n\n问：{}\n\n答：".format(len(history) + 1, query)
+            inputs = tokenizer([prompt], return_tensors="pt")
+        inputs = inputs.to(self.device)
+        return inputs
+
+    @torch.no_grad()
+    def chat(
+        self,
+        tokenizer,
+        query: str,
+        history: List[Tuple[str, str]] = None,
+        max_length: int = 8192,
+        num_beams=1,
+        do_sample=True,
+        top_p=0.8,
+        temperature=0.8,
+        logits_processor=None,
+        **kwargs,
+    ):
+        if history is None:
+            history = []
+        if logits_processor is None:
+            logits_processor = LogitsProcessorList()
+        logits_processor.append(InvalidScoreLogitsProcessor())
+        gen_kwargs = {
+            "max_length": max_length,
+            "num_beams": num_beams,
+            "do_sample": do_sample,
+            "top_p": top_p,
+            "temperature": temperature,
+            "logits_processor": logits_processor,
+            **kwargs,
+        }
+        inputs = self.build_inputs(tokenizer, query, history=history)
+        outputs = self.generate(**inputs, **gen_kwargs)
+        outputs = outputs.tolist()[0][len(inputs["input_ids"][0]):]
+        response = tokenizer.decode(outputs)
+        response = self.process_response(response)
+        history = history + [(query, response)]
+        return response, history
+
+    @torch.no_grad()
+    def stream_chat(
+        self,
+        tokenizer,
+        query: str,
+        history: List[Tuple[str, str]] = None,
+        past_key_values=None,
+        max_length: int = 8192,
+        do_sample=True,
+        top_p=0.8,
+        temperature=0.8,
+        logits_processor=None,
+        return_past_key_values=False,
+        **kwargs,
+    ):
+        if history is None:
+            history = []
+        if logits_processor is None:
+            logits_processor = LogitsProcessorList()
+        logits_processor.append(InvalidScoreLogitsProcessor())
+        gen_kwargs = {
+            "max_length": max_length,
+            "do_sample": do_sample,
+            "top_p": top_p,
+            "temperature": temperature,
+            "logits_processor": logits_processor,
+            **kwargs,
+        }
+        if past_key_values is None and not return_past_key_values:
+            inputs = self.build_inputs(tokenizer, query, history=history)
+        else:
+            inputs = self.build_stream_inputs(tokenizer, query, history=history)
+        if past_key_values is not None:
+            past_length = past_key_values[0][0].shape[0]
+            if self.transformer.pre_seq_len is not None:
+                past_length -= self.transformer.pre_seq_len
+            inputs.position_ids += past_length
+            attention_mask = inputs.attention_mask
+            attention_mask = torch.cat((attention_mask.new_ones(1, past_length), attention_mask), dim=1)
+            inputs["attention_mask"] = attention_mask
+        for outputs in self.stream_generate(
+                **inputs,
+                past_key_values=past_key_values,
+                return_past_key_values=return_past_key_values,
+                **gen_kwargs,
+        ):
+            if return_past_key_values:
+                outputs, past_key_values = outputs
+            outputs = outputs.tolist()[0][len(inputs["input_ids"][0]):]
+            response = tokenizer.decode(outputs)
+            if response and response[-1] != "�":
+                response = self.process_response(response)
+                new_history = history + [(query, response)]
+                if return_past_key_values:
+                    yield response, new_history, past_key_values
+                else:
+                    yield response, new_history
+
+    @torch.no_grad()
+    def stream_generate(
+        self,
+        input_ids,
+        generation_config: Optional[GenerationConfig] = None,
+        logits_processor: Optional[LogitsProcessorList] = None,
+        stopping_criteria: Optional[StoppingCriteriaList] = None,
+        prefix_allowed_tokens_fn: Optional[Callable[[int, torch.Tensor], List[int]]] = None,
+        return_past_key_values=False,
+        **kwargs,
+    ):
+        batch_size, input_ids_seq_length = input_ids.shape[0], input_ids.shape[-1]
+
+        if generation_config is None:
+            generation_config = self.generation_config
+        generation_config = copy.deepcopy(generation_config)
+        model_kwargs = generation_config.update(**kwargs)
+        bos_token_id, eos_token_id = (
+            generation_config.bos_token_id,
+            generation_config.eos_token_id,
+        )
+
+        if isinstance(eos_token_id, int):
+            eos_token_id = [eos_token_id]
+
+        has_default_max_length = (kwargs.get("max_length") is None and generation_config.max_length is not None)
+        if has_default_max_length and generation_config.max_new_tokens is None:
+            warnings.warn(
+                f"Using `max_length`'s default ({generation_config.max_length}) to control the generation length. "
+                "This behaviour is deprecated and will be removed from the config in v5 of Transformers -- we"
+                " recommend using `max_new_tokens` to control the maximum length of the generation.",
+                UserWarning,
+            )
+        elif generation_config.max_new_tokens is not None:
+            generation_config.max_length = (generation_config.max_new_tokens + input_ids_seq_length)
+            if not has_default_max_length:
+                logger.warn(
+                    f"Both `max_new_tokens` (={generation_config.max_new_tokens}) and `max_length`(="
+                    f"{generation_config.max_length}) seem to have been set. `max_new_tokens` will take precedence. "
+                    "Please refer to the documentation for more information. "
+                    "(https://huggingface.co/docs/transformers/main/en/main_classes/text_generation)",
+                    UserWarning,
+                )
+
+        if input_ids_seq_length >= generation_config.max_length:
+            input_ids_string = ("decoder_input_ids" if self.config.is_encoder_decoder else "input_ids")
+            logger.warning(f"Input length of {input_ids_string} is {input_ids_seq_length}, but `max_length` is set to"
+                           f" {generation_config.max_length}. This can lead to unexpected behavior. You should consider"
+                           " increasing `max_new_tokens`.")
+
+        # 2. Set generation parameters if not already defined
+        logits_processor = (logits_processor if logits_processor is not None else LogitsProcessorList())
+        stopping_criteria = (stopping_criteria if stopping_criteria is not None else StoppingCriteriaList())
+
+        logits_processor = self._get_logits_processor(
+            generation_config=generation_config,
+            input_ids_seq_length=input_ids_seq_length,
+            encoder_input_ids=input_ids,
+            prefix_allowed_tokens_fn=prefix_allowed_tokens_fn,
+            logits_processor=logits_processor,
+        )
+
+        stopping_criteria = self._get_stopping_criteria(generation_config=generation_config,
+                                                        stopping_criteria=stopping_criteria)
+        logits_warper = self._get_logits_warper(generation_config)
+
+        unfinished_sequences = input_ids.new(input_ids.shape[0]).fill_(1)
+        scores = None
+        while True:
+            model_inputs = self.prepare_inputs_for_generation(input_ids, **model_kwargs)
+            # forward pass to get next token
+            outputs = self(
+                **model_inputs,
+                return_dict=True,
+                output_attentions=False,
+                output_hidden_states=False,
+            )
+
+            next_token_logits = outputs.logits[:, -1, :]
+
+            # pre-process distribution
+            next_token_scores = logits_processor(input_ids, next_token_logits)
+            next_token_scores = logits_warper(input_ids, next_token_scores)
+
+            # sample
+            probs = nn.functional.softmax(next_token_scores, dim=-1)
+            if generation_config.do_sample:
+                next_tokens = torch.multinomial(probs, num_samples=1).squeeze(1)
+            else:
+                next_tokens = torch.argmax(probs, dim=-1)
+
+            # update generated ids, model inputs, and length for next step
+            input_ids = torch.cat([input_ids, next_tokens[:, None]], dim=-1)
+            model_kwargs = self._update_model_kwargs_for_generation(outputs,
+                                                                    model_kwargs,
+                                                                    is_encoder_decoder=self.config.is_encoder_decoder)
+            unfinished_sequences = unfinished_sequences.mul((sum(next_tokens != i for i in eos_token_id)).long())
+            if return_past_key_values:
+                yield input_ids, outputs.past_key_values
+            else:
+                yield input_ids
+            # stop when each sentence is finished, or if we exceed the maximum length
+            if unfinished_sequences.max() == 0 or stopping_criteria(input_ids, scores):
+                break
+
+    def quantize(self, bits: int, empty_init=False, device=None, **kwargs):
+        if bits == 0:
+            return
+
+        from .quantization import quantize
+
+        if self.quantized:
+            logger.info("Already quantized.")
+            return self
+
+        self.quantized = True
+
+        self.config.quantization_bit = bits
+
+        self.transformer.encoder = quantize(
+            self.transformer.encoder,
+            bits,
+            empty_init=empty_init,
+            device=device,
+            **kwargs,
+        )
+        return self
diff --git a/tests/test_shardformer/test_model/test_shard_chatglm.py b/tests/test_shardformer/test_model/test_shard_chatglm.py
new file mode 100644
index 000000000000..2cdf5da2e6da
--- /dev/null
+++ b/tests/test_shardformer/test_model/test_shard_chatglm.py
@@ -0,0 +1,107 @@
+import copy
+import os
+
+import pytest
+import torch
+
+import colossalai
+from colossalai.logging import disable_existing_loggers
+from colossalai.shardformer import ShardConfig, ShardFormer
+from colossalai.shardformer.policies.chatglm import ChatGLMModelPolicy
+from colossalai.tensor.d_tensor.api import is_customized_distributed_tensor, is_distributed_tensor
+from colossalai.testing import (
+    assert_hf_output_close,
+    clear_cache_before_run,
+    parameterize,
+    rerun_if_address_is_in_use,
+    spawn,
+)
+from tests.kit.model_zoo import model_zoo
+from tests.test_shardformer.test_model._utils import build_model, run_forward
+
+
+def check_forward_backward(org_model, sharded_model, data_gen_fn, output_transform_fn, loss_fn):
+    # check forward
+    org_output, org_loss, shard_output, shard_loss = run_forward(org_model, sharded_model, data_gen_fn,
+                                                                 output_transform_fn, loss_fn)
+    assert_hf_output_close(org_output, shard_output, ignore_keys=['past_key_values'])
+    # do backward
+    org_loss.backward()
+    shard_loss.backward()
+
+    assert torch.allclose(org_loss, shard_loss,
+                          atol=1e-5), f"shard model loss is not equal to orgin model loss\n{org_loss}\n{shard_loss}"
+
+    # unwrap model
+    if org_model.__class__.__name__ == 'ChatGLMModel':
+        chatglm_model = org_model
+        shard_chatglm_model = sharded_model
+    else:
+        chatglm_model = org_model.transformer
+        shard_chatglm_model = sharded_model.transformer
+
+    # check attention grad
+    org_grad = chatglm_model.encoder.layers[0].self_attention.query_key_value.weight.grad
+    shard_grad = shard_chatglm_model.encoder.layers[0].self_attention.query_key_value.weight.grad
+    shard_weight = shard_chatglm_model.encoder.layers[0].self_attention.query_key_value.weight
+
+    if is_distributed_tensor(shard_weight) or is_customized_distributed_tensor(shard_weight):
+        shard_grad_list = [torch.zeros([*shard_grad.shape]).to('cuda') for _ in range(2)]
+        shard_grad = torch.distributed.all_gather(shard_grad_list, shard_grad)
+        all_shard_grad = torch.cat(shard_grad_list, dim=0)
+    else:
+        all_shard_grad = shard_grad
+    assert torch.allclose(org_grad, all_shard_grad,
+                          atol=1e-5), f"shard model grad is not equal to orgin model grad\n{org_grad}\n{shard_grad}"
+
+    # check embedding weights
+    org_grad = chatglm_model.embedding.word_embeddings.weight.grad
+    shard_grad = shard_chatglm_model.embedding.word_embeddings.weight.grad
+    shard_weight = shard_chatglm_model.embedding.word_embeddings.weight
+
+    if is_distributed_tensor(shard_weight) or is_customized_distributed_tensor(shard_weight):
+        shard_grad_list = [torch.zeros([*shard_grad.shape]).to('cuda') for _ in range(2)]
+        torch.distributed.all_gather(shard_grad_list, shard_grad)
+        all_shard_grad = torch.cat(shard_grad_list, dim=0)
+    else:
+        all_shard_grad = shard_grad
+
+    assert torch.allclose(org_grad, all_shard_grad,
+                          atol=1e-5), f"shard model grad is not equal to orgin model grad\n{org_grad}\n{all_shard_grad}"
+
+
+@parameterize('enable_fused_normalization', [True, False])
+@parameterize('enable_tensor_parallelism', [True, False])
+def run_chatglm_test(enable_fused_normalization, enable_tensor_parallelism):
+    sub_model_zoo = model_zoo.get_sub_registry('transformers_chatglm')
+    for name, (model_fn, data_gen_fn, output_transform_fn, loss_fn, _) in sub_model_zoo.items():
+        # create new model
+        org_model = model_fn().cuda()
+
+        # shard model
+        shard_config = ShardConfig(enable_fused_normalization=enable_fused_normalization,
+                                   enable_tensor_parallelism=enable_tensor_parallelism)
+        model_copy = copy.deepcopy(org_model)
+        shard_former = ShardFormer(shard_config=shard_config)
+        if name == "transformers_chatglm":
+            sharded_model = shard_former.optimize(model_copy, ChatGLMModelPolicy()).cuda()
+
+        check_forward_backward(org_model, sharded_model, data_gen_fn, output_transform_fn, loss_fn)
+    torch.cuda.empty_cache()
+
+
+def check_chatglm(rank, world_size, port):
+    disable_existing_loggers()
+    colossalai.launch(config={}, rank=rank, world_size=world_size, host='localhost', port=port, backend='nccl')
+    run_chatglm_test()
+
+
+@pytest.mark.dist
+@rerun_if_address_is_in_use()
+@clear_cache_before_run()
+def test_chatglm():
+    spawn(check_chatglm, 2)
+
+
+if __name__ == "__main__":
+    test_chatglm()
diff --git a/tests/test_shardformer/test_model/test_shard_vit.py b/tests/test_shardformer/test_model/test_shard_vit.py
index a96fd02ae746..a8048a9bdd12 100644
--- a/tests/test_shardformer/test_model/test_shard_vit.py
+++ b/tests/test_shardformer/test_model/test_shard_vit.py
@@ -69,7 +69,6 @@ def check_vit(rank, world_size, port):
 
 
 @pytest.mark.dist
-@pytest.mark.skip
 @rerun_if_address_is_in_use()
 @clear_cache_before_run()
 def test_vit():

From 6c2acf05d62b680262b33b6e8eca829e49c729af Mon Sep 17 00:00:00 2001
From: klhhhhh <1412841649@qq.com>
Date: Tue, 4 Jul 2023 14:35:55 +0800
Subject: [PATCH 05/21] [shardformer] added tests

---
 tests/test_shardformer/test_model/test_shard_vit.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/tests/test_shardformer/test_model/test_shard_vit.py b/tests/test_shardformer/test_model/test_shard_vit.py
index a8048a9bdd12..fdda8c505e9f 100644
--- a/tests/test_shardformer/test_model/test_shard_vit.py
+++ b/tests/test_shardformer/test_model/test_shard_vit.py
@@ -56,6 +56,7 @@ def check_forward_backward(org_model, sharded_model, data_gen_fn, output_transfo
 @parameterize('enable_tensor_parallelism', [True, False])
 def run_vit_test(enable_fused_normalization, enable_tensor_parallelism):
     sub_model_zoo = model_zoo.get_sub_registry('transformers_vit')
+    print(sub_model_zoo)
     for name, (model_fn, data_gen_fn, output_transform_fn, loss_fn, _) in sub_model_zoo.items():
         org_model, sharded_model = build_model(model_fn, enable_fused_normalization, enable_tensor_parallelism)
         check_forward_backward(org_model, sharded_model, data_gen_fn, output_transform_fn, loss_fn)

From 7668b2468a41ed591665b8d40524e456a16b8543 Mon Sep 17 00:00:00 2001
From: klhhhhh <1412841649@qq.com>
Date: Thu, 6 Jul 2023 10:59:42 +0800
Subject: [PATCH 06/21] [shardformer] vit test finish and support

---
 tests/test_shardformer/test_model/test_shard_vit.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/tests/test_shardformer/test_model/test_shard_vit.py b/tests/test_shardformer/test_model/test_shard_vit.py
index fdda8c505e9f..a8048a9bdd12 100644
--- a/tests/test_shardformer/test_model/test_shard_vit.py
+++ b/tests/test_shardformer/test_model/test_shard_vit.py
@@ -56,7 +56,6 @@ def check_forward_backward(org_model, sharded_model, data_gen_fn, output_transfo
 @parameterize('enable_tensor_parallelism', [True, False])
 def run_vit_test(enable_fused_normalization, enable_tensor_parallelism):
     sub_model_zoo = model_zoo.get_sub_registry('transformers_vit')
-    print(sub_model_zoo)
     for name, (model_fn, data_gen_fn, output_transform_fn, loss_fn, _) in sub_model_zoo.items():
         org_model, sharded_model = build_model(model_fn, enable_fused_normalization, enable_tensor_parallelism)
         check_forward_backward(org_model, sharded_model, data_gen_fn, output_transform_fn, loss_fn)

From b135b75b70b64db3b2020af1e2b926e3444371be Mon Sep 17 00:00:00 2001
From: klhhhhh <1412841649@qq.com>
Date: Fri, 7 Jul 2023 19:16:35 +0800
Subject: [PATCH 07/21] import chatglm

---
 =2.0                                          |  134 ++
 .../chatglm2-6b/modeling_chatglm.py           | 1193 +++++++++++++++++
 2 files changed, 1327 insertions(+)
 create mode 100644 =2.0
 create mode 100644 tests/kit/model_zoo/transformers/chatglm2-6b/modeling_chatglm.py

diff --git a/=2.0 b/=2.0
new file mode 100644
index 000000000000..af47ce17aa8e
--- /dev/null
+++ b/=2.0
@@ -0,0 +1,134 @@
+Defaulting to user installation because normal site-packages is not writeable
+Collecting protobuf
+  Using cached protobuf-4.23.4-cp37-abi3-manylinux2014_x86_64.whl (304 kB)
+Requirement already satisfied: transformers==4.30.2 in /home/lclk/.local/lib/python3.9/site-packages (4.30.2)
+Collecting cpm_kernels
+  Using cached cpm_kernels-1.0.11-py3-none-any.whl (416 kB)
+Requirement already satisfied: torch in /home/lclk/.local/lib/python3.9/site-packages (2.0.0+cu118)
+Collecting gradio
+  Using cached gradio-3.36.0-py3-none-any.whl (19.8 MB)
+Collecting mdtex2html
+  Using cached mdtex2html-1.2.0-py3-none-any.whl (13 kB)
+Collecting sentencepiece
+  Using cached sentencepiece-0.1.99-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.3 MB)
+Collecting accelerate
+  Using cached accelerate-0.20.3-py3-none-any.whl (227 kB)
+Requirement already satisfied: pyyaml>=5.1 in /home/lclk/.local/lib/python3.9/site-packages (from transformers==4.30.2) (6.0)
+Requirement already satisfied: regex!=2019.12.17 in /home/lclk/.local/lib/python3.9/site-packages (from transformers==4.30.2) (2023.6.3)
+Requirement already satisfied: huggingface-hub<1.0,>=0.14.1 in /home/lclk/.local/lib/python3.9/site-packages (from transformers==4.30.2) (0.15.1)
+Requirement already satisfied: packaging>=20.0 in /home/lclk/.local/lib/python3.9/site-packages (from transformers==4.30.2) (23.1)
+Requirement already satisfied: requests in /opt/lcsoftware/spack/opt/spack/linux-ubuntu20.04-zen2/gcc-9.3.0/miniconda3-4.10.3-u6p3tgreee7aigtnvuhr44yqo7vcg6r6/lib/python3.9/site-packages (from transformers==4.30.2) (2.25.1)
+Requirement already satisfied: tokenizers!=0.11.3,<0.14,>=0.11.1 in /home/lclk/.local/lib/python3.9/site-packages (from transformers==4.30.2) (0.13.3)
+Requirement already satisfied: safetensors>=0.3.1 in /home/lclk/.local/lib/python3.9/site-packages (from transformers==4.30.2) (0.3.1)
+Requirement already satisfied: filelock in /home/lclk/.local/lib/python3.9/site-packages (from transformers==4.30.2) (3.12.0)
+Requirement already satisfied: numpy>=1.17 in /home/lclk/.local/lib/python3.9/site-packages (from transformers==4.30.2) (1.24.3)
+Requirement already satisfied: tqdm>=4.27 in /home/lclk/.local/lib/python3.9/site-packages (from transformers==4.30.2) (4.65.0)
+Requirement already satisfied: fsspec in /home/lclk/.local/lib/python3.9/site-packages (from huggingface-hub<1.0,>=0.14.1->transformers==4.30.2) (2023.6.0)
+Requirement already satisfied: typing-extensions>=3.7.4.3 in /home/lclk/.local/lib/python3.9/site-packages (from huggingface-hub<1.0,>=0.14.1->transformers==4.30.2) (4.6.3)
+Requirement already satisfied: networkx in /home/lclk/.local/lib/python3.9/site-packages (from torch) (3.1)
+Requirement already satisfied: sympy in /home/lclk/.local/lib/python3.9/site-packages (from torch) (1.12)
+Requirement already satisfied: triton==2.0.0 in /home/lclk/.local/lib/python3.9/site-packages (from torch) (2.0.0)
+Requirement already satisfied: jinja2 in /home/lclk/.local/lib/python3.9/site-packages (from torch) (3.1.2)
+Requirement already satisfied: lit in /home/lclk/.local/lib/python3.9/site-packages (from triton==2.0.0->torch) (16.0.5.post0)
+Requirement already satisfied: cmake in /home/lclk/.local/lib/python3.9/site-packages (from triton==2.0.0->torch) (3.26.3)
+Collecting aiofiles
+  Using cached aiofiles-23.1.0-py3-none-any.whl (14 kB)
+Collecting ffmpy
+  Using cached ffmpy-0.3.0.tar.gz (4.8 kB)
+Requirement already satisfied: pillow in /home/lclk/.local/lib/python3.9/site-packages (from gradio) (9.5.0)
+Collecting pydub
+  Using cached pydub-0.25.1-py2.py3-none-any.whl (32 kB)
+Requirement already satisfied: pandas in /home/lclk/.local/lib/python3.9/site-packages (from gradio) (2.0.2)
+Collecting python-multipart
+  Using cached python_multipart-0.0.6-py3-none-any.whl (45 kB)
+Collecting semantic-version
+  Using cached semantic_version-2.10.0-py2.py3-none-any.whl (15 kB)
+Collecting pydantic
+  Using cached pydantic-2.0.2-py3-none-any.whl (359 kB)
+Collecting uvicorn>=0.14.0
+  Using cached uvicorn-0.22.0-py3-none-any.whl (58 kB)
+Collecting mdit-py-plugins<=0.3.3
+  Using cached mdit_py_plugins-0.3.3-py3-none-any.whl (50 kB)
+Requirement already satisfied: pygments>=2.12.0 in /home/lclk/.local/lib/python3.9/site-packages (from gradio) (2.15.1)
+Collecting httpx
+  Using cached httpx-0.24.1-py3-none-any.whl (75 kB)
+Collecting orjson
+  Using cached orjson-3.9.1-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (136 kB)
+Collecting fastapi
+  Using cached fastapi-0.99.1-py3-none-any.whl (58 kB)
+Collecting altair>=4.2.0
+  Using cached altair-5.0.1-py3-none-any.whl (471 kB)
+Collecting gradio-client>=0.2.7
+  Using cached gradio_client-0.2.7-py3-none-any.whl (288 kB)
+Requirement already satisfied: aiohttp in /home/lclk/.local/lib/python3.9/site-packages (from gradio) (3.8.4)
+Requirement already satisfied: matplotlib in /home/lclk/.local/lib/python3.9/site-packages (from gradio) (3.7.1)
+Collecting websockets>=10.0
+  Using cached websockets-11.0.3-cp39-cp39-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl (129 kB)
+Requirement already satisfied: markdown-it-py[linkify]>=2.0.0 in /home/lclk/.local/lib/python3.9/site-packages (from gradio) (2.2.0)
+Requirement already satisfied: markupsafe in /home/lclk/.local/lib/python3.9/site-packages (from gradio) (2.1.3)
+Collecting toolz
+  Using cached toolz-0.12.0-py3-none-any.whl (55 kB)
+Collecting jsonschema>=3.0
+  Using cached jsonschema-4.18.0-py3-none-any.whl (81 kB)
+Collecting rpds-py>=0.7.1
+  Downloading rpds_py-0.8.8-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.2 MB)
+Collecting referencing>=0.28.4
+  Using cached referencing-0.29.1-py3-none-any.whl (25 kB)
+Collecting jsonschema-specifications>=2023.03.6
+  Using cached jsonschema_specifications-2023.6.1-py3-none-any.whl (17 kB)
+Requirement already satisfied: attrs>=22.2.0 in /home/lclk/.local/lib/python3.9/site-packages (from jsonschema>=3.0->altair>=4.2.0->gradio) (23.1.0)
+Requirement already satisfied: mdurl~=0.1 in /home/lclk/.local/lib/python3.9/site-packages (from markdown-it-py[linkify]>=2.0.0->gradio) (0.1.2)
+Collecting linkify-it-py<3,>=1
+  Downloading linkify_it_py-2.0.2-py3-none-any.whl (19 kB)
+Collecting uc-micro-py
+  Downloading uc_micro_py-1.0.2-py3-none-any.whl (6.2 kB)
+Requirement already satisfied: pytz>=2020.1 in /home/lclk/.local/lib/python3.9/site-packages (from pandas->gradio) (2023.3)
+Requirement already satisfied: tzdata>=2022.1 in /home/lclk/.local/lib/python3.9/site-packages (from pandas->gradio) (2023.3)
+Requirement already satisfied: python-dateutil>=2.8.2 in /home/lclk/.local/lib/python3.9/site-packages (from pandas->gradio) (2.8.2)
+Requirement already satisfied: six>=1.5 in /opt/lcsoftware/spack/opt/spack/linux-ubuntu20.04-zen2/gcc-9.3.0/miniconda3-4.10.3-u6p3tgreee7aigtnvuhr44yqo7vcg6r6/lib/python3.9/site-packages (from python-dateutil>=2.8.2->pandas->gradio) (1.16.0)
+Requirement already satisfied: click>=7.0 in /home/lclk/.local/lib/python3.9/site-packages (from uvicorn>=0.14.0->gradio) (8.1.3)
+Collecting h11>=0.8
+  Downloading h11-0.14.0-py3-none-any.whl (58 kB)
+Collecting latex2mathml
+  Downloading latex2mathml-3.76.0-py3-none-any.whl (73 kB)
+Collecting markdown
+  Downloading Markdown-3.4.3-py3-none-any.whl (93 kB)
+Requirement already satisfied: psutil in /home/lclk/.local/lib/python3.9/site-packages (from accelerate) (5.9.5)
+Requirement already satisfied: multidict<7.0,>=4.5 in /home/lclk/.local/lib/python3.9/site-packages (from aiohttp->gradio) (6.0.4)
+Requirement already satisfied: async-timeout<5.0,>=4.0.0a3 in /home/lclk/.local/lib/python3.9/site-packages (from aiohttp->gradio) (4.0.2)
+Requirement already satisfied: aiosignal>=1.1.2 in /home/lclk/.local/lib/python3.9/site-packages (from aiohttp->gradio) (1.3.1)
+Requirement already satisfied: charset-normalizer<4.0,>=2.0 in /home/lclk/.local/lib/python3.9/site-packages (from aiohttp->gradio) (3.1.0)
+Requirement already satisfied: frozenlist>=1.1.1 in /home/lclk/.local/lib/python3.9/site-packages (from aiohttp->gradio) (1.3.3)
+Requirement already satisfied: yarl<2.0,>=1.0 in /home/lclk/.local/lib/python3.9/site-packages (from aiohttp->gradio) (1.9.2)
+Requirement already satisfied: idna>=2.0 in /opt/lcsoftware/spack/opt/spack/linux-ubuntu20.04-zen2/gcc-9.3.0/miniconda3-4.10.3-u6p3tgreee7aigtnvuhr44yqo7vcg6r6/lib/python3.9/site-packages (from yarl<2.0,>=1.0->aiohttp->gradio) (2.10)
+Collecting pydantic
+  Downloading pydantic-1.10.11-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (3.2 MB)
+Collecting starlette<0.28.0,>=0.27.0
+  Downloading starlette-0.27.0-py3-none-any.whl (66 kB)
+Collecting anyio<5,>=3.4.0
+  Downloading anyio-3.7.1-py3-none-any.whl (80 kB)
+Collecting sniffio>=1.1
+  Downloading sniffio-1.3.0-py3-none-any.whl (10 kB)
+Requirement already satisfied: exceptiongroup in /home/lclk/.local/lib/python3.9/site-packages (from anyio<5,>=3.4.0->starlette<0.28.0,>=0.27.0->fastapi->gradio) (1.1.1)
+Collecting httpcore<0.18.0,>=0.15.0
+  Downloading httpcore-0.17.3-py3-none-any.whl (74 kB)
+Requirement already satisfied: certifi in /opt/lcsoftware/spack/opt/spack/linux-ubuntu20.04-zen2/gcc-9.3.0/miniconda3-4.10.3-u6p3tgreee7aigtnvuhr44yqo7vcg6r6/lib/python3.9/site-packages (from httpx->gradio) (2021.5.30)
+Requirement already satisfied: importlib-metadata>=4.4 in /home/lclk/.local/lib/python3.9/site-packages (from markdown->mdtex2html) (6.7.0)
+Requirement already satisfied: zipp>=0.5 in /home/lclk/.local/lib/python3.9/site-packages (from importlib-metadata>=4.4->markdown->mdtex2html) (3.15.0)
+Requirement already satisfied: contourpy>=1.0.1 in /home/lclk/.local/lib/python3.9/site-packages (from matplotlib->gradio) (1.1.0)
+Requirement already satisfied: fonttools>=4.22.0 in /home/lclk/.local/lib/python3.9/site-packages (from matplotlib->gradio) (4.40.0)
+Requirement already satisfied: pyparsing>=2.3.1 in /home/lclk/.local/lib/python3.9/site-packages (from matplotlib->gradio) (3.1.0)
+Requirement already satisfied: kiwisolver>=1.0.1 in /home/lclk/.local/lib/python3.9/site-packages (from matplotlib->gradio) (1.4.4)
+Requirement already satisfied: importlib-resources>=3.2.0 in /home/lclk/.local/lib/python3.9/site-packages (from matplotlib->gradio) (5.12.0)
+Requirement already satisfied: cycler>=0.10 in /home/lclk/.local/lib/python3.9/site-packages (from matplotlib->gradio) (0.11.0)
+Requirement already satisfied: urllib3<1.27,>=1.21.1 in /opt/lcsoftware/spack/opt/spack/linux-ubuntu20.04-zen2/gcc-9.3.0/miniconda3-4.10.3-u6p3tgreee7aigtnvuhr44yqo7vcg6r6/lib/python3.9/site-packages (from requests->transformers==4.30.2) (1.26.6)
+Requirement already satisfied: chardet<5,>=3.0.2 in /opt/lcsoftware/spack/opt/spack/linux-ubuntu20.04-zen2/gcc-9.3.0/miniconda3-4.10.3-u6p3tgreee7aigtnvuhr44yqo7vcg6r6/lib/python3.9/site-packages (from requests->transformers==4.30.2) (4.0.0)
+Requirement already satisfied: mpmath>=0.19 in /home/lclk/.local/lib/python3.9/site-packages (from sympy->torch) (1.3.0)
+Building wheels for collected packages: ffmpy
+  Building wheel for ffmpy (setup.py): started
+  Building wheel for ffmpy (setup.py): finished with status 'done'
+  Created wheel for ffmpy: filename=ffmpy-0.3.0-py3-none-any.whl size=4709 sha256=071cebb58ca6c6947fbc669e1d94509d6f53d1ed45d9d7fb9f060d1a342cfc18
+  Stored in directory: /home/lclk/.cache/pip/wheels/91/e2/96/f676aa08bfd789328c6576cd0f1fde4a3d686703bb0c247697
+Successfully built ffmpy
+Installing collected packages: sniffio, rpds-py, referencing, h11, anyio, uc-micro-py, jsonschema-specifications, httpcore, websockets, toolz, starlette, pydantic, linkify-it-py, jsonschema, httpx, uvicorn, semantic-version, python-multipart, pydub, orjson, mdit-py-plugins, markdown, latex2mathml, gradio-client, ffmpy, fastapi, altair, aiofiles, sentencepiece, protobuf, mdtex2html, gradio, cpm-kernels, accelerate
+Successfully installed accelerate-0.20.3 aiofiles-23.1.0 altair-5.0.1 anyio-3.7.1 cpm-kernels-1.0.11 fastapi-0.99.1 ffmpy-0.3.0 gradio-3.36.0 gradio-client-0.2.7 h11-0.14.0 httpcore-0.17.3 httpx-0.24.1 jsonschema-4.18.0 jsonschema-specifications-2023.6.1 latex2mathml-3.76.0 linkify-it-py-2.0.2 markdown-3.4.3 mdit-py-plugins-0.3.3 mdtex2html-1.2.0 orjson-3.9.1 protobuf-4.23.4 pydantic-1.10.11 pydub-0.25.1 python-multipart-0.0.6 referencing-0.29.1 rpds-py-0.8.8 semantic-version-2.10.0 sentencepiece-0.1.99 sniffio-1.3.0 starlette-0.27.0 toolz-0.12.0 uc-micro-py-1.0.2 uvicorn-0.22.0 websockets-11.0.3
diff --git a/tests/kit/model_zoo/transformers/chatglm2-6b/modeling_chatglm.py b/tests/kit/model_zoo/transformers/chatglm2-6b/modeling_chatglm.py
new file mode 100644
index 000000000000..82163c46190f
--- /dev/null
+++ b/tests/kit/model_zoo/transformers/chatglm2-6b/modeling_chatglm.py
@@ -0,0 +1,1193 @@
+""" PyTorch ChatGLM model. """
+
+import math
+import copy
+import warnings
+import re
+import sys
+
+import torch
+import torch.utils.checkpoint
+import torch.nn.functional as F
+from torch import nn
+from torch.nn import CrossEntropyLoss, LayerNorm
+from torch.nn.utils import skip_init
+from typing import Optional, Tuple, Union, List, Callable, Dict, Any
+
+from transformers.modeling_outputs import (
+    BaseModelOutputWithPast,
+    CausalLMOutputWithPast,
+)
+from transformers.modeling_utils import PreTrainedModel
+from transformers.utils import logging
+from transformers.generation.logits_process import LogitsProcessor
+from transformers.generation.utils import LogitsProcessorList, StoppingCriteriaList, GenerationConfig, ModelOutput
+
+from .configuration_chatglm import ChatGLMConfig
+
+# flags required to enable jit fusion kernels
+
+if sys.platform != 'darwin':
+    torch._C._jit_set_profiling_mode(False)
+    torch._C._jit_set_profiling_executor(False)
+    torch._C._jit_override_can_fuse_on_cpu(True)
+    torch._C._jit_override_can_fuse_on_gpu(True)
+
+logger = logging.get_logger(__name__)
+
+_CHECKPOINT_FOR_DOC = "THUDM/ChatGLM2-6B"
+_CONFIG_FOR_DOC = "ChatGLM6BConfig"
+
+CHATGLM_6B_PRETRAINED_MODEL_ARCHIVE_LIST = [
+    "THUDM/chatglm2-6b",
+    # See all ChatGLM models at https://huggingface.co/models?filter=chatglm
+]
+
+
+def default_init(cls, *args, **kwargs):
+    return cls(*args, **kwargs)
+
+
+class InvalidScoreLogitsProcessor(LogitsProcessor):
+    def __call__(self, input_ids: torch.LongTensor, scores: torch.FloatTensor) -> torch.FloatTensor:
+        if torch.isnan(scores).any() or torch.isinf(scores).any():
+            scores.zero_()
+            scores[..., 5] = 5e4
+        return scores
+
+
+class PrefixEncoder(torch.nn.Module):
+    """
+    The torch.nn model to encode the prefix
+    Input shape: (batch-size, prefix-length)
+    Output shape: (batch-size, prefix-length, 2*layers*hidden)
+    """
+
+    def __init__(self, config: ChatGLMConfig):
+        super().__init__()
+        self.prefix_projection = config.prefix_projection
+        if self.prefix_projection:
+            # Use a two-layer MLP to encode the prefix
+            kv_size = config.num_layers * config.kv_channels * config.multi_query_group_num * 2
+            self.embedding = torch.nn.Embedding(config.pre_seq_len, kv_size)
+            self.trans = torch.nn.Sequential(
+                torch.nn.Linear(kv_size, config.hidden_size),
+                torch.nn.Tanh(),
+                torch.nn.Linear(config.hidden_size, kv_size)
+            )
+        else:
+            self.embedding = torch.nn.Embedding(config.pre_seq_len,
+                                                config.num_layers * config.kv_channels * config.multi_query_group_num * 2)
+
+    def forward(self, prefix: torch.Tensor):
+        if self.prefix_projection:
+            prefix_tokens = self.embedding(prefix)
+            past_key_values = self.trans(prefix_tokens)
+        else:
+            past_key_values = self.embedding(prefix)
+        return past_key_values
+
+
+def split_tensor_along_last_dim(
+        tensor: torch.Tensor,
+        num_partitions: int,
+        contiguous_split_chunks: bool = False,
+) -> List[torch.Tensor]:
+    """Split a tensor along its last dimension.
+
+    Arguments:
+        tensor: input tensor.
+        num_partitions: number of partitions to split the tensor
+        contiguous_split_chunks: If True, make each chunk contiguous
+                                 in memory.
+
+    Returns:
+        A list of Tensors
+    """
+    # Get the size and dimension.
+    last_dim = tensor.dim() - 1
+    last_dim_size = tensor.size()[last_dim] // num_partitions
+    # Split.
+    tensor_list = torch.split(tensor, last_dim_size, dim=last_dim)
+    # Note: torch.split does not create contiguous tensors by default.
+    if contiguous_split_chunks:
+        return tuple(chunk.contiguous() for chunk in tensor_list)
+
+    return tensor_list
+
+
+class RotaryEmbedding(nn.Module):
+    def __init__(self, dim, original_impl=False, device=None, dtype=None):
+        super().__init__()
+        inv_freq = 1.0 / (10000 ** (torch.arange(0, dim, 2, device=device).to(dtype=dtype) / dim))
+        self.register_buffer("inv_freq", inv_freq)
+        self.dim = dim
+        self.original_impl = original_impl
+
+    def forward_impl(
+            self, seq_len: int, n_elem: int, dtype: torch.dtype, device: torch.device, base: int = 10000
+    ):
+        """Enhanced Transformer with Rotary Position Embedding.
+
+        Derived from: https://github.com/labmlai/annotated_deep_learning_paper_implementations/blob/master/labml_nn/
+        transformers/rope/__init__.py. MIT License:
+        https://github.com/labmlai/annotated_deep_learning_paper_implementations/blob/master/license.
+        """
+        # $\Theta = {\theta_i = 10000^{\frac{2(i-1)}{d}}, i \in [1, 2, ..., \frac{d}{2}]}$
+        theta = 1.0 / (base ** (torch.arange(0, n_elem, 2, dtype=dtype, device=device) / n_elem))
+
+        # Create position indexes `[0, 1, ..., seq_len - 1]`
+        seq_idx = torch.arange(seq_len, dtype=dtype, device=device)
+
+        # Calculate the product of position index and $\theta_i$
+        idx_theta = torch.outer(seq_idx, theta).float()
+
+        cache = torch.stack([torch.cos(idx_theta), torch.sin(idx_theta)], dim=-1)
+
+        # this is to mimic the behaviour of complex32, else we will get different results
+        if dtype in (torch.float16, torch.bfloat16, torch.int8):
+            cache = cache.bfloat16() if dtype == torch.bfloat16 else cache.half()
+        return cache
+
+    def forward(self, max_seq_len, offset=0):
+        return self.forward_impl(
+            max_seq_len, self.dim, dtype=self.inv_freq.dtype, device=self.inv_freq.device
+        )
+
+
+@torch.jit.script
+def apply_rotary_pos_emb(x: torch.Tensor, rope_cache: torch.Tensor) -> torch.Tensor:
+    # x: [sq, b, np, hn]
+    sq, b, np, hn = x.size(0), x.size(1), x.size(2), x.size(3)
+    rot_dim = rope_cache.shape[-2] * 2
+    x, x_pass = x[..., :rot_dim], x[..., rot_dim:]
+    # truncate to support variable sizes
+    rope_cache = rope_cache[:sq]
+    xshaped = x.reshape(sq, -1, np, rot_dim // 2, 2)
+    rope_cache = rope_cache.view(sq, -1, 1, xshaped.size(3), 2)
+    x_out2 = torch.stack(
+        [
+            xshaped[..., 0] * rope_cache[..., 0] - xshaped[..., 1] * rope_cache[..., 1],
+            xshaped[..., 1] * rope_cache[..., 0] + xshaped[..., 0] * rope_cache[..., 1],
+        ],
+        -1,
+    )
+    x_out2 = x_out2.flatten(3)
+    return torch.cat((x_out2, x_pass), dim=-1)
+
+
+class RMSNorm(torch.nn.Module):
+    def __init__(self, normalized_shape, eps=1e-5, device=None, dtype=None, **kwargs):
+        super().__init__()
+        self.weight = torch.nn.Parameter(torch.empty(normalized_shape, device=device, dtype=dtype))
+        self.eps = eps
+
+    def forward(self, hidden_states: torch.Tensor):
+        input_dtype = hidden_states.dtype
+        variance = hidden_states.to(torch.float32).pow(2).mean(-1, keepdim=True)
+        hidden_states = hidden_states * torch.rsqrt(variance + self.eps)
+
+        return (self.weight * hidden_states).to(input_dtype)
+
+
+class CoreAttention(torch.nn.Module):
+    def __init__(self, config: ChatGLMConfig, layer_number):
+        super(CoreAttention, self).__init__()
+
+        self.apply_query_key_layer_scaling = config.apply_query_key_layer_scaling
+        self.attention_softmax_in_fp32 = config.attention_softmax_in_fp32
+        if self.apply_query_key_layer_scaling:
+            self.attention_softmax_in_fp32 = True
+        self.layer_number = max(1, layer_number)
+
+        projection_size = config.kv_channels * config.num_attention_heads
+
+        # Per attention head and per partition values.
+        self.hidden_size_per_partition = projection_size
+        self.hidden_size_per_attention_head = projection_size // config.num_attention_heads
+        self.num_attention_heads_per_partition = config.num_attention_heads
+
+        coeff = None
+        self.norm_factor = math.sqrt(self.hidden_size_per_attention_head)
+        if self.apply_query_key_layer_scaling:
+            coeff = self.layer_number
+            self.norm_factor *= coeff
+        self.coeff = coeff
+
+        self.attention_dropout = torch.nn.Dropout(config.attention_dropout)
+
+    def forward(self, query_layer, key_layer, value_layer, attention_mask):
+        pytorch_major_version = int(torch.__version__.split('.')[0])
+        if pytorch_major_version >= 2:
+            query_layer, key_layer, value_layer = [k.permute(1, 2, 0, 3) for k in [query_layer, key_layer, value_layer]]
+            if attention_mask is None and query_layer.shape[2] == key_layer.shape[2]:
+                context_layer = torch.nn.functional.scaled_dot_product_attention(query_layer, key_layer, value_layer,
+                                                                                 is_causal=True)
+            else:
+                if attention_mask is not None:
+                    attention_mask = ~attention_mask
+                context_layer = torch.nn.functional.scaled_dot_product_attention(query_layer, key_layer, value_layer,
+                                                                                 attention_mask)
+            context_layer = context_layer.permute(2, 0, 1, 3)
+            new_context_layer_shape = context_layer.size()[:-2] + (self.hidden_size_per_partition,)
+            context_layer = context_layer.reshape(*new_context_layer_shape)
+        else:
+            # Raw attention scores
+
+            # [b, np, sq, sk]
+            output_size = (query_layer.size(1), query_layer.size(2), query_layer.size(0), key_layer.size(0))
+
+            # [sq, b, np, hn] -> [sq, b * np, hn]
+            query_layer = query_layer.view(output_size[2], output_size[0] * output_size[1], -1)
+            # [sk, b, np, hn] -> [sk, b * np, hn]
+            key_layer = key_layer.view(output_size[3], output_size[0] * output_size[1], -1)
+
+            # preallocting input tensor: [b * np, sq, sk]
+            matmul_input_buffer = torch.empty(
+                output_size[0] * output_size[1], output_size[2], output_size[3], dtype=query_layer.dtype,
+                device=query_layer.device
+            )
+
+            # Raw attention scores. [b * np, sq, sk]
+            matmul_result = torch.baddbmm(
+                matmul_input_buffer,
+                query_layer.transpose(0, 1),  # [b * np, sq, hn]
+                key_layer.transpose(0, 1).transpose(1, 2),  # [b * np, hn, sk]
+                beta=0.0,
+                alpha=(1.0 / self.norm_factor),
+            )
+
+            # change view to [b, np, sq, sk]
+            attention_scores = matmul_result.view(*output_size)
+
+            # ===========================
+            # Attention probs and dropout
+            # ===========================
+
+            # attention scores and attention mask [b, np, sq, sk]
+            if self.attention_softmax_in_fp32:
+                attention_scores = attention_scores.float()
+            if self.coeff is not None:
+                attention_scores = attention_scores * self.coeff
+            if attention_mask is None and attention_scores.shape[2] == attention_scores.shape[3]:
+                attention_mask = torch.ones(output_size[0], 1, output_size[2], output_size[3],
+                                            device=attention_scores.device, dtype=torch.bool)
+                attention_mask.tril_()
+                attention_mask = ~attention_mask
+            if attention_mask is not None:
+                attention_scores = attention_scores.masked_fill(attention_mask, float("-inf"))
+            attention_probs = F.softmax(attention_scores, dim=-1)
+            attention_probs = attention_probs.type_as(value_layer)
+
+            # This is actually dropping out entire tokens to attend to, which might
+            # seem a bit unusual, but is taken from the original Transformer paper.
+            attention_probs = self.attention_dropout(attention_probs)
+            # =========================
+            # Context layer. [sq, b, hp]
+            # =========================
+
+            # value_layer -> context layer.
+            # [sk, b, np, hn] --> [b, np, sq, hn]
+
+            # context layer shape: [b, np, sq, hn]
+            output_size = (value_layer.size(1), value_layer.size(2), query_layer.size(0), value_layer.size(3))
+            # change view [sk, b * np, hn]
+            value_layer = value_layer.view(value_layer.size(0), output_size[0] * output_size[1], -1)
+            # change view [b * np, sq, sk]
+            attention_probs = attention_probs.view(output_size[0] * output_size[1], output_size[2], -1)
+            # matmul: [b * np, sq, hn]
+            context_layer = torch.bmm(attention_probs, value_layer.transpose(0, 1))
+            # change view [b, np, sq, hn]
+            context_layer = context_layer.view(*output_size)
+            # [b, np, sq, hn] --> [sq, b, np, hn]
+            context_layer = context_layer.permute(2, 0, 1, 3).contiguous()
+            # [sq, b, np, hn] --> [sq, b, hp]
+            new_context_layer_shape = context_layer.size()[:-2] + (self.hidden_size_per_partition,)
+            context_layer = context_layer.view(*new_context_layer_shape)
+
+        return context_layer
+
+
+class SelfAttention(torch.nn.Module):
+    """Parallel self-attention layer abstract class.
+
+    Self-attention layer takes input with size [s, b, h]
+    and returns output of the same size.
+    """
+
+    def __init__(self, config: ChatGLMConfig, layer_number, device=None):
+        super(SelfAttention, self).__init__()
+        self.layer_number = max(1, layer_number)
+
+        self.projection_size = config.kv_channels * config.num_attention_heads
+
+        # Per attention head and per partition values.
+        self.hidden_size_per_attention_head = self.projection_size // config.num_attention_heads
+        self.num_attention_heads_per_partition = config.num_attention_heads
+
+        self.multi_query_attention = config.multi_query_attention
+        self.qkv_hidden_size = 3 * self.projection_size
+        if self.multi_query_attention:
+            self.num_multi_query_groups_per_partition = config.multi_query_group_num
+            self.qkv_hidden_size = (
+                    self.projection_size + 2 * self.hidden_size_per_attention_head * config.multi_query_group_num
+            )
+        self.query_key_value = nn.Linear(config.hidden_size, self.qkv_hidden_size,
+                                         bias=config.add_bias_linear or config.add_qkv_bias,
+                                         device=device, **_config_to_kwargs(config)
+                                         )
+
+        self.core_attention = CoreAttention(config, self.layer_number)
+
+        # Output.
+        self.dense = nn.Linear(self.projection_size, config.hidden_size, bias=config.add_bias_linear,
+                               device=device, **_config_to_kwargs(config)
+                               )
+
+    def _allocate_memory(self, inference_max_sequence_len, batch_size, device=None, dtype=None):
+        if self.multi_query_attention:
+            num_attention_heads = self.num_multi_query_groups_per_partition
+        else:
+            num_attention_heads = self.num_attention_heads_per_partition
+        return torch.empty(
+            inference_max_sequence_len,
+            batch_size,
+            num_attention_heads,
+            self.hidden_size_per_attention_head,
+            dtype=dtype,
+            device=device,
+        )
+
+    def forward(
+            self, hidden_states, attention_mask, rotary_pos_emb, kv_cache=None, use_cache=True
+    ):
+        # hidden_states: [sq, b, h]
+
+        # =================================================
+        # Pre-allocate memory for key-values for inference.
+        # =================================================
+        # =====================
+        # Query, Key, and Value
+        # =====================
+
+        # Attention heads [sq, b, h] --> [sq, b, (np * 3 * hn)]
+        mixed_x_layer = self.query_key_value(hidden_states)
+
+        if self.multi_query_attention:
+            (query_layer, key_layer, value_layer) = mixed_x_layer.split(
+                [
+                    self.num_attention_heads_per_partition * self.hidden_size_per_attention_head,
+                    self.num_multi_query_groups_per_partition * self.hidden_size_per_attention_head,
+                    self.num_multi_query_groups_per_partition * self.hidden_size_per_attention_head,
+                ],
+                dim=-1,
+            )
+            query_layer = query_layer.view(
+                query_layer.size()[:-1] + (self.num_attention_heads_per_partition, self.hidden_size_per_attention_head)
+            )
+            key_layer = key_layer.view(
+                key_layer.size()[:-1] + (self.num_multi_query_groups_per_partition, self.hidden_size_per_attention_head)
+            )
+            value_layer = value_layer.view(
+                value_layer.size()[:-1]
+                + (self.num_multi_query_groups_per_partition, self.hidden_size_per_attention_head)
+            )
+        else:
+            new_tensor_shape = mixed_x_layer.size()[:-1] + \
+                               (self.num_attention_heads_per_partition,
+                                3 * self.hidden_size_per_attention_head)
+            mixed_x_layer = mixed_x_layer.view(*new_tensor_shape)
+
+            # [sq, b, np, 3 * hn] --> 3 [sq, b, np, hn]
+            (query_layer, key_layer, value_layer) = split_tensor_along_last_dim(mixed_x_layer, 3)
+
+        # apply relative positional encoding (rotary embedding)
+        if rotary_pos_emb is not None:
+            query_layer = apply_rotary_pos_emb(query_layer, rotary_pos_emb)
+            key_layer = apply_rotary_pos_emb(key_layer, rotary_pos_emb)
+
+        # adjust key and value for inference
+        if kv_cache is not None:
+            cache_k, cache_v = kv_cache
+            key_layer = torch.cat((cache_k, key_layer), dim=0)
+            value_layer = torch.cat((cache_v, value_layer), dim=0)
+        if use_cache:
+            kv_cache = (key_layer, value_layer)
+        else:
+            kv_cache = None
+
+        if self.multi_query_attention:
+            key_layer = key_layer.unsqueeze(-2)
+            key_layer = key_layer.expand(
+                -1, -1, -1, self.num_attention_heads_per_partition // self.num_multi_query_groups_per_partition, -1
+            )
+            key_layer = key_layer.contiguous().view(
+                key_layer.size()[:2] + (self.num_attention_heads_per_partition, self.hidden_size_per_attention_head)
+            )
+            value_layer = value_layer.unsqueeze(-2)
+            value_layer = value_layer.expand(
+                -1, -1, -1, self.num_attention_heads_per_partition // self.num_multi_query_groups_per_partition, -1
+            )
+            value_layer = value_layer.contiguous().view(
+                value_layer.size()[:2] + (self.num_attention_heads_per_partition, self.hidden_size_per_attention_head)
+            )
+
+        # ==================================
+        # core attention computation
+        # ==================================
+
+        context_layer = self.core_attention(query_layer, key_layer, value_layer, attention_mask)
+
+        # =================
+        # Output. [sq, b, h]
+        # =================
+
+        output = self.dense(context_layer)
+
+        return output, kv_cache
+
+
+def _config_to_kwargs(args):
+    common_kwargs = {
+        "dtype": args.torch_dtype,
+    }
+    return common_kwargs
+
+
+class MLP(torch.nn.Module):
+    """MLP.
+
+    MLP will take the input with h hidden state, project it to 4*h
+    hidden dimension, perform nonlinear transformation, and project the
+    state back into h hidden dimension.
+    """
+
+    def __init__(self, config: ChatGLMConfig, device=None):
+        super(MLP, self).__init__()
+
+        self.add_bias = config.add_bias_linear
+
+        # Project to 4h. If using swiglu double the output width, see https://arxiv.org/pdf/2002.05202.pdf
+        self.dense_h_to_4h = nn.Linear(
+            config.hidden_size,
+            config.ffn_hidden_size * 2,
+            bias=self.add_bias,
+            device=device,
+            **_config_to_kwargs(config)
+        )
+
+        def swiglu(x):
+            x = torch.chunk(x, 2, dim=-1)
+            return F.silu(x[0]) * x[1]
+
+        self.activation_func = swiglu
+
+        # Project back to h.
+        self.dense_4h_to_h = nn.Linear(
+            config.ffn_hidden_size,
+            config.hidden_size,
+            bias=self.add_bias,
+            device=device,
+            **_config_to_kwargs(config)
+        )
+
+    def forward(self, hidden_states):
+        # [s, b, 4hp]
+        intermediate_parallel = self.dense_h_to_4h(hidden_states)
+        intermediate_parallel = self.activation_func(intermediate_parallel)
+        # [s, b, h]
+        output = self.dense_4h_to_h(intermediate_parallel)
+        return output
+
+
+class GLMBlock(torch.nn.Module):
+    """A single transformer layer.
+
+    Transformer layer takes input with size [s, b, h] and returns an
+    output of the same size.
+    """
+
+    def __init__(self, config: ChatGLMConfig, layer_number, device=None):
+        super(GLMBlock, self).__init__()
+        self.layer_number = layer_number
+
+        self.apply_residual_connection_post_layernorm = config.apply_residual_connection_post_layernorm
+
+        self.fp32_residual_connection = config.fp32_residual_connection
+
+        LayerNormFunc = RMSNorm if config.rmsnorm else LayerNorm
+        # Layernorm on the input data.
+        self.input_layernorm = LayerNormFunc(config.hidden_size, eps=config.layernorm_epsilon, device=device,
+                                             dtype=config.torch_dtype)
+
+        # Self attention.
+        self.self_attention = SelfAttention(config, layer_number, device=device)
+        self.hidden_dropout = config.hidden_dropout
+
+        # Layernorm on the attention output
+        self.post_attention_layernorm = LayerNormFunc(config.hidden_size, eps=config.layernorm_epsilon, device=device,
+                                                      dtype=config.torch_dtype)
+
+        # MLP
+        self.mlp = MLP(config, device=device)
+
+    def forward(
+            self, hidden_states, attention_mask, rotary_pos_emb, kv_cache=None, use_cache=True,
+    ):
+        # hidden_states: [s, b, h]
+
+        # Layer norm at the beginning of the transformer layer.
+        layernorm_output = self.input_layernorm(hidden_states)
+        # Self attention.
+        attention_output, kv_cache = self.self_attention(
+            layernorm_output,
+            attention_mask,
+            rotary_pos_emb,
+            kv_cache=kv_cache,
+            use_cache=use_cache
+        )
+
+        # Residual connection.
+        if self.apply_residual_connection_post_layernorm:
+            residual = layernorm_output
+        else:
+            residual = hidden_states
+
+        layernorm_input = torch.nn.functional.dropout(attention_output, p=self.hidden_dropout, training=self.training)
+        layernorm_input = residual + layernorm_input
+
+        # Layer norm post the self attention.
+        layernorm_output = self.post_attention_layernorm(layernorm_input)
+
+        # MLP.
+        mlp_output = self.mlp(layernorm_output)
+
+        # Second residual connection.
+        if self.apply_residual_connection_post_layernorm:
+            residual = layernorm_output
+        else:
+            residual = layernorm_input
+
+        output = torch.nn.functional.dropout(mlp_output, p=self.hidden_dropout, training=self.training)
+        output = residual + output
+
+        return output, kv_cache
+
+
+class GLMTransformer(torch.nn.Module):
+    """Transformer class."""
+
+    def __init__(self, config: ChatGLMConfig, device=None):
+        super(GLMTransformer, self).__init__()
+
+        self.fp32_residual_connection = config.fp32_residual_connection
+        self.post_layer_norm = config.post_layer_norm
+
+        # Number of layers.
+        self.num_layers = config.num_layers
+
+        # Transformer layers.
+        def build_layer(layer_number):
+            return GLMBlock(config, layer_number, device=device)
+
+        self.layers = torch.nn.ModuleList([build_layer(i + 1) for i in range(self.num_layers)])
+
+        if self.post_layer_norm:
+            LayerNormFunc = RMSNorm if config.rmsnorm else LayerNorm
+            # Final layer norm before output.
+            self.final_layernorm = LayerNormFunc(config.hidden_size, eps=config.layernorm_epsilon, device=device,
+                                                 dtype=config.torch_dtype)
+
+        self.gradient_checkpointing = False
+
+    def _get_layer(self, layer_number):
+        return self.layers[layer_number]
+
+    def forward(
+            self, hidden_states, attention_mask, rotary_pos_emb, kv_caches=None,
+            use_cache: Optional[bool] = True,
+            output_hidden_states: Optional[bool] = False,
+    ):
+        if not kv_caches:
+            kv_caches = [None for _ in range(self.num_layers)]
+        presents = () if use_cache else None
+        if self.gradient_checkpointing and self.training:
+            if use_cache:
+                logger.warning_once(
+                    "`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`..."
+                )
+                use_cache = False
+
+        all_self_attentions = None
+        all_hidden_states = () if output_hidden_states else None
+        for index in range(self.num_layers):
+            if output_hidden_states:
+                all_hidden_states = all_hidden_states + (hidden_states,)
+
+            layer = self._get_layer(index)
+            if self.gradient_checkpointing and self.training:
+                layer_ret = torch.utils.checkpoint.checkpoint(
+                    layer,
+                    hidden_states,
+                    attention_mask,
+                    rotary_pos_emb,
+                    kv_caches[index],
+                    use_cache
+                )
+            else:
+                layer_ret = layer(
+                    hidden_states,
+                    attention_mask,
+                    rotary_pos_emb,
+                    kv_cache=kv_caches[index],
+                    use_cache=use_cache
+                )
+            hidden_states, kv_cache = layer_ret
+            if use_cache:
+                presents = presents + (kv_cache,)
+
+        if output_hidden_states:
+            all_hidden_states = all_hidden_states + (hidden_states,)
+
+        # Final layer norm.
+        if self.post_layer_norm:
+            hidden_states = self.final_layernorm(hidden_states)
+
+        return hidden_states, presents, all_hidden_states, all_self_attentions
+
+
+class ChatGLMPreTrainedModel(PreTrainedModel):
+    """
+    An abstract class to handle weights initialization and
+    a simple interface for downloading and loading pretrained models.
+    """
+
+    is_parallelizable = False
+    supports_gradient_checkpointing = True
+    config_class = ChatGLMConfig
+    base_model_prefix = "transformer"
+    _no_split_modules = ["GLMBlock"]
+
+    def _init_weights(self, module: nn.Module):
+        """Initialize the weights."""
+        return
+
+    def get_masks(self, input_ids, past_key_values, padding_mask=None):
+        batch_size, seq_length = input_ids.shape
+        full_attention_mask = torch.ones(batch_size, seq_length, seq_length, device=input_ids.device)
+        full_attention_mask.tril_()
+        past_length = 0
+        if past_key_values:
+            past_length = past_key_values[0][0].shape[0]
+        if past_length:
+            full_attention_mask = torch.cat((torch.ones(batch_size, seq_length, past_length,
+                                                        device=input_ids.device), full_attention_mask), dim=-1)
+        if padding_mask is not None:
+            full_attention_mask = full_attention_mask * padding_mask.unsqueeze(1)
+        if not past_length and padding_mask is not None:
+            full_attention_mask -= padding_mask.unsqueeze(-1) - 1
+        full_attention_mask = (full_attention_mask < 0.5).bool()
+        full_attention_mask.unsqueeze_(1)
+        return full_attention_mask
+
+    def get_position_ids(self, input_ids, device):
+        batch_size, seq_length = input_ids.shape
+        position_ids = torch.arange(seq_length, dtype=torch.long, device=device).unsqueeze(0).repeat(batch_size, 1)
+        return position_ids
+
+    def _set_gradient_checkpointing(self, module, value=False):
+        if isinstance(module, GLMTransformer):
+            module.gradient_checkpointing = value
+
+
+class Embedding(torch.nn.Module):
+    """Language model embeddings."""
+
+    def __init__(self, config: ChatGLMConfig, device=None):
+        super(Embedding, self).__init__()
+
+        self.hidden_size = config.hidden_size
+        # Word embeddings (parallel).
+        self.word_embeddings = nn.Embedding(
+            config.padded_vocab_size,
+            self.hidden_size,
+            dtype=config.torch_dtype,
+            device=device
+        )
+        self.fp32_residual_connection = config.fp32_residual_connection
+
+    def forward(self, input_ids):
+        # Embeddings.
+        words_embeddings = self.word_embeddings(input_ids)
+        embeddings = words_embeddings
+        # Data format change to avoid explicit tranposes : [b s h] --> [s b h].
+        embeddings = embeddings.transpose(0, 1).contiguous()
+        # If the input flag for fp32 residual connection is set, convert for float.
+        if self.fp32_residual_connection:
+            embeddings = embeddings.float()
+        return embeddings
+
+
+class ChatGLMModel(ChatGLMPreTrainedModel):
+    def __init__(self, config: ChatGLMConfig, device=None, empty_init=True):
+        super().__init__(config)
+        if empty_init:
+            init_method = skip_init
+        else:
+            init_method = default_init
+        init_kwargs = {}
+        if device is not None:
+            init_kwargs["device"] = device
+        self.embedding = init_method(Embedding, config, **init_kwargs)
+        self.num_layers = config.num_layers
+        self.multi_query_group_num = config.multi_query_group_num
+        self.kv_channels = config.kv_channels
+
+        # Rotary positional embeddings
+        self.seq_length = config.seq_length
+        rotary_dim = (
+            config.hidden_size // config.num_attention_heads if config.kv_channels is None else config.kv_channels
+        )
+
+        self.rotary_pos_emb = RotaryEmbedding(rotary_dim // 2, original_impl=config.original_rope, device=device,
+                                              dtype=config.torch_dtype)
+        self.encoder = init_method(GLMTransformer, config, **init_kwargs)
+        self.output_layer = init_method(nn.Linear, config.hidden_size, config.padded_vocab_size, bias=False,
+                                        dtype=config.torch_dtype, **init_kwargs)
+        self.pre_seq_len = config.pre_seq_len
+        self.prefix_projection = config.prefix_projection
+        if self.pre_seq_len is not None:
+            for param in self.parameters():
+                param.requires_grad = False
+            self.prefix_tokens = torch.arange(self.pre_seq_len).long()
+            self.prefix_encoder = PrefixEncoder(config)
+            self.dropout = torch.nn.Dropout(0.1)
+
+    def get_input_embeddings(self):
+        return self.embedding.word_embeddings
+
+    def get_prompt(self, batch_size, device, dtype=torch.half):
+        prefix_tokens = self.prefix_tokens.unsqueeze(0).expand(batch_size, -1).to(device)
+        past_key_values = self.prefix_encoder(prefix_tokens).type(dtype)
+        past_key_values = past_key_values.view(
+            batch_size,
+            self.pre_seq_len,
+            self.num_layers * 2,
+            self.multi_query_group_num,
+            self.kv_channels
+        )
+        # seq_len, b, nh, hidden_size
+        past_key_values = self.dropout(past_key_values)
+        past_key_values = past_key_values.permute([2, 1, 0, 3, 4]).split(2)
+        return past_key_values
+
+    def forward(
+            self,
+            input_ids,
+            position_ids: Optional[torch.Tensor] = None,
+            attention_mask: Optional[torch.BoolTensor] = None,
+            full_attention_mask: Optional[torch.BoolTensor] = None,
+            past_key_values: Optional[Tuple[Tuple[torch.Tensor, torch.Tensor], ...]] = None,
+            inputs_embeds: Optional[torch.Tensor] = None,
+            use_cache: Optional[bool] = None,
+            output_hidden_states: Optional[bool] = None,
+            return_dict: Optional[bool] = None,
+    ):
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        use_cache = use_cache if use_cache is not None else self.config.use_cache
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        batch_size, seq_length = input_ids.shape
+
+        if inputs_embeds is None:
+            inputs_embeds = self.embedding(input_ids)
+
+        if self.pre_seq_len is not None:
+            if past_key_values is None:
+                past_key_values = self.get_prompt(batch_size=batch_size, device=input_ids.device,
+                                                  dtype=inputs_embeds.dtype)
+            if attention_mask is not None:
+                attention_mask = torch.cat([attention_mask.new_ones((batch_size, self.pre_seq_len)),
+                                            attention_mask], dim=-1)
+
+        if full_attention_mask is None:
+            if (attention_mask is not None and not attention_mask.all()) or (past_key_values and seq_length != 1):
+                full_attention_mask = self.get_masks(input_ids, past_key_values, padding_mask=attention_mask)
+
+        # Rotary positional embeddings
+        rotary_pos_emb = self.rotary_pos_emb(self.seq_length)
+        if position_ids is not None:
+            rotary_pos_emb = rotary_pos_emb[position_ids]
+        else:
+            rotary_pos_emb = rotary_pos_emb[None, :seq_length]
+        rotary_pos_emb = rotary_pos_emb.transpose(0, 1).contiguous()
+
+        # Run encoder.
+        hidden_states, presents, all_hidden_states, all_self_attentions = self.encoder(
+            inputs_embeds, full_attention_mask, rotary_pos_emb=rotary_pos_emb,
+            kv_caches=past_key_values, use_cache=use_cache, output_hidden_states=output_hidden_states
+        )
+
+        if not return_dict:
+            return tuple(v for v in [hidden_states, presents, all_hidden_states, all_self_attentions] if v is not None)
+
+        return BaseModelOutputWithPast(
+            last_hidden_state=hidden_states,
+            past_key_values=presents,
+            hidden_states=all_hidden_states,
+            attentions=all_self_attentions,
+        )
+
+    def quantize(self, weight_bit_width: int):
+        from .quantization import quantize
+        quantize(self.encoder, weight_bit_width)
+        return self
+
+
+class ChatGLMForConditionalGeneration(ChatGLMPreTrainedModel):
+    def __init__(self, config: ChatGLMConfig, empty_init=True, device=None):
+        super().__init__(config)
+
+        self.max_sequence_length = config.max_length
+        self.transformer = ChatGLMModel(config, empty_init=empty_init, device=device)
+        self.config = config
+        self.quantized = False
+
+        if self.config.quantization_bit:
+            self.quantize(self.config.quantization_bit, empty_init=True)
+
+    def _update_model_kwargs_for_generation(
+            self,
+            outputs: ModelOutput,
+            model_kwargs: Dict[str, Any],
+            is_encoder_decoder: bool = False,
+            standardize_cache_format: bool = False,
+    ) -> Dict[str, Any]:
+        # update past_key_values
+        model_kwargs["past_key_values"] = self._extract_past_from_model_output(
+            outputs, standardize_cache_format=standardize_cache_format
+        )
+
+        # update attention mask
+        if "attention_mask" in model_kwargs:
+            attention_mask = model_kwargs["attention_mask"]
+            model_kwargs["attention_mask"] = torch.cat(
+                [attention_mask, attention_mask.new_ones((attention_mask.shape[0], 1))], dim=-1
+            )
+
+        # update position ids
+        if "position_ids" in model_kwargs:
+            position_ids = model_kwargs["position_ids"]
+            new_position_id = position_ids[..., -1:].clone()
+            new_position_id += 1
+            model_kwargs["position_ids"] = torch.cat(
+                [position_ids, new_position_id], dim=-1
+            )
+
+        model_kwargs["is_first_forward"] = False
+        return model_kwargs
+
+    def prepare_inputs_for_generation(
+            self,
+            input_ids: torch.LongTensor,
+            past_key_values: Optional[torch.Tensor] = None,
+            attention_mask: Optional[torch.Tensor] = None,
+            position_ids: Optional[torch.Tensor] = None,
+            is_first_forward: bool = True,
+            **kwargs
+    ) -> dict:
+        # only last token for input_ids if past is not None
+        if position_ids is None:
+            position_ids = self.get_position_ids(input_ids, device=input_ids.device)
+        if not is_first_forward:
+            position_ids = position_ids[..., -1:]
+            input_ids = input_ids[:, -1:]
+        return {
+            "input_ids": input_ids,
+            "past_key_values": past_key_values,
+            "position_ids": position_ids,
+            "attention_mask": attention_mask,
+            "return_last_logit": True
+        }
+
+    def forward(
+            self,
+            input_ids: Optional[torch.Tensor] = None,
+            position_ids: Optional[torch.Tensor] = None,
+            attention_mask: Optional[torch.Tensor] = None,
+            past_key_values: Optional[Tuple[torch.FloatTensor]] = None,
+            inputs_embeds: Optional[torch.Tensor] = None,
+            labels: Optional[torch.Tensor] = None,
+            use_cache: Optional[bool] = None,
+            output_attentions: Optional[bool] = None,
+            output_hidden_states: Optional[bool] = None,
+            return_dict: Optional[bool] = None,
+            return_last_logit: Optional[bool] = False,
+    ):
+        use_cache = use_cache if use_cache is not None else self.config.use_cache
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        transformer_outputs = self.transformer(
+            input_ids=input_ids,
+            position_ids=position_ids,
+            attention_mask=attention_mask,
+            past_key_values=past_key_values,
+            inputs_embeds=inputs_embeds,
+            use_cache=use_cache,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+
+        hidden_states = transformer_outputs[0]
+        if return_last_logit:
+            hidden_states = hidden_states[-1:]
+        lm_logits = self.transformer.output_layer(hidden_states)
+        lm_logits = lm_logits.transpose(0, 1).contiguous()
+
+        loss = None
+        if labels is not None:
+            lm_logits = lm_logits.to(torch.float32)
+
+            # Shift so that tokens < n predict n
+            shift_logits = lm_logits[..., :-1, :].contiguous()
+            shift_labels = labels[..., 1:].contiguous()
+            # Flatten the tokens
+            loss_fct = CrossEntropyLoss(ignore_index=-100)
+            loss = loss_fct(shift_logits.view(-1, shift_logits.size(-1)), shift_labels.view(-1))
+
+            lm_logits = lm_logits.to(hidden_states.dtype)
+            loss = loss.to(hidden_states.dtype)
+
+        if not return_dict:
+            output = (lm_logits,) + transformer_outputs[1:]
+            return ((loss,) + output) if loss is not None else output
+
+        return CausalLMOutputWithPast(
+            loss=loss,
+            logits=lm_logits,
+            past_key_values=transformer_outputs.past_key_values,
+            hidden_states=transformer_outputs.hidden_states,
+            attentions=transformer_outputs.attentions,
+        )
+
+    @staticmethod
+    def _reorder_cache(
+            past: Tuple[Tuple[torch.Tensor, torch.Tensor], ...], beam_idx: torch.LongTensor
+    ) -> Tuple[Tuple[torch.Tensor, torch.Tensor], ...]:
+        """
+        This function is used to re-order the `past_key_values` cache if [`~PreTrainedModel.beam_search`] or
+        [`~PreTrainedModel.beam_sample`] is called. This is required to match `past_key_values` with the correct
+        beam_idx at every generation step.
+
+        Output shares the same memory storage as `past`.
+        """
+        return tuple(
+            (
+                layer_past[0].index_select(1, beam_idx.to(layer_past[0].device)),
+                layer_past[1].index_select(1, beam_idx.to(layer_past[1].device)),
+            )
+            for layer_past in past
+        )
+
+    def process_response(self, response):
+        response = response.strip()
+        response = response.replace("[[训练时间]]", "2023年")
+        return response
+
+    def build_inputs(self, tokenizer, query: str, history: List[Tuple[str, str]] = None):
+        prompt = tokenizer.build_prompt(query, history=history)
+        inputs = tokenizer([prompt], return_tensors="pt")
+        inputs = inputs.to(self.device)
+        return inputs
+
+    def build_stream_inputs(self, tokenizer, query: str, history: List[Tuple[str, str]] = None):
+        if history:
+            prompt = "\n\n[Round {}]\n\n问：{}\n\n答：".format(len(history) + 1, query)
+            input_ids = tokenizer.encode(prompt, add_special_tokens=False)
+            input_ids = input_ids[1:]
+            inputs = tokenizer.batch_encode_plus([(input_ids, None)], return_tensors="pt", add_special_tokens=False)
+        else:
+            prompt = "[Round {}]\n\n问：{}\n\n答：".format(len(history) + 1, query)
+            inputs = tokenizer([prompt], return_tensors="pt")
+        inputs = inputs.to(self.device)
+        return inputs
+
+    @torch.no_grad()
+    def chat(self, tokenizer, query: str, history: List[Tuple[str, str]] = None, max_length: int = 8192, num_beams=1,
+             do_sample=True, top_p=0.8, temperature=0.8, logits_processor=None, **kwargs):
+        if history is None:
+            history = []
+        if logits_processor is None:
+            logits_processor = LogitsProcessorList()
+        logits_processor.append(InvalidScoreLogitsProcessor())
+        gen_kwargs = {"max_length": max_length, "num_beams": num_beams, "do_sample": do_sample, "top_p": top_p,
+                      "temperature": temperature, "logits_processor": logits_processor, **kwargs}
+        inputs = self.build_inputs(tokenizer, query, history=history)
+        outputs = self.generate(**inputs, **gen_kwargs)
+        outputs = outputs.tolist()[0][len(inputs["input_ids"][0]):]
+        response = tokenizer.decode(outputs)
+        response = self.process_response(response)
+        history = history + [(query, response)]
+        return response, history
+
+    @torch.no_grad()
+    def stream_chat(self, tokenizer, query: str, history: List[Tuple[str, str]] = None, past_key_values=None,
+                    max_length: int = 8192, do_sample=True, top_p=0.8, temperature=0.8, logits_processor=None,
+                    return_past_key_values=False, **kwargs):
+        if history is None:
+            history = []
+        if logits_processor is None:
+            logits_processor = LogitsProcessorList()
+        logits_processor.append(InvalidScoreLogitsProcessor())
+        gen_kwargs = {"max_length": max_length, "do_sample": do_sample, "top_p": top_p,
+                      "temperature": temperature, "logits_processor": logits_processor, **kwargs}
+        if past_key_values is None and not return_past_key_values:
+            inputs = self.build_inputs(tokenizer, query, history=history)
+        else:
+            inputs = self.build_stream_inputs(tokenizer, query, history=history)
+        if past_key_values is not None:
+            past_length = past_key_values[0][0].shape[0]
+            if self.transformer.pre_seq_len is not None:
+                past_length -= self.transformer.pre_seq_len
+            inputs.position_ids += past_length
+            attention_mask = inputs.attention_mask
+            attention_mask = torch.cat((attention_mask.new_ones(1, past_length), attention_mask), dim=1)
+            inputs['attention_mask'] = attention_mask
+        for outputs in self.stream_generate(**inputs, past_key_values=past_key_values,
+                                            return_past_key_values=return_past_key_values, **gen_kwargs):
+            if return_past_key_values:
+                outputs, past_key_values = outputs
+            outputs = outputs.tolist()[0][len(inputs["input_ids"][0]):]
+            response = tokenizer.decode(outputs)
+            if response and response[-1] != "�":
+                response = self.process_response(response)
+                new_history = history + [(query, response)]
+                if return_past_key_values:
+                    yield response, new_history, past_key_values
+                else:
+                    yield response, new_history
+
+    @torch.no_grad()
+    def stream_generate(
+            self,
+            input_ids,
+            generation_config: Optional[GenerationConfig] = None,
+            logits_processor: Optional[LogitsProcessorList] = None,
+            stopping_criteria: Optional[StoppingCriteriaList] = None,
+            prefix_allowed_tokens_fn: Optional[Callable[[int, torch.Tensor], List[int]]] = None,
+            return_past_key_values=False,
+            **kwargs,
+    ):
+        batch_size, input_ids_seq_length = input_ids.shape[0], input_ids.shape[-1]
+
+        if generation_config is None:
+            generation_config = self.generation_config
+        generation_config = copy.deepcopy(generation_config)
+        model_kwargs = generation_config.update(**kwargs)
+        bos_token_id, eos_token_id = generation_config.bos_token_id, generation_config.eos_token_id
+
+        if isinstance(eos_token_id, int):
+            eos_token_id = [eos_token_id]
+
+        has_default_max_length = kwargs.get("max_length") is None and generation_config.max_length is not None
+        if has_default_max_length and generation_config.max_new_tokens is None:
+            warnings.warn(
+                f"Using `max_length`'s default ({generation_config.max_length}) to control the generation length. "
+                "This behaviour is deprecated and will be removed from the config in v5 of Transformers -- we"
+                " recommend using `max_new_tokens` to control the maximum length of the generation.",
+                UserWarning,
+            )
+        elif generation_config.max_new_tokens is not None:
+            generation_config.max_length = generation_config.max_new_tokens + input_ids_seq_length
+            if not has_default_max_length:
+                logger.warn(
+                    f"Both `max_new_tokens` (={generation_config.max_new_tokens}) and `max_length`(="
+                    f"{generation_config.max_length}) seem to have been set. `max_new_tokens` will take precedence. "
+                    "Please refer to the documentation for more information. "
+                    "(https://huggingface.co/docs/transformers/main/en/main_classes/text_generation)",
+                    UserWarning,
+                )
+
+        if input_ids_seq_length >= generation_config.max_length:
+            input_ids_string = "decoder_input_ids" if self.config.is_encoder_decoder else "input_ids"
+            logger.warning(
+                f"Input length of {input_ids_string} is {input_ids_seq_length}, but `max_length` is set to"
+                f" {generation_config.max_length}. This can lead to unexpected behavior. You should consider"
+                " increasing `max_new_tokens`."
+            )
+
+        # 2. Set generation parameters if not already defined
+        logits_processor = logits_processor if logits_processor is not None else LogitsProcessorList()
+        stopping_criteria = stopping_criteria if stopping_criteria is not None else StoppingCriteriaList()
+
+        logits_processor = self._get_logits_processor(
+            generation_config=generation_config,
+            input_ids_seq_length=input_ids_seq_length,
+            encoder_input_ids=input_ids,
+            prefix_allowed_tokens_fn=prefix_allowed_tokens_fn,
+            logits_processor=logits_processor,
+        )
+
+        stopping_criteria = self._get_stopping_criteria(
+            generation_config=generation_config, stopping_criteria=stopping_criteria
+        )
+        logits_warper = self._get_logits_warper(generation_config)
+
+        unfinished_sequences = input_ids.new(input_ids.shape[0]).fill_(1)
+        scores = None
+        while True:
+            model_inputs = self.prepare_inputs_for_generation(input_ids, **model_kwargs)
+            # forward pass to get next token
+            outputs = self(
+                **model_inputs,
+                return_dict=True,
+                output_attentions=False,
+                output_hidden_states=False,
+            )
+
+            next_token_logits = outputs.logits[:, -1, :]
+
+            # pre-process distribution
+            next_token_scores = logits_processor(input_ids, next_token_logits)
+            next_token_scores = logits_warper(input_ids, next_token_scores)
+
+            # sample
+            probs = nn.functional.softmax(next_token_scores, dim=-1)
+            if generation_config.do_sample:
+                next_tokens = torch.multinomial(probs, num_samples=1).squeeze(1)
+            else:
+                next_tokens = torch.argmax(probs, dim=-1)
+
+            # update generated ids, model inputs, and length for next step
+            input_ids = torch.cat([input_ids, next_tokens[:, None]], dim=-1)
+            model_kwargs = self._update_model_kwargs_for_generation(
+                outputs, model_kwargs, is_encoder_decoder=self.config.is_encoder_decoder
+            )
+            unfinished_sequences = unfinished_sequences.mul((sum(next_tokens != i for i in eos_token_id)).long())
+            if return_past_key_values:
+                yield input_ids, outputs.past_key_values
+            else:
+                yield input_ids
+            # stop when each sentence is finished, or if we exceed the maximum length
+            if unfinished_sequences.max() == 0 or stopping_criteria(input_ids, scores):
+                break
+
+    def quantize(self, bits: int, empty_init=False, device=None, **kwargs):
+        if bits == 0:
+            return
+
+        from .quantization import quantize
+
+        if self.quantized:
+            logger.info("Already quantized.")
+            return self
+
+        self.quantized = True
+
+        self.config.quantization_bit = bits
+
+        self.transformer.encoder = quantize(self.transformer.encoder, bits, empty_init=empty_init, device=device,
+                                            **kwargs)
+        return self
\ No newline at end of file

From e3cd5cb2620e6908504b9ac9a2d6bd23a729c7a0 Mon Sep 17 00:00:00 2001
From: klhhhhh <1412841649@qq.com>
Date: Fri, 7 Jul 2023 19:56:22 +0800
Subject: [PATCH 08/21] [shardformer] add test kit in model zoo for chatglm

---
 .../chatglm2-6b/modeling_chatglm.py           | 1193 -----------------
 .../transformers/chatglm2_6b/MODEL_LICENSE    |   33 +
 .../chatglm2_6b/modeling_chatglm.py           |    6 -
 .../transformers/chatglm2_6b/quantization.py  |  188 +++
 4 files changed, 221 insertions(+), 1199 deletions(-)
 delete mode 100644 tests/kit/model_zoo/transformers/chatglm2-6b/modeling_chatglm.py
 create mode 100644 tests/kit/model_zoo/transformers/chatglm2_6b/MODEL_LICENSE
 create mode 100644 tests/kit/model_zoo/transformers/chatglm2_6b/quantization.py

diff --git a/tests/kit/model_zoo/transformers/chatglm2-6b/modeling_chatglm.py b/tests/kit/model_zoo/transformers/chatglm2-6b/modeling_chatglm.py
deleted file mode 100644
index 82163c46190f..000000000000
--- a/tests/kit/model_zoo/transformers/chatglm2-6b/modeling_chatglm.py
+++ /dev/null
@@ -1,1193 +0,0 @@
-""" PyTorch ChatGLM model. """
-
-import math
-import copy
-import warnings
-import re
-import sys
-
-import torch
-import torch.utils.checkpoint
-import torch.nn.functional as F
-from torch import nn
-from torch.nn import CrossEntropyLoss, LayerNorm
-from torch.nn.utils import skip_init
-from typing import Optional, Tuple, Union, List, Callable, Dict, Any
-
-from transformers.modeling_outputs import (
-    BaseModelOutputWithPast,
-    CausalLMOutputWithPast,
-)
-from transformers.modeling_utils import PreTrainedModel
-from transformers.utils import logging
-from transformers.generation.logits_process import LogitsProcessor
-from transformers.generation.utils import LogitsProcessorList, StoppingCriteriaList, GenerationConfig, ModelOutput
-
-from .configuration_chatglm import ChatGLMConfig
-
-# flags required to enable jit fusion kernels
-
-if sys.platform != 'darwin':
-    torch._C._jit_set_profiling_mode(False)
-    torch._C._jit_set_profiling_executor(False)
-    torch._C._jit_override_can_fuse_on_cpu(True)
-    torch._C._jit_override_can_fuse_on_gpu(True)
-
-logger = logging.get_logger(__name__)
-
-_CHECKPOINT_FOR_DOC = "THUDM/ChatGLM2-6B"
-_CONFIG_FOR_DOC = "ChatGLM6BConfig"
-
-CHATGLM_6B_PRETRAINED_MODEL_ARCHIVE_LIST = [
-    "THUDM/chatglm2-6b",
-    # See all ChatGLM models at https://huggingface.co/models?filter=chatglm
-]
-
-
-def default_init(cls, *args, **kwargs):
-    return cls(*args, **kwargs)
-
-
-class InvalidScoreLogitsProcessor(LogitsProcessor):
-    def __call__(self, input_ids: torch.LongTensor, scores: torch.FloatTensor) -> torch.FloatTensor:
-        if torch.isnan(scores).any() or torch.isinf(scores).any():
-            scores.zero_()
-            scores[..., 5] = 5e4
-        return scores
-
-
-class PrefixEncoder(torch.nn.Module):
-    """
-    The torch.nn model to encode the prefix
-    Input shape: (batch-size, prefix-length)
-    Output shape: (batch-size, prefix-length, 2*layers*hidden)
-    """
-
-    def __init__(self, config: ChatGLMConfig):
-        super().__init__()
-        self.prefix_projection = config.prefix_projection
-        if self.prefix_projection:
-            # Use a two-layer MLP to encode the prefix
-            kv_size = config.num_layers * config.kv_channels * config.multi_query_group_num * 2
-            self.embedding = torch.nn.Embedding(config.pre_seq_len, kv_size)
-            self.trans = torch.nn.Sequential(
-                torch.nn.Linear(kv_size, config.hidden_size),
-                torch.nn.Tanh(),
-                torch.nn.Linear(config.hidden_size, kv_size)
-            )
-        else:
-            self.embedding = torch.nn.Embedding(config.pre_seq_len,
-                                                config.num_layers * config.kv_channels * config.multi_query_group_num * 2)
-
-    def forward(self, prefix: torch.Tensor):
-        if self.prefix_projection:
-            prefix_tokens = self.embedding(prefix)
-            past_key_values = self.trans(prefix_tokens)
-        else:
-            past_key_values = self.embedding(prefix)
-        return past_key_values
-
-
-def split_tensor_along_last_dim(
-        tensor: torch.Tensor,
-        num_partitions: int,
-        contiguous_split_chunks: bool = False,
-) -> List[torch.Tensor]:
-    """Split a tensor along its last dimension.
-
-    Arguments:
-        tensor: input tensor.
-        num_partitions: number of partitions to split the tensor
-        contiguous_split_chunks: If True, make each chunk contiguous
-                                 in memory.
-
-    Returns:
-        A list of Tensors
-    """
-    # Get the size and dimension.
-    last_dim = tensor.dim() - 1
-    last_dim_size = tensor.size()[last_dim] // num_partitions
-    # Split.
-    tensor_list = torch.split(tensor, last_dim_size, dim=last_dim)
-    # Note: torch.split does not create contiguous tensors by default.
-    if contiguous_split_chunks:
-        return tuple(chunk.contiguous() for chunk in tensor_list)
-
-    return tensor_list
-
-
-class RotaryEmbedding(nn.Module):
-    def __init__(self, dim, original_impl=False, device=None, dtype=None):
-        super().__init__()
-        inv_freq = 1.0 / (10000 ** (torch.arange(0, dim, 2, device=device).to(dtype=dtype) / dim))
-        self.register_buffer("inv_freq", inv_freq)
-        self.dim = dim
-        self.original_impl = original_impl
-
-    def forward_impl(
-            self, seq_len: int, n_elem: int, dtype: torch.dtype, device: torch.device, base: int = 10000
-    ):
-        """Enhanced Transformer with Rotary Position Embedding.
-
-        Derived from: https://github.com/labmlai/annotated_deep_learning_paper_implementations/blob/master/labml_nn/
-        transformers/rope/__init__.py. MIT License:
-        https://github.com/labmlai/annotated_deep_learning_paper_implementations/blob/master/license.
-        """
-        # $\Theta = {\theta_i = 10000^{\frac{2(i-1)}{d}}, i \in [1, 2, ..., \frac{d}{2}]}$
-        theta = 1.0 / (base ** (torch.arange(0, n_elem, 2, dtype=dtype, device=device) / n_elem))
-
-        # Create position indexes `[0, 1, ..., seq_len - 1]`
-        seq_idx = torch.arange(seq_len, dtype=dtype, device=device)
-
-        # Calculate the product of position index and $\theta_i$
-        idx_theta = torch.outer(seq_idx, theta).float()
-
-        cache = torch.stack([torch.cos(idx_theta), torch.sin(idx_theta)], dim=-1)
-
-        # this is to mimic the behaviour of complex32, else we will get different results
-        if dtype in (torch.float16, torch.bfloat16, torch.int8):
-            cache = cache.bfloat16() if dtype == torch.bfloat16 else cache.half()
-        return cache
-
-    def forward(self, max_seq_len, offset=0):
-        return self.forward_impl(
-            max_seq_len, self.dim, dtype=self.inv_freq.dtype, device=self.inv_freq.device
-        )
-
-
-@torch.jit.script
-def apply_rotary_pos_emb(x: torch.Tensor, rope_cache: torch.Tensor) -> torch.Tensor:
-    # x: [sq, b, np, hn]
-    sq, b, np, hn = x.size(0), x.size(1), x.size(2), x.size(3)
-    rot_dim = rope_cache.shape[-2] * 2
-    x, x_pass = x[..., :rot_dim], x[..., rot_dim:]
-    # truncate to support variable sizes
-    rope_cache = rope_cache[:sq]
-    xshaped = x.reshape(sq, -1, np, rot_dim // 2, 2)
-    rope_cache = rope_cache.view(sq, -1, 1, xshaped.size(3), 2)
-    x_out2 = torch.stack(
-        [
-            xshaped[..., 0] * rope_cache[..., 0] - xshaped[..., 1] * rope_cache[..., 1],
-            xshaped[..., 1] * rope_cache[..., 0] + xshaped[..., 0] * rope_cache[..., 1],
-        ],
-        -1,
-    )
-    x_out2 = x_out2.flatten(3)
-    return torch.cat((x_out2, x_pass), dim=-1)
-
-
-class RMSNorm(torch.nn.Module):
-    def __init__(self, normalized_shape, eps=1e-5, device=None, dtype=None, **kwargs):
-        super().__init__()
-        self.weight = torch.nn.Parameter(torch.empty(normalized_shape, device=device, dtype=dtype))
-        self.eps = eps
-
-    def forward(self, hidden_states: torch.Tensor):
-        input_dtype = hidden_states.dtype
-        variance = hidden_states.to(torch.float32).pow(2).mean(-1, keepdim=True)
-        hidden_states = hidden_states * torch.rsqrt(variance + self.eps)
-
-        return (self.weight * hidden_states).to(input_dtype)
-
-
-class CoreAttention(torch.nn.Module):
-    def __init__(self, config: ChatGLMConfig, layer_number):
-        super(CoreAttention, self).__init__()
-
-        self.apply_query_key_layer_scaling = config.apply_query_key_layer_scaling
-        self.attention_softmax_in_fp32 = config.attention_softmax_in_fp32
-        if self.apply_query_key_layer_scaling:
-            self.attention_softmax_in_fp32 = True
-        self.layer_number = max(1, layer_number)
-
-        projection_size = config.kv_channels * config.num_attention_heads
-
-        # Per attention head and per partition values.
-        self.hidden_size_per_partition = projection_size
-        self.hidden_size_per_attention_head = projection_size // config.num_attention_heads
-        self.num_attention_heads_per_partition = config.num_attention_heads
-
-        coeff = None
-        self.norm_factor = math.sqrt(self.hidden_size_per_attention_head)
-        if self.apply_query_key_layer_scaling:
-            coeff = self.layer_number
-            self.norm_factor *= coeff
-        self.coeff = coeff
-
-        self.attention_dropout = torch.nn.Dropout(config.attention_dropout)
-
-    def forward(self, query_layer, key_layer, value_layer, attention_mask):
-        pytorch_major_version = int(torch.__version__.split('.')[0])
-        if pytorch_major_version >= 2:
-            query_layer, key_layer, value_layer = [k.permute(1, 2, 0, 3) for k in [query_layer, key_layer, value_layer]]
-            if attention_mask is None and query_layer.shape[2] == key_layer.shape[2]:
-                context_layer = torch.nn.functional.scaled_dot_product_attention(query_layer, key_layer, value_layer,
-                                                                                 is_causal=True)
-            else:
-                if attention_mask is not None:
-                    attention_mask = ~attention_mask
-                context_layer = torch.nn.functional.scaled_dot_product_attention(query_layer, key_layer, value_layer,
-                                                                                 attention_mask)
-            context_layer = context_layer.permute(2, 0, 1, 3)
-            new_context_layer_shape = context_layer.size()[:-2] + (self.hidden_size_per_partition,)
-            context_layer = context_layer.reshape(*new_context_layer_shape)
-        else:
-            # Raw attention scores
-
-            # [b, np, sq, sk]
-            output_size = (query_layer.size(1), query_layer.size(2), query_layer.size(0), key_layer.size(0))
-
-            # [sq, b, np, hn] -> [sq, b * np, hn]
-            query_layer = query_layer.view(output_size[2], output_size[0] * output_size[1], -1)
-            # [sk, b, np, hn] -> [sk, b * np, hn]
-            key_layer = key_layer.view(output_size[3], output_size[0] * output_size[1], -1)
-
-            # preallocting input tensor: [b * np, sq, sk]
-            matmul_input_buffer = torch.empty(
-                output_size[0] * output_size[1], output_size[2], output_size[3], dtype=query_layer.dtype,
-                device=query_layer.device
-            )
-
-            # Raw attention scores. [b * np, sq, sk]
-            matmul_result = torch.baddbmm(
-                matmul_input_buffer,
-                query_layer.transpose(0, 1),  # [b * np, sq, hn]
-                key_layer.transpose(0, 1).transpose(1, 2),  # [b * np, hn, sk]
-                beta=0.0,
-                alpha=(1.0 / self.norm_factor),
-            )
-
-            # change view to [b, np, sq, sk]
-            attention_scores = matmul_result.view(*output_size)
-
-            # ===========================
-            # Attention probs and dropout
-            # ===========================
-
-            # attention scores and attention mask [b, np, sq, sk]
-            if self.attention_softmax_in_fp32:
-                attention_scores = attention_scores.float()
-            if self.coeff is not None:
-                attention_scores = attention_scores * self.coeff
-            if attention_mask is None and attention_scores.shape[2] == attention_scores.shape[3]:
-                attention_mask = torch.ones(output_size[0], 1, output_size[2], output_size[3],
-                                            device=attention_scores.device, dtype=torch.bool)
-                attention_mask.tril_()
-                attention_mask = ~attention_mask
-            if attention_mask is not None:
-                attention_scores = attention_scores.masked_fill(attention_mask, float("-inf"))
-            attention_probs = F.softmax(attention_scores, dim=-1)
-            attention_probs = attention_probs.type_as(value_layer)
-
-            # This is actually dropping out entire tokens to attend to, which might
-            # seem a bit unusual, but is taken from the original Transformer paper.
-            attention_probs = self.attention_dropout(attention_probs)
-            # =========================
-            # Context layer. [sq, b, hp]
-            # =========================
-
-            # value_layer -> context layer.
-            # [sk, b, np, hn] --> [b, np, sq, hn]
-
-            # context layer shape: [b, np, sq, hn]
-            output_size = (value_layer.size(1), value_layer.size(2), query_layer.size(0), value_layer.size(3))
-            # change view [sk, b * np, hn]
-            value_layer = value_layer.view(value_layer.size(0), output_size[0] * output_size[1], -1)
-            # change view [b * np, sq, sk]
-            attention_probs = attention_probs.view(output_size[0] * output_size[1], output_size[2], -1)
-            # matmul: [b * np, sq, hn]
-            context_layer = torch.bmm(attention_probs, value_layer.transpose(0, 1))
-            # change view [b, np, sq, hn]
-            context_layer = context_layer.view(*output_size)
-            # [b, np, sq, hn] --> [sq, b, np, hn]
-            context_layer = context_layer.permute(2, 0, 1, 3).contiguous()
-            # [sq, b, np, hn] --> [sq, b, hp]
-            new_context_layer_shape = context_layer.size()[:-2] + (self.hidden_size_per_partition,)
-            context_layer = context_layer.view(*new_context_layer_shape)
-
-        return context_layer
-
-
-class SelfAttention(torch.nn.Module):
-    """Parallel self-attention layer abstract class.
-
-    Self-attention layer takes input with size [s, b, h]
-    and returns output of the same size.
-    """
-
-    def __init__(self, config: ChatGLMConfig, layer_number, device=None):
-        super(SelfAttention, self).__init__()
-        self.layer_number = max(1, layer_number)
-
-        self.projection_size = config.kv_channels * config.num_attention_heads
-
-        # Per attention head and per partition values.
-        self.hidden_size_per_attention_head = self.projection_size // config.num_attention_heads
-        self.num_attention_heads_per_partition = config.num_attention_heads
-
-        self.multi_query_attention = config.multi_query_attention
-        self.qkv_hidden_size = 3 * self.projection_size
-        if self.multi_query_attention:
-            self.num_multi_query_groups_per_partition = config.multi_query_group_num
-            self.qkv_hidden_size = (
-                    self.projection_size + 2 * self.hidden_size_per_attention_head * config.multi_query_group_num
-            )
-        self.query_key_value = nn.Linear(config.hidden_size, self.qkv_hidden_size,
-                                         bias=config.add_bias_linear or config.add_qkv_bias,
-                                         device=device, **_config_to_kwargs(config)
-                                         )
-
-        self.core_attention = CoreAttention(config, self.layer_number)
-
-        # Output.
-        self.dense = nn.Linear(self.projection_size, config.hidden_size, bias=config.add_bias_linear,
-                               device=device, **_config_to_kwargs(config)
-                               )
-
-    def _allocate_memory(self, inference_max_sequence_len, batch_size, device=None, dtype=None):
-        if self.multi_query_attention:
-            num_attention_heads = self.num_multi_query_groups_per_partition
-        else:
-            num_attention_heads = self.num_attention_heads_per_partition
-        return torch.empty(
-            inference_max_sequence_len,
-            batch_size,
-            num_attention_heads,
-            self.hidden_size_per_attention_head,
-            dtype=dtype,
-            device=device,
-        )
-
-    def forward(
-            self, hidden_states, attention_mask, rotary_pos_emb, kv_cache=None, use_cache=True
-    ):
-        # hidden_states: [sq, b, h]
-
-        # =================================================
-        # Pre-allocate memory for key-values for inference.
-        # =================================================
-        # =====================
-        # Query, Key, and Value
-        # =====================
-
-        # Attention heads [sq, b, h] --> [sq, b, (np * 3 * hn)]
-        mixed_x_layer = self.query_key_value(hidden_states)
-
-        if self.multi_query_attention:
-            (query_layer, key_layer, value_layer) = mixed_x_layer.split(
-                [
-                    self.num_attention_heads_per_partition * self.hidden_size_per_attention_head,
-                    self.num_multi_query_groups_per_partition * self.hidden_size_per_attention_head,
-                    self.num_multi_query_groups_per_partition * self.hidden_size_per_attention_head,
-                ],
-                dim=-1,
-            )
-            query_layer = query_layer.view(
-                query_layer.size()[:-1] + (self.num_attention_heads_per_partition, self.hidden_size_per_attention_head)
-            )
-            key_layer = key_layer.view(
-                key_layer.size()[:-1] + (self.num_multi_query_groups_per_partition, self.hidden_size_per_attention_head)
-            )
-            value_layer = value_layer.view(
-                value_layer.size()[:-1]
-                + (self.num_multi_query_groups_per_partition, self.hidden_size_per_attention_head)
-            )
-        else:
-            new_tensor_shape = mixed_x_layer.size()[:-1] + \
-                               (self.num_attention_heads_per_partition,
-                                3 * self.hidden_size_per_attention_head)
-            mixed_x_layer = mixed_x_layer.view(*new_tensor_shape)
-
-            # [sq, b, np, 3 * hn] --> 3 [sq, b, np, hn]
-            (query_layer, key_layer, value_layer) = split_tensor_along_last_dim(mixed_x_layer, 3)
-
-        # apply relative positional encoding (rotary embedding)
-        if rotary_pos_emb is not None:
-            query_layer = apply_rotary_pos_emb(query_layer, rotary_pos_emb)
-            key_layer = apply_rotary_pos_emb(key_layer, rotary_pos_emb)
-
-        # adjust key and value for inference
-        if kv_cache is not None:
-            cache_k, cache_v = kv_cache
-            key_layer = torch.cat((cache_k, key_layer), dim=0)
-            value_layer = torch.cat((cache_v, value_layer), dim=0)
-        if use_cache:
-            kv_cache = (key_layer, value_layer)
-        else:
-            kv_cache = None
-
-        if self.multi_query_attention:
-            key_layer = key_layer.unsqueeze(-2)
-            key_layer = key_layer.expand(
-                -1, -1, -1, self.num_attention_heads_per_partition // self.num_multi_query_groups_per_partition, -1
-            )
-            key_layer = key_layer.contiguous().view(
-                key_layer.size()[:2] + (self.num_attention_heads_per_partition, self.hidden_size_per_attention_head)
-            )
-            value_layer = value_layer.unsqueeze(-2)
-            value_layer = value_layer.expand(
-                -1, -1, -1, self.num_attention_heads_per_partition // self.num_multi_query_groups_per_partition, -1
-            )
-            value_layer = value_layer.contiguous().view(
-                value_layer.size()[:2] + (self.num_attention_heads_per_partition, self.hidden_size_per_attention_head)
-            )
-
-        # ==================================
-        # core attention computation
-        # ==================================
-
-        context_layer = self.core_attention(query_layer, key_layer, value_layer, attention_mask)
-
-        # =================
-        # Output. [sq, b, h]
-        # =================
-
-        output = self.dense(context_layer)
-
-        return output, kv_cache
-
-
-def _config_to_kwargs(args):
-    common_kwargs = {
-        "dtype": args.torch_dtype,
-    }
-    return common_kwargs
-
-
-class MLP(torch.nn.Module):
-    """MLP.
-
-    MLP will take the input with h hidden state, project it to 4*h
-    hidden dimension, perform nonlinear transformation, and project the
-    state back into h hidden dimension.
-    """
-
-    def __init__(self, config: ChatGLMConfig, device=None):
-        super(MLP, self).__init__()
-
-        self.add_bias = config.add_bias_linear
-
-        # Project to 4h. If using swiglu double the output width, see https://arxiv.org/pdf/2002.05202.pdf
-        self.dense_h_to_4h = nn.Linear(
-            config.hidden_size,
-            config.ffn_hidden_size * 2,
-            bias=self.add_bias,
-            device=device,
-            **_config_to_kwargs(config)
-        )
-
-        def swiglu(x):
-            x = torch.chunk(x, 2, dim=-1)
-            return F.silu(x[0]) * x[1]
-
-        self.activation_func = swiglu
-
-        # Project back to h.
-        self.dense_4h_to_h = nn.Linear(
-            config.ffn_hidden_size,
-            config.hidden_size,
-            bias=self.add_bias,
-            device=device,
-            **_config_to_kwargs(config)
-        )
-
-    def forward(self, hidden_states):
-        # [s, b, 4hp]
-        intermediate_parallel = self.dense_h_to_4h(hidden_states)
-        intermediate_parallel = self.activation_func(intermediate_parallel)
-        # [s, b, h]
-        output = self.dense_4h_to_h(intermediate_parallel)
-        return output
-
-
-class GLMBlock(torch.nn.Module):
-    """A single transformer layer.
-
-    Transformer layer takes input with size [s, b, h] and returns an
-    output of the same size.
-    """
-
-    def __init__(self, config: ChatGLMConfig, layer_number, device=None):
-        super(GLMBlock, self).__init__()
-        self.layer_number = layer_number
-
-        self.apply_residual_connection_post_layernorm = config.apply_residual_connection_post_layernorm
-
-        self.fp32_residual_connection = config.fp32_residual_connection
-
-        LayerNormFunc = RMSNorm if config.rmsnorm else LayerNorm
-        # Layernorm on the input data.
-        self.input_layernorm = LayerNormFunc(config.hidden_size, eps=config.layernorm_epsilon, device=device,
-                                             dtype=config.torch_dtype)
-
-        # Self attention.
-        self.self_attention = SelfAttention(config, layer_number, device=device)
-        self.hidden_dropout = config.hidden_dropout
-
-        # Layernorm on the attention output
-        self.post_attention_layernorm = LayerNormFunc(config.hidden_size, eps=config.layernorm_epsilon, device=device,
-                                                      dtype=config.torch_dtype)
-
-        # MLP
-        self.mlp = MLP(config, device=device)
-
-    def forward(
-            self, hidden_states, attention_mask, rotary_pos_emb, kv_cache=None, use_cache=True,
-    ):
-        # hidden_states: [s, b, h]
-
-        # Layer norm at the beginning of the transformer layer.
-        layernorm_output = self.input_layernorm(hidden_states)
-        # Self attention.
-        attention_output, kv_cache = self.self_attention(
-            layernorm_output,
-            attention_mask,
-            rotary_pos_emb,
-            kv_cache=kv_cache,
-            use_cache=use_cache
-        )
-
-        # Residual connection.
-        if self.apply_residual_connection_post_layernorm:
-            residual = layernorm_output
-        else:
-            residual = hidden_states
-
-        layernorm_input = torch.nn.functional.dropout(attention_output, p=self.hidden_dropout, training=self.training)
-        layernorm_input = residual + layernorm_input
-
-        # Layer norm post the self attention.
-        layernorm_output = self.post_attention_layernorm(layernorm_input)
-
-        # MLP.
-        mlp_output = self.mlp(layernorm_output)
-
-        # Second residual connection.
-        if self.apply_residual_connection_post_layernorm:
-            residual = layernorm_output
-        else:
-            residual = layernorm_input
-
-        output = torch.nn.functional.dropout(mlp_output, p=self.hidden_dropout, training=self.training)
-        output = residual + output
-
-        return output, kv_cache
-
-
-class GLMTransformer(torch.nn.Module):
-    """Transformer class."""
-
-    def __init__(self, config: ChatGLMConfig, device=None):
-        super(GLMTransformer, self).__init__()
-
-        self.fp32_residual_connection = config.fp32_residual_connection
-        self.post_layer_norm = config.post_layer_norm
-
-        # Number of layers.
-        self.num_layers = config.num_layers
-
-        # Transformer layers.
-        def build_layer(layer_number):
-            return GLMBlock(config, layer_number, device=device)
-
-        self.layers = torch.nn.ModuleList([build_layer(i + 1) for i in range(self.num_layers)])
-
-        if self.post_layer_norm:
-            LayerNormFunc = RMSNorm if config.rmsnorm else LayerNorm
-            # Final layer norm before output.
-            self.final_layernorm = LayerNormFunc(config.hidden_size, eps=config.layernorm_epsilon, device=device,
-                                                 dtype=config.torch_dtype)
-
-        self.gradient_checkpointing = False
-
-    def _get_layer(self, layer_number):
-        return self.layers[layer_number]
-
-    def forward(
-            self, hidden_states, attention_mask, rotary_pos_emb, kv_caches=None,
-            use_cache: Optional[bool] = True,
-            output_hidden_states: Optional[bool] = False,
-    ):
-        if not kv_caches:
-            kv_caches = [None for _ in range(self.num_layers)]
-        presents = () if use_cache else None
-        if self.gradient_checkpointing and self.training:
-            if use_cache:
-                logger.warning_once(
-                    "`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`..."
-                )
-                use_cache = False
-
-        all_self_attentions = None
-        all_hidden_states = () if output_hidden_states else None
-        for index in range(self.num_layers):
-            if output_hidden_states:
-                all_hidden_states = all_hidden_states + (hidden_states,)
-
-            layer = self._get_layer(index)
-            if self.gradient_checkpointing and self.training:
-                layer_ret = torch.utils.checkpoint.checkpoint(
-                    layer,
-                    hidden_states,
-                    attention_mask,
-                    rotary_pos_emb,
-                    kv_caches[index],
-                    use_cache
-                )
-            else:
-                layer_ret = layer(
-                    hidden_states,
-                    attention_mask,
-                    rotary_pos_emb,
-                    kv_cache=kv_caches[index],
-                    use_cache=use_cache
-                )
-            hidden_states, kv_cache = layer_ret
-            if use_cache:
-                presents = presents + (kv_cache,)
-
-        if output_hidden_states:
-            all_hidden_states = all_hidden_states + (hidden_states,)
-
-        # Final layer norm.
-        if self.post_layer_norm:
-            hidden_states = self.final_layernorm(hidden_states)
-
-        return hidden_states, presents, all_hidden_states, all_self_attentions
-
-
-class ChatGLMPreTrainedModel(PreTrainedModel):
-    """
-    An abstract class to handle weights initialization and
-    a simple interface for downloading and loading pretrained models.
-    """
-
-    is_parallelizable = False
-    supports_gradient_checkpointing = True
-    config_class = ChatGLMConfig
-    base_model_prefix = "transformer"
-    _no_split_modules = ["GLMBlock"]
-
-    def _init_weights(self, module: nn.Module):
-        """Initialize the weights."""
-        return
-
-    def get_masks(self, input_ids, past_key_values, padding_mask=None):
-        batch_size, seq_length = input_ids.shape
-        full_attention_mask = torch.ones(batch_size, seq_length, seq_length, device=input_ids.device)
-        full_attention_mask.tril_()
-        past_length = 0
-        if past_key_values:
-            past_length = past_key_values[0][0].shape[0]
-        if past_length:
-            full_attention_mask = torch.cat((torch.ones(batch_size, seq_length, past_length,
-                                                        device=input_ids.device), full_attention_mask), dim=-1)
-        if padding_mask is not None:
-            full_attention_mask = full_attention_mask * padding_mask.unsqueeze(1)
-        if not past_length and padding_mask is not None:
-            full_attention_mask -= padding_mask.unsqueeze(-1) - 1
-        full_attention_mask = (full_attention_mask < 0.5).bool()
-        full_attention_mask.unsqueeze_(1)
-        return full_attention_mask
-
-    def get_position_ids(self, input_ids, device):
-        batch_size, seq_length = input_ids.shape
-        position_ids = torch.arange(seq_length, dtype=torch.long, device=device).unsqueeze(0).repeat(batch_size, 1)
-        return position_ids
-
-    def _set_gradient_checkpointing(self, module, value=False):
-        if isinstance(module, GLMTransformer):
-            module.gradient_checkpointing = value
-
-
-class Embedding(torch.nn.Module):
-    """Language model embeddings."""
-
-    def __init__(self, config: ChatGLMConfig, device=None):
-        super(Embedding, self).__init__()
-
-        self.hidden_size = config.hidden_size
-        # Word embeddings (parallel).
-        self.word_embeddings = nn.Embedding(
-            config.padded_vocab_size,
-            self.hidden_size,
-            dtype=config.torch_dtype,
-            device=device
-        )
-        self.fp32_residual_connection = config.fp32_residual_connection
-
-    def forward(self, input_ids):
-        # Embeddings.
-        words_embeddings = self.word_embeddings(input_ids)
-        embeddings = words_embeddings
-        # Data format change to avoid explicit tranposes : [b s h] --> [s b h].
-        embeddings = embeddings.transpose(0, 1).contiguous()
-        # If the input flag for fp32 residual connection is set, convert for float.
-        if self.fp32_residual_connection:
-            embeddings = embeddings.float()
-        return embeddings
-
-
-class ChatGLMModel(ChatGLMPreTrainedModel):
-    def __init__(self, config: ChatGLMConfig, device=None, empty_init=True):
-        super().__init__(config)
-        if empty_init:
-            init_method = skip_init
-        else:
-            init_method = default_init
-        init_kwargs = {}
-        if device is not None:
-            init_kwargs["device"] = device
-        self.embedding = init_method(Embedding, config, **init_kwargs)
-        self.num_layers = config.num_layers
-        self.multi_query_group_num = config.multi_query_group_num
-        self.kv_channels = config.kv_channels
-
-        # Rotary positional embeddings
-        self.seq_length = config.seq_length
-        rotary_dim = (
-            config.hidden_size // config.num_attention_heads if config.kv_channels is None else config.kv_channels
-        )
-
-        self.rotary_pos_emb = RotaryEmbedding(rotary_dim // 2, original_impl=config.original_rope, device=device,
-                                              dtype=config.torch_dtype)
-        self.encoder = init_method(GLMTransformer, config, **init_kwargs)
-        self.output_layer = init_method(nn.Linear, config.hidden_size, config.padded_vocab_size, bias=False,
-                                        dtype=config.torch_dtype, **init_kwargs)
-        self.pre_seq_len = config.pre_seq_len
-        self.prefix_projection = config.prefix_projection
-        if self.pre_seq_len is not None:
-            for param in self.parameters():
-                param.requires_grad = False
-            self.prefix_tokens = torch.arange(self.pre_seq_len).long()
-            self.prefix_encoder = PrefixEncoder(config)
-            self.dropout = torch.nn.Dropout(0.1)
-
-    def get_input_embeddings(self):
-        return self.embedding.word_embeddings
-
-    def get_prompt(self, batch_size, device, dtype=torch.half):
-        prefix_tokens = self.prefix_tokens.unsqueeze(0).expand(batch_size, -1).to(device)
-        past_key_values = self.prefix_encoder(prefix_tokens).type(dtype)
-        past_key_values = past_key_values.view(
-            batch_size,
-            self.pre_seq_len,
-            self.num_layers * 2,
-            self.multi_query_group_num,
-            self.kv_channels
-        )
-        # seq_len, b, nh, hidden_size
-        past_key_values = self.dropout(past_key_values)
-        past_key_values = past_key_values.permute([2, 1, 0, 3, 4]).split(2)
-        return past_key_values
-
-    def forward(
-            self,
-            input_ids,
-            position_ids: Optional[torch.Tensor] = None,
-            attention_mask: Optional[torch.BoolTensor] = None,
-            full_attention_mask: Optional[torch.BoolTensor] = None,
-            past_key_values: Optional[Tuple[Tuple[torch.Tensor, torch.Tensor], ...]] = None,
-            inputs_embeds: Optional[torch.Tensor] = None,
-            use_cache: Optional[bool] = None,
-            output_hidden_states: Optional[bool] = None,
-            return_dict: Optional[bool] = None,
-    ):
-        output_hidden_states = (
-            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
-        )
-        use_cache = use_cache if use_cache is not None else self.config.use_cache
-        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
-
-        batch_size, seq_length = input_ids.shape
-
-        if inputs_embeds is None:
-            inputs_embeds = self.embedding(input_ids)
-
-        if self.pre_seq_len is not None:
-            if past_key_values is None:
-                past_key_values = self.get_prompt(batch_size=batch_size, device=input_ids.device,
-                                                  dtype=inputs_embeds.dtype)
-            if attention_mask is not None:
-                attention_mask = torch.cat([attention_mask.new_ones((batch_size, self.pre_seq_len)),
-                                            attention_mask], dim=-1)
-
-        if full_attention_mask is None:
-            if (attention_mask is not None and not attention_mask.all()) or (past_key_values and seq_length != 1):
-                full_attention_mask = self.get_masks(input_ids, past_key_values, padding_mask=attention_mask)
-
-        # Rotary positional embeddings
-        rotary_pos_emb = self.rotary_pos_emb(self.seq_length)
-        if position_ids is not None:
-            rotary_pos_emb = rotary_pos_emb[position_ids]
-        else:
-            rotary_pos_emb = rotary_pos_emb[None, :seq_length]
-        rotary_pos_emb = rotary_pos_emb.transpose(0, 1).contiguous()
-
-        # Run encoder.
-        hidden_states, presents, all_hidden_states, all_self_attentions = self.encoder(
-            inputs_embeds, full_attention_mask, rotary_pos_emb=rotary_pos_emb,
-            kv_caches=past_key_values, use_cache=use_cache, output_hidden_states=output_hidden_states
-        )
-
-        if not return_dict:
-            return tuple(v for v in [hidden_states, presents, all_hidden_states, all_self_attentions] if v is not None)
-
-        return BaseModelOutputWithPast(
-            last_hidden_state=hidden_states,
-            past_key_values=presents,
-            hidden_states=all_hidden_states,
-            attentions=all_self_attentions,
-        )
-
-    def quantize(self, weight_bit_width: int):
-        from .quantization import quantize
-        quantize(self.encoder, weight_bit_width)
-        return self
-
-
-class ChatGLMForConditionalGeneration(ChatGLMPreTrainedModel):
-    def __init__(self, config: ChatGLMConfig, empty_init=True, device=None):
-        super().__init__(config)
-
-        self.max_sequence_length = config.max_length
-        self.transformer = ChatGLMModel(config, empty_init=empty_init, device=device)
-        self.config = config
-        self.quantized = False
-
-        if self.config.quantization_bit:
-            self.quantize(self.config.quantization_bit, empty_init=True)
-
-    def _update_model_kwargs_for_generation(
-            self,
-            outputs: ModelOutput,
-            model_kwargs: Dict[str, Any],
-            is_encoder_decoder: bool = False,
-            standardize_cache_format: bool = False,
-    ) -> Dict[str, Any]:
-        # update past_key_values
-        model_kwargs["past_key_values"] = self._extract_past_from_model_output(
-            outputs, standardize_cache_format=standardize_cache_format
-        )
-
-        # update attention mask
-        if "attention_mask" in model_kwargs:
-            attention_mask = model_kwargs["attention_mask"]
-            model_kwargs["attention_mask"] = torch.cat(
-                [attention_mask, attention_mask.new_ones((attention_mask.shape[0], 1))], dim=-1
-            )
-
-        # update position ids
-        if "position_ids" in model_kwargs:
-            position_ids = model_kwargs["position_ids"]
-            new_position_id = position_ids[..., -1:].clone()
-            new_position_id += 1
-            model_kwargs["position_ids"] = torch.cat(
-                [position_ids, new_position_id], dim=-1
-            )
-
-        model_kwargs["is_first_forward"] = False
-        return model_kwargs
-
-    def prepare_inputs_for_generation(
-            self,
-            input_ids: torch.LongTensor,
-            past_key_values: Optional[torch.Tensor] = None,
-            attention_mask: Optional[torch.Tensor] = None,
-            position_ids: Optional[torch.Tensor] = None,
-            is_first_forward: bool = True,
-            **kwargs
-    ) -> dict:
-        # only last token for input_ids if past is not None
-        if position_ids is None:
-            position_ids = self.get_position_ids(input_ids, device=input_ids.device)
-        if not is_first_forward:
-            position_ids = position_ids[..., -1:]
-            input_ids = input_ids[:, -1:]
-        return {
-            "input_ids": input_ids,
-            "past_key_values": past_key_values,
-            "position_ids": position_ids,
-            "attention_mask": attention_mask,
-            "return_last_logit": True
-        }
-
-    def forward(
-            self,
-            input_ids: Optional[torch.Tensor] = None,
-            position_ids: Optional[torch.Tensor] = None,
-            attention_mask: Optional[torch.Tensor] = None,
-            past_key_values: Optional[Tuple[torch.FloatTensor]] = None,
-            inputs_embeds: Optional[torch.Tensor] = None,
-            labels: Optional[torch.Tensor] = None,
-            use_cache: Optional[bool] = None,
-            output_attentions: Optional[bool] = None,
-            output_hidden_states: Optional[bool] = None,
-            return_dict: Optional[bool] = None,
-            return_last_logit: Optional[bool] = False,
-    ):
-        use_cache = use_cache if use_cache is not None else self.config.use_cache
-        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
-
-        transformer_outputs = self.transformer(
-            input_ids=input_ids,
-            position_ids=position_ids,
-            attention_mask=attention_mask,
-            past_key_values=past_key_values,
-            inputs_embeds=inputs_embeds,
-            use_cache=use_cache,
-            output_hidden_states=output_hidden_states,
-            return_dict=return_dict,
-        )
-
-        hidden_states = transformer_outputs[0]
-        if return_last_logit:
-            hidden_states = hidden_states[-1:]
-        lm_logits = self.transformer.output_layer(hidden_states)
-        lm_logits = lm_logits.transpose(0, 1).contiguous()
-
-        loss = None
-        if labels is not None:
-            lm_logits = lm_logits.to(torch.float32)
-
-            # Shift so that tokens < n predict n
-            shift_logits = lm_logits[..., :-1, :].contiguous()
-            shift_labels = labels[..., 1:].contiguous()
-            # Flatten the tokens
-            loss_fct = CrossEntropyLoss(ignore_index=-100)
-            loss = loss_fct(shift_logits.view(-1, shift_logits.size(-1)), shift_labels.view(-1))
-
-            lm_logits = lm_logits.to(hidden_states.dtype)
-            loss = loss.to(hidden_states.dtype)
-
-        if not return_dict:
-            output = (lm_logits,) + transformer_outputs[1:]
-            return ((loss,) + output) if loss is not None else output
-
-        return CausalLMOutputWithPast(
-            loss=loss,
-            logits=lm_logits,
-            past_key_values=transformer_outputs.past_key_values,
-            hidden_states=transformer_outputs.hidden_states,
-            attentions=transformer_outputs.attentions,
-        )
-
-    @staticmethod
-    def _reorder_cache(
-            past: Tuple[Tuple[torch.Tensor, torch.Tensor], ...], beam_idx: torch.LongTensor
-    ) -> Tuple[Tuple[torch.Tensor, torch.Tensor], ...]:
-        """
-        This function is used to re-order the `past_key_values` cache if [`~PreTrainedModel.beam_search`] or
-        [`~PreTrainedModel.beam_sample`] is called. This is required to match `past_key_values` with the correct
-        beam_idx at every generation step.
-
-        Output shares the same memory storage as `past`.
-        """
-        return tuple(
-            (
-                layer_past[0].index_select(1, beam_idx.to(layer_past[0].device)),
-                layer_past[1].index_select(1, beam_idx.to(layer_past[1].device)),
-            )
-            for layer_past in past
-        )
-
-    def process_response(self, response):
-        response = response.strip()
-        response = response.replace("[[训练时间]]", "2023年")
-        return response
-
-    def build_inputs(self, tokenizer, query: str, history: List[Tuple[str, str]] = None):
-        prompt = tokenizer.build_prompt(query, history=history)
-        inputs = tokenizer([prompt], return_tensors="pt")
-        inputs = inputs.to(self.device)
-        return inputs
-
-    def build_stream_inputs(self, tokenizer, query: str, history: List[Tuple[str, str]] = None):
-        if history:
-            prompt = "\n\n[Round {}]\n\n问：{}\n\n答：".format(len(history) + 1, query)
-            input_ids = tokenizer.encode(prompt, add_special_tokens=False)
-            input_ids = input_ids[1:]
-            inputs = tokenizer.batch_encode_plus([(input_ids, None)], return_tensors="pt", add_special_tokens=False)
-        else:
-            prompt = "[Round {}]\n\n问：{}\n\n答：".format(len(history) + 1, query)
-            inputs = tokenizer([prompt], return_tensors="pt")
-        inputs = inputs.to(self.device)
-        return inputs
-
-    @torch.no_grad()
-    def chat(self, tokenizer, query: str, history: List[Tuple[str, str]] = None, max_length: int = 8192, num_beams=1,
-             do_sample=True, top_p=0.8, temperature=0.8, logits_processor=None, **kwargs):
-        if history is None:
-            history = []
-        if logits_processor is None:
-            logits_processor = LogitsProcessorList()
-        logits_processor.append(InvalidScoreLogitsProcessor())
-        gen_kwargs = {"max_length": max_length, "num_beams": num_beams, "do_sample": do_sample, "top_p": top_p,
-                      "temperature": temperature, "logits_processor": logits_processor, **kwargs}
-        inputs = self.build_inputs(tokenizer, query, history=history)
-        outputs = self.generate(**inputs, **gen_kwargs)
-        outputs = outputs.tolist()[0][len(inputs["input_ids"][0]):]
-        response = tokenizer.decode(outputs)
-        response = self.process_response(response)
-        history = history + [(query, response)]
-        return response, history
-
-    @torch.no_grad()
-    def stream_chat(self, tokenizer, query: str, history: List[Tuple[str, str]] = None, past_key_values=None,
-                    max_length: int = 8192, do_sample=True, top_p=0.8, temperature=0.8, logits_processor=None,
-                    return_past_key_values=False, **kwargs):
-        if history is None:
-            history = []
-        if logits_processor is None:
-            logits_processor = LogitsProcessorList()
-        logits_processor.append(InvalidScoreLogitsProcessor())
-        gen_kwargs = {"max_length": max_length, "do_sample": do_sample, "top_p": top_p,
-                      "temperature": temperature, "logits_processor": logits_processor, **kwargs}
-        if past_key_values is None and not return_past_key_values:
-            inputs = self.build_inputs(tokenizer, query, history=history)
-        else:
-            inputs = self.build_stream_inputs(tokenizer, query, history=history)
-        if past_key_values is not None:
-            past_length = past_key_values[0][0].shape[0]
-            if self.transformer.pre_seq_len is not None:
-                past_length -= self.transformer.pre_seq_len
-            inputs.position_ids += past_length
-            attention_mask = inputs.attention_mask
-            attention_mask = torch.cat((attention_mask.new_ones(1, past_length), attention_mask), dim=1)
-            inputs['attention_mask'] = attention_mask
-        for outputs in self.stream_generate(**inputs, past_key_values=past_key_values,
-                                            return_past_key_values=return_past_key_values, **gen_kwargs):
-            if return_past_key_values:
-                outputs, past_key_values = outputs
-            outputs = outputs.tolist()[0][len(inputs["input_ids"][0]):]
-            response = tokenizer.decode(outputs)
-            if response and response[-1] != "�":
-                response = self.process_response(response)
-                new_history = history + [(query, response)]
-                if return_past_key_values:
-                    yield response, new_history, past_key_values
-                else:
-                    yield response, new_history
-
-    @torch.no_grad()
-    def stream_generate(
-            self,
-            input_ids,
-            generation_config: Optional[GenerationConfig] = None,
-            logits_processor: Optional[LogitsProcessorList] = None,
-            stopping_criteria: Optional[StoppingCriteriaList] = None,
-            prefix_allowed_tokens_fn: Optional[Callable[[int, torch.Tensor], List[int]]] = None,
-            return_past_key_values=False,
-            **kwargs,
-    ):
-        batch_size, input_ids_seq_length = input_ids.shape[0], input_ids.shape[-1]
-
-        if generation_config is None:
-            generation_config = self.generation_config
-        generation_config = copy.deepcopy(generation_config)
-        model_kwargs = generation_config.update(**kwargs)
-        bos_token_id, eos_token_id = generation_config.bos_token_id, generation_config.eos_token_id
-
-        if isinstance(eos_token_id, int):
-            eos_token_id = [eos_token_id]
-
-        has_default_max_length = kwargs.get("max_length") is None and generation_config.max_length is not None
-        if has_default_max_length and generation_config.max_new_tokens is None:
-            warnings.warn(
-                f"Using `max_length`'s default ({generation_config.max_length}) to control the generation length. "
-                "This behaviour is deprecated and will be removed from the config in v5 of Transformers -- we"
-                " recommend using `max_new_tokens` to control the maximum length of the generation.",
-                UserWarning,
-            )
-        elif generation_config.max_new_tokens is not None:
-            generation_config.max_length = generation_config.max_new_tokens + input_ids_seq_length
-            if not has_default_max_length:
-                logger.warn(
-                    f"Both `max_new_tokens` (={generation_config.max_new_tokens}) and `max_length`(="
-                    f"{generation_config.max_length}) seem to have been set. `max_new_tokens` will take precedence. "
-                    "Please refer to the documentation for more information. "
-                    "(https://huggingface.co/docs/transformers/main/en/main_classes/text_generation)",
-                    UserWarning,
-                )
-
-        if input_ids_seq_length >= generation_config.max_length:
-            input_ids_string = "decoder_input_ids" if self.config.is_encoder_decoder else "input_ids"
-            logger.warning(
-                f"Input length of {input_ids_string} is {input_ids_seq_length}, but `max_length` is set to"
-                f" {generation_config.max_length}. This can lead to unexpected behavior. You should consider"
-                " increasing `max_new_tokens`."
-            )
-
-        # 2. Set generation parameters if not already defined
-        logits_processor = logits_processor if logits_processor is not None else LogitsProcessorList()
-        stopping_criteria = stopping_criteria if stopping_criteria is not None else StoppingCriteriaList()
-
-        logits_processor = self._get_logits_processor(
-            generation_config=generation_config,
-            input_ids_seq_length=input_ids_seq_length,
-            encoder_input_ids=input_ids,
-            prefix_allowed_tokens_fn=prefix_allowed_tokens_fn,
-            logits_processor=logits_processor,
-        )
-
-        stopping_criteria = self._get_stopping_criteria(
-            generation_config=generation_config, stopping_criteria=stopping_criteria
-        )
-        logits_warper = self._get_logits_warper(generation_config)
-
-        unfinished_sequences = input_ids.new(input_ids.shape[0]).fill_(1)
-        scores = None
-        while True:
-            model_inputs = self.prepare_inputs_for_generation(input_ids, **model_kwargs)
-            # forward pass to get next token
-            outputs = self(
-                **model_inputs,
-                return_dict=True,
-                output_attentions=False,
-                output_hidden_states=False,
-            )
-
-            next_token_logits = outputs.logits[:, -1, :]
-
-            # pre-process distribution
-            next_token_scores = logits_processor(input_ids, next_token_logits)
-            next_token_scores = logits_warper(input_ids, next_token_scores)
-
-            # sample
-            probs = nn.functional.softmax(next_token_scores, dim=-1)
-            if generation_config.do_sample:
-                next_tokens = torch.multinomial(probs, num_samples=1).squeeze(1)
-            else:
-                next_tokens = torch.argmax(probs, dim=-1)
-
-            # update generated ids, model inputs, and length for next step
-            input_ids = torch.cat([input_ids, next_tokens[:, None]], dim=-1)
-            model_kwargs = self._update_model_kwargs_for_generation(
-                outputs, model_kwargs, is_encoder_decoder=self.config.is_encoder_decoder
-            )
-            unfinished_sequences = unfinished_sequences.mul((sum(next_tokens != i for i in eos_token_id)).long())
-            if return_past_key_values:
-                yield input_ids, outputs.past_key_values
-            else:
-                yield input_ids
-            # stop when each sentence is finished, or if we exceed the maximum length
-            if unfinished_sequences.max() == 0 or stopping_criteria(input_ids, scores):
-                break
-
-    def quantize(self, bits: int, empty_init=False, device=None, **kwargs):
-        if bits == 0:
-            return
-
-        from .quantization import quantize
-
-        if self.quantized:
-            logger.info("Already quantized.")
-            return self
-
-        self.quantized = True
-
-        self.config.quantization_bit = bits
-
-        self.transformer.encoder = quantize(self.transformer.encoder, bits, empty_init=empty_init, device=device,
-                                            **kwargs)
-        return self
\ No newline at end of file
diff --git a/tests/kit/model_zoo/transformers/chatglm2_6b/MODEL_LICENSE b/tests/kit/model_zoo/transformers/chatglm2_6b/MODEL_LICENSE
new file mode 100644
index 000000000000..26198b21b6b2
--- /dev/null
+++ b/tests/kit/model_zoo/transformers/chatglm2_6b/MODEL_LICENSE
@@ -0,0 +1,33 @@
+The ChatGLM2-6B License
+
+1. Definitions
+
+“Licensor” means the ChatGLM2-6B Model Team that distributes its Software.
+
+“Software” means the ChatGLM2-6B model parameters made available under this license.
+
+2. License Grant
+
+Subject to the terms and conditions of this License, the Licensor hereby grants to you a non-exclusive, worldwide, non-transferable, non-sublicensable, revocable, royalty-free copyright license to use the Software solely for your non-commercial research purposes.
+
+The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+
+3. Restriction
+
+You will not use, copy, modify, merge, publish, distribute, reproduce, or create derivative works of the Software, in whole or in part, for any commercial, military, or illegal purposes.
+
+You will not use the Software for any act that may undermine China's national security and national unity, harm the public interest of society, or infringe upon the rights and interests of human beings.
+
+4. Disclaimer
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+5. Limitation of Liability
+
+EXCEPT TO THE EXTENT PROHIBITED BY APPLICABLE LAW, IN NO EVENT AND UNDER NO LEGAL THEORY, WHETHER BASED IN TORT, NEGLIGENCE, CONTRACT, LIABILITY, OR OTHERWISE WILL ANY LICENSOR BE LIABLE TO YOU FOR ANY DIRECT, INDIRECT, SPECIAL, INCIDENTAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES, OR ANY OTHER COMMERCIAL LOSSES, EVEN IF THE LICENSOR HAS BEEN ADVISED OF THE POSSIBILITY OF SUCH DAMAGES.
+
+6. Dispute Resolution
+
+This license shall be governed and construed in accordance with the laws of People’s Republic of China. Any dispute arising from or in connection with this License shall be submitted to Haidian District People's Court in Beijing.
+
+Note that the license is subject to update to a more comprehensive version.  For any questions related to the license and copyright, please contact us at glm-130b@googlegroups.com.
\ No newline at end of file
diff --git a/tests/kit/model_zoo/transformers/chatglm2_6b/modeling_chatglm.py b/tests/kit/model_zoo/transformers/chatglm2_6b/modeling_chatglm.py
index bae6d425878d..488f24c5fcb9 100644
--- a/tests/kit/model_zoo/transformers/chatglm2_6b/modeling_chatglm.py
+++ b/tests/kit/model_zoo/transformers/chatglm2_6b/modeling_chatglm.py
@@ -80,7 +80,6 @@ def default_init(cls, *args, **kwargs):
 
 
 class InvalidScoreLogitsProcessor(LogitsProcessor):
-
     def __call__(self, input_ids: torch.LongTensor, scores: torch.FloatTensor) -> torch.FloatTensor:
         if torch.isnan(scores).any() or torch.isinf(scores).any():
             scores.zero_()
@@ -220,7 +219,6 @@ def apply_rotary_pos_emb(x: torch.Tensor, rope_cache: torch.Tensor) -> torch.Ten
 
 
 class RMSNorm(torch.nn.Module):
-
     def __init__(self, normalized_shape, eps=1e-5, device=None, dtype=None, **kwargs):
         super().__init__()
         self.weight = torch.nn.Parameter(torch.empty(normalized_shape, device=device, dtype=dtype))
@@ -235,7 +233,6 @@ def forward(self, hidden_states: torch.Tensor):
 
 
 class CoreAttention(torch.nn.Module):
-
     def __init__(self, config: ChatGLMConfig, layer_number):
         super(CoreAttention, self).__init__()
 
@@ -842,7 +839,6 @@ def forward(self, input_ids):
 
 
 class ChatGLMModel(ChatGLMPreTrainedModel):
-
     def __init__(self, config: ChatGLMConfig, device=None, empty_init=True):
         super().__init__(config)
         if empty_init:
@@ -981,13 +977,11 @@ def forward(
 
     def quantize(self, weight_bit_width: int):
         from .quantization import quantize
-
         quantize(self.encoder, weight_bit_width)
         return self
 
 
 class ChatGLMForConditionalGeneration(ChatGLMPreTrainedModel):
-
     def __init__(self, config: ChatGLMConfig, empty_init=True, device=None):
         super().__init__(config)
 
diff --git a/tests/kit/model_zoo/transformers/chatglm2_6b/quantization.py b/tests/kit/model_zoo/transformers/chatglm2_6b/quantization.py
new file mode 100644
index 000000000000..cb95bfe82b20
--- /dev/null
+++ b/tests/kit/model_zoo/transformers/chatglm2_6b/quantization.py
@@ -0,0 +1,188 @@
+from torch.nn import Linear
+from torch.nn.parameter import Parameter
+
+import bz2
+import torch
+import base64
+import ctypes
+from transformers.utils import logging
+
+from typing import List
+from functools import partial
+
+logger = logging.get_logger(__name__)
+
+try:
+    from cpm_kernels.kernels.base import LazyKernelCModule, KernelFunction, round_up
+
+    class Kernel:
+        def __init__(self, code: bytes, function_names: List[str]):
+            self.code = code
+            self._function_names = function_names
+            self._cmodule = LazyKernelCModule(self.code)
+
+            for name in self._function_names:
+                setattr(self, name, KernelFunction(self._cmodule, name))
+
+    quantization_code = "$QlpoOTFBWSZTWU9yuJUAQHN//////////f/n/8/n///n//bt4dTidcVx8X3V9FV/92/v4B7/AD5FBQFAAAChSgKpFCFAFVSigUAAAEKhSgUUqgFBKigqVREQAABQBQIANDTTIGI00BkZBkNGE0A0BkBkGQGRkaNAaAGQNBoGgDIAAYIGTI0DQAQAaGmmQMRpoDIyDIaMJoBoDIDIMgMjI0aA0AMgaDQNAGQAAwQMmRoGgAgA0NNMgYjTQGRkGQ0YTQDQGQGQZAZGRo0BoAZA0GgaAMgABggZMjQNABABoaaZAxGmgMjIMhowmgGgMgMgyAyMjRoDQAyBoNA0AZAADBAyZGgaAAmqU1NEgJqnptU/Sn4jRR6J6epk2pqb1Q/SgAPUGgyNNGjQ2SBpoAZAAGg0NB6mgDIAAAAA2oaApSREBNAARhGiYEaEwU8pvImlP0k2aam1GaGqbFNM1MHpTwmkepmyU9R6nqPKekHqNNPUxNGhp6n6p6QaZ6o9TG1GMqcoV9ly6nRanHlq6zPNbnGZNi6HSug+2nPiZ13XcnFYZW+45W11CumhzYhchOJ2GLLV1OBjBjGf4TptOddTSOcVxhqYZMYwZXZZY00zI1paX5X9J+b+f4e+x43RXSxXPOdquiGpduatGyXneN696M9t4HU2eR5XX/kPhP261NTx3JO1Ow7LyuDmeo9a7d351T1ZxnvnrvYnrXv/hXxPCeuYx2XsNmO003eg9J3Z6U7b23meJ4ri01OdzTk9BNO96brz+qT5nuvvH3ds/G+m/JcG/F2XYuhXlvO+jP7U3XgrzPN/lr8Sf1n6j4j7jZs+s/T0tNaNNYzTs12rxjwztHlnire3Nzc3N1wuBwOBwXBvZfoHpD7rFmR99V5vj3aXza3xdBbXMalubTg/jIv5dfAi54Pdc75j4z412n3Npj3Ld/ENm7a3b/Cod6h/ret1/5vn/C+l+gdslMvgPSLJ8d8q+U66fevYn/tW1chleEtNTGlcHCbLRlq0tHzF5tsbbZZfHjjLgZu42XCuC3NrdjTasZGNzgxPIrGqp7r3p7L2p5XjnpPSmTd5XtzqnB6U87zzg1Ol0zd0zsLszxR6lkxp35u6/teL0L0W922cR7Lu1lpL9CsHirzuM2T+BgsyViT6LHcm0/Vr6U/7LGGyJeqTEjt0PHWhF5mCT7R9mtlDwriYv0Tyr/OxYt6qp5r0mPVT0608TqnqMZaarU2nFwrTzzlrs1ed7z1ux60wyr4ydCaTi3enW8x68x0zU7tXSlcmPSW1mGpWJMg4zmPC2lK96tp0OE80y4MfEvnZj8zGluR6b22ki1Ou9V2nCd9xovcPvcYMZYy0lvN60ScZ45vN6yeCeeXFb1lVjnnCar5fwXwE2bzJ4HI1XVPXfXZMm44GUsMpYsmLB65TuVdm0cl0b+i/wGNN66XjeV7zuPpHcnK/juhhjdfId5jMdE5nN0dGmmm2zZs2cexD5n9p/dY352XsvXHaZNWWsmmS1atjR452nYudzvqv2HMRyvNNnlMcDl3R2+yx2uVrBubTW9icHDVtbNXlZm7jma1rM4VurZZd2y6nUau7ZXZ7bVU+mnoOVxZGMrVmvX60605JwmzGZhhhjTWtaaaMaaGTGmNMZasY0iX8VMUl8eepaIrzGSpemWOQyZORk2bNpjUybMmxqYmknCGCFynutfksaZpjTNMaaatM0xsxcGR0sociNqxNSmhhR1ZJPbsn8qyF0t2qH6iYBclclalbtTTcHTDsPaX6rlnElph2Jyumumtynv2Kk8GI7rsvXbIcJgHJOSaSXnnGaI3m87RtVXJOZ/YtgdTE6Wpha6ZlE8ayXkef1fh602r2WwvfMXtMdLlkfnLFdYYwYso+bWqm7yJqHXZGw2nrS5ZanSYnWlxBxMF1V940K2wdrI7R6OYf7DGGamMmTSbRhlS45xmVOumF1EyPCmHrrN8wwZOOrdNtLeMtzFzDlWnfTBxMk2NaXIZHBYxYLD4w8yju0ao65Vz1OIXoS9dLanwCe1PWrYuWMqf1if1z2k2yYfKJ741PDgno1ZQ8DRqvUny3mNoWTzGO6m1DkrJI8JiR5cSd+vZdGOO8nrMoc5+NDUFsMSXaZJeNlMmGLtJsovOsUp7I9S5VojKxF6bTVEelXqlfJobQr3LozSh2Jk7VcrVMfhXqszGWMzNqGhqZY0OadxkyyMssKugZR0KNFXBHlqwmJgTE/BNVMk6ItJXZMR0H47GpXv/DMOvNkmVuaV1PRfEdxuqc7Hcd+ZV/zTLaRxWk0nl9CdCeM6mn5rstHIBcpiuwmUZXeq81DacHI2rmrZ5SuE5mOZd6LQrZg9mx32TprA8BMo5jKN6yLTCi3WzQaZSuhzTtM1fUTGVpG8Tw+KXI0tjEpiWxtLYynOlktSbVlaI5kxP8TDH8kx50xoxi5KcA4pcja8KWLRlO/Ks6q06ergnvm1ca3Tq8Uw7LTUsmWyctXPWmpitl/uvGcWTGXGuAXDfhqazGmjkxcJW5hMMMMpYsXl2TZYtVOddG3XCarUt6Ptq9CZXSNzyuRzqRZOjsxdBbFVz6OA5HI43r1jityVlVpVkxmOsyaYWE1NTGq1sOVh36mHMcxtSvcy70edG0ZGR3I1Go1GRlV7mWWo1G0ZGRqlvH40l7o4m5xMWLLLYyNjnqc8556mdPqLJ31n/1nWOncxzG1tizrHs/Z+d2vP/B/l8wdJ6rHUn2nbbDq4p6htFtYzMMMTaZis1K5GKzGNmxhmUx2DDlZ/qNnIx41xnaMfCZWYaZWtNLTNW8ND4Fw1MyZOCdM428suKG1ehW8TesOydg7J+YYcD4cYR+8dFK6M4E3HM9ZfRNNL+Sn6rsl4DsrDl2HpPCnfxjGXtbZtYys1ttlyJ4T+BvexjGWRjMszK4Jpc77D3GyuVD7q0+G8m9G+2+rGm7cOR2y7FdtY2XUYx/oNlfRYxhMYyYZkyyg55enna9Kt/FFi6GMMwYwdwxWgxGMLKYmUyGExTKMZkMFhkymKuh0NOBNnBu+23LdwDoZYYzGGMxtORaTU1pjTGWTTGGtMrNWUsyyTTLLG1qy2ZjbK2DBllWqxMtBMaYZQmcE7zvvRcTkclUwdkxTaSdyySt/7fpL+T1v516Ji97fwr5JbLu305zMn5+GMTTZ9F+y7ExwmGVfG44yxn3dLv6l5i+Wth1jCrDq21nW9LqvvDzz3Vf3LLH/O/32TJ/erx3bXftO4eF+G956D952K/An4NfvOpjFjExjevP/UmE0fIoZXx6/w6lX/no3D0bLt+ixjieBM6ksRd0yB4Lt2SwYNE+gd1detlZWUnpiZfGfFaK+4PyCa/v18V8X75pe9fLXzp7l3VjF76vWZmHwGz1IZNWT7b8yddJ4q5kyrVdfru6atWc7bVYztL9Jf4GXvT+Y8m9/YsXP6H018a8D4XVOqvfzqeR+6yZOD8dPv0+U7/q5Pl+2dNb0MjzGVH5p6MNQ7cOWvw62U9aHE8DprDek+McLyvDz+te+9Zhq5+YTruufMcWMabqysTmZVWjKPfnK0wyVcrsuhjZRdLkHNvD72b9abriOSGIxiLixMOoalNPXzy+wT/tf+U6HHONfsz+xe8ufHBdQWWGWLA9if0rsnmrxK5LvRZQeWsTCsrmOYy8VteVfuRfcVTtDLItLIsMYxZLdU/DbtSemxF6Z6Zo5WBXE4tFdCyVMMXMTEMZXVlS6Xec2T4e0tHsRcEuWshcJ2YsNF5rUx1E8ifCq6Z+ZP7qdCeu/aTwFd53l16/o0NOw6O3dLavP4Hbi4RdmuDk6DoYaninC0+o4uZjbJ7Rxeu0/FbuFg+q7DVS6fQe0rZ6NDGUNNU6DEqOaLTicKnYZMnBWruljQxoaS3dZhocDge0bSTyOvdAbG5hxe2xji7E/L55xX13wWNDi6HCekcFxfCPGxY0MXC+s7afWaMdDyjyr+o8Rudm/NabOZvdl274zH4f5XK9z6On1Pe/K5TdPAslg77BjuO6Y3eO7GqvOPG/stknp1leyvLL0Z7bl9I4noMvLkzytLhWYzrOZzLXCORe028rORzOg4N/L0HlMOQ3Pgmnbb6KczlabORpu980q37TBqRu0/p3PO6234Bl03Ynuz+9W7gnsEcmvYaYY3aMYY0wx3pYd+ujsXauWdaY5Xkbtl23fPzFHiDB/QMo0yFjBllYxTQYYyxkrwn7JufwJ/PfgJ+C83X69ni6zvXcnyXabv0ncbLwsceS+RNlyN2mnneJtX0ngYO0+e+0+UnA+Wch3ji8hj5an4h+i6XBySU4n+R0roVcbw5yvHrmr4Yw8Y7x6c+9POPYHI5HI5HI5HI5HGXGww4nE4nrVyOR8XeqPEO7PLOiukYa3Novk5hV4cdtYZLI93e+uxff2jRo0aNGjRo0aNG1bVtW1dy3m83m8+tQ5ZzHw3nObwOu8La9Rc1dtkdS8A3eTk823tnktXWlxN6Oixe06zrN70Isd9jiOgZFq9yfkPqP/SLhN2Myl8jDM43bl1nbcb4cO57jlh8Jow6pzXZdL4dyODTuuhu77FyO27DdwdRxmvO+O+3N2+BdqyTwLHVczDVY4UPE4O66/ZO2cx1LFzVdSXtF7G4HMbrauOHRw6c8FdZ5m9fHZHYZXfTlZquyynSyTTKke6vcffSD9pzPA/G7n7jxPmuhc1DHMynPMrGL6AdewYmwu5ko+UUyTwrMv27rPH1v1nGqd87+p6N6LU8k3NEng53xXyHS97+44OSg/sy/hn+Se6yfYNjW0/uTgP+PvWYzLMmjhcLB/gGpri6H83/84eUXWT6T9Hsv7785z/7z4icpW+zfXypuR7rx/gMdZb1/wC678pcs8/2a3mDitGHxl9mfPlll5MafWWqxk/eYuTDgcNMzDGWLWvsuglNxs53GtN6uWpktlW1tZZYcuinMMWmnNnJydze3b2Y1McBxrBkXw799izLMZZYyy0TkbsGM4p03S2uVu5s/XXUdSdec6smVxZYYGpVmT8A+8ajuEyV5FatkvVru2x6uxGXXbH4A+jvgP4GMYy3iPLXzq/6z65+E005ey+cwMZD3fZcqc6xpjTFjQ0P3U+e++cPYmTIwj0nrK5NPTfl3WvpfLtXDcb2HQMudYOxFXQBor4L4T6vrOauFctYXJQ++NUWmJe5bmx1jDiZS1dTqWxo4GR8jm3fttpmPHppk9PEyv4/y8/sO07XacOmcqc0x2Vi9BvNJvN5oW8x4mOsydpidRxMYJPx06m1bqPzq9KtK8sxXNXFodD/+MYYaJTLwOhc9brCsV18oOR1i4tXChyTkq4lf4y1Ke+9axjDHqs1mfBbMXuP4Hzi+X7t8vzv7bHerrUPgPCxhjre4fXdfLNtNM+Jd+Zdh8xd8wP87uNPoPgv4W7/5P2BuxfsMabNnMnza+54Pdi5U671GPZY8CehX8Voeoo7FHpkeEc6715FwHZrIrUrHaviPUbPZHND+IhczrP6FcYvhOZ0Di/ETt0OI+YwNWR9r7tpf6WDeZKZDB1+z2IthOl1mPyb5FluvEx9h9d0NnM0Y1XPFkWIsk1WotJ0PBMmkvjvQTd0e71tfeV+8r8lQ/tpzpsmxJ+InrI/dj2UajUajVTUajatRqNRtGo1Go1Go4wjeMpZFMVV9CHbofPraLsJ3JpWV2XOoanCuFky4y3PPNxucK2uKC1Lbdb1eo+m5XomN6HfeZsabHLHRX/K+offtNGGmHWctcVcG44MdSqsOLY9VzX+Zxfxn2HPdWTpzWvkrtJ8M5zorrKcquRytJ5N5DZmcaW02l76nWO+BqPXm1A2Ry/0q71dH/mqrqeFjkYxjEXtsX8qubTk67rGycyqsdm4tZx5D6D5hhi0waaWmiaMP81Yjii5qxPlPuU/GfTL1Y5E6Jyfiq63qTa39A4J0sOGDgO9WF9bOXl0XfPRbsY2bPNKPy1YrFYrFYmRhhlTIyMjJWJYZHXuCXI8OoXsvfljGLFicNifpp2XunoPiG1wtx3p1Tah+/DD66OnVtVXP9rKbVxOnL0tR/rHtqB5UDErUVcl11D4qqvjpOcxX7armUNJB3LpW6bxVvD08e8h3odKKvyCFZBdSh2FVcST9xV3n3T8t1j7Kr9qgrqXg+13Pt5U7JCvFXVIV1YG5lRhkVYZJYYDDD4KOIMoHCp26WS8GB7uBh2zIdgq/PKyInjV2STShuoapUdCpX1yTwqq/z1VvET7Kh5nVPkO8YyxjLt2MaaMmWTLQvx3qnzltnXW0p2jxgbEtSny/Osv8Y9pLMXYoHVPAhkVdWVeODhR6q9/Sxe2liwwZWMVvFXfRkeIDxAePUPIrdJ4ey6yquzH+PD/bUOWAu05qVHtFd8rrKHSoeNIOUqrYr3FXyToqfYJgwmJdKpXXOwYYegNNGMzfZPp/t3t/DVs4zjNTN61rRqaWaa4NYbRjTa0tWwy2Y2tGN8ZO8ofNKq4j9SL7I+cSm4/6ovLV5HNXLI0jJidwrtk6ynCaP6Z++GjRlWS3tLeW129Mi9evxU9mtz6s5J3Z7M2ngTgnKvmpomxpaLCzPfmx0JWE+m3NLDDGOX47RctdYYNK5jakdqLkRlI39n590T5zctGSwwZZDJj6kW8XSi6ot2MmWWJ0DUT3nuvebBudScjZ79g8cWJ8av0k+/bE5WKd5MdbFpbDVMxu1DVMmtNZGJvq1mtRbn6M+g/kP0FwDwr7quZs7xosNGpbscyxhhd9TyJyFwbLcxlTasg75vW7TsV5K7ji44XPMMrdoj+Y3rT0Hie62nlYV/pwczzOmdLqLhYkzGMzCZWGMQzGMSsZYY6Di1t4nlJ+Em63mJxrVLxPbYxNEdgc1dU2iOKyoYYWjNrEeHTYybVk0atSa7ehuwsWMWTqn1TrnS6hYsi71d1+s+k+ic70e20fzE/VaTdxT9ZtU4GIXdeNx3X77guYYfpHeTQjaMX6brOu4OY4K7Y2d9mbHarI5ox3p4GpJ2Vd/Tst60f7j999pppjR+Q/Qf8J/VaORs3cji7FfFuN61+ui9s8hix1OCh5KGVV23BPXvZfz3CLyHpix+exi8z/KnCnosY2eunor+cxyPO/xJ0vKey9OvE9VjqaYu0x3Z3jd6o2b1T12D+F8l232lwaaacD5LE8LBxu7WTlbWraWpew8Xexjel3E+wWD4APITdNqR8F3R3T0lunCQ4GaE9R37DxeCYfcHi4xci5ovKfxVs55y2hf+65E/Xdp6jR5nrebTmi5incpkyOjs50JvrZwstbbW6kfuuQw+2mykf/EXNFzxfKTrxew929TR6bWnGL//F3JFOFCQT3K4lQ"
+
+    kernels = Kernel(
+        bz2.decompress(base64.b64decode(quantization_code)),
+        [
+            "int4WeightCompression",
+            "int4WeightExtractionFloat",
+            "int4WeightExtractionHalf",
+            "int8WeightExtractionFloat",
+            "int8WeightExtractionHalf",
+        ],
+    )
+except Exception as exception:
+    kernels = None
+    logger.warning("Failed to load cpm_kernels:" + str(exception))
+
+
+class W8A16Linear(torch.autograd.Function):
+    @staticmethod
+    def forward(ctx, inp: torch.Tensor, quant_w: torch.Tensor, scale_w: torch.Tensor, weight_bit_width):
+        ctx.inp_shape = inp.size()
+        ctx.weight_bit_width = weight_bit_width
+        out_features = quant_w.size(0)
+        inp = inp.contiguous().view(-1, inp.size(-1))
+        weight = extract_weight_to_half(quant_w, scale_w, weight_bit_width)
+        ctx.weight_shape = weight.size()
+        output = inp.mm(weight.t())
+        ctx.save_for_backward(inp, quant_w, scale_w)
+        return output.view(*(ctx.inp_shape[:-1] + (out_features,)))
+
+    @staticmethod
+    def backward(ctx, grad_output: torch.Tensor):
+        inp, quant_w, scale_w = ctx.saved_tensors
+        weight = extract_weight_to_half(quant_w, scale_w, ctx.weight_bit_width)
+        grad_output = grad_output.contiguous().view(-1, weight.size(0))
+        grad_input = grad_output.mm(weight)
+        grad_weight = grad_output.t().mm(inp)
+        return grad_input.view(ctx.inp_shape), grad_weight.view(ctx.weight_shape), None, None
+
+
+def compress_int4_weight(weight: torch.Tensor):  # (n, m)
+    with torch.cuda.device(weight.device):
+        n, m = weight.size(0), weight.size(1)
+        assert m % 2 == 0
+        m = m // 2
+        out = torch.empty(n, m, dtype=torch.int8, device="cuda")
+        stream = torch.cuda.current_stream()
+
+        gridDim = (n, 1, 1)
+        blockDim = (min(round_up(m, 32), 1024), 1, 1)
+
+        kernels.int4WeightCompression(
+            gridDim,
+            blockDim,
+            0,
+            stream,
+            [ctypes.c_void_p(weight.data_ptr()), ctypes.c_void_p(out.data_ptr()), ctypes.c_int32(n), ctypes.c_int32(m)],
+        )
+        return out
+
+
+def extract_weight_to_half(weight: torch.Tensor, scale_list: torch.Tensor, source_bit_width: int):
+    assert scale_list.dtype in [torch.half, torch.bfloat16]
+    assert weight.dtype in [torch.int8]
+    if source_bit_width == 8:
+        return weight.to(scale_list.dtype) * scale_list[:, None]
+    elif source_bit_width == 4:
+        func = (
+            kernels.int4WeightExtractionHalf if scale_list.dtype == torch.half else kernels.int4WeightExtractionBFloat16
+        )
+    else:
+        assert False, "Unsupported bit-width"
+
+    with torch.cuda.device(weight.device):
+        n, m = weight.size(0), weight.size(1)
+        out = torch.empty(n, m * (8 // source_bit_width), dtype=scale_list.dtype, device="cuda")
+        stream = torch.cuda.current_stream()
+
+        gridDim = (n, 1, 1)
+        blockDim = (min(round_up(m, 32), 1024), 1, 1)
+
+        func(
+            gridDim,
+            blockDim,
+            0,
+            stream,
+            [
+                ctypes.c_void_p(weight.data_ptr()),
+                ctypes.c_void_p(scale_list.data_ptr()),
+                ctypes.c_void_p(out.data_ptr()),
+                ctypes.c_int32(n),
+                ctypes.c_int32(m),
+            ],
+        )
+        return out
+
+
+class QuantizedLinear(torch.nn.Module):
+    def __init__(self, weight_bit_width: int, weight, bias=None, device="cpu", dtype=None, empty_init=False, *args,
+                 **kwargs):
+        super().__init__()
+        self.weight_bit_width = weight_bit_width
+
+        shape = weight.shape
+
+        if weight is None or empty_init:
+            self.weight = torch.empty(shape[0], shape[1] * weight_bit_width // 8, dtype=torch.int8, device=device)
+            self.weight_scale = torch.empty(shape[0], dtype=dtype, device=device)
+        else:
+            self.weight_scale = weight.abs().max(dim=-1).values / ((2 ** (weight_bit_width - 1)) - 1)
+            self.weight = torch.round(weight / self.weight_scale[:, None]).to(torch.int8)
+            if weight_bit_width == 4:
+                self.weight = compress_int4_weight(self.weight)
+
+        self.weight = Parameter(self.weight.to(device), requires_grad=False)
+        self.weight_scale = Parameter(self.weight_scale.to(device), requires_grad=False)
+        self.bias = Parameter(bias.to(device), requires_grad=False) if bias is not None else None
+
+    def forward(self, input):
+        output = W8A16Linear.apply(input, self.weight, self.weight_scale, self.weight_bit_width)
+        if self.bias is not None:
+            output = output + self.bias
+        return output
+
+
+def quantize(model, weight_bit_width, empty_init=False, device=None):
+    """Replace fp16 linear with quantized linear"""
+    for layer in model.layers:
+        layer.self_attention.query_key_value = QuantizedLinear(
+            weight_bit_width=weight_bit_width,
+            weight=layer.self_attention.query_key_value.weight.to(torch.cuda.current_device()),
+            bias=layer.self_attention.query_key_value.bias,
+            dtype=layer.self_attention.query_key_value.weight.dtype,
+            device=layer.self_attention.query_key_value.weight.device if device is None else device,
+            empty_init=empty_init
+        )
+        layer.self_attention.dense = QuantizedLinear(
+            weight_bit_width=weight_bit_width,
+            weight=layer.self_attention.dense.weight.to(torch.cuda.current_device()),
+            bias=layer.self_attention.dense.bias,
+            dtype=layer.self_attention.dense.weight.dtype,
+            device=layer.self_attention.dense.weight.device if device is None else device,
+            empty_init=empty_init
+        )
+        layer.mlp.dense_h_to_4h = QuantizedLinear(
+            weight_bit_width=weight_bit_width,
+            weight=layer.mlp.dense_h_to_4h.weight.to(torch.cuda.current_device()),
+            bias=layer.mlp.dense_h_to_4h.bias,
+            dtype=layer.mlp.dense_h_to_4h.weight.dtype,
+            device=layer.mlp.dense_h_to_4h.weight.device if device is None else device,
+            empty_init=empty_init
+        )
+        layer.mlp.dense_4h_to_h = QuantizedLinear(
+            weight_bit_width=weight_bit_width,
+            weight=layer.mlp.dense_4h_to_h.weight.to(torch.cuda.current_device()),
+            bias=layer.mlp.dense_4h_to_h.bias,
+            dtype=layer.mlp.dense_4h_to_h.weight.dtype,
+            device=layer.mlp.dense_4h_to_h.weight.device if device is None else device,
+            empty_init=empty_init
+        )
+
+    return model

From 30574a70ebae39bcdf2e8f610c463cfff6f0f356 Mon Sep 17 00:00:00 2001
From: klhhhhh <1412841649@qq.com>
Date: Mon, 10 Jul 2023 18:55:33 +0800
Subject: [PATCH 09/21] [sharformer] add first version of policy of chatglm

---
 colossalai/shardformer/policies/chatglm.py    | 44 +++++++++++++++++++
 .../test_model/test_shard_chatglm.py          |  1 -
 2 files changed, 44 insertions(+), 1 deletion(-)

diff --git a/colossalai/shardformer/policies/chatglm.py b/colossalai/shardformer/policies/chatglm.py
index 934b99b83ea1..c17b92c8dc81 100644
--- a/colossalai/shardformer/policies/chatglm.py
+++ b/colossalai/shardformer/policies/chatglm.py
@@ -1,6 +1,7 @@
 from typing import Dict, Union
 
 import torch.nn as nn
+from ....tests.kit.model_zoo.transformers.chatglm2_6b.modeling_chatglm import ChatGLMModel, GLMBlock
 
 import colossalai.shardformer.layer as col_nn
 
@@ -8,6 +9,49 @@
 
 __all__ = ['ChatGLMModelPolicy', 'ChatGLMForConditionalGenerationPolicy']
 
+class ChatGLMModelPolicy(Policy):
+
+    def config_sanity_check(self):
+        pass
+    
+    def preprocess(self):
+        # Resize embedding
+        vocab_size = self.model.config.vocab_size
+        world_size = self.shard_config.tensor_parallel_size
+
+        if vocab_size % world_size != 0:
+            new_vocab_size = vocab_size + world_size - vocab_size % world_size
+            self.model.resize_token_embeddings(new_vocab_size)
+
+        return self.model
+    
+    def module_policy(self) -> Dict[Union[str, nn.Module], ModulePolicyDescription]:
+        from ....tests.kit.model_zoo.transformers.chatglm2_6b.modeling_chatglm import ChatGLMModel, GLMBlock
+
+        policy = {}
+
+        if self.shard_config.enable_tensor_parallelism:
+
+            policy[GLMBlock] = ModulePolicyDescription(
+                attribute_replacement = {},
+                sub_module_replacement = [
+                        # SubModuleReplacementDescription(
+                        #     suffix = "self_attention.query_key_value",
+                        #     target_module = col_nn.Linear1D_Col,
+                        # ),
+                        # SubModuleReplacementDescription(
+                        #     suffix = "self_attention.dense",
+                        #     target_module = col_nn.Linear1D_Row,
+                        # )
+                        # SubModuleReplacementDescription(
+                        #     suffix = "self_attention.core_attention.attention_dropout",
+                        #     target_module = col_nn.DropoutForParallelInput,
+                        # )
+                    ],)
+
+
+    def postprocess(self):
+        return self.model
 
 class ChatGLMModelPolicy(Policy):
 
diff --git a/tests/test_shardformer/test_model/test_shard_chatglm.py b/tests/test_shardformer/test_model/test_shard_chatglm.py
index 2cdf5da2e6da..f05649fcb9a0 100644
--- a/tests/test_shardformer/test_model/test_shard_chatglm.py
+++ b/tests/test_shardformer/test_model/test_shard_chatglm.py
@@ -19,7 +19,6 @@
 from tests.kit.model_zoo import model_zoo
 from tests.test_shardformer.test_model._utils import build_model, run_forward
 
-
 def check_forward_backward(org_model, sharded_model, data_gen_fn, output_transform_fn, loss_fn):
     # check forward
     org_output, org_loss, shard_output, shard_loss = run_forward(org_model, sharded_model, data_gen_fn,

From 28677d43060cfd681eec605dab97c79ac068e1d8 Mon Sep 17 00:00:00 2001
From: klhhhhh <1412841649@qq.com>
Date: Wed, 12 Jul 2023 15:25:07 +0800
Subject: [PATCH 10/21] [shardformer] polish chatglm code

---
 colossalai/shardformer/policies/autopolicy.py |  3 ++
 colossalai/shardformer/policies/chatglm.py    | 44 -------------------
 .../test_model/test_shard_chatglm.py          |  1 +
 3 files changed, 4 insertions(+), 44 deletions(-)

diff --git a/colossalai/shardformer/policies/autopolicy.py b/colossalai/shardformer/policies/autopolicy.py
index 77583dd77cf0..52d9bd5ebeae 100644
--- a/colossalai/shardformer/policies/autopolicy.py
+++ b/colossalai/shardformer/policies/autopolicy.py
@@ -112,6 +112,9 @@ class PolicyLocation:
     # Sam
     "transformers.models.sam.modeling_sam.SamModel":
         PolicyLocation(file_name="sam", class_name="SamModelPolicy"),
+    # ChatGLM
+    "tests.kit.model_zoo.transformers.chatglm2_6b.modeling_chatglm.ChatGLMModel":
+        PolicyLocation(file_name="chatglm", class_name="ChatGLMModelPolicy"),
 }
 
 
diff --git a/colossalai/shardformer/policies/chatglm.py b/colossalai/shardformer/policies/chatglm.py
index c17b92c8dc81..934b99b83ea1 100644
--- a/colossalai/shardformer/policies/chatglm.py
+++ b/colossalai/shardformer/policies/chatglm.py
@@ -1,7 +1,6 @@
 from typing import Dict, Union
 
 import torch.nn as nn
-from ....tests.kit.model_zoo.transformers.chatglm2_6b.modeling_chatglm import ChatGLMModel, GLMBlock
 
 import colossalai.shardformer.layer as col_nn
 
@@ -9,49 +8,6 @@
 
 __all__ = ['ChatGLMModelPolicy', 'ChatGLMForConditionalGenerationPolicy']
 
-class ChatGLMModelPolicy(Policy):
-
-    def config_sanity_check(self):
-        pass
-    
-    def preprocess(self):
-        # Resize embedding
-        vocab_size = self.model.config.vocab_size
-        world_size = self.shard_config.tensor_parallel_size
-
-        if vocab_size % world_size != 0:
-            new_vocab_size = vocab_size + world_size - vocab_size % world_size
-            self.model.resize_token_embeddings(new_vocab_size)
-
-        return self.model
-    
-    def module_policy(self) -> Dict[Union[str, nn.Module], ModulePolicyDescription]:
-        from ....tests.kit.model_zoo.transformers.chatglm2_6b.modeling_chatglm import ChatGLMModel, GLMBlock
-
-        policy = {}
-
-        if self.shard_config.enable_tensor_parallelism:
-
-            policy[GLMBlock] = ModulePolicyDescription(
-                attribute_replacement = {},
-                sub_module_replacement = [
-                        # SubModuleReplacementDescription(
-                        #     suffix = "self_attention.query_key_value",
-                        #     target_module = col_nn.Linear1D_Col,
-                        # ),
-                        # SubModuleReplacementDescription(
-                        #     suffix = "self_attention.dense",
-                        #     target_module = col_nn.Linear1D_Row,
-                        # )
-                        # SubModuleReplacementDescription(
-                        #     suffix = "self_attention.core_attention.attention_dropout",
-                        #     target_module = col_nn.DropoutForParallelInput,
-                        # )
-                    ],)
-
-
-    def postprocess(self):
-        return self.model
 
 class ChatGLMModelPolicy(Policy):
 
diff --git a/tests/test_shardformer/test_model/test_shard_chatglm.py b/tests/test_shardformer/test_model/test_shard_chatglm.py
index f05649fcb9a0..2cdf5da2e6da 100644
--- a/tests/test_shardformer/test_model/test_shard_chatglm.py
+++ b/tests/test_shardformer/test_model/test_shard_chatglm.py
@@ -19,6 +19,7 @@
 from tests.kit.model_zoo import model_zoo
 from tests.test_shardformer.test_model._utils import build_model, run_forward
 
+
 def check_forward_backward(org_model, sharded_model, data_gen_fn, output_transform_fn, loss_fn):
     # check forward
     org_output, org_loss, shard_output, shard_loss = run_forward(org_model, sharded_model, data_gen_fn,

From 28319c2c8a130e54447c6bbebe6b0955ba7a7063 Mon Sep 17 00:00:00 2001
From: klhhhhh <1412841649@qq.com>
Date: Thu, 13 Jul 2023 19:51:25 +0800
Subject: [PATCH 11/21] [shardformer] polish code

---
 .../model_zoo/transformers/chatglm2_6b/modeling_chatglm.py  | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/tests/kit/model_zoo/transformers/chatglm2_6b/modeling_chatglm.py b/tests/kit/model_zoo/transformers/chatglm2_6b/modeling_chatglm.py
index 488f24c5fcb9..f704715e1245 100644
--- a/tests/kit/model_zoo/transformers/chatglm2_6b/modeling_chatglm.py
+++ b/tests/kit/model_zoo/transformers/chatglm2_6b/modeling_chatglm.py
@@ -80,6 +80,7 @@ def default_init(cls, *args, **kwargs):
 
 
 class InvalidScoreLogitsProcessor(LogitsProcessor):
+
     def __call__(self, input_ids: torch.LongTensor, scores: torch.FloatTensor) -> torch.FloatTensor:
         if torch.isnan(scores).any() or torch.isinf(scores).any():
             scores.zero_()
@@ -219,6 +220,7 @@ def apply_rotary_pos_emb(x: torch.Tensor, rope_cache: torch.Tensor) -> torch.Ten
 
 
 class RMSNorm(torch.nn.Module):
+
     def __init__(self, normalized_shape, eps=1e-5, device=None, dtype=None, **kwargs):
         super().__init__()
         self.weight = torch.nn.Parameter(torch.empty(normalized_shape, device=device, dtype=dtype))
@@ -233,6 +235,7 @@ def forward(self, hidden_states: torch.Tensor):
 
 
 class CoreAttention(torch.nn.Module):
+
     def __init__(self, config: ChatGLMConfig, layer_number):
         super(CoreAttention, self).__init__()
 
@@ -839,6 +842,7 @@ def forward(self, input_ids):
 
 
 class ChatGLMModel(ChatGLMPreTrainedModel):
+
     def __init__(self, config: ChatGLMConfig, device=None, empty_init=True):
         super().__init__(config)
         if empty_init:
@@ -921,6 +925,7 @@ def forward(
 
         if inputs_embeds is None:
             inputs_embeds = self.embedding(input_ids)
+        print(inputs_embeds)
 
         if self.pre_seq_len is not None:
             if past_key_values is None:
@@ -982,6 +987,7 @@ def quantize(self, weight_bit_width: int):
 
 
 class ChatGLMForConditionalGeneration(ChatGLMPreTrainedModel):
+
     def __init__(self, config: ChatGLMConfig, empty_init=True, device=None):
         super().__init__(config)
 

From 3f19de9d6c0035c017513bea12686ce484ca5ee8 Mon Sep 17 00:00:00 2001
From: klhhhhh <1412841649@qq.com>
Date: Fri, 14 Jul 2023 18:10:52 +0800
Subject: [PATCH 12/21] [shardformer] support chatglm without layernorm

---
 .../chatglm2_6b/modeling_chatglm.py           | 19 +++++++++++++------
 1 file changed, 13 insertions(+), 6 deletions(-)

diff --git a/tests/kit/model_zoo/transformers/chatglm2_6b/modeling_chatglm.py b/tests/kit/model_zoo/transformers/chatglm2_6b/modeling_chatglm.py
index f704715e1245..46078f441523 100644
--- a/tests/kit/model_zoo/transformers/chatglm2_6b/modeling_chatglm.py
+++ b/tests/kit/model_zoo/transformers/chatglm2_6b/modeling_chatglm.py
@@ -396,17 +396,18 @@ def __init__(self, config: ChatGLMConfig, layer_number, device=None):
             self.num_multi_query_groups_per_partition = config.multi_query_group_num
             self.qkv_hidden_size = (self.projection_size +
                                     2 * self.hidden_size_per_attention_head * config.multi_query_group_num)
+<<<<<<< HEAD
         self.query_key_value = nn.Linear(
             config.hidden_size,
             self.qkv_hidden_size,
-            bias=config.add_bias_linear or config.add_qkv_bias,
             device=device,
             **_config_to_kwargs(config),
         )
-
-        self.core_attention = CoreAttention(config, self.layer_number)
-
-        # Output.
+=======
+        self.query_key_value = nn.Linear(self.hidden_size,
+                                         self.qkv_hidden_size,
+                                         bias=config.add_bias_linear or config.add_qkv_bias,
+<<<<<<< HEAD
         self.dense = nn.Linear(
             self.projection_size,
             config.hidden_size,
@@ -414,6 +415,13 @@ def __init__(self, config: ChatGLMConfig, layer_number, device=None):
             device=device,
             **_config_to_kwargs(config),
         )
+=======
+        self.dense = nn.Linear(self.projection_size,
+                               self.hidden_size,
+                               bias=config.add_bias_linear,
+                               device=device,
+                               **_config_to_kwargs(config))
+>>>>>>> [shardformer] support chatglm without layernorm
 
     def _allocate_memory(self, inference_max_sequence_len, batch_size, device=None, dtype=None):
         if self.multi_query_attention:
@@ -925,7 +933,6 @@ def forward(
 
         if inputs_embeds is None:
             inputs_embeds = self.embedding(input_ids)
-        print(inputs_embeds)
 
         if self.pre_seq_len is not None:
             if past_key_values is None:

From 2a4bbcf9625c8f758bbb0cc9e97370ed30dbf1c2 Mon Sep 17 00:00:00 2001
From: klhhhhh <1412841649@qq.com>
Date: Mon, 17 Jul 2023 15:10:15 +0800
Subject: [PATCH 13/21] [shardformer] delete some file

---
 =2.0                                          | 134 -------------
 .../transformers/chatglm2_6b/MODEL_LICENSE    |  33 ---
 .../transformers/chatglm2_6b/quantization.py  | 188 ------------------
 3 files changed, 355 deletions(-)
 delete mode 100644 =2.0
 delete mode 100644 tests/kit/model_zoo/transformers/chatglm2_6b/MODEL_LICENSE
 delete mode 100644 tests/kit/model_zoo/transformers/chatglm2_6b/quantization.py

diff --git a/=2.0 b/=2.0
deleted file mode 100644
index af47ce17aa8e..000000000000
--- a/=2.0
+++ /dev/null
@@ -1,134 +0,0 @@
-Defaulting to user installation because normal site-packages is not writeable
-Collecting protobuf
-  Using cached protobuf-4.23.4-cp37-abi3-manylinux2014_x86_64.whl (304 kB)
-Requirement already satisfied: transformers==4.30.2 in /home/lclk/.local/lib/python3.9/site-packages (4.30.2)
-Collecting cpm_kernels
-  Using cached cpm_kernels-1.0.11-py3-none-any.whl (416 kB)
-Requirement already satisfied: torch in /home/lclk/.local/lib/python3.9/site-packages (2.0.0+cu118)
-Collecting gradio
-  Using cached gradio-3.36.0-py3-none-any.whl (19.8 MB)
-Collecting mdtex2html
-  Using cached mdtex2html-1.2.0-py3-none-any.whl (13 kB)
-Collecting sentencepiece
-  Using cached sentencepiece-0.1.99-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.3 MB)
-Collecting accelerate
-  Using cached accelerate-0.20.3-py3-none-any.whl (227 kB)
-Requirement already satisfied: pyyaml>=5.1 in /home/lclk/.local/lib/python3.9/site-packages (from transformers==4.30.2) (6.0)
-Requirement already satisfied: regex!=2019.12.17 in /home/lclk/.local/lib/python3.9/site-packages (from transformers==4.30.2) (2023.6.3)
-Requirement already satisfied: huggingface-hub<1.0,>=0.14.1 in /home/lclk/.local/lib/python3.9/site-packages (from transformers==4.30.2) (0.15.1)
-Requirement already satisfied: packaging>=20.0 in /home/lclk/.local/lib/python3.9/site-packages (from transformers==4.30.2) (23.1)
-Requirement already satisfied: requests in /opt/lcsoftware/spack/opt/spack/linux-ubuntu20.04-zen2/gcc-9.3.0/miniconda3-4.10.3-u6p3tgreee7aigtnvuhr44yqo7vcg6r6/lib/python3.9/site-packages (from transformers==4.30.2) (2.25.1)
-Requirement already satisfied: tokenizers!=0.11.3,<0.14,>=0.11.1 in /home/lclk/.local/lib/python3.9/site-packages (from transformers==4.30.2) (0.13.3)
-Requirement already satisfied: safetensors>=0.3.1 in /home/lclk/.local/lib/python3.9/site-packages (from transformers==4.30.2) (0.3.1)
-Requirement already satisfied: filelock in /home/lclk/.local/lib/python3.9/site-packages (from transformers==4.30.2) (3.12.0)
-Requirement already satisfied: numpy>=1.17 in /home/lclk/.local/lib/python3.9/site-packages (from transformers==4.30.2) (1.24.3)
-Requirement already satisfied: tqdm>=4.27 in /home/lclk/.local/lib/python3.9/site-packages (from transformers==4.30.2) (4.65.0)
-Requirement already satisfied: fsspec in /home/lclk/.local/lib/python3.9/site-packages (from huggingface-hub<1.0,>=0.14.1->transformers==4.30.2) (2023.6.0)
-Requirement already satisfied: typing-extensions>=3.7.4.3 in /home/lclk/.local/lib/python3.9/site-packages (from huggingface-hub<1.0,>=0.14.1->transformers==4.30.2) (4.6.3)
-Requirement already satisfied: networkx in /home/lclk/.local/lib/python3.9/site-packages (from torch) (3.1)
-Requirement already satisfied: sympy in /home/lclk/.local/lib/python3.9/site-packages (from torch) (1.12)
-Requirement already satisfied: triton==2.0.0 in /home/lclk/.local/lib/python3.9/site-packages (from torch) (2.0.0)
-Requirement already satisfied: jinja2 in /home/lclk/.local/lib/python3.9/site-packages (from torch) (3.1.2)
-Requirement already satisfied: lit in /home/lclk/.local/lib/python3.9/site-packages (from triton==2.0.0->torch) (16.0.5.post0)
-Requirement already satisfied: cmake in /home/lclk/.local/lib/python3.9/site-packages (from triton==2.0.0->torch) (3.26.3)
-Collecting aiofiles
-  Using cached aiofiles-23.1.0-py3-none-any.whl (14 kB)
-Collecting ffmpy
-  Using cached ffmpy-0.3.0.tar.gz (4.8 kB)
-Requirement already satisfied: pillow in /home/lclk/.local/lib/python3.9/site-packages (from gradio) (9.5.0)
-Collecting pydub
-  Using cached pydub-0.25.1-py2.py3-none-any.whl (32 kB)
-Requirement already satisfied: pandas in /home/lclk/.local/lib/python3.9/site-packages (from gradio) (2.0.2)
-Collecting python-multipart
-  Using cached python_multipart-0.0.6-py3-none-any.whl (45 kB)
-Collecting semantic-version
-  Using cached semantic_version-2.10.0-py2.py3-none-any.whl (15 kB)
-Collecting pydantic
-  Using cached pydantic-2.0.2-py3-none-any.whl (359 kB)
-Collecting uvicorn>=0.14.0
-  Using cached uvicorn-0.22.0-py3-none-any.whl (58 kB)
-Collecting mdit-py-plugins<=0.3.3
-  Using cached mdit_py_plugins-0.3.3-py3-none-any.whl (50 kB)
-Requirement already satisfied: pygments>=2.12.0 in /home/lclk/.local/lib/python3.9/site-packages (from gradio) (2.15.1)
-Collecting httpx
-  Using cached httpx-0.24.1-py3-none-any.whl (75 kB)
-Collecting orjson
-  Using cached orjson-3.9.1-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (136 kB)
-Collecting fastapi
-  Using cached fastapi-0.99.1-py3-none-any.whl (58 kB)
-Collecting altair>=4.2.0
-  Using cached altair-5.0.1-py3-none-any.whl (471 kB)
-Collecting gradio-client>=0.2.7
-  Using cached gradio_client-0.2.7-py3-none-any.whl (288 kB)
-Requirement already satisfied: aiohttp in /home/lclk/.local/lib/python3.9/site-packages (from gradio) (3.8.4)
-Requirement already satisfied: matplotlib in /home/lclk/.local/lib/python3.9/site-packages (from gradio) (3.7.1)
-Collecting websockets>=10.0
-  Using cached websockets-11.0.3-cp39-cp39-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl (129 kB)
-Requirement already satisfied: markdown-it-py[linkify]>=2.0.0 in /home/lclk/.local/lib/python3.9/site-packages (from gradio) (2.2.0)
-Requirement already satisfied: markupsafe in /home/lclk/.local/lib/python3.9/site-packages (from gradio) (2.1.3)
-Collecting toolz
-  Using cached toolz-0.12.0-py3-none-any.whl (55 kB)
-Collecting jsonschema>=3.0
-  Using cached jsonschema-4.18.0-py3-none-any.whl (81 kB)
-Collecting rpds-py>=0.7.1
-  Downloading rpds_py-0.8.8-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.2 MB)
-Collecting referencing>=0.28.4
-  Using cached referencing-0.29.1-py3-none-any.whl (25 kB)
-Collecting jsonschema-specifications>=2023.03.6
-  Using cached jsonschema_specifications-2023.6.1-py3-none-any.whl (17 kB)
-Requirement already satisfied: attrs>=22.2.0 in /home/lclk/.local/lib/python3.9/site-packages (from jsonschema>=3.0->altair>=4.2.0->gradio) (23.1.0)
-Requirement already satisfied: mdurl~=0.1 in /home/lclk/.local/lib/python3.9/site-packages (from markdown-it-py[linkify]>=2.0.0->gradio) (0.1.2)
-Collecting linkify-it-py<3,>=1
-  Downloading linkify_it_py-2.0.2-py3-none-any.whl (19 kB)
-Collecting uc-micro-py
-  Downloading uc_micro_py-1.0.2-py3-none-any.whl (6.2 kB)
-Requirement already satisfied: pytz>=2020.1 in /home/lclk/.local/lib/python3.9/site-packages (from pandas->gradio) (2023.3)
-Requirement already satisfied: tzdata>=2022.1 in /home/lclk/.local/lib/python3.9/site-packages (from pandas->gradio) (2023.3)
-Requirement already satisfied: python-dateutil>=2.8.2 in /home/lclk/.local/lib/python3.9/site-packages (from pandas->gradio) (2.8.2)
-Requirement already satisfied: six>=1.5 in /opt/lcsoftware/spack/opt/spack/linux-ubuntu20.04-zen2/gcc-9.3.0/miniconda3-4.10.3-u6p3tgreee7aigtnvuhr44yqo7vcg6r6/lib/python3.9/site-packages (from python-dateutil>=2.8.2->pandas->gradio) (1.16.0)
-Requirement already satisfied: click>=7.0 in /home/lclk/.local/lib/python3.9/site-packages (from uvicorn>=0.14.0->gradio) (8.1.3)
-Collecting h11>=0.8
-  Downloading h11-0.14.0-py3-none-any.whl (58 kB)
-Collecting latex2mathml
-  Downloading latex2mathml-3.76.0-py3-none-any.whl (73 kB)
-Collecting markdown
-  Downloading Markdown-3.4.3-py3-none-any.whl (93 kB)
-Requirement already satisfied: psutil in /home/lclk/.local/lib/python3.9/site-packages (from accelerate) (5.9.5)
-Requirement already satisfied: multidict<7.0,>=4.5 in /home/lclk/.local/lib/python3.9/site-packages (from aiohttp->gradio) (6.0.4)
-Requirement already satisfied: async-timeout<5.0,>=4.0.0a3 in /home/lclk/.local/lib/python3.9/site-packages (from aiohttp->gradio) (4.0.2)
-Requirement already satisfied: aiosignal>=1.1.2 in /home/lclk/.local/lib/python3.9/site-packages (from aiohttp->gradio) (1.3.1)
-Requirement already satisfied: charset-normalizer<4.0,>=2.0 in /home/lclk/.local/lib/python3.9/site-packages (from aiohttp->gradio) (3.1.0)
-Requirement already satisfied: frozenlist>=1.1.1 in /home/lclk/.local/lib/python3.9/site-packages (from aiohttp->gradio) (1.3.3)
-Requirement already satisfied: yarl<2.0,>=1.0 in /home/lclk/.local/lib/python3.9/site-packages (from aiohttp->gradio) (1.9.2)
-Requirement already satisfied: idna>=2.0 in /opt/lcsoftware/spack/opt/spack/linux-ubuntu20.04-zen2/gcc-9.3.0/miniconda3-4.10.3-u6p3tgreee7aigtnvuhr44yqo7vcg6r6/lib/python3.9/site-packages (from yarl<2.0,>=1.0->aiohttp->gradio) (2.10)
-Collecting pydantic
-  Downloading pydantic-1.10.11-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (3.2 MB)
-Collecting starlette<0.28.0,>=0.27.0
-  Downloading starlette-0.27.0-py3-none-any.whl (66 kB)
-Collecting anyio<5,>=3.4.0
-  Downloading anyio-3.7.1-py3-none-any.whl (80 kB)
-Collecting sniffio>=1.1
-  Downloading sniffio-1.3.0-py3-none-any.whl (10 kB)
-Requirement already satisfied: exceptiongroup in /home/lclk/.local/lib/python3.9/site-packages (from anyio<5,>=3.4.0->starlette<0.28.0,>=0.27.0->fastapi->gradio) (1.1.1)
-Collecting httpcore<0.18.0,>=0.15.0
-  Downloading httpcore-0.17.3-py3-none-any.whl (74 kB)
-Requirement already satisfied: certifi in /opt/lcsoftware/spack/opt/spack/linux-ubuntu20.04-zen2/gcc-9.3.0/miniconda3-4.10.3-u6p3tgreee7aigtnvuhr44yqo7vcg6r6/lib/python3.9/site-packages (from httpx->gradio) (2021.5.30)
-Requirement already satisfied: importlib-metadata>=4.4 in /home/lclk/.local/lib/python3.9/site-packages (from markdown->mdtex2html) (6.7.0)
-Requirement already satisfied: zipp>=0.5 in /home/lclk/.local/lib/python3.9/site-packages (from importlib-metadata>=4.4->markdown->mdtex2html) (3.15.0)
-Requirement already satisfied: contourpy>=1.0.1 in /home/lclk/.local/lib/python3.9/site-packages (from matplotlib->gradio) (1.1.0)
-Requirement already satisfied: fonttools>=4.22.0 in /home/lclk/.local/lib/python3.9/site-packages (from matplotlib->gradio) (4.40.0)
-Requirement already satisfied: pyparsing>=2.3.1 in /home/lclk/.local/lib/python3.9/site-packages (from matplotlib->gradio) (3.1.0)
-Requirement already satisfied: kiwisolver>=1.0.1 in /home/lclk/.local/lib/python3.9/site-packages (from matplotlib->gradio) (1.4.4)
-Requirement already satisfied: importlib-resources>=3.2.0 in /home/lclk/.local/lib/python3.9/site-packages (from matplotlib->gradio) (5.12.0)
-Requirement already satisfied: cycler>=0.10 in /home/lclk/.local/lib/python3.9/site-packages (from matplotlib->gradio) (0.11.0)
-Requirement already satisfied: urllib3<1.27,>=1.21.1 in /opt/lcsoftware/spack/opt/spack/linux-ubuntu20.04-zen2/gcc-9.3.0/miniconda3-4.10.3-u6p3tgreee7aigtnvuhr44yqo7vcg6r6/lib/python3.9/site-packages (from requests->transformers==4.30.2) (1.26.6)
-Requirement already satisfied: chardet<5,>=3.0.2 in /opt/lcsoftware/spack/opt/spack/linux-ubuntu20.04-zen2/gcc-9.3.0/miniconda3-4.10.3-u6p3tgreee7aigtnvuhr44yqo7vcg6r6/lib/python3.9/site-packages (from requests->transformers==4.30.2) (4.0.0)
-Requirement already satisfied: mpmath>=0.19 in /home/lclk/.local/lib/python3.9/site-packages (from sympy->torch) (1.3.0)
-Building wheels for collected packages: ffmpy
-  Building wheel for ffmpy (setup.py): started
-  Building wheel for ffmpy (setup.py): finished with status 'done'
-  Created wheel for ffmpy: filename=ffmpy-0.3.0-py3-none-any.whl size=4709 sha256=071cebb58ca6c6947fbc669e1d94509d6f53d1ed45d9d7fb9f060d1a342cfc18
-  Stored in directory: /home/lclk/.cache/pip/wheels/91/e2/96/f676aa08bfd789328c6576cd0f1fde4a3d686703bb0c247697
-Successfully built ffmpy
-Installing collected packages: sniffio, rpds-py, referencing, h11, anyio, uc-micro-py, jsonschema-specifications, httpcore, websockets, toolz, starlette, pydantic, linkify-it-py, jsonschema, httpx, uvicorn, semantic-version, python-multipart, pydub, orjson, mdit-py-plugins, markdown, latex2mathml, gradio-client, ffmpy, fastapi, altair, aiofiles, sentencepiece, protobuf, mdtex2html, gradio, cpm-kernels, accelerate
-Successfully installed accelerate-0.20.3 aiofiles-23.1.0 altair-5.0.1 anyio-3.7.1 cpm-kernels-1.0.11 fastapi-0.99.1 ffmpy-0.3.0 gradio-3.36.0 gradio-client-0.2.7 h11-0.14.0 httpcore-0.17.3 httpx-0.24.1 jsonschema-4.18.0 jsonschema-specifications-2023.6.1 latex2mathml-3.76.0 linkify-it-py-2.0.2 markdown-3.4.3 mdit-py-plugins-0.3.3 mdtex2html-1.2.0 orjson-3.9.1 protobuf-4.23.4 pydantic-1.10.11 pydub-0.25.1 python-multipart-0.0.6 referencing-0.29.1 rpds-py-0.8.8 semantic-version-2.10.0 sentencepiece-0.1.99 sniffio-1.3.0 starlette-0.27.0 toolz-0.12.0 uc-micro-py-1.0.2 uvicorn-0.22.0 websockets-11.0.3
diff --git a/tests/kit/model_zoo/transformers/chatglm2_6b/MODEL_LICENSE b/tests/kit/model_zoo/transformers/chatglm2_6b/MODEL_LICENSE
deleted file mode 100644
index 26198b21b6b2..000000000000
--- a/tests/kit/model_zoo/transformers/chatglm2_6b/MODEL_LICENSE
+++ /dev/null
@@ -1,33 +0,0 @@
-The ChatGLM2-6B License
-
-1. Definitions
-
-“Licensor” means the ChatGLM2-6B Model Team that distributes its Software.
-
-“Software” means the ChatGLM2-6B model parameters made available under this license.
-
-2. License Grant
-
-Subject to the terms and conditions of this License, the Licensor hereby grants to you a non-exclusive, worldwide, non-transferable, non-sublicensable, revocable, royalty-free copyright license to use the Software solely for your non-commercial research purposes.
-
-The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
-
-3. Restriction
-
-You will not use, copy, modify, merge, publish, distribute, reproduce, or create derivative works of the Software, in whole or in part, for any commercial, military, or illegal purposes.
-
-You will not use the Software for any act that may undermine China's national security and national unity, harm the public interest of society, or infringe upon the rights and interests of human beings.
-
-4. Disclaimer
-
-THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
-
-5. Limitation of Liability
-
-EXCEPT TO THE EXTENT PROHIBITED BY APPLICABLE LAW, IN NO EVENT AND UNDER NO LEGAL THEORY, WHETHER BASED IN TORT, NEGLIGENCE, CONTRACT, LIABILITY, OR OTHERWISE WILL ANY LICENSOR BE LIABLE TO YOU FOR ANY DIRECT, INDIRECT, SPECIAL, INCIDENTAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES, OR ANY OTHER COMMERCIAL LOSSES, EVEN IF THE LICENSOR HAS BEEN ADVISED OF THE POSSIBILITY OF SUCH DAMAGES.
-
-6. Dispute Resolution
-
-This license shall be governed and construed in accordance with the laws of People’s Republic of China. Any dispute arising from or in connection with this License shall be submitted to Haidian District People's Court in Beijing.
-
-Note that the license is subject to update to a more comprehensive version.  For any questions related to the license and copyright, please contact us at glm-130b@googlegroups.com.
\ No newline at end of file
diff --git a/tests/kit/model_zoo/transformers/chatglm2_6b/quantization.py b/tests/kit/model_zoo/transformers/chatglm2_6b/quantization.py
deleted file mode 100644
index cb95bfe82b20..000000000000
--- a/tests/kit/model_zoo/transformers/chatglm2_6b/quantization.py
+++ /dev/null
@@ -1,188 +0,0 @@
-from torch.nn import Linear
-from torch.nn.parameter import Parameter
-
-import bz2
-import torch
-import base64
-import ctypes
-from transformers.utils import logging
-
-from typing import List
-from functools import partial
-
-logger = logging.get_logger(__name__)
-
-try:
-    from cpm_kernels.kernels.base import LazyKernelCModule, KernelFunction, round_up
-
-    class Kernel:
-        def __init__(self, code: bytes, function_names: List[str]):
-            self.code = code
-            self._function_names = function_names
-            self._cmodule = LazyKernelCModule(self.code)
-
-            for name in self._function_names:
-                setattr(self, name, KernelFunction(self._cmodule, name))
-
-    quantization_code = "$QlpoOTFBWSZTWU9yuJUAQHN//////////f/n/8/n///n//bt4dTidcVx8X3V9FV/92/v4B7/AD5FBQFAAAChSgKpFCFAFVSigUAAAEKhSgUUqgFBKigqVREQAABQBQIANDTTIGI00BkZBkNGE0A0BkBkGQGRkaNAaAGQNBoGgDIAAYIGTI0DQAQAaGmmQMRpoDIyDIaMJoBoDIDIMgMjI0aA0AMgaDQNAGQAAwQMmRoGgAgA0NNMgYjTQGRkGQ0YTQDQGQGQZAZGRo0BoAZA0GgaAMgABggZMjQNABABoaaZAxGmgMjIMhowmgGgMgMgyAyMjRoDQAyBoNA0AZAADBAyZGgaAAmqU1NEgJqnptU/Sn4jRR6J6epk2pqb1Q/SgAPUGgyNNGjQ2SBpoAZAAGg0NB6mgDIAAAAA2oaApSREBNAARhGiYEaEwU8pvImlP0k2aam1GaGqbFNM1MHpTwmkepmyU9R6nqPKekHqNNPUxNGhp6n6p6QaZ6o9TG1GMqcoV9ly6nRanHlq6zPNbnGZNi6HSug+2nPiZ13XcnFYZW+45W11CumhzYhchOJ2GLLV1OBjBjGf4TptOddTSOcVxhqYZMYwZXZZY00zI1paX5X9J+b+f4e+x43RXSxXPOdquiGpduatGyXneN696M9t4HU2eR5XX/kPhP261NTx3JO1Ow7LyuDmeo9a7d351T1ZxnvnrvYnrXv/hXxPCeuYx2XsNmO003eg9J3Z6U7b23meJ4ri01OdzTk9BNO96brz+qT5nuvvH3ds/G+m/JcG/F2XYuhXlvO+jP7U3XgrzPN/lr8Sf1n6j4j7jZs+s/T0tNaNNYzTs12rxjwztHlnire3Nzc3N1wuBwOBwXBvZfoHpD7rFmR99V5vj3aXza3xdBbXMalubTg/jIv5dfAi54Pdc75j4z412n3Npj3Ld/ENm7a3b/Cod6h/ret1/5vn/C+l+gdslMvgPSLJ8d8q+U66fevYn/tW1chleEtNTGlcHCbLRlq0tHzF5tsbbZZfHjjLgZu42XCuC3NrdjTasZGNzgxPIrGqp7r3p7L2p5XjnpPSmTd5XtzqnB6U87zzg1Ol0zd0zsLszxR6lkxp35u6/teL0L0W922cR7Lu1lpL9CsHirzuM2T+BgsyViT6LHcm0/Vr6U/7LGGyJeqTEjt0PHWhF5mCT7R9mtlDwriYv0Tyr/OxYt6qp5r0mPVT0608TqnqMZaarU2nFwrTzzlrs1ed7z1ux60wyr4ydCaTi3enW8x68x0zU7tXSlcmPSW1mGpWJMg4zmPC2lK96tp0OE80y4MfEvnZj8zGluR6b22ki1Ou9V2nCd9xovcPvcYMZYy0lvN60ScZ45vN6yeCeeXFb1lVjnnCar5fwXwE2bzJ4HI1XVPXfXZMm44GUsMpYsmLB65TuVdm0cl0b+i/wGNN66XjeV7zuPpHcnK/juhhjdfId5jMdE5nN0dGmmm2zZs2cexD5n9p/dY352XsvXHaZNWWsmmS1atjR452nYudzvqv2HMRyvNNnlMcDl3R2+yx2uVrBubTW9icHDVtbNXlZm7jma1rM4VurZZd2y6nUau7ZXZ7bVU+mnoOVxZGMrVmvX60605JwmzGZhhhjTWtaaaMaaGTGmNMZasY0iX8VMUl8eepaIrzGSpemWOQyZORk2bNpjUybMmxqYmknCGCFynutfksaZpjTNMaaatM0xsxcGR0sociNqxNSmhhR1ZJPbsn8qyF0t2qH6iYBclclalbtTTcHTDsPaX6rlnElph2Jyumumtynv2Kk8GI7rsvXbIcJgHJOSaSXnnGaI3m87RtVXJOZ/YtgdTE6Wpha6ZlE8ayXkef1fh602r2WwvfMXtMdLlkfnLFdYYwYso+bWqm7yJqHXZGw2nrS5ZanSYnWlxBxMF1V940K2wdrI7R6OYf7DGGamMmTSbRhlS45xmVOumF1EyPCmHrrN8wwZOOrdNtLeMtzFzDlWnfTBxMk2NaXIZHBYxYLD4w8yju0ao65Vz1OIXoS9dLanwCe1PWrYuWMqf1if1z2k2yYfKJ741PDgno1ZQ8DRqvUny3mNoWTzGO6m1DkrJI8JiR5cSd+vZdGOO8nrMoc5+NDUFsMSXaZJeNlMmGLtJsovOsUp7I9S5VojKxF6bTVEelXqlfJobQr3LozSh2Jk7VcrVMfhXqszGWMzNqGhqZY0OadxkyyMssKugZR0KNFXBHlqwmJgTE/BNVMk6ItJXZMR0H47GpXv/DMOvNkmVuaV1PRfEdxuqc7Hcd+ZV/zTLaRxWk0nl9CdCeM6mn5rstHIBcpiuwmUZXeq81DacHI2rmrZ5SuE5mOZd6LQrZg9mx32TprA8BMo5jKN6yLTCi3WzQaZSuhzTtM1fUTGVpG8Tw+KXI0tjEpiWxtLYynOlktSbVlaI5kxP8TDH8kx50xoxi5KcA4pcja8KWLRlO/Ks6q06ergnvm1ca3Tq8Uw7LTUsmWyctXPWmpitl/uvGcWTGXGuAXDfhqazGmjkxcJW5hMMMMpYsXl2TZYtVOddG3XCarUt6Ptq9CZXSNzyuRzqRZOjsxdBbFVz6OA5HI43r1jityVlVpVkxmOsyaYWE1NTGq1sOVh36mHMcxtSvcy70edG0ZGR3I1Go1GRlV7mWWo1G0ZGRqlvH40l7o4m5xMWLLLYyNjnqc8556mdPqLJ31n/1nWOncxzG1tizrHs/Z+d2vP/B/l8wdJ6rHUn2nbbDq4p6htFtYzMMMTaZis1K5GKzGNmxhmUx2DDlZ/qNnIx41xnaMfCZWYaZWtNLTNW8ND4Fw1MyZOCdM428suKG1ehW8TesOydg7J+YYcD4cYR+8dFK6M4E3HM9ZfRNNL+Sn6rsl4DsrDl2HpPCnfxjGXtbZtYys1ttlyJ4T+BvexjGWRjMszK4Jpc77D3GyuVD7q0+G8m9G+2+rGm7cOR2y7FdtY2XUYx/oNlfRYxhMYyYZkyyg55enna9Kt/FFi6GMMwYwdwxWgxGMLKYmUyGExTKMZkMFhkymKuh0NOBNnBu+23LdwDoZYYzGGMxtORaTU1pjTGWTTGGtMrNWUsyyTTLLG1qy2ZjbK2DBllWqxMtBMaYZQmcE7zvvRcTkclUwdkxTaSdyySt/7fpL+T1v516Ji97fwr5JbLu305zMn5+GMTTZ9F+y7ExwmGVfG44yxn3dLv6l5i+Wth1jCrDq21nW9LqvvDzz3Vf3LLH/O/32TJ/erx3bXftO4eF+G956D952K/An4NfvOpjFjExjevP/UmE0fIoZXx6/w6lX/no3D0bLt+ixjieBM6ksRd0yB4Lt2SwYNE+gd1detlZWUnpiZfGfFaK+4PyCa/v18V8X75pe9fLXzp7l3VjF76vWZmHwGz1IZNWT7b8yddJ4q5kyrVdfru6atWc7bVYztL9Jf4GXvT+Y8m9/YsXP6H018a8D4XVOqvfzqeR+6yZOD8dPv0+U7/q5Pl+2dNb0MjzGVH5p6MNQ7cOWvw62U9aHE8DprDek+McLyvDz+te+9Zhq5+YTruufMcWMabqysTmZVWjKPfnK0wyVcrsuhjZRdLkHNvD72b9abriOSGIxiLixMOoalNPXzy+wT/tf+U6HHONfsz+xe8ufHBdQWWGWLA9if0rsnmrxK5LvRZQeWsTCsrmOYy8VteVfuRfcVTtDLItLIsMYxZLdU/DbtSemxF6Z6Zo5WBXE4tFdCyVMMXMTEMZXVlS6Xec2T4e0tHsRcEuWshcJ2YsNF5rUx1E8ifCq6Z+ZP7qdCeu/aTwFd53l16/o0NOw6O3dLavP4Hbi4RdmuDk6DoYaninC0+o4uZjbJ7Rxeu0/FbuFg+q7DVS6fQe0rZ6NDGUNNU6DEqOaLTicKnYZMnBWruljQxoaS3dZhocDge0bSTyOvdAbG5hxe2xji7E/L55xX13wWNDi6HCekcFxfCPGxY0MXC+s7afWaMdDyjyr+o8Rudm/NabOZvdl274zH4f5XK9z6On1Pe/K5TdPAslg77BjuO6Y3eO7GqvOPG/stknp1leyvLL0Z7bl9I4noMvLkzytLhWYzrOZzLXCORe028rORzOg4N/L0HlMOQ3Pgmnbb6KczlabORpu980q37TBqRu0/p3PO6234Bl03Ynuz+9W7gnsEcmvYaYY3aMYY0wx3pYd+ujsXauWdaY5Xkbtl23fPzFHiDB/QMo0yFjBllYxTQYYyxkrwn7JufwJ/PfgJ+C83X69ni6zvXcnyXabv0ncbLwsceS+RNlyN2mnneJtX0ngYO0+e+0+UnA+Wch3ji8hj5an4h+i6XBySU4n+R0roVcbw5yvHrmr4Yw8Y7x6c+9POPYHI5HI5HI5HI5HGXGww4nE4nrVyOR8XeqPEO7PLOiukYa3Novk5hV4cdtYZLI93e+uxff2jRo0aNGjRo0aNG1bVtW1dy3m83m8+tQ5ZzHw3nObwOu8La9Rc1dtkdS8A3eTk823tnktXWlxN6Oixe06zrN70Isd9jiOgZFq9yfkPqP/SLhN2Myl8jDM43bl1nbcb4cO57jlh8Jow6pzXZdL4dyODTuuhu77FyO27DdwdRxmvO+O+3N2+BdqyTwLHVczDVY4UPE4O66/ZO2cx1LFzVdSXtF7G4HMbrauOHRw6c8FdZ5m9fHZHYZXfTlZquyynSyTTKke6vcffSD9pzPA/G7n7jxPmuhc1DHMynPMrGL6AdewYmwu5ko+UUyTwrMv27rPH1v1nGqd87+p6N6LU8k3NEng53xXyHS97+44OSg/sy/hn+Se6yfYNjW0/uTgP+PvWYzLMmjhcLB/gGpri6H83/84eUXWT6T9Hsv7785z/7z4icpW+zfXypuR7rx/gMdZb1/wC678pcs8/2a3mDitGHxl9mfPlll5MafWWqxk/eYuTDgcNMzDGWLWvsuglNxs53GtN6uWpktlW1tZZYcuinMMWmnNnJydze3b2Y1McBxrBkXw799izLMZZYyy0TkbsGM4p03S2uVu5s/XXUdSdec6smVxZYYGpVmT8A+8ajuEyV5FatkvVru2x6uxGXXbH4A+jvgP4GMYy3iPLXzq/6z65+E005ey+cwMZD3fZcqc6xpjTFjQ0P3U+e++cPYmTIwj0nrK5NPTfl3WvpfLtXDcb2HQMudYOxFXQBor4L4T6vrOauFctYXJQ++NUWmJe5bmx1jDiZS1dTqWxo4GR8jm3fttpmPHppk9PEyv4/y8/sO07XacOmcqc0x2Vi9BvNJvN5oW8x4mOsydpidRxMYJPx06m1bqPzq9KtK8sxXNXFodD/+MYYaJTLwOhc9brCsV18oOR1i4tXChyTkq4lf4y1Ke+9axjDHqs1mfBbMXuP4Hzi+X7t8vzv7bHerrUPgPCxhjre4fXdfLNtNM+Jd+Zdh8xd8wP87uNPoPgv4W7/5P2BuxfsMabNnMnza+54Pdi5U671GPZY8CehX8Voeoo7FHpkeEc6715FwHZrIrUrHaviPUbPZHND+IhczrP6FcYvhOZ0Di/ETt0OI+YwNWR9r7tpf6WDeZKZDB1+z2IthOl1mPyb5FluvEx9h9d0NnM0Y1XPFkWIsk1WotJ0PBMmkvjvQTd0e71tfeV+8r8lQ/tpzpsmxJ+InrI/dj2UajUajVTUajatRqNRtGo1Go1Go4wjeMpZFMVV9CHbofPraLsJ3JpWV2XOoanCuFky4y3PPNxucK2uKC1Lbdb1eo+m5XomN6HfeZsabHLHRX/K+offtNGGmHWctcVcG44MdSqsOLY9VzX+Zxfxn2HPdWTpzWvkrtJ8M5zorrKcquRytJ5N5DZmcaW02l76nWO+BqPXm1A2Ry/0q71dH/mqrqeFjkYxjEXtsX8qubTk67rGycyqsdm4tZx5D6D5hhi0waaWmiaMP81Yjii5qxPlPuU/GfTL1Y5E6Jyfiq63qTa39A4J0sOGDgO9WF9bOXl0XfPRbsY2bPNKPy1YrFYrFYmRhhlTIyMjJWJYZHXuCXI8OoXsvfljGLFicNifpp2XunoPiG1wtx3p1Tah+/DD66OnVtVXP9rKbVxOnL0tR/rHtqB5UDErUVcl11D4qqvjpOcxX7armUNJB3LpW6bxVvD08e8h3odKKvyCFZBdSh2FVcST9xV3n3T8t1j7Kr9qgrqXg+13Pt5U7JCvFXVIV1YG5lRhkVYZJYYDDD4KOIMoHCp26WS8GB7uBh2zIdgq/PKyInjV2STShuoapUdCpX1yTwqq/z1VvET7Kh5nVPkO8YyxjLt2MaaMmWTLQvx3qnzltnXW0p2jxgbEtSny/Osv8Y9pLMXYoHVPAhkVdWVeODhR6q9/Sxe2liwwZWMVvFXfRkeIDxAePUPIrdJ4ey6yquzH+PD/bUOWAu05qVHtFd8rrKHSoeNIOUqrYr3FXyToqfYJgwmJdKpXXOwYYegNNGMzfZPp/t3t/DVs4zjNTN61rRqaWaa4NYbRjTa0tWwy2Y2tGN8ZO8ofNKq4j9SL7I+cSm4/6ovLV5HNXLI0jJidwrtk6ynCaP6Z++GjRlWS3tLeW129Mi9evxU9mtz6s5J3Z7M2ngTgnKvmpomxpaLCzPfmx0JWE+m3NLDDGOX47RctdYYNK5jakdqLkRlI39n590T5zctGSwwZZDJj6kW8XSi6ot2MmWWJ0DUT3nuvebBudScjZ79g8cWJ8av0k+/bE5WKd5MdbFpbDVMxu1DVMmtNZGJvq1mtRbn6M+g/kP0FwDwr7quZs7xosNGpbscyxhhd9TyJyFwbLcxlTasg75vW7TsV5K7ji44XPMMrdoj+Y3rT0Hie62nlYV/pwczzOmdLqLhYkzGMzCZWGMQzGMSsZYY6Di1t4nlJ+Em63mJxrVLxPbYxNEdgc1dU2iOKyoYYWjNrEeHTYybVk0atSa7ehuwsWMWTqn1TrnS6hYsi71d1+s+k+ic70e20fzE/VaTdxT9ZtU4GIXdeNx3X77guYYfpHeTQjaMX6brOu4OY4K7Y2d9mbHarI5ox3p4GpJ2Vd/Tst60f7j999pppjR+Q/Qf8J/VaORs3cji7FfFuN61+ui9s8hix1OCh5KGVV23BPXvZfz3CLyHpix+exi8z/KnCnosY2eunor+cxyPO/xJ0vKey9OvE9VjqaYu0x3Z3jd6o2b1T12D+F8l232lwaaacD5LE8LBxu7WTlbWraWpew8Xexjel3E+wWD4APITdNqR8F3R3T0lunCQ4GaE9R37DxeCYfcHi4xci5ovKfxVs55y2hf+65E/Xdp6jR5nrebTmi5incpkyOjs50JvrZwstbbW6kfuuQw+2mykf/EXNFzxfKTrxew929TR6bWnGL//F3JFOFCQT3K4lQ"
-
-    kernels = Kernel(
-        bz2.decompress(base64.b64decode(quantization_code)),
-        [
-            "int4WeightCompression",
-            "int4WeightExtractionFloat",
-            "int4WeightExtractionHalf",
-            "int8WeightExtractionFloat",
-            "int8WeightExtractionHalf",
-        ],
-    )
-except Exception as exception:
-    kernels = None
-    logger.warning("Failed to load cpm_kernels:" + str(exception))
-
-
-class W8A16Linear(torch.autograd.Function):
-    @staticmethod
-    def forward(ctx, inp: torch.Tensor, quant_w: torch.Tensor, scale_w: torch.Tensor, weight_bit_width):
-        ctx.inp_shape = inp.size()
-        ctx.weight_bit_width = weight_bit_width
-        out_features = quant_w.size(0)
-        inp = inp.contiguous().view(-1, inp.size(-1))
-        weight = extract_weight_to_half(quant_w, scale_w, weight_bit_width)
-        ctx.weight_shape = weight.size()
-        output = inp.mm(weight.t())
-        ctx.save_for_backward(inp, quant_w, scale_w)
-        return output.view(*(ctx.inp_shape[:-1] + (out_features,)))
-
-    @staticmethod
-    def backward(ctx, grad_output: torch.Tensor):
-        inp, quant_w, scale_w = ctx.saved_tensors
-        weight = extract_weight_to_half(quant_w, scale_w, ctx.weight_bit_width)
-        grad_output = grad_output.contiguous().view(-1, weight.size(0))
-        grad_input = grad_output.mm(weight)
-        grad_weight = grad_output.t().mm(inp)
-        return grad_input.view(ctx.inp_shape), grad_weight.view(ctx.weight_shape), None, None
-
-
-def compress_int4_weight(weight: torch.Tensor):  # (n, m)
-    with torch.cuda.device(weight.device):
-        n, m = weight.size(0), weight.size(1)
-        assert m % 2 == 0
-        m = m // 2
-        out = torch.empty(n, m, dtype=torch.int8, device="cuda")
-        stream = torch.cuda.current_stream()
-
-        gridDim = (n, 1, 1)
-        blockDim = (min(round_up(m, 32), 1024), 1, 1)
-
-        kernels.int4WeightCompression(
-            gridDim,
-            blockDim,
-            0,
-            stream,
-            [ctypes.c_void_p(weight.data_ptr()), ctypes.c_void_p(out.data_ptr()), ctypes.c_int32(n), ctypes.c_int32(m)],
-        )
-        return out
-
-
-def extract_weight_to_half(weight: torch.Tensor, scale_list: torch.Tensor, source_bit_width: int):
-    assert scale_list.dtype in [torch.half, torch.bfloat16]
-    assert weight.dtype in [torch.int8]
-    if source_bit_width == 8:
-        return weight.to(scale_list.dtype) * scale_list[:, None]
-    elif source_bit_width == 4:
-        func = (
-            kernels.int4WeightExtractionHalf if scale_list.dtype == torch.half else kernels.int4WeightExtractionBFloat16
-        )
-    else:
-        assert False, "Unsupported bit-width"
-
-    with torch.cuda.device(weight.device):
-        n, m = weight.size(0), weight.size(1)
-        out = torch.empty(n, m * (8 // source_bit_width), dtype=scale_list.dtype, device="cuda")
-        stream = torch.cuda.current_stream()
-
-        gridDim = (n, 1, 1)
-        blockDim = (min(round_up(m, 32), 1024), 1, 1)
-
-        func(
-            gridDim,
-            blockDim,
-            0,
-            stream,
-            [
-                ctypes.c_void_p(weight.data_ptr()),
-                ctypes.c_void_p(scale_list.data_ptr()),
-                ctypes.c_void_p(out.data_ptr()),
-                ctypes.c_int32(n),
-                ctypes.c_int32(m),
-            ],
-        )
-        return out
-
-
-class QuantizedLinear(torch.nn.Module):
-    def __init__(self, weight_bit_width: int, weight, bias=None, device="cpu", dtype=None, empty_init=False, *args,
-                 **kwargs):
-        super().__init__()
-        self.weight_bit_width = weight_bit_width
-
-        shape = weight.shape
-
-        if weight is None or empty_init:
-            self.weight = torch.empty(shape[0], shape[1] * weight_bit_width // 8, dtype=torch.int8, device=device)
-            self.weight_scale = torch.empty(shape[0], dtype=dtype, device=device)
-        else:
-            self.weight_scale = weight.abs().max(dim=-1).values / ((2 ** (weight_bit_width - 1)) - 1)
-            self.weight = torch.round(weight / self.weight_scale[:, None]).to(torch.int8)
-            if weight_bit_width == 4:
-                self.weight = compress_int4_weight(self.weight)
-
-        self.weight = Parameter(self.weight.to(device), requires_grad=False)
-        self.weight_scale = Parameter(self.weight_scale.to(device), requires_grad=False)
-        self.bias = Parameter(bias.to(device), requires_grad=False) if bias is not None else None
-
-    def forward(self, input):
-        output = W8A16Linear.apply(input, self.weight, self.weight_scale, self.weight_bit_width)
-        if self.bias is not None:
-            output = output + self.bias
-        return output
-
-
-def quantize(model, weight_bit_width, empty_init=False, device=None):
-    """Replace fp16 linear with quantized linear"""
-    for layer in model.layers:
-        layer.self_attention.query_key_value = QuantizedLinear(
-            weight_bit_width=weight_bit_width,
-            weight=layer.self_attention.query_key_value.weight.to(torch.cuda.current_device()),
-            bias=layer.self_attention.query_key_value.bias,
-            dtype=layer.self_attention.query_key_value.weight.dtype,
-            device=layer.self_attention.query_key_value.weight.device if device is None else device,
-            empty_init=empty_init
-        )
-        layer.self_attention.dense = QuantizedLinear(
-            weight_bit_width=weight_bit_width,
-            weight=layer.self_attention.dense.weight.to(torch.cuda.current_device()),
-            bias=layer.self_attention.dense.bias,
-            dtype=layer.self_attention.dense.weight.dtype,
-            device=layer.self_attention.dense.weight.device if device is None else device,
-            empty_init=empty_init
-        )
-        layer.mlp.dense_h_to_4h = QuantizedLinear(
-            weight_bit_width=weight_bit_width,
-            weight=layer.mlp.dense_h_to_4h.weight.to(torch.cuda.current_device()),
-            bias=layer.mlp.dense_h_to_4h.bias,
-            dtype=layer.mlp.dense_h_to_4h.weight.dtype,
-            device=layer.mlp.dense_h_to_4h.weight.device if device is None else device,
-            empty_init=empty_init
-        )
-        layer.mlp.dense_4h_to_h = QuantizedLinear(
-            weight_bit_width=weight_bit_width,
-            weight=layer.mlp.dense_4h_to_h.weight.to(torch.cuda.current_device()),
-            bias=layer.mlp.dense_4h_to_h.bias,
-            dtype=layer.mlp.dense_4h_to_h.weight.dtype,
-            device=layer.mlp.dense_4h_to_h.weight.device if device is None else device,
-            empty_init=empty_init
-        )
-
-    return model

From 32448e32ce2276956ad2dee9bd84b31b79709d04 Mon Sep 17 00:00:00 2001
From: klhhhhh <1412841649@qq.com>
Date: Mon, 17 Jul 2023 19:47:57 +0800
Subject: [PATCH 14/21] [shardformer] ChatGLM support layernorm sharding

---
 .../kit/model_zoo/transformers/chatglm2_6b/modeling_chatglm.py  | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/kit/model_zoo/transformers/chatglm2_6b/modeling_chatglm.py b/tests/kit/model_zoo/transformers/chatglm2_6b/modeling_chatglm.py
index 46078f441523..04d318d47868 100644
--- a/tests/kit/model_zoo/transformers/chatglm2_6b/modeling_chatglm.py
+++ b/tests/kit/model_zoo/transformers/chatglm2_6b/modeling_chatglm.py
@@ -417,7 +417,7 @@ def __init__(self, config: ChatGLMConfig, layer_number, device=None):
         )
 =======
         self.dense = nn.Linear(self.projection_size,
-                               self.hidden_size,
+                               config.hidden_size,
                                bias=config.add_bias_linear,
                                device=device,
                                **_config_to_kwargs(config))

From eb1c71abaa40f09bc9d9369cf67d717adad3b5e8 Mon Sep 17 00:00:00 2001
From: klhhhhh <1412841649@qq.com>
Date: Tue, 18 Jul 2023 12:33:12 +0800
Subject: [PATCH 15/21] [shardformer] register without auto policy

---
 colossalai/shardformer/policies/autopolicy.py | 3 ---
 1 file changed, 3 deletions(-)

diff --git a/colossalai/shardformer/policies/autopolicy.py b/colossalai/shardformer/policies/autopolicy.py
index 52d9bd5ebeae..77583dd77cf0 100644
--- a/colossalai/shardformer/policies/autopolicy.py
+++ b/colossalai/shardformer/policies/autopolicy.py
@@ -112,9 +112,6 @@ class PolicyLocation:
     # Sam
     "transformers.models.sam.modeling_sam.SamModel":
         PolicyLocation(file_name="sam", class_name="SamModelPolicy"),
-    # ChatGLM
-    "tests.kit.model_zoo.transformers.chatglm2_6b.modeling_chatglm.ChatGLMModel":
-        PolicyLocation(file_name="chatglm", class_name="ChatGLMModelPolicy"),
 }
 
 

From 127e38539a7ce756e2f35cc545b8f8b199c50eef Mon Sep 17 00:00:00 2001
From: klhhhhh <1412841649@qq.com>
Date: Wed, 19 Jul 2023 11:39:59 +0800
Subject: [PATCH 16/21] [shardformer] pre-commit check files

---
 .../chatglm2_6b/modeling_chatglm.py           | 19 ++++++-------------
 1 file changed, 6 insertions(+), 13 deletions(-)

diff --git a/tests/kit/model_zoo/transformers/chatglm2_6b/modeling_chatglm.py b/tests/kit/model_zoo/transformers/chatglm2_6b/modeling_chatglm.py
index 04d318d47868..bae6d425878d 100644
--- a/tests/kit/model_zoo/transformers/chatglm2_6b/modeling_chatglm.py
+++ b/tests/kit/model_zoo/transformers/chatglm2_6b/modeling_chatglm.py
@@ -396,18 +396,17 @@ def __init__(self, config: ChatGLMConfig, layer_number, device=None):
             self.num_multi_query_groups_per_partition = config.multi_query_group_num
             self.qkv_hidden_size = (self.projection_size +
                                     2 * self.hidden_size_per_attention_head * config.multi_query_group_num)
-<<<<<<< HEAD
         self.query_key_value = nn.Linear(
             config.hidden_size,
             self.qkv_hidden_size,
+            bias=config.add_bias_linear or config.add_qkv_bias,
             device=device,
             **_config_to_kwargs(config),
         )
-=======
-        self.query_key_value = nn.Linear(self.hidden_size,
-                                         self.qkv_hidden_size,
-                                         bias=config.add_bias_linear or config.add_qkv_bias,
-<<<<<<< HEAD
+
+        self.core_attention = CoreAttention(config, self.layer_number)
+
+        # Output.
         self.dense = nn.Linear(
             self.projection_size,
             config.hidden_size,
@@ -415,13 +414,6 @@ def __init__(self, config: ChatGLMConfig, layer_number, device=None):
             device=device,
             **_config_to_kwargs(config),
         )
-=======
-        self.dense = nn.Linear(self.projection_size,
-                               config.hidden_size,
-                               bias=config.add_bias_linear,
-                               device=device,
-                               **_config_to_kwargs(config))
->>>>>>> [shardformer] support chatglm without layernorm
 
     def _allocate_memory(self, inference_max_sequence_len, batch_size, device=None, dtype=None):
         if self.multi_query_attention:
@@ -989,6 +981,7 @@ def forward(
 
     def quantize(self, weight_bit_width: int):
         from .quantization import quantize
+
         quantize(self.encoder, weight_bit_width)
         return self
 

From 9d5b141df329d584675f2e4b31576e6337d9f274 Mon Sep 17 00:00:00 2001
From: klhhhhh <1412841649@qq.com>
Date: Thu, 20 Jul 2023 19:14:04 +0800
Subject: [PATCH 17/21] [shardformer] support ChatGLMForConditionalGeneration &
 add fusedlayernorm for vit

---
 colossalai/shardformer/policies/chatglm.py    |  24 +++
 colossalai/shardformer/policies/vit.py        | 157 ++++++++++--------
 tests/kit/model_zoo/transformers/chatglm.py   |  11 +-
 .../test_model/test_shard_chatglm.py          |   4 +-
 4 files changed, 123 insertions(+), 73 deletions(-)

diff --git a/colossalai/shardformer/policies/chatglm.py b/colossalai/shardformer/policies/chatglm.py
index 934b99b83ea1..46aa3b52af8f 100644
--- a/colossalai/shardformer/policies/chatglm.py
+++ b/colossalai/shardformer/policies/chatglm.py
@@ -90,7 +90,31 @@ def module_policy(self) -> Dict[Union[str, nn.Module], ModulePolicyDescription]:
                                                                 policy=policy,
                                                                 target_key=ChatGLMModel)
 
+            else:
+                self.append_or_create_submodule_replacement(description=[
+                    SubModuleReplacementDescription(suffix="input_layernorm", target_module=col_nn.FusedRMSNorm),
+                    SubModuleReplacementDescription(suffix="post_attention_layernorm",
+                                                    target_module=col_nn.FusedRMSNorm)
+                ],
+                                                            policy=policy,
+                                                            target_key=GLMBlock)
+
+                if self.model.config.post_layer_norm:
+                    self.append_or_create_submodule_replacement(description=[
+                        SubModuleReplacementDescription(suffix="encoder.final_layernorm",
+                                                        target_module=col_nn.FusedRMSNorm)
+                    ],
+                                                                policy=policy,
+                                                                target_key=ChatGLMModel)
+
         return policy
 
     def postprocess(self):
         return self.model
+
+
+class ChatGLMForConditionalGenerationPolicy(ChatGLMModelPolicy):
+
+    def module_policy(self):
+        policy = super().module_policy()
+        return policy
diff --git a/colossalai/shardformer/policies/vit.py b/colossalai/shardformer/policies/vit.py
index 7b035afae22c..d45055bc8beb 100644
--- a/colossalai/shardformer/policies/vit.py
+++ b/colossalai/shardformer/policies/vit.py
@@ -2,7 +2,13 @@
 
 import torch.nn as nn
 
-from colossalai.shardformer.layer import DropoutForReplicatedInput, DropoutForParallelInput, FusedLayerNorm, Linear1D_Col, Linear1D_Row
+from colossalai.shardformer.layer import (
+    DropoutForParallelInput,
+    DropoutForReplicatedInput,
+    FusedLayerNorm,
+    Linear1D_Col,
+    Linear1D_Row,
+)
 
 from .basepolicy import ModulePolicyDescription, Policy, SubModuleReplacementDescription
 
@@ -18,101 +24,112 @@ def preprocess(self):
         return self.model
 
     def module_policy(self) -> Dict[Union[str, nn.Module], ModulePolicyDescription]:
-        from transformers.models.vit.modeling_vit import ViTEmbeddings, ViTLayer
+        from transformers.models.vit.modeling_vit import ViTEmbeddings, ViTLayer, ViTModel
 
         policy = {}
 
         if self.shard_config.enable_tensor_parallelism:
             policy[ViTEmbeddings] = ModulePolicyDescription(attribute_replacement={},
-                                        param_replacement=[],
-                                        sub_module_replacement=[
-                                            SubModuleReplacementDescription(
-                                                suffix="dropout",
-                                                target_module=DropoutForReplicatedInput,
-                                            )
-                                        ])
-            
-            policy[ViTLayer] = ModulePolicyDescription(
-                    attribute_replacement={
-                        "attention.attention.num_attention_heads":
-                            self.model.config.num_attention_heads//self.shard_config.tensor_parallel_size,
-                        "attention.attention.all_head_size":
-                            self.model.config.hidden_size//self.shard_config.tensor_parallel_size,
-                    },
-                    param_replacement=[],
-                    sub_module_replacement=[
-                        SubModuleReplacementDescription(
-                            suffix="attention.attention.query",
-                            target_module=Linear1D_Col,
-                        ),
-                        SubModuleReplacementDescription(
-                            suffix="attention.attention.key",
-                            target_module=Linear1D_Col,
-                        ),
-                        SubModuleReplacementDescription(
-                            suffix="attention.attention.value",
-                            target_module=Linear1D_Col,
-                        ),
-                        SubModuleReplacementDescription(
-                            suffix="attention.attention.dropout",
-                            target_module=DropoutForParallelInput,
-                        ),
-                        SubModuleReplacementDescription(
-                            suffix="attention.output.dense",
-                            target_module=Linear1D_Row,
-                        ),
-                        SubModuleReplacementDescription(
-                            suffix="attention.output.dropout",
-                            target_module=DropoutForReplicatedInput,
-                        ),
-                        SubModuleReplacementDescription(
-                            suffix="intermediate.dense",
-                            target_module=Linear1D_Col,
-                        ),
-                        SubModuleReplacementDescription(
-                            suffix="output.dense",
-                            target_module=Linear1D_Row,
-                        ),
-                        SubModuleReplacementDescription(
-                            suffix="output.dropout",
-                            target_module=DropoutForReplicatedInput,
-                        ),
-                    ]
-                )
+                                                            param_replacement=[],
+                                                            sub_module_replacement=[
+                                                                SubModuleReplacementDescription(
+                                                                    suffix="dropout",
+                                                                    target_module=DropoutForReplicatedInput,
+                                                                )
+                                                            ])
+
+            policy[ViTLayer] = ModulePolicyDescription(attribute_replacement={
+                "attention.attention.num_attention_heads":
+                    self.model.config.num_attention_heads // self.shard_config.tensor_parallel_size,
+                "attention.attention.all_head_size":
+                    self.model.config.hidden_size // self.shard_config.tensor_parallel_size,
+            },
+                                                       param_replacement=[],
+                                                       sub_module_replacement=[
+                                                           SubModuleReplacementDescription(
+                                                               suffix="attention.attention.query",
+                                                               target_module=Linear1D_Col,
+                                                           ),
+                                                           SubModuleReplacementDescription(
+                                                               suffix="attention.attention.key",
+                                                               target_module=Linear1D_Col,
+                                                           ),
+                                                           SubModuleReplacementDescription(
+                                                               suffix="attention.attention.value",
+                                                               target_module=Linear1D_Col,
+                                                           ),
+                                                           SubModuleReplacementDescription(
+                                                               suffix="attention.attention.dropout",
+                                                               target_module=DropoutForParallelInput,
+                                                           ),
+                                                           SubModuleReplacementDescription(
+                                                               suffix="attention.output.dense",
+                                                               target_module=Linear1D_Row,
+                                                           ),
+                                                           SubModuleReplacementDescription(
+                                                               suffix="attention.output.dropout",
+                                                               target_module=DropoutForReplicatedInput,
+                                                           ),
+                                                           SubModuleReplacementDescription(
+                                                               suffix="intermediate.dense",
+                                                               target_module=Linear1D_Col,
+                                                           ),
+                                                           SubModuleReplacementDescription(
+                                                               suffix="output.dense",
+                                                               target_module=Linear1D_Row,
+                                                           ),
+                                                           SubModuleReplacementDescription(
+                                                               suffix="output.dropout",
+                                                               target_module=DropoutForReplicatedInput,
+                                                           ),
+                                                       ])
+
+        if self.shard_config.enable_fused_normalization:
+            policy[ViTModel] = ModulePolicyDescription(attribute_replacement={},
+                                                       param_replacement=[],
+                                                       sub_module_replacement=[
+                                                           SubModuleReplacementDescription(
+                                                               suffix="layernorm",
+                                                               target_module=FusedLayerNorm,
+                                                           )
+                                                       ])
+
+            self.append_or_create_submodule_replacement(description=[
+                SubModuleReplacementDescription(suffix="layernorm_before", target_module=FusedLayerNorm),
+                SubModuleReplacementDescription(suffix="layernorm_after", target_module=FusedLayerNorm)
+            ],
+                                                        policy=policy,
+                                                        target_key=ViTLayer)
 
         return policy
-  
-    
+
     def new_model_class(self):
         return None
 
     def postprocess(self):
         return self.model
 
+
 class ViTForImageClassificationPolicy(ViTPolicy):
 
-     def module_policy(self):
+    def module_policy(self):
         from transformers.models.vit.modeling_vit import ViTForImageClassification
 
         policy = super().module_policy()
         if self.shard_config.enable_tensor_parallelism:
             new_item = {
                 ViTForImageClassification:
-                ModulePolicyDescription(sub_module_replacement=[
-                                        SubModuleReplacementDescription(suffix="classifier",
-                                                                            target_module=Linear1D_Col,
-                                                                            kwargs=dict(gather_output=True))
-                                        ])
+                    ModulePolicyDescription(sub_module_replacement=[
+                        SubModuleReplacementDescription(
+                            suffix="classifier", target_module=Linear1D_Col, kwargs=dict(gather_output=True))
+                    ])
             }
             policy.update(new_item)
         return policy
 
+
 class ViTForMaskedImageModelingPolicy(ViTPolicy):
-    
+
     def module_policy(self):
         policy = super().module_policy()
         return policy
-    
-
-        
-
diff --git a/tests/kit/model_zoo/transformers/chatglm.py b/tests/kit/model_zoo/transformers/chatglm.py
index 1408babede64..04e73a832abe 100644
--- a/tests/kit/model_zoo/transformers/chatglm.py
+++ b/tests/kit/model_zoo/transformers/chatglm.py
@@ -3,7 +3,7 @@
 
 from ..registry import ModelAttribute, model_zoo
 from .chatglm2_6b.configuration_chatglm import ChatGLMConfig
-from .chatglm2_6b.modeling_chatglm import ChatGLMModel
+from .chatglm2_6b.modeling_chatglm import ChatGLMForConditionalGeneration, ChatGLMModel
 
 # ================================
 # Register single-sentence ChatGLM
@@ -21,7 +21,7 @@ def data_gen():
 
 # define loss function
 loss_fn_for_chatglm_model = lambda x: x.last_hidden_state.mean()
-loss_fn = lambda x: x.loss
+loss_fn = lambda x: x.logits.mean()
 config = ChatGLMConfig(num_layers=1,
                        padded_vocab_size=65024,
                        hidden_size=64,
@@ -36,3 +36,10 @@ def data_gen():
                    output_transform_fn=output_transform_fn,
                    loss_fn=loss_fn_for_chatglm_model,
                    model_attribute=ModelAttribute(has_control_flow=True))
+
+model_zoo.register(name="transformers_chatglm_for_conditional_generation",
+                   model_fn=lambda: ChatGLMForConditionalGeneration(config, empty_init=False),
+                   data_gen_fn=data_gen,
+                   output_transform_fn=output_transform_fn,
+                   loss_fn=loss_fn,
+                   model_attribute=ModelAttribute(has_control_flow=True))
diff --git a/tests/test_shardformer/test_model/test_shard_chatglm.py b/tests/test_shardformer/test_model/test_shard_chatglm.py
index 2cdf5da2e6da..a0fa4bd82e74 100644
--- a/tests/test_shardformer/test_model/test_shard_chatglm.py
+++ b/tests/test_shardformer/test_model/test_shard_chatglm.py
@@ -7,7 +7,7 @@
 import colossalai
 from colossalai.logging import disable_existing_loggers
 from colossalai.shardformer import ShardConfig, ShardFormer
-from colossalai.shardformer.policies.chatglm import ChatGLMModelPolicy
+from colossalai.shardformer.policies.chatglm import ChatGLMForConditionalGenerationPolicy, ChatGLMModelPolicy
 from colossalai.tensor.d_tensor.api import is_customized_distributed_tensor, is_distributed_tensor
 from colossalai.testing import (
     assert_hf_output_close,
@@ -85,6 +85,8 @@ def run_chatglm_test(enable_fused_normalization, enable_tensor_parallelism):
         shard_former = ShardFormer(shard_config=shard_config)
         if name == "transformers_chatglm":
             sharded_model = shard_former.optimize(model_copy, ChatGLMModelPolicy()).cuda()
+        else:
+            sharded_model = shard_former.optimize(model_copy, ChatGLMForConditionalGenerationPolicy()).cuda()
 
         check_forward_backward(org_model, sharded_model, data_gen_fn, output_transform_fn, loss_fn)
     torch.cuda.empty_cache()

From f48a8bbd7995944796c52050039541903708b8f1 Mon Sep 17 00:00:00 2001
From: FoolPlayer <45593998+FoolPlayer@users.noreply.github.com>
Date: Tue, 25 Jul 2023 14:29:10 +0800
Subject: [PATCH 18/21] [shardformer] support Blip2 (#4243)

* support base blip2

* add support for downstream blip2 model

* update readme

* add forward injection

* skip not compatible models test

* fix test for gemini and low_level_zero_pugin
---
 colossalai/shardformer/README.md              |   3 +-
 colossalai/shardformer/modeling/blip2.py      |  60 ++++
 colossalai/shardformer/modeling/sam.py        |   2 -
 colossalai/shardformer/policies/autopolicy.py |   6 +
 colossalai/shardformer/policies/blip2.py      | 304 ++++++++++++++++++
 tests/kit/model_zoo/transformers/__init__.py  |   1 +
 tests/kit/model_zoo/transformers/blip2.py     |  61 ++++
 .../test_plugin/test_gemini_plugin.py         |   3 +-
 .../test_plugin/test_low_level_zero_plugin.py |   3 +-
 tests/test_lazy/test_distribute.py            |   3 +-
 .../test_model/test_shard_blip2.py            | 107 ++++++
 11 files changed, 547 insertions(+), 6 deletions(-)
 create mode 100644 colossalai/shardformer/modeling/blip2.py
 create mode 100644 colossalai/shardformer/policies/blip2.py
 create mode 100644 tests/kit/model_zoo/transformers/blip2.py
 create mode 100644 tests/test_shardformer/test_model/test_shard_blip2.py

diff --git a/colossalai/shardformer/README.md b/colossalai/shardformer/README.md
index 21b7bf05f923..aa1d553d8d66 100644
--- a/colossalai/shardformer/README.md
+++ b/colossalai/shardformer/README.md
@@ -104,7 +104,8 @@ We will follow this roadmap to develop Shardformer:
     - [ ] Audio
       - [x] Whisper
     - [ ] Multi-modal
-      - [ ] To be added
+      - [x] SAM
+      - [x] BLIP-2
 
 ## 💡 API Design
 
diff --git a/colossalai/shardformer/modeling/blip2.py b/colossalai/shardformer/modeling/blip2.py
new file mode 100644
index 000000000000..b7945423ae83
--- /dev/null
+++ b/colossalai/shardformer/modeling/blip2.py
@@ -0,0 +1,60 @@
+from typing import Optional, Tuple, Union
+
+import torch
+import torch.nn as nn
+
+
+def forward_fn():
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        head_mask: Optional[torch.Tensor] = None,
+        output_attentions: Optional[bool] = False,
+    ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
+        """Input shape: Batch x Time x Channel"""
+
+        bsz, tgt_len, embed_dim = hidden_states.size()
+
+        mixed_qkv = self.qkv(hidden_states)
+
+        # modified from original code, which is:
+        # mixed_qkv = mixed_qkv.reshape(bsz, tgt_len, 3, self.num_heads, embed_dim // self.num_heads).permute(
+        #     2, 0, 3, 1, 4
+        # )
+        # to:
+        mixed_qkv = mixed_qkv.reshape(bsz, tgt_len, 3, self.num_heads, -1).permute(2, 0, 3, 1, 4)
+        query_states, key_states, value_states = (
+            mixed_qkv[0],
+            mixed_qkv[1],
+            mixed_qkv[2],
+        )
+
+        # Take the dot product between "query" and "key" to get the raw attention scores.
+        attention_scores = torch.matmul(query_states, key_states.transpose(-1, -2))
+
+        attention_scores = attention_scores * self.scale
+
+        # Normalize the attention scores to probabilities.
+        attention_probs = nn.functional.softmax(attention_scores, dim=-1)
+
+        # This is actually dropping out entire tokens to attend to, which might
+        # seem a bit unusual, but is taken from the original Transformer paper.
+        attention_probs = self.dropout(attention_probs)
+
+        # Mask heads if we want to
+        if head_mask is not None:
+            attention_probs = attention_probs * head_mask
+
+        context_layer = torch.matmul(attention_probs, value_states).permute(0, 2, 1, 3)
+
+        new_context_layer_shape = context_layer.size()[:-2] + (self.embed_dim,)
+        context_layer = context_layer.reshape(new_context_layer_shape)
+
+        output = self.projection(context_layer)
+
+        outputs = (output, attention_probs) if output_attentions else (output, None)
+
+        return outputs
+
+    return forward
diff --git a/colossalai/shardformer/modeling/sam.py b/colossalai/shardformer/modeling/sam.py
index 00e2d744e219..63ebfe89d5fa 100644
--- a/colossalai/shardformer/modeling/sam.py
+++ b/colossalai/shardformer/modeling/sam.py
@@ -1,6 +1,4 @@
 import torch
-import torch.distributed as dist
-from torch.distributed import ProcessGroup
 
 
 def forward_fn():
diff --git a/colossalai/shardformer/policies/autopolicy.py b/colossalai/shardformer/policies/autopolicy.py
index 77583dd77cf0..55f99ed83819 100644
--- a/colossalai/shardformer/policies/autopolicy.py
+++ b/colossalai/shardformer/policies/autopolicy.py
@@ -112,6 +112,12 @@ class PolicyLocation:
     # Sam
     "transformers.models.sam.modeling_sam.SamModel":
         PolicyLocation(file_name="sam", class_name="SamModelPolicy"),
+
+    # Blip2
+    "transformers.models.blip_2.modeling_blip_2.Blip2Model":
+        PolicyLocation(file_name="blip2", class_name="Blip2ModelPolicy"),
+    "transformers.models.blip_2.modeling_blip_2.Blip2ForConditionalGeneration":
+        PolicyLocation(file_name="blip2", class_name="Blip2ForConditionalGenerationPolicy"),
 }
 
 
diff --git a/colossalai/shardformer/policies/blip2.py b/colossalai/shardformer/policies/blip2.py
new file mode 100644
index 000000000000..43aa1adc1c5b
--- /dev/null
+++ b/colossalai/shardformer/policies/blip2.py
@@ -0,0 +1,304 @@
+import torch.nn as nn
+
+import colossalai.shardformer.layer as col_nn
+
+from .._utils import getattr_, setattr_
+from ..modeling.blip2 import forward_fn
+from .basepolicy import ModulePolicyDescription, Policy, SubModuleReplacementDescription
+
+__all__ = ['BlipPolicy', 'BlipModelPolicy']
+
+
+class BlipPolicy(Policy):
+
+    def config_sanity_check(self):
+        pass
+
+    def preprocess(self):
+        # reshape the embedding layer
+        r"""
+        Reshape the Embedding layer to make the embedding dimension divisible by world_size
+        """
+        # TODO:
+        vocab_size = self.model.config.qformer_config.vocab_size
+        world_size = self.shard_config.tensor_parallel_size
+        if vocab_size % world_size != 0:
+            new_vocab_size = vocab_size + world_size - vocab_size % world_size
+            self.model.resize_token_embeddings(new_vocab_size)
+        return self.model
+
+    def module_policy(self):
+        from transformers.models.blip_2.modeling_blip_2 import (
+            Blip2Attention,
+            Blip2EncoderLayer,
+            Blip2QFormerLayer,
+            Blip2QFormerModel,
+            Blip2VisionModel,
+        )
+        from transformers.models.opt.modeling_opt import OPTDecoderLayer, OPTForCausalLM
+
+        policy = {}
+
+        if self.shard_config.enable_tensor_parallelism:
+            policy[Blip2EncoderLayer] = ModulePolicyDescription(attribute_replacement={
+                "self_attn.num_heads":
+                    self.model.config.vision_config.num_attention_heads // self.shard_config.tensor_parallel_size,
+                "self_attn.embed_dim":
+                    self.model.config.vision_config.hidden_size // self.shard_config.tensor_parallel_size,
+            },
+                                                                sub_module_replacement=[
+                                                                    SubModuleReplacementDescription(
+                                                                        suffix="self_attn.dropout",
+                                                                        target_module=col_nn.DropoutForParallelInput,
+                                                                    ),
+                                                                    SubModuleReplacementDescription(
+                                                                        suffix="self_attn.qkv",
+                                                                        target_module=col_nn.FusedLinear1D_Col,
+                                                                        kwargs={
+                                                                            "n_fused": 3,
+                                                                        }),
+                                                                    SubModuleReplacementDescription(
+                                                                        suffix="self_attn.projection",
+                                                                        target_module=col_nn.Linear1D_Row,
+                                                                    ),
+                                                                    SubModuleReplacementDescription(
+                                                                        suffix="mlp.fc1",
+                                                                        target_module=col_nn.Linear1D_Col,
+                                                                    ),
+                                                                    SubModuleReplacementDescription(
+                                                                        suffix="mlp.fc2",
+                                                                        target_module=col_nn.Linear1D_Row,
+                                                                    ),
+                                                                ])
+
+            policy[Blip2QFormerModel] = ModulePolicyDescription(sub_module_replacement=[
+                SubModuleReplacementDescription(
+                    suffix="dropout",
+                    target_module=col_nn.DropoutForParallelInput,
+                ),
+            ])
+
+            policy[Blip2QFormerLayer] = ModulePolicyDescription(attribute_replacement={
+                "attention.attention.num_attention_heads":
+                    self.model.config.qformer_config.num_attention_heads // self.shard_config.tensor_parallel_size,
+                "attention.attention.all_head_size":
+                    self.model.config.qformer_config.hidden_size // self.shard_config.tensor_parallel_size,
+                "crossattention.attention.num_attention_heads":
+                    self.model.config.qformer_config.num_attention_heads // self.shard_config.tensor_parallel_size,
+                "crossattention.attention.all_head_size":
+                    self.model.config.qformer_config.hidden_size // self.shard_config.tensor_parallel_size,
+            },
+                                                                sub_module_replacement=[
+                                                                    SubModuleReplacementDescription(
+                                                                        suffix="attention.attention.query",
+                                                                        target_module=col_nn.Linear1D_Col,
+                                                                    ),
+                                                                    SubModuleReplacementDescription(
+                                                                        suffix="attention.attention.key",
+                                                                        target_module=col_nn.Linear1D_Col,
+                                                                    ),
+                                                                    SubModuleReplacementDescription(
+                                                                        suffix="attention.attention.value",
+                                                                        target_module=col_nn.Linear1D_Col,
+                                                                    ),
+                                                                    SubModuleReplacementDescription(
+                                                                        suffix="attention.attention.dropout",
+                                                                        target_module=col_nn.DropoutForParallelInput,
+                                                                    ),
+                                                                    SubModuleReplacementDescription(
+                                                                        suffix="attention.output.dense",
+                                                                        target_module=col_nn.Linear1D_Row,
+                                                                    ),
+                                                                    SubModuleReplacementDescription(
+                                                                        suffix="attention.output.dropout",
+                                                                        target_module=col_nn.DropoutForParallelInput,
+                                                                    ),
+                                                                    SubModuleReplacementDescription(
+                                                                        suffix="crossattention.attention.query",
+                                                                        target_module=col_nn.Linear1D_Col,
+                                                                    ),
+                                                                    SubModuleReplacementDescription(
+                                                                        suffix="crossattention.attention.key",
+                                                                        target_module=col_nn.Linear1D_Col,
+                                                                    ),
+                                                                    SubModuleReplacementDescription(
+                                                                        suffix="crossattention.attention.value",
+                                                                        target_module=col_nn.Linear1D_Col,
+                                                                    ),
+                                                                    SubModuleReplacementDescription(
+                                                                        suffix="crossattention.attention.dropout",
+                                                                        target_module=col_nn.DropoutForParallelInput,
+                                                                    ),
+                                                                    SubModuleReplacementDescription(
+                                                                        suffix="crossattention.output.dense",
+                                                                        target_module=col_nn.Linear1D_Row,
+                                                                    ),
+                                                                    SubModuleReplacementDescription(
+                                                                        suffix="crossattention.output.dropout",
+                                                                        target_module=col_nn.DropoutForParallelInput,
+                                                                    ),
+                                                                    SubModuleReplacementDescription(
+                                                                        suffix="intermediate_query.dense",
+                                                                        target_module=col_nn.Linear1D_Col,
+                                                                    ),
+                                                                    SubModuleReplacementDescription(
+                                                                        suffix="output_query.dense",
+                                                                        target_module=col_nn.Linear1D_Row,
+                                                                    ),
+                                                                    SubModuleReplacementDescription(
+                                                                        suffix="output_query.dropout",
+                                                                        target_module=col_nn.DropoutForParallelInput,
+                                                                    )
+                                                                ])
+
+            policy[OPTDecoderLayer] = ModulePolicyDescription(attribute_replacement={
+                "self_attn.embed_dim":
+                    self.model.config.text_config.hidden_size // self.shard_config.tensor_parallel_size,
+                "self_attn.num_heads":
+                    self.model.config.text_config.num_attention_heads // self.shard_config.tensor_parallel_size
+            },
+                                                              sub_module_replacement=[
+                                                                  SubModuleReplacementDescription(
+                                                                      suffix="self_attn.q_proj",
+                                                                      target_module=col_nn.Linear1D_Col,
+                                                                  ),
+                                                                  SubModuleReplacementDescription(
+                                                                      suffix="self_attn.k_proj",
+                                                                      target_module=col_nn.Linear1D_Col,
+                                                                  ),
+                                                                  SubModuleReplacementDescription(
+                                                                      suffix="self_attn.v_proj",
+                                                                      target_module=col_nn.Linear1D_Col,
+                                                                  ),
+                                                                  SubModuleReplacementDescription(
+                                                                      suffix="self_attn.out_proj",
+                                                                      target_module=col_nn.Linear1D_Row,
+                                                                  ),
+                                                                  SubModuleReplacementDescription(
+                                                                      suffix="fc1",
+                                                                      target_module=col_nn.Linear1D_Col,
+                                                                  ),
+                                                                  SubModuleReplacementDescription(
+                                                                      suffix="fc2",
+                                                                      target_module=col_nn.Linear1D_Row,
+                                                                  )
+                                                              ])
+
+            policy[OPTForCausalLM] = ModulePolicyDescription(sub_module_replacement=[
+                SubModuleReplacementDescription(
+                    suffix="model.decoder.embed_tokens",
+                    target_module=col_nn.VocabParallelEmbedding1D,
+                ),
+                SubModuleReplacementDescription(
+                    suffix="lm_head",
+                    target_module=col_nn.Linear1D_Col,
+                    kwargs={"gather_output": True},
+                ),
+            ])
+
+            policy[Blip2Attention] = ModulePolicyDescription(method_replacement={"forward": forward_fn()})
+
+        # optimization configuration
+        if self.shard_config.enable_fused_normalization:
+            # Handle Blip2EncoderLayer layer
+            self.append_or_create_submodule_replacement(description=[
+                SubModuleReplacementDescription(
+                    suffix="layer_norm1",
+                    target_module=col_nn.FusedLayerNorm,
+                ),
+                SubModuleReplacementDescription(
+                    suffix="layer_norm2",
+                    target_module=col_nn.FusedLayerNorm,
+                )
+            ],
+                                                        policy=policy,
+                                                        target_key=Blip2EncoderLayer)
+
+            # handle Blip2VisionModel layer
+            self.append_or_create_submodule_replacement(description=[
+                SubModuleReplacementDescription(
+                    suffix="post_layernorm",
+                    target_module=col_nn.FusedLayerNorm,
+                )
+            ],
+                                                        policy=policy,
+                                                        target_key=Blip2VisionModel)
+
+            # handle Blip2VisionModel layer
+            self.append_or_create_submodule_replacement(
+                description=[SubModuleReplacementDescription(
+                    suffix="layernorm",
+                    target_module=col_nn.FusedLayerNorm,
+                )],
+                policy=policy,
+                target_key=Blip2QFormerModel)
+
+            # handle Blip2QFormerLayer layer
+            self.append_or_create_submodule_replacement(description=[
+                SubModuleReplacementDescription(
+                    suffix="attention.output.LayerNorm",
+                    target_module=col_nn.FusedLayerNorm,
+                ),
+                SubModuleReplacementDescription(
+                    suffix="crossattention.output.LayerNorm",
+                    target_module=col_nn.FusedLayerNorm,
+                ),
+                SubModuleReplacementDescription(
+                    suffix="output_query.LayerNorm",
+                    target_module=col_nn.FusedLayerNorm,
+                )
+            ],
+                                                        policy=policy,
+                                                        target_key=Blip2QFormerLayer)
+
+            # handle OPTForCausalLM layer
+            self.append_or_create_submodule_replacement(description=[
+                SubModuleReplacementDescription(
+                    suffix="model.decoder.final_layer_norm",
+                    target_module=col_nn.FusedLayerNorm,
+                )
+            ],
+                                                        policy=policy,
+                                                        target_key=OPTForCausalLM)
+
+            # handle OPTDecoderLayer layer
+            self.append_or_create_submodule_replacement(description=[
+                SubModuleReplacementDescription(
+                    suffix="self_attn_layer_norm",
+                    target_module=col_nn.FusedLayerNorm,
+                ),
+                SubModuleReplacementDescription(
+                    suffix="final_layer_norm",
+                    target_module=col_nn.FusedLayerNorm,
+                )
+            ],
+                                                        policy=policy,
+                                                        target_key=OPTDecoderLayer)
+
+        return policy
+
+    def postprocess(self):
+        binding_map = {
+            'language_model.model.decoder.embed_tokens': 'language_model.lm_head',
+        }
+
+        for k, v in binding_map.items():
+            src_mod = getattr_(self.model, k)
+            dst_mod = getattr_(self.model, v)
+            dst_mod.weight = src_mod.weight
+
+        return self.model
+
+
+# Blip2Model
+class Blip2ModelPolicy(BlipPolicy):
+
+    def __init__(self) -> None:
+        super().__init__()
+
+
+# Blip2ForConditionalGeneration
+class Blip2ForConditionalGenerationPolicy(BlipPolicy):
+
+    def __init__(self) -> None:
+        super().__init__()
diff --git a/tests/kit/model_zoo/transformers/__init__.py b/tests/kit/model_zoo/transformers/__init__.py
index 08a118e5783d..823ca032fc30 100644
--- a/tests/kit/model_zoo/transformers/__init__.py
+++ b/tests/kit/model_zoo/transformers/__init__.py
@@ -1,5 +1,6 @@
 from .albert import *
 from .bert import *
+from .blip2 import *
 from .bloom import *
 from .chatglm import *
 from .gpt import *
diff --git a/tests/kit/model_zoo/transformers/blip2.py b/tests/kit/model_zoo/transformers/blip2.py
new file mode 100644
index 000000000000..7338f740be7f
--- /dev/null
+++ b/tests/kit/model_zoo/transformers/blip2.py
@@ -0,0 +1,61 @@
+import torch
+import transformers
+
+from ..registry import ModelAttribute, model_zoo
+
+# ===============================
+# Register single-image SAM
+# ===============================
+
+
+# define data gen function
+def data_gen():
+    # Generated from following code snippet
+    #
+    # from PIL import Image
+    # import requests
+    # from transformers import Blip2Processor, Blip2Model
+    # import torch
+
+    # processor = Blip2Processor.from_pretrained("Salesforce/blip2-opt-2.7b")
+    # url = "http://images.cocodataset.org/val2017/000000039769.jpg"
+    # image = Image.open(requests.get(url, stream=True).raw)
+
+    # prompt = "Question: how many cats are there? Answer:"
+    # inputs = processor(images=image, text=prompt, return_tensors="pt").to(device, torch.float16)
+
+    pixel_values = torch.rand(1, 3, 224, 224, dtype=torch.float32)
+    input_ids = torch.tensor([[2, 45641, 35, 141, 171, 10017, 32, 89, 116, 31652, 35]], dtype=torch.int64)
+    attention_mask = torch.tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]], dtype=torch.int64)
+    labels = torch.tensor([[34, 56]], dtype=torch.int64)
+    return dict(pixel_values=pixel_values, input_ids=input_ids, attention_mask=attention_mask, labels=labels)
+
+
+# define output transform function
+output_transform_fn = lambda x: x
+
+# define loss funciton
+loss_fn_blip2_model = lambda x: x.loss
+
+config = transformers.Blip2Config()
+config.text_config.num_hidden_layers = 1
+config.qformer_config.num_hidden_layers = 1
+config.vision_config.num_hidden_layers = 1
+config.qformer_config.attention_probs_dropout_prob = 0
+config.qformer_config.hidden_dropout_prob = 0
+config.text_config.dropout = 0
+
+# register the blip2 variants
+model_zoo.register(name='transformers_blip2',
+                   model_fn=lambda: transformers.Blip2Model(config),
+                   data_gen_fn=data_gen,
+                   output_transform_fn=output_transform_fn,
+                   loss_fn=loss_fn_blip2_model,
+                   model_attribute=ModelAttribute(has_control_flow=True))
+
+model_zoo.register(name='transformers_blip2_conditional_gerneration',
+                   model_fn=lambda: transformers.Blip2ForConditionalGeneration(config),
+                   data_gen_fn=data_gen,
+                   output_transform_fn=output_transform_fn,
+                   loss_fn=loss_fn_blip2_model,
+                   model_attribute=ModelAttribute(has_control_flow=True))
diff --git a/tests/test_booster/test_plugin/test_gemini_plugin.py b/tests/test_booster/test_plugin/test_gemini_plugin.py
index d29c92926066..27f8bbd136b8 100644
--- a/tests/test_booster/test_plugin/test_gemini_plugin.py
+++ b/tests/test_booster/test_plugin/test_gemini_plugin.py
@@ -88,7 +88,8 @@ def check_gemini_plugin(init_method: str = 'none', early_stop: bool = True):
                 'torchvision_vit_b_16', 'torchvision_convnext_base', 'torchvision_swin_s', 'transformers_albert',
                 'transformers_albert_for_pretraining', 'transformers_bert', 'transformers_bert_for_pretraining',
                 'transformers_gpt_double_heads', 'torchaudio_hubert_base', 'torchaudio_wav2vec2_base',
-                'transformers_t5_for_conditional_generation', 'transformers_t5', 'transformers_t5_encoder_model'
+                'transformers_t5_for_conditional_generation', 'transformers_t5', 'transformers_t5_encoder_model',
+                'transformers_blip2', 'transformers_vit', 'transformers_vit_for_masked_image_modeling'
         ]:
             continue
 
diff --git a/tests/test_booster/test_plugin/test_low_level_zero_plugin.py b/tests/test_booster/test_plugin/test_low_level_zero_plugin.py
index eedd8c59a3a8..26157be9a2c5 100644
--- a/tests/test_booster/test_plugin/test_low_level_zero_plugin.py
+++ b/tests/test_booster/test_plugin/test_low_level_zero_plugin.py
@@ -17,7 +17,8 @@
 # These models will get stuck
 _STUCK_MODELS = [
     'diffusers_vq_model', 'transformers_albert', 'transformers_albert_for_pretraining', 'transformers_bert',
-    'transformers_bert_for_pretraining', 'transformers_gpt_double_heads'
+    'transformers_bert_for_pretraining', 'transformers_gpt_double_heads', 'transformers_sam',
+    'transformers_bert_lm_head_model', 'transformers_bert_for_masked_lm', 'transformers_vit'
 ]
 
 
diff --git a/tests/test_lazy/test_distribute.py b/tests/test_lazy/test_distribute.py
index 622d9deb601d..461145a1c91b 100644
--- a/tests/test_lazy/test_distribute.py
+++ b/tests/test_lazy/test_distribute.py
@@ -71,7 +71,8 @@ def run_dist_lazy_init(subset, seed: int = 42):
 
     for name, entry in sub_model_zoo.items():
         # TODO(ver217): lazy init does not support weight norm, skip these models
-        if name in ('torchaudio_wav2vec2_base', 'torchaudio_hubert_base') or name.startswith('transformers_llama'):
+        if name in ('torchaudio_wav2vec2_base', 'torchaudio_hubert_base') or name.startswith(
+            ('transformers_llama', 'transformers_blip2')):
             continue
         print_rank_0(name)
         model_fn, data_gen_fn, output_transform_fn, _, model_attr = entry
diff --git a/tests/test_shardformer/test_model/test_shard_blip2.py b/tests/test_shardformer/test_model/test_shard_blip2.py
new file mode 100644
index 000000000000..f96299e55a49
--- /dev/null
+++ b/tests/test_shardformer/test_model/test_shard_blip2.py
@@ -0,0 +1,107 @@
+import pytest
+import torch
+
+import colossalai
+from colossalai.logging import disable_existing_loggers
+from colossalai.tensor.d_tensor.api import is_customized_distributed_tensor, is_distributed_tensor
+from colossalai.testing import (
+    assert_hf_output_close,
+    clear_cache_before_run,
+    parameterize,
+    rerun_if_address_is_in_use,
+    spawn,
+)
+from tests.kit.model_zoo import model_zoo
+from tests.test_shardformer.test_model._utils import build_model, run_forward
+
+
+def check_forward_backward(org_model, sharded_model, data_gen_fn, output_transform_fn, loss_fn):
+    # check forward
+    org_output, org_loss, shard_output, shard_loss = run_forward(org_model, sharded_model, data_gen_fn,
+                                                                 output_transform_fn, loss_fn)
+    assert_hf_output_close(org_output, shard_output, ignore_keys=['past_key_values'])
+
+    # do backward
+    org_loss.backward()
+    shard_loss.backward()
+
+    assert torch.allclose(org_loss, shard_loss,
+                          atol=1e-5), f"shard model loss is not equal to orgin model loss\n{org_loss}\n{shard_loss}"
+
+    # check grad
+
+    blip2 = org_model
+    sharded_blip2 = sharded_model
+
+    # compare vision_model grad
+
+    org_grad = blip2.vision_model.encoder.layers[0].self_attn.qkv.weight.grad
+    shard_grad = sharded_blip2.vision_model.encoder.layers[0].self_attn.qkv.weight.grad
+    shard_weight = sharded_blip2.vision_model.encoder.layers[0].self_attn.qkv.weight
+
+    if is_distributed_tensor(shard_weight) or is_customized_distributed_tensor(shard_weight):
+        shard_grad_list = [torch.zeros([*shard_grad.shape]).to('cuda') for _ in range(2)]
+        shard_grad = torch.distributed.all_gather(shard_grad_list, shard_grad)
+        all_shard_grad = torch.cat(shard_grad_list, dim=0)
+    else:
+        all_shard_grad = shard_grad
+    assert torch.allclose(org_grad, all_shard_grad,
+                          atol=1e-5), f"shard model grad is not equal to orgin model grad\n{org_grad}\n{all_shard_grad}"
+
+    # compare qformer grad
+    org_grad = blip2.qformer.encoder.layer[0].attention.attention.query.weight.grad
+    shard_grad = sharded_blip2.qformer.encoder.layer[0].attention.attention.query.weight.grad
+    shard_weight = sharded_blip2.qformer.encoder.layer[0].attention.attention.query.weight
+
+    if is_distributed_tensor(shard_weight) or is_customized_distributed_tensor(shard_weight):
+        shard_grad_list = [torch.zeros([*shard_grad.shape]).to('cuda') for _ in range(2)]
+        shard_grad = torch.distributed.all_gather(shard_grad_list, shard_grad)
+        all_shard_grad = torch.cat(shard_grad_list, dim=0)
+    else:
+        all_shard_grad = shard_grad
+
+    assert torch.allclose(org_grad, all_shard_grad,
+                          atol=1e-5), f"shard model grad is not equal to orgin model grad\n{org_grad}\n{all_shard_grad}"
+
+    # compare language_model grad
+    org_grad = blip2.language_model.model.decoder.layers[0].self_attn.k_proj.weight.grad
+    shard_grad = sharded_blip2.language_model.model.decoder.layers[0].self_attn.k_proj.weight.grad
+    shard_weight = sharded_blip2.language_model.model.decoder.layers[0].self_attn.k_proj.weight
+
+    if is_distributed_tensor(shard_weight) or is_customized_distributed_tensor(shard_weight):
+        shard_grad_list = [torch.zeros([*shard_grad.shape]).to('cuda') for _ in range(2)]
+        shard_grad = torch.distributed.all_gather(shard_grad_list, shard_grad)
+        all_shard_grad = torch.cat(shard_grad_list, dim=0)
+    else:
+        all_shard_grad = shard_grad
+
+    assert torch.allclose(org_grad, all_shard_grad,
+                          atol=1e-5), f"shard model grad is not equal to orgin model grad\n{org_grad}\n{all_shard_grad}"
+
+
+@parameterize('enable_fused_normalization', [True, False])
+@parameterize('enable_tensor_parallelism', [True, False])
+def run_blip2_test(enable_fused_normalization, enable_tensor_parallelism):
+    sub_model_zoo = model_zoo.get_sub_registry('transformers_blip2')
+    for name, (model_fn, data_gen_fn, output_transform_fn, loss_fn, _) in sub_model_zoo.items():
+        org_model, sharded_model = build_model(model_fn, enable_fused_normalization, enable_tensor_parallelism)
+        check_forward_backward(org_model, sharded_model, data_gen_fn, output_transform_fn, loss_fn)
+
+    torch.cuda.empty_cache()
+
+
+def check_blip2(rank, world_size, port):
+    disable_existing_loggers()
+    colossalai.launch(config={}, rank=rank, world_size=world_size, host='localhost', port=port, backend='nccl')
+    run_blip2_test()
+
+
+@pytest.mark.dist
+@rerun_if_address_is_in_use()
+@clear_cache_before_run()
+def test_blip2():
+    spawn(check_blip2, 2)
+
+
+if __name__ == "__main__":
+    test_blip2()

From bb3a5899d6791437559032d493bf79b40628fdbd Mon Sep 17 00:00:00 2001
From: Mingyan Jiang <1829166702@qq.com>
Date: Tue, 25 Jul 2023 19:13:27 +0800
Subject: [PATCH 19/21] [shardformer] blip2 support flash attention and jit
 operator

---
 colossalai/shardformer/modeling/blip2.py      | 60 +++++++++++++++++++
 colossalai/shardformer/policies/blip2.py      | 32 +++++++++-
 tests/kit/model_zoo/transformers/blip2.py     |  6 +-
 .../test_model/test_shard_blip2.py            |  7 ++-
 .../test_model/test_shard_vit.py              |  4 +-
 5 files changed, 101 insertions(+), 8 deletions(-)

diff --git a/colossalai/shardformer/modeling/blip2.py b/colossalai/shardformer/modeling/blip2.py
index b7945423ae83..c5c6b14ba993 100644
--- a/colossalai/shardformer/modeling/blip2.py
+++ b/colossalai/shardformer/modeling/blip2.py
@@ -1,3 +1,4 @@
+import math
 from typing import Optional, Tuple, Union
 
 import torch
@@ -58,3 +59,62 @@ def forward(
         return outputs
 
     return forward
+
+
+def get_blip2_flash_attention_forward():
+
+    from transformers.models.blip_2.modeling_blip_2 import Blip2Attention
+
+    from colossalai.kernel.cuda_native.flash_attention import AttnMaskType, ColoAttention
+
+    def forward(
+        self: Blip2Attention,
+        hidden_states: torch.Tensor,
+        head_mask: Optional[torch.Tensor] = None,
+        output_attentions: Optional[bool] = False,
+    ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
+        """Input shape: Batch x Time x Channel"""
+
+        bsz, tgt_len, embed_dim = hidden_states.size()
+        mixed_qkv = self.qkv(hidden_states)
+        mixed_qkv = mixed_qkv.reshape(bsz, tgt_len, 3, self.num_heads, -1).permute(2, 0, 1, 3, 4)
+        query_states, key_states, value_states = mixed_qkv[0], mixed_qkv[1], mixed_qkv[2]
+
+        attention = ColoAttention(embed_dim=self.embed_dim,
+                                  num_heads=self.num_heads,
+                                  dropout=self.dropout.p,
+                                  scale=self.scale)
+        context_layer = attention(query_states, key_states, value_states)
+
+        output = self.projection(context_layer)
+        outputs = (output, None)
+
+        return outputs
+
+    return forward
+
+
+def get_jit_fused_blip2_QFormer_self_output_forward():
+
+    from transformers.models.blip_2.modeling_blip_2 import Blip2QFormerSelfOutput
+
+    def forward(self: Blip2QFormerSelfOutput, hidden_states: torch.Tensor, input_tensor: torch.Tensor) -> torch.Tensor:
+        hidden_states = self.dense(hidden_states)
+        hidden_states = self.dropout_add(hidden_states, input_tensor, self.dropout.p, self.dropout.training)
+        hidden_states = self.LayerNorm(hidden_states)
+        return hidden_states
+
+    return forward
+
+
+def get_jit_fused_blip2_QFormer_output_forward():
+
+    from transformers.models.blip_2.modeling_blip_2 import Blip2QFormerOutput
+
+    def forward(self: Blip2QFormerOutput, hidden_states: torch.Tensor, input_tensor: torch.Tensor) -> torch.Tensor:
+        hidden_states = self.dense(hidden_states)
+        hidden_states = self.dropout_add(hidden_states, input_tensor, self.dropout.p, self.dropout.training)
+        hidden_states = self.LayerNorm(hidden_states)
+        return hidden_states
+
+    return forward
diff --git a/colossalai/shardformer/policies/blip2.py b/colossalai/shardformer/policies/blip2.py
index 43aa1adc1c5b..3440fd6f0cc9 100644
--- a/colossalai/shardformer/policies/blip2.py
+++ b/colossalai/shardformer/policies/blip2.py
@@ -3,7 +3,14 @@
 import colossalai.shardformer.layer as col_nn
 
 from .._utils import getattr_, setattr_
-from ..modeling.blip2 import forward_fn
+from ..modeling.blip2 import (
+    forward_fn,
+    get_blip2_flash_attention_forward,
+    get_jit_fused_blip2_QFormer_output_forward,
+    get_jit_fused_blip2_QFormer_self_output_forward,
+)
+from ..modeling.jit import get_jit_fused_dropout_add_func
+from ..modeling.opt import get_opt_flash_attention_forward
 from .basepolicy import ModulePolicyDescription, Policy, SubModuleReplacementDescription
 
 __all__ = ['BlipPolicy', 'BlipModelPolicy']
@@ -33,9 +40,11 @@ def module_policy(self):
             Blip2EncoderLayer,
             Blip2QFormerLayer,
             Blip2QFormerModel,
+            Blip2QFormerOutput,
+            Blip2QFormerSelfOutput,
             Blip2VisionModel,
         )
-        from transformers.models.opt.modeling_opt import OPTDecoderLayer, OPTForCausalLM
+        from transformers.models.opt.modeling_opt import OPTAttention, OPTDecoderLayer, OPTForCausalLM
 
         policy = {}
 
@@ -274,7 +283,26 @@ def module_policy(self):
             ],
                                                         policy=policy,
                                                         target_key=OPTDecoderLayer)
+        # use flash attention
+        if self.shard_config.enable_flash_attention:
+            policy[Blip2Attention] = ModulePolicyDescription(method_replacement={
+                'forward': get_blip2_flash_attention_forward(),
+            })
+            policy[OPTAttention] = ModulePolicyDescription(method_replacement={
+                'forward': get_opt_flash_attention_forward(),
+            })
 
+        # use jit operator
+        if self.shard_config.enable_jit_fused:
+            policy[Blip2QFormerSelfOutput] = ModulePolicyDescription(
+                method_replacement={
+                    'forward': get_jit_fused_blip2_QFormer_self_output_forward(),
+                    'dropout_add': get_jit_fused_dropout_add_func(),
+                })
+            policy[Blip2QFormerOutput] = ModulePolicyDescription(method_replacement={
+                'forward': get_jit_fused_blip2_QFormer_output_forward(),
+                'dropout_add': get_jit_fused_dropout_add_func(),
+            })
         return policy
 
     def postprocess(self):
diff --git a/tests/kit/model_zoo/transformers/blip2.py b/tests/kit/model_zoo/transformers/blip2.py
index 7338f740be7f..f68de942367f 100644
--- a/tests/kit/model_zoo/transformers/blip2.py
+++ b/tests/kit/model_zoo/transformers/blip2.py
@@ -25,8 +25,9 @@ def data_gen():
     # inputs = processor(images=image, text=prompt, return_tensors="pt").to(device, torch.float16)
 
     pixel_values = torch.rand(1, 3, 224, 224, dtype=torch.float32)
-    input_ids = torch.tensor([[2, 45641, 35, 141, 171, 10017, 32, 89, 116, 31652, 35]], dtype=torch.int64)
-    attention_mask = torch.tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]], dtype=torch.int64)
+    input_ids = torch.tensor([[2, 45641, 35, 141, 171, 10017, 32, 89, 116, 31652, 35, 32, 89, 116, 31652, 35]],
+                             dtype=torch.int64)
+    attention_mask = torch.tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]], dtype=torch.int64)
     labels = torch.tensor([[34, 56]], dtype=torch.int64)
     return dict(pixel_values=pixel_values, input_ids=input_ids, attention_mask=attention_mask, labels=labels)
 
@@ -38,6 +39,7 @@ def data_gen():
 loss_fn_blip2_model = lambda x: x.loss
 
 config = transformers.Blip2Config()
+config.vision_config.patch_size = 14
 config.text_config.num_hidden_layers = 1
 config.qformer_config.num_hidden_layers = 1
 config.vision_config.num_hidden_layers = 1
diff --git a/tests/test_shardformer/test_model/test_shard_blip2.py b/tests/test_shardformer/test_model/test_shard_blip2.py
index f96299e55a49..0564af329d35 100644
--- a/tests/test_shardformer/test_model/test_shard_blip2.py
+++ b/tests/test_shardformer/test_model/test_shard_blip2.py
@@ -81,10 +81,13 @@ def check_forward_backward(org_model, sharded_model, data_gen_fn, output_transfo
 
 @parameterize('enable_fused_normalization', [True, False])
 @parameterize('enable_tensor_parallelism', [True, False])
-def run_blip2_test(enable_fused_normalization, enable_tensor_parallelism):
+@parameterize('enable_flash_attention', [True, False])
+@parameterize('enable_jit_fused', [True, False])
+def run_blip2_test(enable_fused_normalization, enable_tensor_parallelism, enable_flash_attention, enable_jit_fused):
     sub_model_zoo = model_zoo.get_sub_registry('transformers_blip2')
     for name, (model_fn, data_gen_fn, output_transform_fn, loss_fn, _) in sub_model_zoo.items():
-        org_model, sharded_model = build_model(model_fn, enable_fused_normalization, enable_tensor_parallelism)
+        org_model, sharded_model = build_model(model_fn, enable_fused_normalization, enable_tensor_parallelism,
+                                               enable_flash_attention, enable_jit_fused)
         check_forward_backward(org_model, sharded_model, data_gen_fn, output_transform_fn, loss_fn)
 
     torch.cuda.empty_cache()
diff --git a/tests/test_shardformer/test_model/test_shard_vit.py b/tests/test_shardformer/test_model/test_shard_vit.py
index a8048a9bdd12..2b02c83e0d27 100644
--- a/tests/test_shardformer/test_model/test_shard_vit.py
+++ b/tests/test_shardformer/test_model/test_shard_vit.py
@@ -21,7 +21,7 @@ def check_forward_backward(org_model, sharded_model, data_gen_fn, output_transfo
     # check forward
     org_output, org_loss, shard_output, shard_loss = run_forward(org_model, sharded_model, data_gen_fn,
                                                                  output_transform_fn, loss_fn)
-    assert_hf_output_close(org_output, shard_output, atol=1e-4, rtol=1e-4)
+    assert_hf_output_close(org_output, shard_output, atol=1e-3, rtol=1e-3)
     # do backward
     org_loss.backward()
     shard_loss.backward()
@@ -50,7 +50,7 @@ def check_forward_backward(org_model, sharded_model, data_gen_fn, output_transfo
         all_shard_grad = shard_grad
     assert torch.allclose(org_grad, all_shard_grad,
                           atol=1e-5), f"shard model grad is not equal to orgin model grad\n{org_grad}\n{shard_grad}"
-    
+
 
 @parameterize('enable_fused_normalization', [True, False])
 @parameterize('enable_tensor_parallelism', [True, False])

From ce1eccf230566048e15fa59b4b9461012b1ea58f Mon Sep 17 00:00:00 2001
From: Mingyan Jiang <1829166702@qq.com>
Date: Tue, 25 Jul 2023 19:20:27 +0800
Subject: [PATCH 20/21] [shardformer] blip2 support flash attention and jit
 operator

---
 tests/test_shardformer/test_model/test_shard_vit.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/test_shardformer/test_model/test_shard_vit.py b/tests/test_shardformer/test_model/test_shard_vit.py
index 2b02c83e0d27..13cde0e611ff 100644
--- a/tests/test_shardformer/test_model/test_shard_vit.py
+++ b/tests/test_shardformer/test_model/test_shard_vit.py
@@ -21,7 +21,7 @@ def check_forward_backward(org_model, sharded_model, data_gen_fn, output_transfo
     # check forward
     org_output, org_loss, shard_output, shard_loss = run_forward(org_model, sharded_model, data_gen_fn,
                                                                  output_transform_fn, loss_fn)
-    assert_hf_output_close(org_output, shard_output, atol=1e-3, rtol=1e-3)
+    assert_hf_output_close(org_output, shard_output, atol=1e-4, rtol=1e-4)
     # do backward
     org_loss.backward()
     shard_loss.backward()

From 537e00588a49214716444f817617e04f9598fdce Mon Sep 17 00:00:00 2001
From: Mingyan Jiang <1829166702@qq.com>
Date: Tue, 25 Jul 2023 19:25:46 +0800
Subject: [PATCH 21/21] [shardformer] blip2 support flash attention and jit
 operator

---
 colossalai/shardformer/policies/blip2.py | 6 +-----
 1 file changed, 1 insertion(+), 5 deletions(-)

diff --git a/colossalai/shardformer/policies/blip2.py b/colossalai/shardformer/policies/blip2.py
index 92bafced2ac4..888e209f9e7c 100644
--- a/colossalai/shardformer/policies/blip2.py
+++ b/colossalai/shardformer/policies/blip2.py
@@ -10,7 +10,6 @@
     get_jit_fused_blip2_QFormer_self_output_forward,
 )
 from ..modeling.jit import get_jit_fused_dropout_add_func
-from ..modeling.opt import get_opt_flash_attention_forward
 from .basepolicy import ModulePolicyDescription, Policy, SubModuleReplacementDescription
 
 __all__ = ['BlipPolicy', 'BlipModelPolicy']
@@ -44,7 +43,7 @@ def module_policy(self):
             Blip2QFormerSelfOutput,
             Blip2VisionModel,
         )
-        from transformers.models.opt.modeling_opt import OPTAttention, OPTDecoderLayer, OPTForCausalLM
+        from transformers.models.opt.modeling_opt import OPTDecoderLayer, OPTForCausalLM
 
         policy = {}
 
@@ -289,9 +288,6 @@ def module_policy(self):
             policy[Blip2Attention] = ModulePolicyDescription(method_replacement={
                 'forward': get_blip2_flash_attention_forward(),
             })
-            policy[OPTAttention] = ModulePolicyDescription(method_replacement={
-                'forward': get_opt_flash_attention_forward(),
-            })
 
         # use jit operator
         if self.shard_config.enable_jit_fused: