From 2c2c3cd994a8b4bc78d3216f47280b6c8f2733d3 Mon Sep 17 00:00:00 2001
From: flybird11111 <1829166702@qq.com>
Date: Thu, 7 Mar 2024 10:35:27 +0800
Subject: [PATCH 1/8] fix

---
 applications/Colossal-LLaMA-2/train.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/applications/Colossal-LLaMA-2/train.py b/applications/Colossal-LLaMA-2/train.py
index 2e4bab75a085..d97da61e4dc8 100644
--- a/applications/Colossal-LLaMA-2/train.py
+++ b/applications/Colossal-LLaMA-2/train.py
@@ -56,6 +56,7 @@ def format_numel_str(numel: int) -> str:
 
 def all_reduce_mean(tensor: torch.Tensor) -> torch.Tensor:
     dist.all_reduce(tensor=tensor, op=dist.ReduceOp.SUM)
+    tensor = tensor.data
     tensor.div_(dist.get_world_size())
     return tensor
 

From c66002f37644b9d23689d7b22a5f667861585ce8 Mon Sep 17 00:00:00 2001
From: Hongxin Liu <lhx0217@gmail.com>
Date: Thu, 7 Mar 2024 23:36:07 +0800
Subject: [PATCH 2/8] [release] update version (#5411)

---
 version.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/version.txt b/version.txt
index c2c0004f0e2a..449d7e73a966 100644
--- a/version.txt
+++ b/version.txt
@@ -1 +1 @@
-0.3.5
+0.3.6

From caa99f34c2db1c36668ae617b4ea2d6e5e0a9fb6 Mon Sep 17 00:00:00 2001
From: digger yu <digger-yu@outlook.com>
Date: Tue, 12 Mar 2024 11:25:16 +0800
Subject: [PATCH 3/8] [hotfix] fix typo s/keywrods/keywords etc. (#5429)

---
 .../ColossalQA/colossalqa/chain/retrieval_qa/base.py      | 8 ++++----
 applications/ColossalQA/colossalqa/prompt/prompt.py       | 6 +++---
 .../ColossalQA/colossalqa/retrieval_conversation_en.py    | 2 +-
 .../colossalqa/retrieval_conversation_universal.py        | 2 +-
 .../ColossalQA/colossalqa/retrieval_conversation_zh.py    | 2 +-
 .../ColossalQA/examples/retrieval_conversation_chatgpt.py | 2 +-
 .../ColossalQA/examples/retrieval_conversation_en.py      | 4 ++--
 .../retrieval_conversation_en_customer_service.py         | 2 +-
 .../examples/retrieval_conversation_universal.py          | 2 +-
 .../ColossalQA/examples/retrieval_conversation_zh.py      | 2 +-
 .../ColossalQA/examples/webui_demo/RAG_ChatBot.py         | 2 +-
 colossalai/inference/README.md                            | 2 +-
 colossalai/legacy/inference/README.md                     | 2 +-
 colossalai/legacy/inference/hybridengine/engine.py        | 4 ++--
 colossalai/nn/optimizer/README.md                         | 2 +-
 colossalai/pipeline/schedule/generate.py                  | 4 ++--
 .../diffusion/ldm/modules/diffusionmodules/openaimodel.py | 2 +-
 17 files changed, 25 insertions(+), 25 deletions(-)

diff --git a/applications/ColossalQA/colossalqa/chain/retrieval_qa/base.py b/applications/ColossalQA/colossalqa/chain/retrieval_qa/base.py
index e80befdaccfa..a6e87e6bea9f 100644
--- a/applications/ColossalQA/colossalqa/chain/retrieval_qa/base.py
+++ b/applications/ColossalQA/colossalqa/chain/retrieval_qa/base.py
@@ -117,8 +117,8 @@ def _call(
             ) = copy.deepcopy(buffered_history_backup), copy.deepcopy(summarized_history_temp_backup)
 
         # if rejection_trigger_keywords is not given, return the response from LLM directly
-        rejection_trigger_keywrods = inputs.get('rejection_trigger_keywrods', [])
-        answer = answer if all([rej not in answer for rej in rejection_trigger_keywrods]) else None
+        rejection_trigger_keywords = inputs.get('rejection_trigger_keywords', [])
+        answer = answer if all([rej not in answer for rej in rejection_trigger_keywords]) else None
         if answer is None: 
             answer = inputs.get('rejection_answer', "抱歉，根据提供的信息无法回答该问题。")
         if self.combine_documents_chain.memory is not None:
@@ -161,8 +161,8 @@ async def _acall(
             input_documents=docs, question=question, callbacks=_run_manager.get_child(), **kwargs
         )
         # if rejection_trigger_keywords is not given, return the response from LLM directly
-        rejection_trigger_keywrods = inputs.get('rejection_trigger_keywrods', [])
-        answer = answer if all([rej not in answer for rej in rejection_trigger_keywrods]) or len(rejection_trigger_keywrods)==0 else None
+        rejection_trigger_keywords = inputs.get('rejection_trigger_keywords', [])
+        answer = answer if all([rej not in answer for rej in rejection_trigger_keywords]) or len(rejection_trigger_keywords)==0 else None
         if answer is None:
             answer = inputs.get('rejection_answer', "抱歉，根据提供的信息无法回答该问题。")
         self.combine_documents_chain.memory.save_context({"question": question}, {"output": answer})
diff --git a/applications/ColossalQA/colossalqa/prompt/prompt.py b/applications/ColossalQA/colossalqa/prompt/prompt.py
index 533f0bd552b9..d62249ba9c51 100644
--- a/applications/ColossalQA/colossalqa/prompt/prompt.py
+++ b/applications/ColossalQA/colossalqa/prompt/prompt.py
@@ -75,7 +75,7 @@
 # Below are English retrieval qa prompts
 
 _EN_RETRIEVAL_QA_PROMPT = """[INST] <<SYS>>Always answer as helpfully as possible, while being safe. Your answers should not include any harmful, unethical, racist, sexist content.
-If the answer cannot be infered based on the given context, please say "I cannot answer the question based on the information given.".<</SYS>>
+If the answer cannot be inferred based on the given context, please say "I cannot answer the question based on the information given.".<</SYS>>
 Use the context and chat history to answer the question.
 
 context:
@@ -97,8 +97,8 @@
 Human: I have a friend, Mike. Do you know him?
 Assistant: Yes, I know a person named Mike
 
-sentence: What's his favorate food?
-disambiguated sentence: What's Mike's favorate food?
+sentence: What's his favorite food?
+disambiguated sentence: What's Mike's favorite food?
 [/INST]
 Chat history:
 {chat_history}
diff --git a/applications/ColossalQA/colossalqa/retrieval_conversation_en.py b/applications/ColossalQA/colossalqa/retrieval_conversation_en.py
index d2626321d68d..96bce82b9ee0 100644
--- a/applications/ColossalQA/colossalqa/retrieval_conversation_en.py
+++ b/applications/ColossalQA/colossalqa/retrieval_conversation_en.py
@@ -80,7 +80,7 @@ def run(self, user_input: str, memory: ConversationBufferWithSummary) -> Tuple[s
             self.retrieval_chain.run(
                 query=user_input,
                 stop=[self.memory.human_prefix + ": "],
-                rejection_trigger_keywrods=["cannot answer the question"],
+                rejection_trigger_keywords=["cannot answer the question"],
                 rejection_answer="Sorry, this question cannot be answered based on the information provided.",
             ).split("\n")[0],
             self.memory,
diff --git a/applications/ColossalQA/colossalqa/retrieval_conversation_universal.py b/applications/ColossalQA/colossalqa/retrieval_conversation_universal.py
index 76bec715fb6e..b23058d6dbe3 100644
--- a/applications/ColossalQA/colossalqa/retrieval_conversation_universal.py
+++ b/applications/ColossalQA/colossalqa/retrieval_conversation_universal.py
@@ -103,7 +103,7 @@ def load_supporting_docs(self, files: List[List[str]] = None, text_splitter: Tex
                     break
                 data_name = input("Enter a short description of the data:")
                 separator = input(
-                    "Enter a separator to force separating text into chunks, if no separator is given, the defaut separator is '\\n\\n', press ENTER directly to skip:"
+                    "Enter a separator to force separating text into chunks, if no separator is given, the default separator is '\\n\\n', press ENTER directly to skip:"
                 )
                 separator = separator if separator != "" else "\n\n"
                 retriever_data = DocumentLoader([[file, data_name.replace(" ", "_")]]).all_data
diff --git a/applications/ColossalQA/colossalqa/retrieval_conversation_zh.py b/applications/ColossalQA/colossalqa/retrieval_conversation_zh.py
index 484be21c1553..4eef41947d11 100644
--- a/applications/ColossalQA/colossalqa/retrieval_conversation_zh.py
+++ b/applications/ColossalQA/colossalqa/retrieval_conversation_zh.py
@@ -87,7 +87,7 @@ def run(self, user_input: str, memory: ConversationBufferWithSummary) -> Tuple[s
                 query=user_input,
                 stop=["</答案>"],
                 doc_prefix="支持文档",
-                rejection_trigger_keywrods=["无法回答该问题"],
+                rejection_trigger_keywords=["无法回答该问题"],
                 rejection_answer="抱歉，根据提供的信息无法回答该问题。",
             ).split("\n")[0],
             self.memory,
diff --git a/applications/ColossalQA/examples/retrieval_conversation_chatgpt.py b/applications/ColossalQA/examples/retrieval_conversation_chatgpt.py
index 00b920d274bc..1042adbf2095 100644
--- a/applications/ColossalQA/examples/retrieval_conversation_chatgpt.py
+++ b/applications/ColossalQA/examples/retrieval_conversation_chatgpt.py
@@ -61,7 +61,7 @@
     information_retriever.add_documents(docs=documents, cleanup="incremental", mode="by_source", embedding=embedding)
 
     prompt_template = """Always answer as helpfully as possible, while being safe. Your answers should not include any harmful, unethical, racist, sexist, toxic, dangerous, or illegal content. Please ensure that your responses are socially unbiased and positive in nature.
-    If the answer cannot be infered based on the given context, please don't share false information.
+    If the answer cannot be inferred based on the given context, please don't share false information.
     Use the context and chat history to respond to the human's input at the end or carry on the conversation. You should generate one response only. No following up is needed.
 
     context:
diff --git a/applications/ColossalQA/examples/retrieval_conversation_en.py b/applications/ColossalQA/examples/retrieval_conversation_en.py
index e0fe46ae6322..fe2b9b4db3c2 100644
--- a/applications/ColossalQA/examples/retrieval_conversation_en.py
+++ b/applications/ColossalQA/examples/retrieval_conversation_en.py
@@ -67,7 +67,7 @@ def disambiguity(input):
             break
         data_name = input("Enter a short description of the data:")
         separator = input(
-            "Enter a separator to force separating text into chunks, if no separator is given, the defaut separator is '\\n\\n'. Note that"
+            "Enter a separator to force separating text into chunks, if no separator is given, the default separator is '\\n\\n'. Note that"
             + "we use neural text spliter to split texts into chunks, the seperator only serves as a delimiter to force split long passage into"
             + " chunks before passing to the neural network. Press ENTER directly to skip:"
         )
@@ -112,7 +112,7 @@ def disambiguity(input):
         agent_response = retrieval_chain.run(
             query=user_input,
             stop=["Human: "],
-            rejection_trigger_keywrods=EN_RETRIEVAL_QA_TRIGGER_KEYWORDS,
+            rejection_trigger_keywords=EN_RETRIEVAL_QA_TRIGGER_KEYWORDS,
             rejection_answer=EN_RETRIEVAL_QA_REJECTION_ANSWER,
         )
         agent_response = agent_response.split("\n")[0]
diff --git a/applications/ColossalQA/examples/retrieval_conversation_en_customer_service.py b/applications/ColossalQA/examples/retrieval_conversation_en_customer_service.py
index d98a75592372..d4ba73b9468c 100644
--- a/applications/ColossalQA/examples/retrieval_conversation_en_customer_service.py
+++ b/applications/ColossalQA/examples/retrieval_conversation_en_customer_service.py
@@ -142,7 +142,7 @@ def metadata_func(data_sample, additional_fields):
         agent_response = retrieval_chain.run(
             query=user_input,
             stop=["Human: "],
-            rejection_trigger_keywrods=EN_RETRIEVAL_QA_TRIGGER_KEYWORDS,
+            rejection_trigger_keywords=EN_RETRIEVAL_QA_TRIGGER_KEYWORDS,
             rejection_answer=EN_RETRIEVAL_QA_REJECTION_ANSWER,
         )
         agent_response = agent_response.split("\n")[0]
diff --git a/applications/ColossalQA/examples/retrieval_conversation_universal.py b/applications/ColossalQA/examples/retrieval_conversation_universal.py
index 361aa9833d27..5d13a63c3fad 100644
--- a/applications/ColossalQA/examples/retrieval_conversation_universal.py
+++ b/applications/ColossalQA/examples/retrieval_conversation_universal.py
@@ -11,7 +11,7 @@
     parser.add_argument('--sql_file_path', type=str, default=None, help='path to the a empty folder for storing sql files for indexing')
     args = parser.parse_args()
     
-    # Will ask for documents path in runnning time
+    # Will ask for documents path in running time
     session = UniversalRetrievalConversation(files_en=None, 
                 files_zh=None, 
                 zh_model_path=args.zh_model_path, en_model_path=args.en_model_path,
diff --git a/applications/ColossalQA/examples/retrieval_conversation_zh.py b/applications/ColossalQA/examples/retrieval_conversation_zh.py
index cbbbefad7c7b..b143b9baacc1 100644
--- a/applications/ColossalQA/examples/retrieval_conversation_zh.py
+++ b/applications/ColossalQA/examples/retrieval_conversation_zh.py
@@ -107,7 +107,7 @@ def disambiguity(input: str):
             query=user_input,
             stop=["</答案>"],
             doc_prefix="支持文档",
-            rejection_trigger_keywrods=ZH_RETRIEVAL_QA_TRIGGER_KEYWORDS,
+            rejection_trigger_keywords=ZH_RETRIEVAL_QA_TRIGGER_KEYWORDS,
             rejection_answer=ZH_RETRIEVAL_QA_REJECTION_ANSWER,
         )
         print(f"Agent: {agent_response}")
diff --git a/applications/ColossalQA/examples/webui_demo/RAG_ChatBot.py b/applications/ColossalQA/examples/webui_demo/RAG_ChatBot.py
index c58be9c33477..526328dda11b 100644
--- a/applications/ColossalQA/examples/webui_demo/RAG_ChatBot.py
+++ b/applications/ColossalQA/examples/webui_demo/RAG_ChatBot.py
@@ -140,7 +140,7 @@ def run(self, user_input: str, memory: ConversationBufferWithSummary) -> Tuple[s
         result = self.rag_chain.run(
             query=user_input,
             stop=[memory.human_prefix + ": "],
-            rejection_trigger_keywrods=ZH_RETRIEVAL_QA_TRIGGER_KEYWORDS,
+            rejection_trigger_keywords=ZH_RETRIEVAL_QA_TRIGGER_KEYWORDS,
             rejection_answer=ZH_RETRIEVAL_QA_REJECTION_ANSWER,
         )
         return result, memory
diff --git a/colossalai/inference/README.md b/colossalai/inference/README.md
index dfac7cfd9be9..287853a86383 100644
--- a/colossalai/inference/README.md
+++ b/colossalai/inference/README.md
@@ -89,7 +89,7 @@ docker pull hpcaitech/colossalai-inference:v2
 docker run -it --gpus all --name ANY_NAME -v $PWD:/workspace -w /workspace hpcaitech/colossalai-inference:v2 /bin/bash
 
 # enter into docker container
-cd /path/to/CollossalAI
+cd /path/to/ColossalAI
 pip install -e .
 
 ```
diff --git a/colossalai/legacy/inference/README.md b/colossalai/legacy/inference/README.md
index f466f46c1629..63b5f2a75fa8 100644
--- a/colossalai/legacy/inference/README.md
+++ b/colossalai/legacy/inference/README.md
@@ -86,7 +86,7 @@ docker pull hpcaitech/colossalai-inference:v2
 docker run -it --gpus all --name ANY_NAME -v $PWD:/workspace -w /workspace hpcaitech/colossalai-inference:v2 /bin/bash
 
 # enter into docker container
-cd /path/to/CollossalAI
+cd /path/to/ColossalAI
 pip install -e .
 
 # install lightllm
diff --git a/colossalai/legacy/inference/hybridengine/engine.py b/colossalai/legacy/inference/hybridengine/engine.py
index bb0b4c77a2a7..bc4e4fd199c0 100644
--- a/colossalai/legacy/inference/hybridengine/engine.py
+++ b/colossalai/legacy/inference/hybridengine/engine.py
@@ -46,7 +46,7 @@ class CaiInferEngine:
 
     model = LlamaForCausalLM.from_pretrained("your_path_to_model")
     tokenizer = LlamaTokenizer.from_pretrained("/home/lczyh/share/models/llama-7b-hf")
-    # assume the model is infered with 2 pipeline stages
+    # assume the model is inferred with 2 pipeline stages
     inferengine = CaiInferEngine(pp_size=2, model=model, model_policy=LlamaModelInferPolicy())
 
     input = ["Introduce a landmark in China ","Introduce a landmark in China "]
@@ -70,7 +70,7 @@ def __init__(
         max_input_len: int = 32,
         max_output_len: int = 32,
         verbose: bool = False,
-        # TODO: implement early_stopping, and various gerneration options
+        # TODO: implement early_stopping, and various generation options
         early_stopping: bool = False,
         do_sample: bool = False,
         num_beams: int = 1,
diff --git a/colossalai/nn/optimizer/README.md b/colossalai/nn/optimizer/README.md
index e89e6217d596..d3f8badc7313 100644
--- a/colossalai/nn/optimizer/README.md
+++ b/colossalai/nn/optimizer/README.md
@@ -47,7 +47,7 @@ be optimized jointly to further speed up training.
 
 2. Model Accuracy
     - Communication Efficiency
-      - Reduce Volumn of Comm.
+      - Reduce Volume of Comm.
       - Reduce Frequency of Comm.
     - Memory Efficiency
       - Mix-Precision Training
diff --git a/colossalai/pipeline/schedule/generate.py b/colossalai/pipeline/schedule/generate.py
index d6a6aec63a12..48ae54c1fe54 100644
--- a/colossalai/pipeline/schedule/generate.py
+++ b/colossalai/pipeline/schedule/generate.py
@@ -164,7 +164,7 @@ def _gen_token_action(self, model: Module):
             self.timestamps[self.mb_manager.idx].append(time.time())
         assert (
             "logits" in logits
-        ), f"When first stage in GENERATE phase, the ouput should have attribute `logits`, but has {logits.keys()}"
+        ), f"When first stage in GENERATE phase, the output should have attribute `logits`, but has {logits.keys()}"
         new_token = self._get_token_id(logits["logits"])
 
         self.mb_manager.step(new_token)
@@ -401,7 +401,7 @@ def generate_step_broadcast(self, model: Module, data_iter: Iterable) -> Union[t
                             self.timestamps[self.mb_manager.idx].append(time.time())
                         assert (
                             "logits" in logits
-                        ), f"When first stage in GENERATE phase, the ouput should have attribute `logits`, but has {logits.keys()}"
+                        ), f"When first stage in GENERATE phase, the output should have attribute `logits`, but has {logits.keys()}"
                         new_token = self._get_token_id(logits["logits"])
                         self.mb_manager.step(new_token)
                         # If the current micro batch is not DONE, go through blocks
diff --git a/examples/images/diffusion/ldm/modules/diffusionmodules/openaimodel.py b/examples/images/diffusion/ldm/modules/diffusionmodules/openaimodel.py
index 614fe510f20e..6c80f3229ce3 100644
--- a/examples/images/diffusion/ldm/modules/diffusionmodules/openaimodel.py
+++ b/examples/images/diffusion/ldm/modules/diffusionmodules/openaimodel.py
@@ -338,7 +338,7 @@ def count_flops_attn(model, _x, y):
 
 class QKVAttentionLegacy(nn.Module):
     """
-    A module which performs QKV attention. Matches legacy QKVAttention + input/ouput heads shaping
+    A module which performs QKV attention. Matches legacy QKVAttention + input/output heads shaping
     """
 
     def __init__(self, n_heads):

From 4b8909f0f336f6fc5489f4f8a4980f5b318e25a9 Mon Sep 17 00:00:00 2001
From: Hongxin Liu <lhx0217@gmail.com>
Date: Wed, 13 Mar 2024 15:24:13 +0800
Subject: [PATCH 4/8] [devops] fix compatibility (#5444)

* [devops] fix compatibility

* [hotfix] update compatibility test on pr

* [devops] fix compatibility

* [devops] record duration during comp test

* [test] decrease test duration

* fix falcon
---
 .compatibility                                 |  3 +--
 .github/workflows/build_on_schedule.yml        |  3 +--
 .../compatiblity_test_on_dispatch.yml          |  5 ++---
 .github/workflows/compatiblity_test_on_pr.yml  |  5 ++---
 .../compatiblity_test_on_schedule.yml          |  5 ++---
 .../plugin/moe_hybrid_parallel_plugin.py       |  3 +--
 .../test_plugin/test_gemini_plugin.py          | 18 ++----------------
 .../test_gemini_checkpoint_io.py               | 10 +---------
 .../test_model/test_shard_falcon.py            |  3 +++
 9 files changed, 15 insertions(+), 40 deletions(-)

diff --git a/.compatibility b/.compatibility
index a918cb162216..d90a74b584d8 100644
--- a/.compatibility
+++ b/.compatibility
@@ -1,2 +1 @@
-2.0.0-11.7.0
-2.1.0-11.8.0
+2.1.0-12.1.0
diff --git a/.github/workflows/build_on_schedule.yml b/.github/workflows/build_on_schedule.yml
index 510665b46f4b..3ff19b37b4bf 100644
--- a/.github/workflows/build_on_schedule.yml
+++ b/.github/workflows/build_on_schedule.yml
@@ -67,7 +67,6 @@ jobs:
           --durations=0 \
           tests/
         env:
-          NCCL_SHM_DISABLE: 1
           LD_LIBRARY_PATH: /github/home/.tensornvme/lib:/usr/local/nvidia/lib:/usr/local/nvidia/lib64
           LLAMA_PATH: /data/scratch/llama-tiny
 
@@ -83,4 +82,4 @@ jobs:
           SERVER_URL: ${{github.server_url }}
           REPO: ${{ github.repository }}
           RUN_ID: ${{ github.run_id }}
-          WEBHOOK_URL: ${{ secrets.LARK_NOTIFICATION_WEBHOOK_URL }}
\ No newline at end of file
+          WEBHOOK_URL: ${{ secrets.LARK_NOTIFICATION_WEBHOOK_URL }}
diff --git a/.github/workflows/compatiblity_test_on_dispatch.yml b/.github/workflows/compatiblity_test_on_dispatch.yml
index a6f9582ac901..76493880651c 100644
--- a/.github/workflows/compatiblity_test_on_dispatch.yml
+++ b/.github/workflows/compatiblity_test_on_dispatch.yml
@@ -50,7 +50,7 @@ jobs:
       matrix: ${{fromJson(needs.matrix_preparation.outputs.matrix)}}
     container:
       image: ${{ matrix.container }}
-      options: --gpus all --rm -v /data/scratch/cifar-10:/data/scratch/cifar-10 -v /data/scratch/llama-tiny:/data/scratch/llama-tiny
+      options: --gpus all --rm -v /dev/shm -v /data/scratch/cifar-10:/data/scratch/cifar-10 -v /data/scratch/llama-tiny:/data/scratch/llama-tiny
     timeout-minutes: 120
     steps:
       - name: Install dependencies
@@ -87,9 +87,8 @@ jobs:
           pip install -r requirements/requirements-test.txt
       - name: Unit Testing
         run: |
-          PYTHONPATH=$PWD pytest tests
+          PYTHONPATH=$PWD pytest --durations=0 tests
         env:
           DATA: /data/scratch/cifar-10
-          NCCL_SHM_DISABLE: 1
           LD_LIBRARY_PATH: /github/home/.tensornvme/lib:/usr/local/nvidia/lib:/usr/local/nvidia/lib64
           LLAMA_PATH: /data/scratch/llama-tiny
diff --git a/.github/workflows/compatiblity_test_on_pr.yml b/.github/workflows/compatiblity_test_on_pr.yml
index ede6c380a8ec..f582b30907bf 100644
--- a/.github/workflows/compatiblity_test_on_pr.yml
+++ b/.github/workflows/compatiblity_test_on_pr.yml
@@ -41,7 +41,7 @@ jobs:
       matrix: ${{fromJson(needs.matrix_preparation.outputs.matrix)}}
     container:
       image: ${{ matrix.container }}
-      options: --gpus all --rm -v /data/scratch/cifar-10:/data/scratch/cifar-10 -v /data/scratch/llama-tiny:/data/scratch/llama-tiny
+      options: --gpus all --rm -v /dev/shm -v /data/scratch/cifar-10:/data/scratch/cifar-10 -v /data/scratch/llama-tiny:/data/scratch/llama-tiny
     timeout-minutes: 120
     concurrency:
       group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }}-run-test-${{ matrix.container }}
@@ -82,9 +82,8 @@ jobs:
           pip install -r requirements/requirements-test.txt
       - name: Unit Testing
         run: |
-          PYTHONPATH=$PWD pytest tests
+          PYTHONPATH=$PWD pytest --durations=0 tests
         env:
           DATA: /data/scratch/cifar-10
-          NCCL_SHM_DISABLE: 1
           LD_LIBRARY_PATH: /github/home/.tensornvme/lib:/usr/local/nvidia/lib:/usr/local/nvidia/lib64
           LLAMA_PATH: /data/scratch/llama-tiny
diff --git a/.github/workflows/compatiblity_test_on_schedule.yml b/.github/workflows/compatiblity_test_on_schedule.yml
index 1cf456ff62c1..3348b51ecc6e 100644
--- a/.github/workflows/compatiblity_test_on_schedule.yml
+++ b/.github/workflows/compatiblity_test_on_schedule.yml
@@ -38,7 +38,7 @@ jobs:
       matrix: ${{fromJson(needs.matrix_preparation.outputs.matrix)}}
     container:
       image: ${{ matrix.container }}
-      options: --gpus all --rm -v /data/scratch/cifar-10:/data/scratch/cifar-10 -v /data/scratch/llama-tiny:/data/scratch/llama-tiny
+      options: --gpus all --rm -v /dev/shm -v /data/scratch/cifar-10:/data/scratch/cifar-10 -v /data/scratch/llama-tiny:/data/scratch/llama-tiny
     timeout-minutes: 120
     steps:
       - name: Install dependencies
@@ -80,10 +80,9 @@ jobs:
 
       - name: Unit Testing
         run: |
-          PYTHONPATH=$PWD pytest tests
+          PYTHONPATH=$PWD pytest --durations=0 tests
         env:
           DATA: /data/scratch/cifar-10
-          NCCL_SHM_DISABLE: 1
           LD_LIBRARY_PATH: /github/home/.tensornvme/lib:/usr/local/nvidia/lib:/usr/local/nvidia/lib64
           LLAMA_PATH: /data/scratch/llama-tiny
 
diff --git a/colossalai/booster/plugin/moe_hybrid_parallel_plugin.py b/colossalai/booster/plugin/moe_hybrid_parallel_plugin.py
index 454710fccaa7..ae372dd034e0 100644
--- a/colossalai/booster/plugin/moe_hybrid_parallel_plugin.py
+++ b/colossalai/booster/plugin/moe_hybrid_parallel_plugin.py
@@ -182,7 +182,7 @@ def __init__(
         overlap_communication: bool = True,
         use_ep_inside: bool = True,
         custom_policy: Policy = None,
-        checkpoint_io: Optional[MoECheckpintIO] = None,
+        checkpoint_io: Optional[MoECheckpointIO] = None,
     ) -> None:
         assert (
             dist.get_world_size() % (tp_size * pp_size) == 0
@@ -341,7 +341,6 @@ def seed_worker(worker_id):
             **_kwargs,
         )
 
-
     def get_checkpoint_io(self) -> MoECheckpointIO:
         if self.checkpoint_io is None:
             self.checkpoint_io = MoECheckpointIO(self.dp_group, self.pp_group, self.tp_group, self.zero_stage)
diff --git a/tests/test_booster/test_plugin/test_gemini_plugin.py b/tests/test_booster/test_plugin/test_gemini_plugin.py
index 0f72d2bcd3e4..89214477239b 100644
--- a/tests/test_booster/test_plugin/test_gemini_plugin.py
+++ b/tests/test_booster/test_plugin/test_gemini_plugin.py
@@ -1,7 +1,6 @@
 from contextlib import nullcontext
 from typing import Optional
 
-import pytest
 import torch
 import torch.distributed as dist
 
@@ -12,13 +11,7 @@
 from colossalai.lazy.lazy_init import LazyInitContext
 from colossalai.nn.optimizer import HybridAdam
 from colossalai.tensor.colo_parameter import ColoParameter
-from colossalai.testing import (
-    clear_cache_before_run,
-    parameterize,
-    rerun_if_address_is_in_use,
-    skip_if_not_enough_gpus,
-    spawn,
-)
+from colossalai.testing import clear_cache_before_run, parameterize, rerun_if_address_is_in_use, spawn
 from tests.kit.model_zoo import COMMON_MODELS, IS_FAST_TEST, model_zoo
 
 
@@ -177,12 +170,5 @@ def test_gemini_plugin(early_stop: bool = True):
     spawn(run_dist, 4, early_stop=early_stop)
 
 
-@pytest.mark.largedist
-@skip_if_not_enough_gpus(8)
-@rerun_if_address_is_in_use()
-def test_gemini_plugin_3d(early_stop: bool = True):
-    spawn(run_dist, 8, early_stop=early_stop)
-
-
 if __name__ == "__main__":
-    test_gemini_plugin(early_stop=False)
\ No newline at end of file
+    test_gemini_plugin(early_stop=False)
diff --git a/tests/test_checkpoint_io/test_gemini_checkpoint_io.py b/tests/test_checkpoint_io/test_gemini_checkpoint_io.py
index daddf6dc7ca0..ece3b40360e8 100644
--- a/tests/test_checkpoint_io/test_gemini_checkpoint_io.py
+++ b/tests/test_checkpoint_io/test_gemini_checkpoint_io.py
@@ -16,7 +16,6 @@
     clear_cache_before_run,
     parameterize,
     rerun_if_address_is_in_use,
-    skip_if_not_enough_gpus,
     spawn,
 )
 from tests.kit.model_zoo import model_zoo
@@ -178,12 +177,5 @@ def test_gemini_ckpIO():
     spawn(run_dist, 4)
 
 
-@pytest.mark.largedist
-@skip_if_not_enough_gpus(min_gpus=8)
-@rerun_if_address_is_in_use()
-def test_gemini_ckpIO_3d():
-    spawn(run_dist, 8)
-
-
 if __name__ == "__main__":
-    test_gemini_ckpIO()
\ No newline at end of file
+    test_gemini_ckpIO()
diff --git a/tests/test_shardformer/test_model/test_shard_falcon.py b/tests/test_shardformer/test_model/test_shard_falcon.py
index 9630451799c0..5e2efcd80367 100644
--- a/tests/test_shardformer/test_model/test_shard_falcon.py
+++ b/tests/test_shardformer/test_model/test_shard_falcon.py
@@ -1,5 +1,6 @@
 import pytest
 import torch
+import torch.distributed as dist
 
 import colossalai
 from colossalai.logging import disable_existing_loggers
@@ -72,6 +73,8 @@ def check_forward_backward(model_fn, data_gen_fn, output_transform_fn, loss_fn,
     if stage_manager is None or stage_manager.is_first_stage():
         if test_config["precision"] == "fp32":
             atol, rtol = 2e-4, 1e-3
+            if dist.get_world_size() > 4:
+                atol, rtol = 4e-4, 3e-2
         else:
             atol, rtol = 5e-3, 5e-3
         check_weight(falcon, sharded_falcon, col_layer_for_check, tp_group, atol=atol, rtol=rtol, dim=1, verbose=False)

From 972eddb43bdbc8996516b040372f72658ed9170d Mon Sep 17 00:00:00 2001
From: flybird11111 <1829166702@qq.com>
Date: Mon, 18 Mar 2024 15:55:11 +0800
Subject: [PATCH 5/8] [shardformer] fix gathering output when using tensor
 parallelism (#5431)

* fix

* padding vocab_size when using pipeline parallellism

padding vocab_size when using pipeline parallellism

fix

fix

* fix

* fix

fix

fix

* fix gather output

* fix

* fix

* fix

fix resize embedding

fix resize embedding

* fix resize embedding

fix

* revert

* revert

* revert
---
 .../booster/plugin/hybrid_parallel_plugin.py     | 10 +++++++++-
 colossalai/shardformer/modeling/gpt2.py          | 16 ++++++++++++----
 colossalai/shardformer/modeling/llama.py         | 11 ++++++-----
 colossalai/shardformer/policies/base_policy.py   |  2 +-
 colossalai/shardformer/shard/shard_config.py     |  4 +++-
 tests/test_booster/test_plugin/test_3d_plugin.py |  2 +-
 6 files changed, 32 insertions(+), 13 deletions(-)

diff --git a/colossalai/booster/plugin/hybrid_parallel_plugin.py b/colossalai/booster/plugin/hybrid_parallel_plugin.py
index 8cc76dd3e0f3..c37a6b4df72d 100644
--- a/colossalai/booster/plugin/hybrid_parallel_plugin.py
+++ b/colossalai/booster/plugin/hybrid_parallel_plugin.py
@@ -199,7 +199,12 @@ def get_param_info(optim: Optimizer):
 
     if optim is None:
         return {}
-    param_info = {"param_groups": [], "param2id": {}, "id2param": {}, "param2shape": {}}
+    param_info = {
+        "param_groups": [],
+        "param2id": {},
+        "id2param": {},
+        "param2shape": {},
+    }
     start_index = 0
     for group in optim.param_groups:
         packed_group = {k: v for k, v in group.items() if k != "params"}
@@ -899,6 +904,7 @@ class HybridParallelPlugin(PipelinePluginBase):
         enable_jit_fused (bool, optional): Whether to switch on JIT in Shardformer. Default to False.
         enable_sequence_parallelism (bool): Whether to turn on sequence parallelism in Shardformer. Defaults to False.
         enable_sequence_overlap (bool): Whether to turn on sequence overlap in Shardformer. Defaults to False.
+        parallel_output (bool): Whether to keep the output parallel when enabling tensor parallelism. Default to True.
         num_microbatches (int, optional): Number of microbatches when using pipeline parallelism. Defaults to None.
         microbatch_size (int, optional): Microbatch size when using pipeline parallelism.
             Either ``num_microbatches`` or ``microbatch_size`` should be provided if using pipeline.
@@ -939,6 +945,7 @@ def __init__(
         enable_jit_fused: bool = False,
         enable_sequence_parallelism: bool = False,
         enable_sequence_overlap: bool = False,
+        parallel_output: bool = True,
         num_microbatches: Optional[int] = None,
         microbatch_size: Optional[int] = None,
         initial_scale: float = 2**16,
@@ -1035,6 +1042,7 @@ def __init__(
             enable_jit_fused=self.enable_jit_fused,
             enable_sequence_parallelism=enable_sequence_parallelism,
             enable_sequence_overlap=enable_sequence_overlap,
+            parallel_output=parallel_output,
         )
         self.amp_config = dict(
             initial_scale=initial_scale,
diff --git a/colossalai/shardformer/modeling/gpt2.py b/colossalai/shardformer/modeling/gpt2.py
index 3e5cc6015adc..1e22d9094eae 100644
--- a/colossalai/shardformer/modeling/gpt2.py
+++ b/colossalai/shardformer/modeling/gpt2.py
@@ -25,6 +25,7 @@
 from colossalai.shardformer.shard import ShardConfig
 
 from ..layer import cross_entropy_1d
+from ..layer._operation import gather_forward_split_backward
 
 
 class GPT2PipelineForwards:
@@ -337,6 +338,9 @@ def gpt2_lmhead_model_forward(
             else:
                 loss = loss_fct(shift_logits, shift_labels)
 
+            if not shard_config.parallel_output:
+                lm_logits = gather_forward_split_backward(lm_logits, -1, shard_config.tensor_parallel_process_group)
+
         if not return_dict:
             output = (lm_logits,) + outputs[1:]
             return ((loss,) + output) if loss is not None else output
@@ -793,11 +797,12 @@ def forward(
             scale = scale * (1 / float(self.layer_idx + 1))
 
         # use coloattention
-        attention = ColoAttention(
-            embed_dim=self.embed_dim, num_heads=self.num_heads, dropout=self.attn_dropout.p, scale=scale
-        )
+        if not hasattr(self, "attention"):
+            self.attention = ColoAttention(
+                embed_dim=self.embed_dim, num_heads=self.num_heads, dropout=self.attn_dropout.p, scale=scale
+            )
 
-        attn_output = attention(query, key, value, attn_mask=flash_attention_mask, attn_mask_type=attn_mask_type)
+        attn_output = self.attention(query, key, value, attn_mask=flash_attention_mask, attn_mask_type=attn_mask_type)
 
         attn_output = self.c_proj(attn_output)
         attn_output = self.resid_dropout(attn_output)
@@ -1083,6 +1088,9 @@ def forward(
             else:
                 loss = loss_fct(shift_logits, shift_labels)
 
+        if not shard_config.parallel_output:
+            lm_logits = gather_forward_split_backward(lm_logits, -1, shard_config.tensor_parallel_process_group)
+
         if not return_dict:
             output = (lm_logits,) + transformer_outputs[1:]
             return ((loss,) + output) if loss is not None else output
diff --git a/colossalai/shardformer/modeling/llama.py b/colossalai/shardformer/modeling/llama.py
index f20ceb2d6760..eb8e9f748527 100644
--- a/colossalai/shardformer/modeling/llama.py
+++ b/colossalai/shardformer/modeling/llama.py
@@ -16,7 +16,7 @@
 from colossalai.shardformer.shard import ShardConfig
 
 from ..layer import cross_entropy_1d
-from ..layer._operation import _gather
+from ..layer._operation import gather_forward_split_backward
 
 try:
     from transformers.models.llama.modeling_llama import _prepare_4d_causal_attention_mask
@@ -290,7 +290,7 @@ def llama_for_causal_lm_forward(
                     loss = loss_fct(shift_logits, shift_labels)
 
             if not shard_config.parallel_output:
-                logits = _gather(logits, -1, shard_config.tensor_parallel_process_group)
+                logits = gather_forward_split_backward(logits, -1, shard_config.tensor_parallel_process_group)
 
             if not return_dict:
                 output = (logits,) + outputs[1:]
@@ -485,8 +485,9 @@ def forward(
             flash_attention_mask = ~(attention_mask[:, :, -1].squeeze(1).to(torch.bool)).contiguous()
             attn_mask_type = AttnMaskType.paddedcausal
 
-        attention = ColoAttention(embed_dim=self.hidden_size, num_heads=self.num_heads)
-        attn_output = attention(
+        if not hasattr(self, "attention"):
+            self.attention = ColoAttention(embed_dim=self.hidden_size, num_heads=self.num_heads)
+        attn_output = self.attention(
             query_states,
             key_states,
             value_states,
@@ -593,7 +594,7 @@ def forward(
                 loss = loss_fct(shift_logits, shift_labels)
 
         if not shard_config.parallel_output:
-            logits = _gather(logits, -1, shard_config.tensor_parallel_process_group)
+            logits = gather_forward_split_backward(logits, -1, shard_config.tensor_parallel_process_group)
 
         if not return_dict:
             output = (logits,) + outputs[1:]
diff --git a/colossalai/shardformer/policies/base_policy.py b/colossalai/shardformer/policies/base_policy.py
index 1d2b7a570681..9a49b1ba6a14 100644
--- a/colossalai/shardformer/policies/base_policy.py
+++ b/colossalai/shardformer/policies/base_policy.py
@@ -242,4 +242,4 @@ def get_stage_index(
             end_idx = num_layers_per_stage_accumulated[stage + model_chunk * num_stages + 1]
             stage_indices.append([start_idx, end_idx])
 
-        return stage_indices[0] if num_model_chunks == 1 else stage_indices
+        return stage_indices[0] if num_model_chunks == 1 else stage_indices
\ No newline at end of file
diff --git a/colossalai/shardformer/shard/shard_config.py b/colossalai/shardformer/shard/shard_config.py
index 415fc6dd5f06..da27341d9c29 100644
--- a/colossalai/shardformer/shard/shard_config.py
+++ b/colossalai/shardformer/shard/shard_config.py
@@ -34,8 +34,10 @@ class ShardConfig:
     enable_all_optimization: bool = False
     enable_sequence_parallelism: bool = False
     enable_sequence_overlap: bool = False
-    parallel_output = True
+    parallel_output: bool = True
     extra_kwargs: Dict[str, Any] = field(default_factory=dict)
+    # TODO padding vocab
+    # make_vocab_size_divisible_by: int = 128
     # pipeline_parallel_size: int
     # data_parallel_size: int
     # tensor_parallel_mode: Literal['1d', '2d', '2.5d', '3d']
diff --git a/tests/test_booster/test_plugin/test_3d_plugin.py b/tests/test_booster/test_plugin/test_3d_plugin.py
index 285c4866c441..38361d803c49 100644
--- a/tests/test_booster/test_plugin/test_3d_plugin.py
+++ b/tests/test_booster/test_plugin/test_3d_plugin.py
@@ -260,7 +260,7 @@ def run_grad_acc_test(test_args):
         origin_model, origin_optimizer, dataloader=dataloader
     )
     for p1, p2 in zip(model.unwrap().parameters(), origin_model.unwrap().parameters()):
-        assert_close(p1.to(p2.dtype), p2, atol=1e-2, rtol=1e-2)
+        assert_close(p1.to(p2.dtype), p2, atol=1e-2, rtol=1e-2)      
 
 
 def run_dist(rank, world_size, port, early_stop: bool = True):

From 5f873ca8896fed31e5c4aa970f4070f789b386fc Mon Sep 17 00:00:00 2001
From: binmakeswell <binmakeswell@gmail.com>
Date: Mon, 18 Mar 2024 18:31:18 +0800
Subject: [PATCH 6/8] [doc] release Open-Sora 1.0 with model weights (#5468)

* [doc] release Open-Sora 1.0 with model weights

* [doc] release Open-Sora 1.0 with model weights

* [doc] release Open-Sora 1.0 with model weights
---
 README.md              | 9 ++++++---
 docs/README-zh-Hans.md | 9 ++++++---
 2 files changed, 12 insertions(+), 6 deletions(-)

diff --git a/README.md b/README.md
index f045c56043be..2f6aa60678ef 100644
--- a/README.md
+++ b/README.md
@@ -25,6 +25,7 @@
 </div>
 
 ## Latest News
+* [2024/03] [Open-Sora: Revealing Complete Model Parameters, Training Details, and Everything for Sora-like Video Generation Models](https://hpc-ai.com/blog/open-sora-v1.0)
 * [2024/03] [Open-Sora：Sora Replication Solution with 46% Cost Reduction, Sequence Expansion to Nearly a Million](https://hpc-ai.com/blog/open-sora)
 * [2024/01] [Inference Performance Improved by 46%, Open Source Solution Breaks the Length Limit of LLM for Multi-Round Conversations](https://hpc-ai.com/blog/Colossal-AI-SwiftInfer)
 * [2024/01] [Construct Refined 13B Private Model With Just $5000 USD, Upgraded Colossal-AI Llama-2 Open Source](https://hpc-ai.com/blog/colossal-llama-2-13b)
@@ -40,7 +41,7 @@
  <li>
    <a href="#Colossal-AI-in-the-Real-World">Colossal-AI for Real World Applications</a>
    <ul>
-     <li><a href="#Open-Sora">Open-Sora: Open-Sora：Sora Replication Solution with 46% Cost Reduction, Sequence Expansion to Nearly a Million</a></li>
+     <li><a href="#Open-Sora">Open-Sora: Revealing Complete Model Parameters, Training Details, and Everything for Sora-like Video Generation Models</a></li>
      <li><a href="#Colossal-LLaMA-2">Colossal-LLaMA-2: One Half-Day of Training Using a Few Hundred Dollars Yields Similar Results to Mainstream Large Models, Open-Source and Commercial-Free Domain-Specific Llm Solution</a></li>
      <li><a href="#ColossalChat">ColossalChat: An Open-Source Solution for Cloning ChatGPT With a Complete RLHF Pipeline</a></li>
      <li><a href="#AIGC">AIGC: Acceleration of Stable Diffusion</a></li>
@@ -126,9 +127,11 @@ distributed training and inference in a few lines.
 ## Colossal-AI in the Real World
 ### Open-Sora
 
-[Open-Sora](https://github.com/hpcaitech/Open-Sora)：Sora Replication Solution with 46% Cost Reduction, Sequence Expansion to Nearly a Million
+[Open-Sora](https://github.com/hpcaitech/Open-Sora)：Revealing Complete Model Parameters, Training Details, and Everything for Sora-like Video Generation Models
 [[code]](https://github.com/hpcaitech/Open-Sora)
-[[blog]](https://hpc-ai.com/blog/open-sora)
+[[blog]](https://hpc-ai.com/blog/open-sora-v1.0)
+[[HuggingFace model weights]](https://huggingface.co/hpcai-tech/Open-Sora)
+[[Demo]](https://github.com/hpcaitech/Open-Sora?tab=readme-ov-file#-latest-demo)
 
 <p id="diffusion_demo" align="center">
 <img src="https://raw.githubusercontent.com/hpcaitech/public_assets/main/applications/sora/open-sora-1.png" width=600/>
diff --git a/docs/README-zh-Hans.md b/docs/README-zh-Hans.md
index 110e1a502b0f..7d267b16f442 100644
--- a/docs/README-zh-Hans.md
+++ b/docs/README-zh-Hans.md
@@ -24,6 +24,7 @@
 </div>
 
 ## 新闻
+* [2024/03] [Open-Sora: Revealing Complete Model Parameters, Training Details, and Everything for Sora-like Video Generation Models](https://hpc-ai.com/blog/open-sora-v1.0)
 * [2024/03] [Open-Sora：Sora Replication Solution with 46% Cost Reduction, Sequence Expansion to Nearly a Million](https://hpc-ai.com/blog/open-sora)
 * [2024/01] [Inference Performance Improved by 46%, Open Source Solution Breaks the Length Limit of LLM for Multi-Round Conversations](https://hpc-ai.com/blog/Colossal-AI-SwiftInfer)
 * [2024/01] [Construct Refined 13B Private Model With Just $5000 USD, Upgraded Colossal-AI Llama-2 Open Source](https://hpc-ai.com/blog/colossal-llama-2-13b)
@@ -39,7 +40,7 @@
  <li>
    <a href="#Colossal-AI-in-the-Real-World">Colossal-AI 成功案例</a>
    <ul>
-     <li><a href="#Open-Sora">Open-Sora：开源Sora复现方案，成本降低46%，序列扩充至近百万</a></li>
+     <li><a href="#Open-Sora">Open-Sora：全面开源类Sora模型参数和所有训练细节</a></li>
      <li><a href="#Colossal-LLaMA-2">Colossal-LLaMA-2: 千元预算半天训练，效果媲美主流大模型，开源可商用中文LLaMA-2</a></li>
      <li><a href="#ColossalChat">ColossalChat：完整RLHF流程0门槛克隆ChatGPT</a></li>
      <li><a href="#AIGC">AIGC: 加速 Stable Diffusion</a></li>
@@ -121,9 +122,11 @@ Colossal-AI 为您提供了一系列并行组件。我们的目标是让您的
 ## Colossal-AI 成功案例
 ### Open-Sora
 
-[Open-Sora](https://github.com/hpcaitech/Open-Sora)：开源Sora复现方案，成本降低46%，序列扩充至近百万
+[Open-Sora](https://github.com/hpcaitech/Open-Sora)：全面开源类Sora模型参数和所有训练细节
 [[代码]](https://github.com/hpcaitech/Open-Sora)
-[[博客]](https://hpc-ai.com/blog/open-sora)
+[[博客]](https://hpc-ai.com/blog/open-sora-v1.0)
+[[模型权重]](https://huggingface.co/hpcai-tech/Open-Sora)
+[[演示样例]](https://github.com/hpcaitech/Open-Sora?tab=readme-ov-file#-latest-demo)
 
 <p id="diffusion_demo" align="center">
 <img src="https://raw.githubusercontent.com/hpcaitech/public_assets/main/applications/sora/open-sora-1.png" width=600/>

From 119b62a4320434420bba2250cfc4e5d374b74be0 Mon Sep 17 00:00:00 2001
From: binmakeswell <binmakeswell@gmail.com>
Date: Wed, 20 Mar 2024 16:08:41 +0800
Subject: [PATCH 7/8] [doc] update open-sora demo (#5479)

* [doc] update open-sora demo

* [doc] update open-sora demo

* [doc] update open-sora demo
---
 README.md              | 13 ++++++-------
 applications/README.md |  2 +-
 docs/README-zh-Hans.md | 12 +++++-------
 3 files changed, 12 insertions(+), 15 deletions(-)

diff --git a/README.md b/README.md
index 2f6aa60678ef..7c234b15e75e 100644
--- a/README.md
+++ b/README.md
@@ -133,14 +133,13 @@ distributed training and inference in a few lines.
 [[HuggingFace model weights]](https://huggingface.co/hpcai-tech/Open-Sora)
 [[Demo]](https://github.com/hpcaitech/Open-Sora?tab=readme-ov-file#-latest-demo)
 
-<p id="diffusion_demo" align="center">
-<img src="https://raw.githubusercontent.com/hpcaitech/public_assets/main/applications/sora/open-sora-1.png" width=600/>
-</p>
-
-<p id="diffusion_demo" align="center">
-<img src="https://raw.githubusercontent.com/hpcaitech/public_assets/main/applications/sora/open-sora-2.png" width=600/>
-</p>
+<div align="center">
+   <a href="https://www.youtube.com/watch?v=iDTxepqixuc">
+   <img src="https://raw.githubusercontent.com/hpcaitech/public_assets/main/applications/sora/sora-demo.png" width="700" />
+   </a>
+</div>
 
+<p align="right">(<a href="#top">back to top</a>)</p>
 
 ### Colossal-LLaMA-2
 
diff --git a/applications/README.md b/applications/README.md
index 8abe1e52d96c..120767d5c9ea 100644
--- a/applications/README.md
+++ b/applications/README.md
@@ -4,7 +4,7 @@ This directory contains the applications that are powered by Colossal-AI.
 
 The list of applications include:
 
-- [X] [Open-Sora](https://github.com/hpcaitech/Open-Sora): Sora Replication Solution with 46% Cost Reduction, Sequence Expansion to Nearly a Million
+- [X] [Open-Sora](https://github.com/hpcaitech/Open-Sora): Revealing Complete Model Parameters, Training Details, and Everything for Sora-like Video Generation Models
 - [X] [Colossal-LLaMA-2](./Colossal-LLaMA-2/): Continual Pre-training of LLaMA-2.
 - [X] [ColossalEval](./ColossalEval): Evaluation Pipeline for LLMs.
 - [X] [ColossalChat](./Chat/README.md): Replication of ChatGPT with RLHF.
diff --git a/docs/README-zh-Hans.md b/docs/README-zh-Hans.md
index 7d267b16f442..93045ea6adc6 100644
--- a/docs/README-zh-Hans.md
+++ b/docs/README-zh-Hans.md
@@ -128,13 +128,11 @@ Colossal-AI 为您提供了一系列并行组件。我们的目标是让您的
 [[模型权重]](https://huggingface.co/hpcai-tech/Open-Sora)
 [[演示样例]](https://github.com/hpcaitech/Open-Sora?tab=readme-ov-file#-latest-demo)
 
-<p id="diffusion_demo" align="center">
-<img src="https://raw.githubusercontent.com/hpcaitech/public_assets/main/applications/sora/open-sora-1.png" width=600/>
-</p>
-
-<p id="diffusion_demo" align="center">
-<img src="https://raw.githubusercontent.com/hpcaitech/public_assets/main/applications/sora/open-sora-2.png" width=600/>
-</p>
+<div align="center">
+   <a href="https://www.bilibili.com/video/BV1dW421c7MN">
+   <img src="https://raw.githubusercontent.com/hpcaitech/public_assets/main/applications/sora/sora-demo-cn.png" width="700" />
+   </a>
+</div>
 
 ### Colossal-LLaMA-2
 

From 72320d7c0ea3a244262234244b2b1c3be2bc3950 Mon Sep 17 00:00:00 2001
From: Hongxin Liu <lhx0217@gmail.com>
Date: Thu, 21 Mar 2024 18:07:22 +0800
Subject: [PATCH 8/8] [example] add grok-1 inference (#5485)

* [misc] add submodule

* remove submodule

* [example] support grok-1 tp inference

* [example] add grok-1 inference script

* [example] refactor code

* [example] add grok-1 readme

* [exmaple] add test ci

* [exmaple] update readme
---
 examples/language/grok-1/README.md            | 43 ++++++++
 examples/language/grok-1/grok1_policy.py      | 99 +++++++++++++++++++
 examples/language/grok-1/inference.py         | 32 ++++++
 examples/language/grok-1/inference_tp.py      | 50 ++++++++++
 examples/language/grok-1/requirements.txt     |  4 +
 .../language/grok-1/run_inference_fast.sh     | 11 +++
 .../language/grok-1/run_inference_slow.sh     | 11 +++
 examples/language/grok-1/test_ci.sh           |  1 +
 examples/language/grok-1/utils.py             | 46 +++++++++
 9 files changed, 297 insertions(+)
 create mode 100644 examples/language/grok-1/README.md
 create mode 100644 examples/language/grok-1/grok1_policy.py
 create mode 100644 examples/language/grok-1/inference.py
 create mode 100644 examples/language/grok-1/inference_tp.py
 create mode 100644 examples/language/grok-1/requirements.txt
 create mode 100755 examples/language/grok-1/run_inference_fast.sh
 create mode 100755 examples/language/grok-1/run_inference_slow.sh
 create mode 100755 examples/language/grok-1/test_ci.sh
 create mode 100644 examples/language/grok-1/utils.py

diff --git a/examples/language/grok-1/README.md b/examples/language/grok-1/README.md
new file mode 100644
index 000000000000..c523f941262d
--- /dev/null
+++ b/examples/language/grok-1/README.md
@@ -0,0 +1,43 @@
+# Grok-1 Inference
+
+## Install
+
+```bash
+# Make sure you install colossalai from the latest source code
+git clone https://github.com/hpcaitech/ColossalAI.git
+cd ColossalAI
+pip install .
+cd examples/language/grok-1
+pip install -r requirements.txt
+```
+
+## Tokenizer preparation
+
+You should download the tokenizer from the official grok-1 repository.
+
+```bash
+wget https://github.com/xai-org/grok-1/raw/main/tokenizer.model
+```
+
+## Inference
+
+You need 8x A100 80GB or equivalent GPUs to run the inference.
+
+We provide two scripts for inference. `run_inference_fast.sh` uses tensor parallelism provided by ColossalAI, and it is faster. `run_inference_slow.sh` uses auto device provided by transformers, and it is slower.
+
+Command format:
+
+```bash
+./run_inference_fast.sh <model_name_or_path> <tokenizer_path>
+./run_inference_slow.sh <model_name_or_path> <tokenizer_path>
+```
+
+`model_name_or_path` can be a local path or a model name from Hugging Face model hub. We provided weights on model hub, named `hpcaitech/grok-1`.
+
+Command example:
+
+```bash
+./run_inference_fast.sh hpcaitech/grok-1 tokenizer.model
+```
+
+It will take 5-10 minutes to load checkpoints. Don't worry, it's not stuck.
diff --git a/examples/language/grok-1/grok1_policy.py b/examples/language/grok-1/grok1_policy.py
new file mode 100644
index 000000000000..aefea6f3df1c
--- /dev/null
+++ b/examples/language/grok-1/grok1_policy.py
@@ -0,0 +1,99 @@
+from typing import Dict, Union
+
+import torch.nn as nn
+
+from colossalai.shardformer.layer import Linear1D_Col, Linear1D_Row, VocabParallelEmbedding1D
+from colossalai.shardformer.policies.base_policy import ModulePolicyDescription, Policy, SubModuleReplacementDescription
+
+
+class Grok1Policy(Policy):
+    def config_sanity_check(self):
+        pass
+
+    def preprocess(self) -> nn.Module:
+        if self.shard_config.enable_tensor_parallelism:
+            vocab_size = self.model.config.vocab_size
+            world_size = self.shard_config.tensor_parallel_size
+            assert vocab_size % world_size == 0, f"vocab_size {vocab_size} must be divisible by world_size {world_size}"
+        return self.model
+
+    def module_policy(self) -> Dict[Union[str, nn.Module], ModulePolicyDescription]:
+        policy = {}
+        if self.shard_config.enable_tensor_parallelism:
+            decoder_attribute_replacement = {
+                "attn.hidden_size": self.model.config.hidden_size // self.shard_config.tensor_parallel_size,
+                "attn.num_heads": self.model.config.num_attention_heads // self.shard_config.tensor_parallel_size,
+                "attn.num_key_value_heads": self.model.config.num_key_value_heads
+                // self.shard_config.tensor_parallel_size,
+            }
+            decoder_submodule_replacement = [
+                SubModuleReplacementDescription(
+                    suffix="attn.q_proj",
+                    target_module=Linear1D_Col,
+                ),
+                SubModuleReplacementDescription(
+                    suffix="attn.k_proj",
+                    target_module=Linear1D_Col,
+                ),
+                SubModuleReplacementDescription(
+                    suffix="attn.v_proj",
+                    target_module=Linear1D_Col,
+                ),
+                SubModuleReplacementDescription(
+                    suffix="attn.o_proj",
+                    target_module=Linear1D_Row,
+                ),
+            ]
+            for i in range(self.model.config.num_experts):
+                decoder_submodule_replacement.extend(
+                    [
+                        SubModuleReplacementDescription(
+                            suffix=f"moe_block.experts[{i}].linear",
+                            target_module=Linear1D_Col,
+                        ),
+                        SubModuleReplacementDescription(
+                            suffix=f"moe_block.experts[{i}].linear_v",
+                            target_module=Linear1D_Col,
+                        ),
+                        SubModuleReplacementDescription(
+                            suffix=f"moe_block.experts[{i}].linear_1",
+                            target_module=Linear1D_Row,
+                        ),
+                    ]
+                )
+
+            policy["DecoderLayer"] = ModulePolicyDescription(
+                attribute_replacement=decoder_attribute_replacement,
+                sub_module_replacement=decoder_submodule_replacement,
+            )
+            self.append_or_create_submodule_replacement(
+                description=SubModuleReplacementDescription(
+                    suffix="embed_tokens",
+                    target_module=VocabParallelEmbedding1D,
+                ),
+                policy=policy,
+                target_key="Grok1Model",
+            )
+        return policy
+
+    def postprocess(self):
+        return self.model
+
+
+class Grok1ModelPolicy(Grok1Policy):
+    pass
+
+
+class Grok1ForCausalLMPolicy(Grok1Policy):
+    def module_policy(self) -> Dict[Union[str, nn.Module], ModulePolicyDescription]:
+        policy = super().module_policy()
+        self.append_or_create_submodule_replacement(
+            description=SubModuleReplacementDescription(
+                suffix="lm_head",
+                target_module=Linear1D_Col,
+                kwargs={"gather_output": not self.shard_config.parallel_output},
+            ),
+            policy=policy,
+            target_key="Grok1ModelForCausalLM",
+        )
+        return policy
diff --git a/examples/language/grok-1/inference.py b/examples/language/grok-1/inference.py
new file mode 100644
index 000000000000..ca0ad0d4fe95
--- /dev/null
+++ b/examples/language/grok-1/inference.py
@@ -0,0 +1,32 @@
+import time
+
+import torch
+from sentencepiece import SentencePieceProcessor
+from transformers import AutoModelForCausalLM
+from utils import get_defualt_parser, inference, print_output
+
+if __name__ == "__main__":
+    parser = get_defualt_parser()
+    args = parser.parse_args()
+    start = time.time()
+    torch.set_default_dtype(torch.bfloat16)
+    model = AutoModelForCausalLM.from_pretrained(
+        args.pretrained,
+        trust_remote_code=True,
+        device_map="auto",
+        torch_dtype=torch.bfloat16,
+    )
+    sp = SentencePieceProcessor(model_file=args.tokenizer)
+    for text in args.text:
+        output = inference(
+            model,
+            sp,
+            text,
+            max_new_tokens=args.max_new_tokens,
+            do_sample=args.do_sample,
+            temperature=args.temperature,
+            top_k=args.top_k,
+            top_p=args.top_p,
+        )
+        print_output(text, sp.decode(output))
+    print(f"Overall time: {time.time() - start} seconds.")
diff --git a/examples/language/grok-1/inference_tp.py b/examples/language/grok-1/inference_tp.py
new file mode 100644
index 000000000000..99de60e1f6be
--- /dev/null
+++ b/examples/language/grok-1/inference_tp.py
@@ -0,0 +1,50 @@
+import time
+
+import torch
+from grok1_policy import Grok1ForCausalLMPolicy
+from sentencepiece import SentencePieceProcessor
+from transformers import AutoModelForCausalLM
+from utils import get_defualt_parser, inference, print_output
+
+import colossalai
+from colossalai.booster import Booster
+from colossalai.booster.plugin import HybridParallelPlugin
+from colossalai.cluster import DistCoordinator
+from colossalai.lazy import LazyInitContext
+from colossalai.utils import get_current_device
+
+if __name__ == "__main__":
+    parser = get_defualt_parser()
+    args = parser.parse_args()
+    start = time.time()
+    colossalai.launch_from_torch({})
+    coordinator = DistCoordinator()
+    plugin = HybridParallelPlugin(
+        tp_size=coordinator.world_size,
+        pp_size=1,
+        precision="bf16",
+        parallel_output=False,
+        custom_policy=Grok1ForCausalLMPolicy(),
+    )
+    booster = Booster(plugin=plugin)
+    torch.set_default_dtype(torch.bfloat16)
+    with LazyInitContext(default_device=get_current_device()):
+        model = AutoModelForCausalLM.from_pretrained(
+            args.pretrained, trust_remote_code=True, torch_dtype=torch.bfloat16
+        )
+    model, *_ = booster.boost(model)
+    sp = SentencePieceProcessor(model_file=args.tokenizer)
+    for text in args.text:
+        output = inference(
+            model.unwrap(),
+            sp,
+            text,
+            max_new_tokens=args.max_new_tokens,
+            do_sample=args.do_sample,
+            temperature=args.temperature,
+            top_k=args.top_k,
+            top_p=args.top_p,
+        )
+        if coordinator.is_master():
+            print_output(text, sp.decode(output))
+    coordinator.print_on_master(f"Overall time: {time.time() - start} seconds.")
diff --git a/examples/language/grok-1/requirements.txt b/examples/language/grok-1/requirements.txt
new file mode 100644
index 000000000000..15d5ea53a15e
--- /dev/null
+++ b/examples/language/grok-1/requirements.txt
@@ -0,0 +1,4 @@
+torch>=2.1.0,<2.2.0
+colossalai>=0.3.6
+sentencepiece==0.1.99
+transformers==4.35.0
diff --git a/examples/language/grok-1/run_inference_fast.sh b/examples/language/grok-1/run_inference_fast.sh
new file mode 100755
index 000000000000..0dc398c53e33
--- /dev/null
+++ b/examples/language/grok-1/run_inference_fast.sh
@@ -0,0 +1,11 @@
+#!/usr/bin/env bash
+
+PRETRAINED=${1:-"hpcaitech/grok-1"}
+TOKENIZER=${2:-"tokenizer.model"}
+
+torchrun --standalone --nproc_per_node 8 inference_tp.py --pretrained "$PRETRAINED" \
+    --tokenizer "$TOKENIZER" \
+    --max_new_tokens 64 \
+    --text "The company's annual conference, featuring keynote speakers and exclusive product launches, will be held at the Los Angeles Convention Center from October 20th to October 23rd, 2021. Extract the date mentioned in the above sentence." \
+            "将以下句子翻译成英语。 我喜欢看电影和读书。" \
+            "All books have the same weight, 10 books weigh 5kg, what is the weight of 2 books?"
diff --git a/examples/language/grok-1/run_inference_slow.sh b/examples/language/grok-1/run_inference_slow.sh
new file mode 100755
index 000000000000..c64dd93b9e62
--- /dev/null
+++ b/examples/language/grok-1/run_inference_slow.sh
@@ -0,0 +1,11 @@
+#!/usr/bin/env bash
+
+PRETRAINED=${1:-"hpcaitech/grok-1"}
+TOKENIZER=${2:-"tokenizer.model"}
+
+python3 inference.py --pretrained "$PRETRAINED" \
+    --tokenizer "$TOKENIZER" \
+    --max_new_tokens 64 \
+    --text "The company's annual conference, featuring keynote speakers and exclusive product launches, will be held at the Los Angeles Convention Center from October 20th to October 23rd, 2021. Extract the date mentioned in the above sentence." \
+            "将以下句子翻译成英语。 我喜欢看电影和读书。" \
+            "All books have the same weight, 10 books weigh 5kg, what is the weight of 2 books?"
diff --git a/examples/language/grok-1/test_ci.sh b/examples/language/grok-1/test_ci.sh
new file mode 100755
index 000000000000..f6a0d658462b
--- /dev/null
+++ b/examples/language/grok-1/test_ci.sh
@@ -0,0 +1 @@
+pip install -r requirements.txt
diff --git a/examples/language/grok-1/utils.py b/examples/language/grok-1/utils.py
new file mode 100644
index 000000000000..f113f852eff6
--- /dev/null
+++ b/examples/language/grok-1/utils.py
@@ -0,0 +1,46 @@
+import argparse
+
+import torch
+
+
+class Bcolors:
+    HEADER = "\033[95m"
+    OKBLUE = "\033[94m"
+    OKCYAN = "\033[96m"
+    OKGREEN = "\033[92m"
+    WARNING = "\033[93m"
+    FAIL = "\033[91m"
+    ENDC = "\033[0m"
+    BOLD = "\033[1m"
+    UNDERLINE = "\033[4m"
+
+
+def print_output(text, output):
+    print(f"-----\n{Bcolors.OKBLUE}{text}{Bcolors.ENDC}{output[len(text):]}")
+
+
+@torch.no_grad()
+def inference(model, sp, text, **generate_kwargs):
+    input_ids = sp.encode(text)
+    input_ids = torch.tensor([input_ids]).cuda()
+    attention_mask = torch.ones_like(input_ids)
+    inputs = {
+        "input_ids": input_ids,
+        "attention_mask": attention_mask,
+        **generate_kwargs,
+    }
+    outputs = model.generate(**inputs)
+    return outputs[0].tolist()
+
+
+def get_defualt_parser():
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--pretrained", type=str, default="hpcaitech/grok-1")
+    parser.add_argument("--tokenizer", type=str, default="tokenizer.model")
+    parser.add_argument("--text", type=str, nargs="+", default=["Hi, what's your name?"])
+    parser.add_argument("--max_new_tokens", type=int, default=30)
+    parser.add_argument("--do_sample", action="store_true", default=False)
+    parser.add_argument("--temperature", type=float, default=0.3, help="Set temperature value")
+    parser.add_argument("--top_k", type=int, default=50, help="Set top_k value for top-k-filtering")
+    parser.add_argument("--top_p", type=float, default=0.95, help="Set top_p value for generation")
+    return parser